Add card browser with similar cards and performance optimizations

2026-03-17 18:56:30 +01:00 · 2025-10-17 16:17:36 -07:00 · 2025-10-17 16:17:36 -07:00 · c2960c808e
commit c2960c808e
parent a8dc1835eb
25 changed files with 4841 additions and 1392 deletions
--- a/code/web/services/card_similarity.py
+++ b/code/web/services/card_similarity.py
@ -0,0 +1,483 @@
+"""
+Card similarity service using Jaccard index on theme tags.
+
+Provides similarity scoring between cards based on theme tag overlap.
+Used for "Similar Cards" feature in card browser.
+
+Supports persistent caching for improved performance (2-6s → <500ms).
+
+Uses "signature tags" approach: compares top 5 most frequent tags instead
+of all tags, significantly improving performance and quality.
+"""
+
+import ast
+import logging
+import random
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+
+from code.web.services.similarity_cache import SimilarityCache, get_cache
+
+logger = logging.getLogger(__name__)
+
+
+class CardSimilarity:
+    """Calculate card similarity using theme tag overlap (Jaccard index) with caching."""
+
+    def __init__(self, cards_df: Optional[pd.DataFrame] = None, cache: Optional[SimilarityCache] = None):
+        """
+        Initialize similarity calculator.
+
+        Args:
+            cards_df: DataFrame with card data. If None, loads from all_cards.parquet
+            cache: SimilarityCache instance. If None, uses global singleton
+        """
+        if cards_df is None:
+            # Load from default location
+            parquet_path = Path(__file__).parents[3] / "card_files" / "all_cards.parquet"
+            logger.info(f"Loading cards from {parquet_path}")
+            self.cards_df = pd.read_parquet(parquet_path)
+        else:
+            self.cards_df = cards_df
+
+        # Initialize cache
+        self.cache = cache if cache is not None else get_cache()
+
+        # Load theme frequencies from catalog
+        self.theme_frequencies = self._load_theme_frequencies()
+
+        # Pre-compute cleaned tags (with exclusions) for all cards (one-time cost, huge speedup)
+        # This removes "Historics Matter" and "Legends Matter" from all cards
+        self.cleaned_tags_cache = self._precompute_cleaned_tags()
+        
+        # Pre-compute card metadata (EDHREC rank) for fast lookups
+        self._card_metadata = self._precompute_card_metadata()
+        
+        # Inverted index (tag -> set of card names) - built lazily on first use
+        self._tag_to_cards_index = None
+
+        logger.info(
+            f"Initialized CardSimilarity with {len(self.cards_df)} cards "
+            f"and {len(self.theme_frequencies)} theme frequencies "
+            f"(cache: {'enabled' if self.cache.enabled else 'disabled'})"
+        )
+
+    def _load_theme_frequencies(self) -> dict[str, int]:
+        """
+        Load theme frequencies from theme_catalog.csv.
+
+        Returns:
+            Dict mapping theme name to card_count (higher = more common)
+        """
+        catalog_path = Path(__file__).parents[3] / "config" / "themes" / "theme_catalog.csv"
+
+        try:
+            # Read CSV, skipping comment line
+            df = pd.read_csv(catalog_path, comment="#")
+
+            # Create dict mapping theme -> card_count
+            # Higher card_count = more common/frequent theme
+            frequencies = dict(zip(df["theme"], df["card_count"]))
+
+            logger.info(f"Loaded {len(frequencies)} theme frequencies from catalog")
+            return frequencies
+
+        except Exception as e:
+            logger.warning(f"Failed to load theme frequencies: {e}, using empty dict")
+            return {}
+
+    def _precompute_cleaned_tags(self) -> dict[str, set[str]]:
+        """
+        Pre-compute cleaned tags for all cards.
+
+        Removes overly common tags like "Historics Matter" and "Legends Matter"
+        that don't provide meaningful similarity. This is done once during
+        initialization to avoid recalculating for every comparison.
+
+        Returns:
+            Dict mapping card name -> cleaned tags (full set minus exclusions)
+        """
+        logger.info("Pre-computing cleaned tags for all cards...")
+        excluded_tags = {"Historics Matter", "Legends Matter"}
+        cleaned = {}
+
+        for _, row in self.cards_df.iterrows():
+            card_name = row["name"]
+            tags = self.parse_theme_tags(row["themeTags"])
+
+            if tags:
+                # Remove excluded tags
+                cleaned_tags = tags - excluded_tags
+                if cleaned_tags:  # Only store if card has tags after exclusion
+                    cleaned[card_name] = cleaned_tags
+
+        logger.info(f"Pre-computed {len(cleaned)} card tag sets")
+        return cleaned
+
+    def _precompute_card_metadata(self) -> dict[str, dict]:
+        """
+        Pre-compute card metadata (EDHREC rank, etc.) for fast lookups.
+        
+        Returns:
+            Dict mapping card name -> metadata dict
+        """
+        logger.info("Pre-computing card metadata...")
+        metadata = {}
+        
+        for _, row in self.cards_df.iterrows():
+            card_name = row["name"]
+            edhrec_rank = row.get("edhrecRank")
+            # Convert to float, use inf for NaN/None
+            edhrec_rank = float(edhrec_rank) if pd.notna(edhrec_rank) else float('inf')
+            
+            metadata[card_name] = {
+                "edhrecRank": edhrec_rank,
+            }
+        
+        logger.info(f"Pre-computed metadata for {len(metadata)} cards")
+        return metadata
+
+    def _build_tag_index(self) -> None:
+        """
+        Build inverted index: tag -> set of card names that have this tag.
+        
+        This allows fast candidate filtering - instead of checking all 29k cards,
+        we only check cards that share at least one tag with the target.
+        
+        Performance impact: Reduces 29k comparisons to typically 100-2000 comparisons.
+        """
+        logger.info("Building inverted tag index...")
+        index = {}
+        
+        for card_name, tags in self.cleaned_tags_cache.items():
+            for tag in tags:
+                if tag not in index:
+                    index[tag] = set()
+                index[tag].add(card_name)
+        
+        self._tag_to_cards_index = index
+        
+        # Log statistics
+        avg_cards_per_tag = sum(len(cards) for cards in index.values()) / len(index) if index else 0
+        logger.info(
+            f"Built tag index: {len(index)} unique tags, "
+            f"avg {avg_cards_per_tag:.1f} cards per tag"
+        )
+
+    def get_signature_tags(
+        self,
+        card_tags: set[str],
+        top_n: int = 5,
+        random_n: Optional[int] = None,
+        seed: Optional[int] = None,
+    ) -> set[str]:
+        """
+        Get signature tags for similarity comparison.
+
+        Takes the most frequent (popular) tags PLUS random tags for diversity.
+        This balances defining characteristics with discovery of niche synergies.
+
+        Excludes overly common tags like "Historics Matter" and "Legends Matter"
+        that appear on most legendary cards and don't provide meaningful similarity.
+
+        Args:
+            card_tags: Full set of card theme tags
+            top_n: Number of most frequent tags to use (default 5)
+            random_n: Number of random tags to add. If None, auto-scales:
+                     - 6-10 tags: 1 random
+                     - 11-15 tags: 2 random
+                     - 16+ tags: 3 random
+            seed: Random seed for reproducibility (default: None)
+
+        Returns:
+            Set of signature tags (top_n most frequent + random_n random)
+        """
+        # Exclude overly common tags that don't provide meaningful similarity
+        excluded_tags = {"Historics Matter", "Legends Matter"}
+        card_tags = card_tags - excluded_tags
+
+        if len(card_tags) <= top_n:
+            return card_tags  # Use all if card has few tags
+
+        # Auto-scale random_n based on total tag count if not specified
+        if random_n is None:
+            tag_count = len(card_tags)
+            if tag_count >= 16:
+                random_n = 3
+            elif tag_count >= 11:
+                random_n = 2
+            elif tag_count >= 6:
+                random_n = 1
+            else:
+                random_n = 0  # Very few tags, no random needed
+
+        # Sort tags by frequency (higher card_count = more common = higher priority)
+        sorted_tags = sorted(
+            card_tags,
+            key=lambda t: -self.theme_frequencies.get(t, 0),  # Negate for descending order
+        )
+
+        # Take top N most frequent tags
+        signature = set(sorted_tags[:top_n])
+
+        # Add random tags from remaining tags
+        remaining_tags = card_tags - signature
+        if remaining_tags and random_n > 0:
+            if seed is not None:
+                random.seed(seed)
+            
+            # Sample min(random_n, len(remaining_tags)) to avoid errors
+            sample_size = min(random_n, len(remaining_tags))
+            random_tags = set(random.sample(list(remaining_tags), sample_size))
+            
+            signature = signature | random_tags
+
+        return signature
+
+    @staticmethod
+    def parse_theme_tags(tags: str | list) -> set[str]:
+        """
+        Parse theme tags from string or list format.
+
+        Args:
+            tags: Theme tags as string representation of list or actual list
+
+        Returns:
+            Set of theme tag strings
+        """
+        if pd.isna(tags) or not tags:
+            return set()
+
+        if isinstance(tags, list):
+            return set(tags)
+
+        if isinstance(tags, str):
+            # Handle string representation of list: "['tag1', 'tag2']"
+            try:
+                parsed = ast.literal_eval(tags)
+                if isinstance(parsed, list):
+                    return set(parsed)
+                return set()
+            except (ValueError, SyntaxError):
+                # If parsing fails, return empty set
+                logger.warning(f"Failed to parse theme tags: {tags[:100]}")
+                return set()
+
+        return set()
+
+    @staticmethod
+    def calculate_similarity(tags_a: set[str], tags_b: set[str]) -> float:
+        """
+        Calculate Jaccard similarity between two sets of theme tags.
+
+        Jaccard index = intersection / union
+
+        Args:
+            tags_a: First set of theme tags
+            tags_b: Second set of theme tags
+
+        Returns:
+            Similarity score from 0.0 (no overlap) to 1.0 (identical)
+        """
+        if not tags_a or not tags_b:
+            return 0.0
+
+        intersection = len(tags_a & tags_b)
+        union = len(tags_a | tags_b)
+
+        if union == 0:
+            return 0.0
+
+        return intersection / union
+
+    def get_card_tags(self, card_name: str) -> Optional[set[str]]:
+        """
+        Get theme tags for a specific card.
+
+        Args:
+            card_name: Name of the card
+
+        Returns:
+            Set of theme tags, or None if card not found
+        """
+        card_row = self.cards_df[self.cards_df["name"] == card_name]
+
+        if card_row.empty:
+            return None
+
+        tags = card_row.iloc[0]["themeTags"]
+        return self.parse_theme_tags(tags)
+
+    def find_similar(
+        self,
+        card_name: str,
+        threshold: float = 0.8,
+        limit: int = 10,
+        min_results: int = 3,
+        adaptive: bool = True,
+        use_cache: bool = True,
+    ) -> list[dict]:
+        """
+        Find cards with similar theme tags.
+
+        Uses adaptive threshold scaling to ensure minimum number of results.
+        Tries 80% → 60% thresholds until min_results is met (skips 70% for performance).
+
+        Checks cache first for pre-computed results, falls back to real-time calculation.
+
+        Args:
+            card_name: Name of the target card
+            threshold: Starting similarity threshold (0.0-1.0), default 0.8 (80%)
+            limit: Maximum number of results, default 10
+            min_results: Minimum desired results for adaptive scaling, default 3
+            adaptive: Enable adaptive threshold scaling, default True
+            use_cache: Check cache first before calculating, default True
+
+        Returns:
+            List of dicts with keys: name, similarity, themeTags, edhrecRank, threshold_used
+            Sorted by similarity descending, then by EDHREC rank ascending (more popular first)
+            Returns empty list if card not found or has no tags
+        """
+        # Check cache first
+        if use_cache and self.cache.enabled:
+            cached_results = self.cache.get_similar(card_name, limit=limit, randomize=True)
+            if cached_results is not None:
+                logger.info(f"Cache HIT for '{card_name}' ({len(cached_results)} results, randomized)")
+                return cached_results
+            else:
+                logger.info(f"Cache MISS for '{card_name}', calculating...")
+
+        # Get target card tags
+        target_tags = self.get_card_tags(card_name)
+
+        if target_tags is None:
+            logger.warning(f"Card not found: {card_name}")
+            return []
+
+        if not target_tags:
+            logger.info(f"Card has no theme tags: {card_name}")
+            return []
+
+        # Get signature tags for TARGET card only (top 5 most frequent + 1-3 random)
+        # This focuses the search on the target's defining characteristics
+        # with some diversity from random tags
+        
+        # Use card name hash as seed for reproducible randomness per card
+        card_seed = hash(card_name) % (2**31)
+        target_signature = self.get_signature_tags(
+            target_tags,
+            top_n=5,
+            seed=card_seed
+        )
+
+        logger.debug(
+            f"Target '{card_name}': {len(target_tags)} tags → "
+            f"{len(target_signature)} signature tags"
+        )
+
+        # Try adaptive thresholds if enabled
+        thresholds_to_try = [threshold]
+        if adaptive:
+            # Build list of thresholds to try: 80% → 60% → 50% (skip 70% for performance)
+            thresholds_to_try = []
+            if threshold >= 0.8:
+                thresholds_to_try.append(0.8)
+            if threshold >= 0.6:
+                thresholds_to_try.append(0.6)
+            if threshold >= 0.5:
+                thresholds_to_try.append(0.5)
+            
+            # Remove duplicates and sort descending
+            thresholds_to_try = sorted(set(thresholds_to_try), reverse=True)
+
+        results = []
+        threshold_used = threshold
+
+        for current_threshold in thresholds_to_try:
+            # Use inverted index for fast candidate filtering
+            # Instead of checking all 29k cards, only check cards that share at least one signature tag
+            results = []
+            
+            # Build inverted index on first use (lazily)
+            if self._tag_to_cards_index is None:
+                self._build_tag_index()
+            
+            # Get candidate cards that share at least one signature tag
+            # This drastically reduces the number of cards we need to check
+            candidate_cards = set()
+            for tag in target_signature:
+                if tag in self._tag_to_cards_index:
+                    candidate_cards.update(self._tag_to_cards_index[tag])
+            
+            # Remove the target card itself
+            candidate_cards.discard(card_name)
+            
+            if not candidate_cards:
+                continue  # No candidates at all, try lower threshold
+            
+            # Now calculate scores only for candidates (vectorized where possible)
+            # Pre-filter candidates by checking if they meet minimum overlap requirement
+            min_overlap = int(len(target_signature) * current_threshold)
+            
+            for candidate_name in candidate_cards:
+                candidate_tags = self.cleaned_tags_cache.get(candidate_name)
+                
+                if not candidate_tags:
+                    continue
+                
+                # Fast overlap check using set intersection
+                overlap = target_signature & candidate_tags
+                overlap_count = len(overlap)
+                
+                # Quick filter: skip if overlap too small
+                if overlap_count < min_overlap:
+                    continue
+                
+                # Calculate exact containment score
+                containment_score = overlap_count / len(target_signature)
+                
+                if containment_score >= current_threshold:
+                    # Get EDHREC rank efficiently from card metadata
+                    edhrec_rank = self._card_metadata.get(candidate_name, {}).get('edhrecRank', float('inf'))
+                    
+                    results.append({
+                        "name": candidate_name,
+                        "similarity": containment_score,
+                        "themeTags": list(candidate_tags),
+                        "edhrecRank": edhrec_rank,
+                    })
+
+            # Sort by similarity descending, then by EDHREC rank ascending (lower is better)
+            # Unranked cards (inf) will appear last
+            results.sort(key=lambda x: (-x["similarity"], x["edhrecRank"]))
+
+            # Check if we have enough results
+            if len(results) >= min_results or not adaptive:
+                threshold_used = current_threshold
+                break
+            
+            # Log that we're trying a lower threshold
+            logger.info(
+                f"Found {len(results)} results at {current_threshold:.0%} "
+                f"for '{card_name}', trying lower threshold..."
+            )
+
+        # Add threshold_used to results
+        for result in results:
+            result["threshold_used"] = threshold_used
+
+        logger.info(
+            f"Found {len(results)} similar cards for '{card_name}' "
+            f"at {threshold_used:.0%} threshold"
+        )
+
+        final_results = results[:limit]
+
+        # Cache the results for future lookups
+        if use_cache and self.cache.enabled and final_results:
+            self.cache.set_similar(card_name, final_results)
+            logger.debug(f"Cached {len(final_results)} results for '{card_name}'")
+
+        return final_results
--- a/code/web/services/similarity_cache.py
+++ b/code/web/services/similarity_cache.py
@ -0,0 +1,386 @@
+"""
+Similarity cache manager for card similarity calculations.
+
+Provides persistent caching of pre-computed card similarity scores to improve
+card detail page load times from 2-6s down to <500ms.
+
+Cache format: Parquet file with columnar structure:
+- card_name: str (source card)
+- similar_name: str (similar card name)
+- similarity: float (similarity score)
+- edhrecRank: float (EDHREC rank of similar card)
+- rank: int (ranking position, 0-19 for top 20)
+
+Metadata stored in separate JSON sidecar file.
+
+Benefits vs JSON:
+- 5-10x faster load times
+- 50-70% smaller file size
+- Better compression for large datasets
+- Consistent with other card data storage
+"""
+
+import json
+import logging
+import os
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Default cache settings
+CACHE_VERSION = "2.0"  # Bumped for Parquet format
+DEFAULT_CACHE_PATH = Path(__file__).parents[3] / "card_files" / "similarity_cache.parquet"
+DEFAULT_METADATA_PATH = Path(__file__).parents[3] / "card_files" / "similarity_cache_metadata.json"
+
+
+class SimilarityCache:
+    """Manages persistent cache for card similarity calculations using Parquet."""
+
+    def __init__(self, cache_path: Optional[Path] = None, enabled: bool = True):
+        """
+        Initialize similarity cache manager.
+
+        Args:
+            cache_path: Path to cache file. If None, uses DEFAULT_CACHE_PATH
+            enabled: Whether cache is enabled (can be disabled via env var)
+        """
+        self.cache_path = cache_path or DEFAULT_CACHE_PATH
+        self.metadata_path = self.cache_path.with_name(
+            self.cache_path.stem + "_metadata.json"
+        )
+        self.enabled = enabled and os.getenv("SIMILARITY_CACHE_ENABLED", "1") == "1"
+        self._cache_df: Optional[pd.DataFrame] = None
+        self._metadata: Optional[dict] = None
+
+        # Ensure cache directory exists
+        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if self.enabled:
+            logger.info(f"SimilarityCache initialized at {self.cache_path}")
+        else:
+            logger.info("SimilarityCache disabled")
+
+    def load_cache(self) -> pd.DataFrame:
+        """
+        Load cache from disk.
+
+        Returns:
+            DataFrame with columns: card_name, similar_name, similarity, edhrecRank, rank
+            Returns empty DataFrame if file doesn't exist or loading fails
+        """
+        if not self.enabled:
+            return self._empty_cache_df()
+
+        if self._cache_df is not None:
+            return self._cache_df
+
+        if not self.cache_path.exists():
+            logger.info("Cache file not found, returning empty cache")
+            self._cache_df = self._empty_cache_df()
+            return self._cache_df
+
+        try:
+            # Load Parquet file
+            self._cache_df = pq.read_table(self.cache_path).to_pandas()
+            
+            # Load metadata
+            if self.metadata_path.exists():
+                with open(self.metadata_path, "r", encoding="utf-8") as f:
+                    self._metadata = json.load(f)
+            else:
+                self._metadata = self._empty_metadata()
+
+            # Validate cache structure
+            if not self._validate_cache(self._cache_df):
+                logger.warning("Cache validation failed, returning empty cache")
+                self._cache_df = self._empty_cache_df()
+                return self._cache_df
+
+            total_cards = len(self._cache_df["card_name"].unique()) if len(self._cache_df) > 0 else 0
+            logger.info(
+                f"Loaded similarity cache v{self._metadata.get('version', 'unknown')} with {total_cards:,} cards ({len(self._cache_df):,} entries)"
+            )
+
+            return self._cache_df
+
+        except Exception as e:
+            logger.error(f"Failed to load cache: {e}")
+            self._cache_df = self._empty_cache_df()
+            return self._cache_df
+
+    def save_cache(self, cache_df: pd.DataFrame, metadata: Optional[dict] = None) -> bool:
+        """
+        Save cache to disk.
+
+        Args:
+            cache_df: DataFrame with similarity data
+            metadata: Optional metadata dict. If None, uses current metadata with updates.
+
+        Returns:
+            True if save successful, False otherwise
+        """
+        if not self.enabled:
+            logger.debug("Cache disabled, skipping save")
+            return False
+
+        try:
+            # Ensure directory exists
+            self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Update metadata
+            if metadata is None:
+                metadata = self._metadata or self._empty_metadata()
+            
+            total_cards = len(cache_df["card_name"].unique()) if len(cache_df) > 0 else 0
+            metadata["total_cards"] = total_cards
+            metadata["last_updated"] = datetime.now().isoformat()
+            metadata["total_entries"] = len(cache_df)
+
+            # Write Parquet file (with compression)
+            temp_cache = self.cache_path.with_suffix(".tmp")
+            pq.write_table(
+                pa.table(cache_df),
+                temp_cache,
+                compression="snappy",
+                version="2.6",
+            )
+            temp_cache.replace(self.cache_path)
+
+            # Write metadata file
+            temp_meta = self.metadata_path.with_suffix(".tmp")
+            with open(temp_meta, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            temp_meta.replace(self.metadata_path)
+
+            self._cache_df = cache_df
+            self._metadata = metadata
+            
+            logger.info(f"Saved similarity cache with {total_cards:,} cards ({len(cache_df):,} entries)")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to save cache: {e}")
+            return False
+
+    def get_similar(self, card_name: str, limit: int = 5, randomize: bool = True) -> Optional[list[dict]]:
+        """
+        Get cached similar cards for a given card.
+
+        Args:
+            card_name: Name of the card to look up
+            limit: Maximum number of results to return
+            randomize: If True, randomly sample from cached results; if False, return top by rank
+
+        Returns:
+            List of similar cards with similarity scores, or None if not in cache
+        """
+        if not self.enabled:
+            return None
+
+        cache_df = self.load_cache()
+        
+        if len(cache_df) == 0:
+            return None
+
+        # Filter to this card
+        card_data = cache_df[cache_df["card_name"] == card_name]
+        
+        if len(card_data) == 0:
+            return None
+
+        # Randomly sample if requested and we have more results than limit
+        if randomize and len(card_data) > limit:
+            card_data = card_data.sample(n=limit, random_state=None)
+        else:
+            # Sort by rank and take top N
+            card_data = card_data.sort_values("rank").head(limit)
+
+        # Convert to list of dicts
+        results = []
+        for _, row in card_data.iterrows():
+            results.append({
+                "name": row["similar_name"],
+                "similarity": row["similarity"],
+                "edhrecRank": row["edhrecRank"],
+            })
+
+        return results
+
+    def set_similar(self, card_name: str, similar_cards: list[dict]) -> bool:
+        """
+        Cache similar cards for a given card.
+
+        Args:
+            card_name: Name of the card
+            similar_cards: List of similar cards with similarity scores
+
+        Returns:
+            True if successful, False otherwise
+        """
+        if not self.enabled:
+            return False
+
+        cache_df = self.load_cache()
+
+        # Remove existing entries for this card
+        cache_df = cache_df[cache_df["card_name"] != card_name]
+
+        # Add new entries
+        new_rows = []
+        for rank, card in enumerate(similar_cards):
+            new_rows.append({
+                "card_name": card_name,
+                "similar_name": card["name"],
+                "similarity": card["similarity"],
+                "edhrecRank": card.get("edhrecRank", float("inf")),
+                "rank": rank,
+            })
+
+        if new_rows:
+            new_df = pd.DataFrame(new_rows)
+            cache_df = pd.concat([cache_df, new_df], ignore_index=True)
+
+        return self.save_cache(cache_df)
+
+    def invalidate(self, card_name: Optional[str] = None) -> bool:
+        """
+        Invalidate cache entries.
+
+        Args:
+            card_name: If provided, invalidate only this card. If None, clear entire cache.
+
+        Returns:
+            True if successful, False otherwise
+        """
+        if not self.enabled:
+            return False
+
+        if card_name is None:
+            # Clear entire cache
+            logger.info("Clearing entire similarity cache")
+            self._cache_df = self._empty_cache_df()
+            self._metadata = self._empty_metadata()
+            return self.save_cache(self._cache_df, self._metadata)
+
+        # Clear specific card
+        cache_df = self.load_cache()
+        
+        initial_len = len(cache_df)
+        cache_df = cache_df[cache_df["card_name"] != card_name]
+        
+        if len(cache_df) < initial_len:
+            logger.info(f"Invalidated cache for card: {card_name}")
+            return self.save_cache(cache_df)
+
+        return False
+
+    def get_stats(self) -> dict:
+        """
+        Get cache statistics.
+
+        Returns:
+            Dictionary with cache stats (version, total_cards, build_date, file_size, etc.)
+        """
+        if not self.enabled:
+            return {"enabled": False}
+
+        cache_df = self.load_cache()
+        metadata = self._metadata or self._empty_metadata()
+
+        stats = {
+            "enabled": True,
+            "version": metadata.get("version", "unknown"),
+            "total_cards": len(cache_df["card_name"].unique()) if len(cache_df) > 0 else 0,
+            "total_entries": len(cache_df),
+            "build_date": metadata.get("build_date"),
+            "last_updated": metadata.get("last_updated"),
+            "file_exists": self.cache_path.exists(),
+            "file_path": str(self.cache_path),
+            "format": "parquet",
+        }
+
+        if self.cache_path.exists():
+            stats["file_size_mb"] = round(
+                self.cache_path.stat().st_size / (1024 * 1024), 2
+            )
+
+        return stats
+
+    @staticmethod
+    def _empty_cache_df() -> pd.DataFrame:
+        """
+        Create empty cache DataFrame.
+
+        Returns:
+            Empty DataFrame with correct schema
+        """
+        return pd.DataFrame(columns=["card_name", "similar_name", "similarity", "edhrecRank", "rank"])
+
+    @staticmethod
+    def _empty_metadata() -> dict:
+        """
+        Create empty metadata structure.
+
+        Returns:
+            Empty metadata dictionary
+        """
+        return {
+            "version": CACHE_VERSION,
+            "total_cards": 0,
+            "total_entries": 0,
+            "build_date": None,
+            "last_updated": None,
+            "threshold": 0.6,
+            "min_results": 3,
+        }
+
+    @staticmethod
+    def _validate_cache(cache_df: pd.DataFrame) -> bool:
+        """
+        Validate cache DataFrame structure.
+
+        Args:
+            cache_df: DataFrame to validate
+
+        Returns:
+            True if valid, False otherwise
+        """
+        if not isinstance(cache_df, pd.DataFrame):
+            return False
+
+        # Check required columns
+        required_cols = {"card_name", "similar_name", "similarity", "edhrecRank", "rank"}
+        if not required_cols.issubset(cache_df.columns):
+            logger.warning(f"Cache missing required columns. Expected: {required_cols}, Got: {set(cache_df.columns)}")
+            return False
+
+        return True
+
+
+# Singleton instance for global access
+_cache_instance: Optional[SimilarityCache] = None
+
+
+def get_cache() -> SimilarityCache:
+    """
+    Get singleton cache instance.
+
+    Returns:
+        Global SimilarityCache instance
+    """
+    global _cache_instance
+
+    if _cache_instance is None:
+        # Check environment variables for custom path
+        cache_path_str = os.getenv("SIMILARITY_CACHE_PATH")
+        cache_path = Path(cache_path_str) if cache_path_str else None
+
+        _cache_instance = SimilarityCache(cache_path=cache_path)
+
+    return _cache_instance