Add card browser with similar cards and performance optimizations

2026-03-08 06:32:37 +01:00 · 2025-10-17 16:17:36 -07:00 · 2025-10-17 16:17:36 -07:00 · c2960c808e
commit c2960c808e
parent a8dc1835eb
25 changed files with 4841 additions and 1392 deletions
--- a/code/web/services/similarity_cache.py
+++ b/code/web/services/similarity_cache.py
@ -0,0 +1,386 @@
+"""
+Similarity cache manager for card similarity calculations.
+
+Provides persistent caching of pre-computed card similarity scores to improve
+card detail page load times from 2-6s down to <500ms.
+
+Cache format: Parquet file with columnar structure:
+- card_name: str (source card)
+- similar_name: str (similar card name)
+- similarity: float (similarity score)
+- edhrecRank: float (EDHREC rank of similar card)
+- rank: int (ranking position, 0-19 for top 20)
+
+Metadata stored in separate JSON sidecar file.
+
+Benefits vs JSON:
+- 5-10x faster load times
+- 50-70% smaller file size
+- Better compression for large datasets
+- Consistent with other card data storage
+"""
+
+import json
+import logging
+import os
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Default cache settings
+CACHE_VERSION = "2.0"  # Bumped for Parquet format
+DEFAULT_CACHE_PATH = Path(__file__).parents[3] / "card_files" / "similarity_cache.parquet"
+DEFAULT_METADATA_PATH = Path(__file__).parents[3] / "card_files" / "similarity_cache_metadata.json"
+
+
+class SimilarityCache:
+    """Manages persistent cache for card similarity calculations using Parquet."""
+
+    def __init__(self, cache_path: Optional[Path] = None, enabled: bool = True):
+        """
+        Initialize similarity cache manager.
+
+        Args:
+            cache_path: Path to cache file. If None, uses DEFAULT_CACHE_PATH
+            enabled: Whether cache is enabled (can be disabled via env var)
+        """
+        self.cache_path = cache_path or DEFAULT_CACHE_PATH
+        self.metadata_path = self.cache_path.with_name(
+            self.cache_path.stem + "_metadata.json"
+        )
+        self.enabled = enabled and os.getenv("SIMILARITY_CACHE_ENABLED", "1") == "1"
+        self._cache_df: Optional[pd.DataFrame] = None
+        self._metadata: Optional[dict] = None
+
+        # Ensure cache directory exists
+        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if self.enabled:
+            logger.info(f"SimilarityCache initialized at {self.cache_path}")
+        else:
+            logger.info("SimilarityCache disabled")
+
+    def load_cache(self) -> pd.DataFrame:
+        """
+        Load cache from disk.
+
+        Returns:
+            DataFrame with columns: card_name, similar_name, similarity, edhrecRank, rank
+            Returns empty DataFrame if file doesn't exist or loading fails
+        """
+        if not self.enabled:
+            return self._empty_cache_df()
+
+        if self._cache_df is not None:
+            return self._cache_df
+
+        if not self.cache_path.exists():
+            logger.info("Cache file not found, returning empty cache")
+            self._cache_df = self._empty_cache_df()
+            return self._cache_df
+
+        try:
+            # Load Parquet file
+            self._cache_df = pq.read_table(self.cache_path).to_pandas()
+            
+            # Load metadata
+            if self.metadata_path.exists():
+                with open(self.metadata_path, "r", encoding="utf-8") as f:
+                    self._metadata = json.load(f)
+            else:
+                self._metadata = self._empty_metadata()
+
+            # Validate cache structure
+            if not self._validate_cache(self._cache_df):
+                logger.warning("Cache validation failed, returning empty cache")
+                self._cache_df = self._empty_cache_df()
+                return self._cache_df
+
+            total_cards = len(self._cache_df["card_name"].unique()) if len(self._cache_df) > 0 else 0
+            logger.info(
+                f"Loaded similarity cache v{self._metadata.get('version', 'unknown')} with {total_cards:,} cards ({len(self._cache_df):,} entries)"
+            )
+
+            return self._cache_df
+
+        except Exception as e:
+            logger.error(f"Failed to load cache: {e}")
+            self._cache_df = self._empty_cache_df()
+            return self._cache_df
+
+    def save_cache(self, cache_df: pd.DataFrame, metadata: Optional[dict] = None) -> bool:
+        """
+        Save cache to disk.
+
+        Args:
+            cache_df: DataFrame with similarity data
+            metadata: Optional metadata dict. If None, uses current metadata with updates.
+
+        Returns:
+            True if save successful, False otherwise
+        """
+        if not self.enabled:
+            logger.debug("Cache disabled, skipping save")
+            return False
+
+        try:
+            # Ensure directory exists
+            self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Update metadata
+            if metadata is None:
+                metadata = self._metadata or self._empty_metadata()
+            
+            total_cards = len(cache_df["card_name"].unique()) if len(cache_df) > 0 else 0
+            metadata["total_cards"] = total_cards
+            metadata["last_updated"] = datetime.now().isoformat()
+            metadata["total_entries"] = len(cache_df)
+
+            # Write Parquet file (with compression)
+            temp_cache = self.cache_path.with_suffix(".tmp")
+            pq.write_table(
+                pa.table(cache_df),
+                temp_cache,
+                compression="snappy",
+                version="2.6",
+            )
+            temp_cache.replace(self.cache_path)
+
+            # Write metadata file
+            temp_meta = self.metadata_path.with_suffix(".tmp")
+            with open(temp_meta, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            temp_meta.replace(self.metadata_path)
+
+            self._cache_df = cache_df
+            self._metadata = metadata
+            
+            logger.info(f"Saved similarity cache with {total_cards:,} cards ({len(cache_df):,} entries)")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to save cache: {e}")
+            return False
+
+    def get_similar(self, card_name: str, limit: int = 5, randomize: bool = True) -> Optional[list[dict]]:
+        """
+        Get cached similar cards for a given card.
+
+        Args:
+            card_name: Name of the card to look up
+            limit: Maximum number of results to return
+            randomize: If True, randomly sample from cached results; if False, return top by rank
+
+        Returns:
+            List of similar cards with similarity scores, or None if not in cache
+        """
+        if not self.enabled:
+            return None
+
+        cache_df = self.load_cache()
+        
+        if len(cache_df) == 0:
+            return None
+
+        # Filter to this card
+        card_data = cache_df[cache_df["card_name"] == card_name]
+        
+        if len(card_data) == 0:
+            return None
+
+        # Randomly sample if requested and we have more results than limit
+        if randomize and len(card_data) > limit:
+            card_data = card_data.sample(n=limit, random_state=None)
+        else:
+            # Sort by rank and take top N
+            card_data = card_data.sort_values("rank").head(limit)
+
+        # Convert to list of dicts
+        results = []
+        for _, row in card_data.iterrows():
+            results.append({
+                "name": row["similar_name"],
+                "similarity": row["similarity"],
+                "edhrecRank": row["edhrecRank"],
+            })
+
+        return results
+
+    def set_similar(self, card_name: str, similar_cards: list[dict]) -> bool:
+        """
+        Cache similar cards for a given card.
+
+        Args:
+            card_name: Name of the card
+            similar_cards: List of similar cards with similarity scores
+
+        Returns:
+            True if successful, False otherwise
+        """
+        if not self.enabled:
+            return False
+
+        cache_df = self.load_cache()
+
+        # Remove existing entries for this card
+        cache_df = cache_df[cache_df["card_name"] != card_name]
+
+        # Add new entries
+        new_rows = []
+        for rank, card in enumerate(similar_cards):
+            new_rows.append({
+                "card_name": card_name,
+                "similar_name": card["name"],
+                "similarity": card["similarity"],
+                "edhrecRank": card.get("edhrecRank", float("inf")),
+                "rank": rank,
+            })
+
+        if new_rows:
+            new_df = pd.DataFrame(new_rows)
+            cache_df = pd.concat([cache_df, new_df], ignore_index=True)
+
+        return self.save_cache(cache_df)
+
+    def invalidate(self, card_name: Optional[str] = None) -> bool:
+        """
+        Invalidate cache entries.
+
+        Args:
+            card_name: If provided, invalidate only this card. If None, clear entire cache.
+
+        Returns:
+            True if successful, False otherwise
+        """
+        if not self.enabled:
+            return False
+
+        if card_name is None:
+            # Clear entire cache
+            logger.info("Clearing entire similarity cache")
+            self._cache_df = self._empty_cache_df()
+            self._metadata = self._empty_metadata()
+            return self.save_cache(self._cache_df, self._metadata)
+
+        # Clear specific card
+        cache_df = self.load_cache()
+        
+        initial_len = len(cache_df)
+        cache_df = cache_df[cache_df["card_name"] != card_name]
+        
+        if len(cache_df) < initial_len:
+            logger.info(f"Invalidated cache for card: {card_name}")
+            return self.save_cache(cache_df)
+
+        return False
+
+    def get_stats(self) -> dict:
+        """
+        Get cache statistics.
+
+        Returns:
+            Dictionary with cache stats (version, total_cards, build_date, file_size, etc.)
+        """
+        if not self.enabled:
+            return {"enabled": False}
+
+        cache_df = self.load_cache()
+        metadata = self._metadata or self._empty_metadata()
+
+        stats = {
+            "enabled": True,
+            "version": metadata.get("version", "unknown"),
+            "total_cards": len(cache_df["card_name"].unique()) if len(cache_df) > 0 else 0,
+            "total_entries": len(cache_df),
+            "build_date": metadata.get("build_date"),
+            "last_updated": metadata.get("last_updated"),
+            "file_exists": self.cache_path.exists(),
+            "file_path": str(self.cache_path),
+            "format": "parquet",
+        }
+
+        if self.cache_path.exists():
+            stats["file_size_mb"] = round(
+                self.cache_path.stat().st_size / (1024 * 1024), 2
+            )
+
+        return stats
+
+    @staticmethod
+    def _empty_cache_df() -> pd.DataFrame:
+        """
+        Create empty cache DataFrame.
+
+        Returns:
+            Empty DataFrame with correct schema
+        """
+        return pd.DataFrame(columns=["card_name", "similar_name", "similarity", "edhrecRank", "rank"])
+
+    @staticmethod
+    def _empty_metadata() -> dict:
+        """
+        Create empty metadata structure.
+
+        Returns:
+            Empty metadata dictionary
+        """
+        return {
+            "version": CACHE_VERSION,
+            "total_cards": 0,
+            "total_entries": 0,
+            "build_date": None,
+            "last_updated": None,
+            "threshold": 0.6,
+            "min_results": 3,
+        }
+
+    @staticmethod
+    def _validate_cache(cache_df: pd.DataFrame) -> bool:
+        """
+        Validate cache DataFrame structure.
+
+        Args:
+            cache_df: DataFrame to validate
+
+        Returns:
+            True if valid, False otherwise
+        """
+        if not isinstance(cache_df, pd.DataFrame):
+            return False
+
+        # Check required columns
+        required_cols = {"card_name", "similar_name", "similarity", "edhrecRank", "rank"}
+        if not required_cols.issubset(cache_df.columns):
+            logger.warning(f"Cache missing required columns. Expected: {required_cols}, Got: {set(cache_df.columns)}")
+            return False
+
+        return True
+
+
+# Singleton instance for global access
+_cache_instance: Optional[SimilarityCache] = None
+
+
+def get_cache() -> SimilarityCache:
+    """
+    Get singleton cache instance.
+
+    Returns:
+        Global SimilarityCache instance
+    """
+    global _cache_instance
+
+    if _cache_instance is None:
+        # Check environment variables for custom path
+        cache_path_str = os.getenv("SIMILARITY_CACHE_PATH")
+        cache_path = Path(cache_path_str) if cache_path_str else None
+
+        _cache_instance = SimilarityCache(cache_path=cache_path)
+
+    return _cache_instance