Add card browser with similar cards and performance optimizations

2026-03-08 06:32:37 +01:00 · 2025-10-17 16:17:36 -07:00 · 2025-10-17 16:17:36 -07:00 · c2960c808e
commit c2960c808e
parent a8dc1835eb
25 changed files with 4841 additions and 1392 deletions
--- a/code/web/services/card_similarity.py
+++ b/code/web/services/card_similarity.py
@ -0,0 +1,483 @@
+"""
+Card similarity service using Jaccard index on theme tags.
+
+Provides similarity scoring between cards based on theme tag overlap.
+Used for "Similar Cards" feature in card browser.
+
+Supports persistent caching for improved performance (2-6s → <500ms).
+
+Uses "signature tags" approach: compares top 5 most frequent tags instead
+of all tags, significantly improving performance and quality.
+"""
+
+import ast
+import logging
+import random
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+
+from code.web.services.similarity_cache import SimilarityCache, get_cache
+
+logger = logging.getLogger(__name__)
+
+
+class CardSimilarity:
+    """Calculate card similarity using theme tag overlap (Jaccard index) with caching."""
+
+    def __init__(self, cards_df: Optional[pd.DataFrame] = None, cache: Optional[SimilarityCache] = None):
+        """
+        Initialize similarity calculator.
+
+        Args:
+            cards_df: DataFrame with card data. If None, loads from all_cards.parquet
+            cache: SimilarityCache instance. If None, uses global singleton
+        """
+        if cards_df is None:
+            # Load from default location
+            parquet_path = Path(__file__).parents[3] / "card_files" / "all_cards.parquet"
+            logger.info(f"Loading cards from {parquet_path}")
+            self.cards_df = pd.read_parquet(parquet_path)
+        else:
+            self.cards_df = cards_df
+
+        # Initialize cache
+        self.cache = cache if cache is not None else get_cache()
+
+        # Load theme frequencies from catalog
+        self.theme_frequencies = self._load_theme_frequencies()
+
+        # Pre-compute cleaned tags (with exclusions) for all cards (one-time cost, huge speedup)
+        # This removes "Historics Matter" and "Legends Matter" from all cards
+        self.cleaned_tags_cache = self._precompute_cleaned_tags()
+        
+        # Pre-compute card metadata (EDHREC rank) for fast lookups
+        self._card_metadata = self._precompute_card_metadata()
+        
+        # Inverted index (tag -> set of card names) - built lazily on first use
+        self._tag_to_cards_index = None
+
+        logger.info(
+            f"Initialized CardSimilarity with {len(self.cards_df)} cards "
+            f"and {len(self.theme_frequencies)} theme frequencies "
+            f"(cache: {'enabled' if self.cache.enabled else 'disabled'})"
+        )
+
+    def _load_theme_frequencies(self) -> dict[str, int]:
+        """
+        Load theme frequencies from theme_catalog.csv.
+
+        Returns:
+            Dict mapping theme name to card_count (higher = more common)
+        """
+        catalog_path = Path(__file__).parents[3] / "config" / "themes" / "theme_catalog.csv"
+
+        try:
+            # Read CSV, skipping comment line
+            df = pd.read_csv(catalog_path, comment="#")
+
+            # Create dict mapping theme -> card_count
+            # Higher card_count = more common/frequent theme
+            frequencies = dict(zip(df["theme"], df["card_count"]))
+
+            logger.info(f"Loaded {len(frequencies)} theme frequencies from catalog")
+            return frequencies
+
+        except Exception as e:
+            logger.warning(f"Failed to load theme frequencies: {e}, using empty dict")
+            return {}
+
+    def _precompute_cleaned_tags(self) -> dict[str, set[str]]:
+        """
+        Pre-compute cleaned tags for all cards.
+
+        Removes overly common tags like "Historics Matter" and "Legends Matter"
+        that don't provide meaningful similarity. This is done once during
+        initialization to avoid recalculating for every comparison.
+
+        Returns:
+            Dict mapping card name -> cleaned tags (full set minus exclusions)
+        """
+        logger.info("Pre-computing cleaned tags for all cards...")
+        excluded_tags = {"Historics Matter", "Legends Matter"}
+        cleaned = {}
+
+        for _, row in self.cards_df.iterrows():
+            card_name = row["name"]
+            tags = self.parse_theme_tags(row["themeTags"])
+
+            if tags:
+                # Remove excluded tags
+                cleaned_tags = tags - excluded_tags
+                if cleaned_tags:  # Only store if card has tags after exclusion
+                    cleaned[card_name] = cleaned_tags
+
+        logger.info(f"Pre-computed {len(cleaned)} card tag sets")
+        return cleaned
+
+    def _precompute_card_metadata(self) -> dict[str, dict]:
+        """
+        Pre-compute card metadata (EDHREC rank, etc.) for fast lookups.
+        
+        Returns:
+            Dict mapping card name -> metadata dict
+        """
+        logger.info("Pre-computing card metadata...")
+        metadata = {}
+        
+        for _, row in self.cards_df.iterrows():
+            card_name = row["name"]
+            edhrec_rank = row.get("edhrecRank")
+            # Convert to float, use inf for NaN/None
+            edhrec_rank = float(edhrec_rank) if pd.notna(edhrec_rank) else float('inf')
+            
+            metadata[card_name] = {
+                "edhrecRank": edhrec_rank,
+            }
+        
+        logger.info(f"Pre-computed metadata for {len(metadata)} cards")
+        return metadata
+
+    def _build_tag_index(self) -> None:
+        """
+        Build inverted index: tag -> set of card names that have this tag.
+        
+        This allows fast candidate filtering - instead of checking all 29k cards,
+        we only check cards that share at least one tag with the target.
+        
+        Performance impact: Reduces 29k comparisons to typically 100-2000 comparisons.
+        """
+        logger.info("Building inverted tag index...")
+        index = {}
+        
+        for card_name, tags in self.cleaned_tags_cache.items():
+            for tag in tags:
+                if tag not in index:
+                    index[tag] = set()
+                index[tag].add(card_name)
+        
+        self._tag_to_cards_index = index
+        
+        # Log statistics
+        avg_cards_per_tag = sum(len(cards) for cards in index.values()) / len(index) if index else 0
+        logger.info(
+            f"Built tag index: {len(index)} unique tags, "
+            f"avg {avg_cards_per_tag:.1f} cards per tag"
+        )
+
+    def get_signature_tags(
+        self,
+        card_tags: set[str],
+        top_n: int = 5,
+        random_n: Optional[int] = None,
+        seed: Optional[int] = None,
+    ) -> set[str]:
+        """
+        Get signature tags for similarity comparison.
+
+        Takes the most frequent (popular) tags PLUS random tags for diversity.
+        This balances defining characteristics with discovery of niche synergies.
+
+        Excludes overly common tags like "Historics Matter" and "Legends Matter"
+        that appear on most legendary cards and don't provide meaningful similarity.
+
+        Args:
+            card_tags: Full set of card theme tags
+            top_n: Number of most frequent tags to use (default 5)
+            random_n: Number of random tags to add. If None, auto-scales:
+                     - 6-10 tags: 1 random
+                     - 11-15 tags: 2 random
+                     - 16+ tags: 3 random
+            seed: Random seed for reproducibility (default: None)
+
+        Returns:
+            Set of signature tags (top_n most frequent + random_n random)
+        """
+        # Exclude overly common tags that don't provide meaningful similarity
+        excluded_tags = {"Historics Matter", "Legends Matter"}
+        card_tags = card_tags - excluded_tags
+
+        if len(card_tags) <= top_n:
+            return card_tags  # Use all if card has few tags
+
+        # Auto-scale random_n based on total tag count if not specified
+        if random_n is None:
+            tag_count = len(card_tags)
+            if tag_count >= 16:
+                random_n = 3
+            elif tag_count >= 11:
+                random_n = 2
+            elif tag_count >= 6:
+                random_n = 1
+            else:
+                random_n = 0  # Very few tags, no random needed
+
+        # Sort tags by frequency (higher card_count = more common = higher priority)
+        sorted_tags = sorted(
+            card_tags,
+            key=lambda t: -self.theme_frequencies.get(t, 0),  # Negate for descending order
+        )
+
+        # Take top N most frequent tags
+        signature = set(sorted_tags[:top_n])
+
+        # Add random tags from remaining tags
+        remaining_tags = card_tags - signature
+        if remaining_tags and random_n > 0:
+            if seed is not None:
+                random.seed(seed)
+            
+            # Sample min(random_n, len(remaining_tags)) to avoid errors
+            sample_size = min(random_n, len(remaining_tags))
+            random_tags = set(random.sample(list(remaining_tags), sample_size))
+            
+            signature = signature | random_tags
+
+        return signature
+
+    @staticmethod
+    def parse_theme_tags(tags: str | list) -> set[str]:
+        """
+        Parse theme tags from string or list format.
+
+        Args:
+            tags: Theme tags as string representation of list or actual list
+
+        Returns:
+            Set of theme tag strings
+        """
+        if pd.isna(tags) or not tags:
+            return set()
+
+        if isinstance(tags, list):
+            return set(tags)
+
+        if isinstance(tags, str):
+            # Handle string representation of list: "['tag1', 'tag2']"
+            try:
+                parsed = ast.literal_eval(tags)
+                if isinstance(parsed, list):
+                    return set(parsed)
+                return set()
+            except (ValueError, SyntaxError):
+                # If parsing fails, return empty set
+                logger.warning(f"Failed to parse theme tags: {tags[:100]}")
+                return set()
+
+        return set()
+
+    @staticmethod
+    def calculate_similarity(tags_a: set[str], tags_b: set[str]) -> float:
+        """
+        Calculate Jaccard similarity between two sets of theme tags.
+
+        Jaccard index = intersection / union
+
+        Args:
+            tags_a: First set of theme tags
+            tags_b: Second set of theme tags
+
+        Returns:
+            Similarity score from 0.0 (no overlap) to 1.0 (identical)
+        """
+        if not tags_a or not tags_b:
+            return 0.0
+
+        intersection = len(tags_a & tags_b)
+        union = len(tags_a | tags_b)
+
+        if union == 0:
+            return 0.0
+
+        return intersection / union
+
+    def get_card_tags(self, card_name: str) -> Optional[set[str]]:
+        """
+        Get theme tags for a specific card.
+
+        Args:
+            card_name: Name of the card
+
+        Returns:
+            Set of theme tags, or None if card not found
+        """
+        card_row = self.cards_df[self.cards_df["name"] == card_name]
+
+        if card_row.empty:
+            return None
+
+        tags = card_row.iloc[0]["themeTags"]
+        return self.parse_theme_tags(tags)
+
+    def find_similar(
+        self,
+        card_name: str,
+        threshold: float = 0.8,
+        limit: int = 10,
+        min_results: int = 3,
+        adaptive: bool = True,
+        use_cache: bool = True,
+    ) -> list[dict]:
+        """
+        Find cards with similar theme tags.
+
+        Uses adaptive threshold scaling to ensure minimum number of results.
+        Tries 80% → 60% thresholds until min_results is met (skips 70% for performance).
+
+        Checks cache first for pre-computed results, falls back to real-time calculation.
+
+        Args:
+            card_name: Name of the target card
+            threshold: Starting similarity threshold (0.0-1.0), default 0.8 (80%)
+            limit: Maximum number of results, default 10
+            min_results: Minimum desired results for adaptive scaling, default 3
+            adaptive: Enable adaptive threshold scaling, default True
+            use_cache: Check cache first before calculating, default True
+
+        Returns:
+            List of dicts with keys: name, similarity, themeTags, edhrecRank, threshold_used
+            Sorted by similarity descending, then by EDHREC rank ascending (more popular first)
+            Returns empty list if card not found or has no tags
+        """
+        # Check cache first
+        if use_cache and self.cache.enabled:
+            cached_results = self.cache.get_similar(card_name, limit=limit, randomize=True)
+            if cached_results is not None:
+                logger.info(f"Cache HIT for '{card_name}' ({len(cached_results)} results, randomized)")
+                return cached_results
+            else:
+                logger.info(f"Cache MISS for '{card_name}', calculating...")
+
+        # Get target card tags
+        target_tags = self.get_card_tags(card_name)
+
+        if target_tags is None:
+            logger.warning(f"Card not found: {card_name}")
+            return []
+
+        if not target_tags:
+            logger.info(f"Card has no theme tags: {card_name}")
+            return []
+
+        # Get signature tags for TARGET card only (top 5 most frequent + 1-3 random)
+        # This focuses the search on the target's defining characteristics
+        # with some diversity from random tags
+        
+        # Use card name hash as seed for reproducible randomness per card
+        card_seed = hash(card_name) % (2**31)
+        target_signature = self.get_signature_tags(
+            target_tags,
+            top_n=5,
+            seed=card_seed
+        )
+
+        logger.debug(
+            f"Target '{card_name}': {len(target_tags)} tags → "
+            f"{len(target_signature)} signature tags"
+        )
+
+        # Try adaptive thresholds if enabled
+        thresholds_to_try = [threshold]
+        if adaptive:
+            # Build list of thresholds to try: 80% → 60% → 50% (skip 70% for performance)
+            thresholds_to_try = []
+            if threshold >= 0.8:
+                thresholds_to_try.append(0.8)
+            if threshold >= 0.6:
+                thresholds_to_try.append(0.6)
+            if threshold >= 0.5:
+                thresholds_to_try.append(0.5)
+            
+            # Remove duplicates and sort descending
+            thresholds_to_try = sorted(set(thresholds_to_try), reverse=True)
+
+        results = []
+        threshold_used = threshold
+
+        for current_threshold in thresholds_to_try:
+            # Use inverted index for fast candidate filtering
+            # Instead of checking all 29k cards, only check cards that share at least one signature tag
+            results = []
+            
+            # Build inverted index on first use (lazily)
+            if self._tag_to_cards_index is None:
+                self._build_tag_index()
+            
+            # Get candidate cards that share at least one signature tag
+            # This drastically reduces the number of cards we need to check
+            candidate_cards = set()
+            for tag in target_signature:
+                if tag in self._tag_to_cards_index:
+                    candidate_cards.update(self._tag_to_cards_index[tag])
+            
+            # Remove the target card itself
+            candidate_cards.discard(card_name)
+            
+            if not candidate_cards:
+                continue  # No candidates at all, try lower threshold
+            
+            # Now calculate scores only for candidates (vectorized where possible)
+            # Pre-filter candidates by checking if they meet minimum overlap requirement
+            min_overlap = int(len(target_signature) * current_threshold)
+            
+            for candidate_name in candidate_cards:
+                candidate_tags = self.cleaned_tags_cache.get(candidate_name)
+                
+                if not candidate_tags:
+                    continue
+                
+                # Fast overlap check using set intersection
+                overlap = target_signature & candidate_tags
+                overlap_count = len(overlap)
+                
+                # Quick filter: skip if overlap too small
+                if overlap_count < min_overlap:
+                    continue
+                
+                # Calculate exact containment score
+                containment_score = overlap_count / len(target_signature)
+                
+                if containment_score >= current_threshold:
+                    # Get EDHREC rank efficiently from card metadata
+                    edhrec_rank = self._card_metadata.get(candidate_name, {}).get('edhrecRank', float('inf'))
+                    
+                    results.append({
+                        "name": candidate_name,
+                        "similarity": containment_score,
+                        "themeTags": list(candidate_tags),
+                        "edhrecRank": edhrec_rank,
+                    })
+
+            # Sort by similarity descending, then by EDHREC rank ascending (lower is better)
+            # Unranked cards (inf) will appear last
+            results.sort(key=lambda x: (-x["similarity"], x["edhrecRank"]))
+
+            # Check if we have enough results
+            if len(results) >= min_results or not adaptive:
+                threshold_used = current_threshold
+                break
+            
+            # Log that we're trying a lower threshold
+            logger.info(
+                f"Found {len(results)} results at {current_threshold:.0%} "
+                f"for '{card_name}', trying lower threshold..."
+            )
+
+        # Add threshold_used to results
+        for result in results:
+            result["threshold_used"] = threshold_used
+
+        logger.info(
+            f"Found {len(results)} similar cards for '{card_name}' "
+            f"at {threshold_used:.0%} threshold"
+        )
+
+        final_results = results[:limit]
+
+        # Cache the results for future lookups
+        if use_cache and self.cache.enabled and final_results:
+            self.cache.set_similar(card_name, final_results)
+            logger.debug(f"Cached {len(final_results)} results for '{card_name}'")
+
+        return final_results