mtg_python_deckbuilder/code/web/services/card_similarity.py
matt 505bbdf166 fix: handle numpy arrays in card_similarity parse_theme_tags
The similarity cache build was failing because parse_theme_tags() was checking isinstance(tags, list) but Parquet files return numpy.ndarray objects. This caused all cards to be flagged as having no theme tags, resulting in an empty cache.

Changed to use hasattr(__len__) check instead, which works for both lists and numpy arrays.
2025-10-19 08:26:20 -07:00

487 lines
18 KiB
Python

"""
Card similarity service using Jaccard index on theme tags.
Provides similarity scoring between cards based on theme tag overlap.
Used for "Similar Cards" feature in card browser.
Supports persistent caching for improved performance (2-6s → <500ms).
Uses "signature tags" approach: compares top 5 most frequent tags instead
of all tags, significantly improving performance and quality.
"""
import ast
import logging
import random
from pathlib import Path
from typing import Optional
import pandas as pd
from code.web.services.similarity_cache import SimilarityCache, get_cache
logger = logging.getLogger(__name__)
class CardSimilarity:
"""Calculate card similarity using theme tag overlap (Jaccard index) with caching."""
def __init__(self, cards_df: Optional[pd.DataFrame] = None, cache: Optional[SimilarityCache] = None):
"""
Initialize similarity calculator.
Args:
cards_df: DataFrame with card data. If None, loads from processed all_cards.parquet
cache: SimilarityCache instance. If None, uses global singleton
"""
if cards_df is None:
# Load from processed directory (M4 Parquet migration)
from path_util import get_processed_cards_path
parquet_path = get_processed_cards_path()
logger.info(f"Loading cards from {parquet_path}")
self.cards_df = pd.read_parquet(parquet_path)
else:
self.cards_df = cards_df
# Initialize cache
self.cache = cache if cache is not None else get_cache()
# Load theme frequencies from catalog
self.theme_frequencies = self._load_theme_frequencies()
# Pre-compute cleaned tags (with exclusions) for all cards (one-time cost, huge speedup)
# This removes "Historics Matter" and "Legends Matter" from all cards
self.cleaned_tags_cache = self._precompute_cleaned_tags()
# Pre-compute card metadata (EDHREC rank) for fast lookups
self._card_metadata = self._precompute_card_metadata()
# Inverted index (tag -> set of card names) - built lazily on first use
self._tag_to_cards_index = None
logger.info(
f"Initialized CardSimilarity with {len(self.cards_df)} cards "
f"and {len(self.theme_frequencies)} theme frequencies "
f"(cache: {'enabled' if self.cache.enabled else 'disabled'})"
)
def _load_theme_frequencies(self) -> dict[str, int]:
"""
Load theme frequencies from theme_catalog.csv.
Returns:
Dict mapping theme name to card_count (higher = more common)
"""
catalog_path = Path(__file__).parents[3] / "config" / "themes" / "theme_catalog.csv"
try:
# Read CSV, skipping comment line
df = pd.read_csv(catalog_path, comment="#")
# Create dict mapping theme -> card_count
# Higher card_count = more common/frequent theme
frequencies = dict(zip(df["theme"], df["card_count"]))
logger.info(f"Loaded {len(frequencies)} theme frequencies from catalog")
return frequencies
except Exception as e:
logger.warning(f"Failed to load theme frequencies: {e}, using empty dict")
return {}
def _precompute_cleaned_tags(self) -> dict[str, set[str]]:
"""
Pre-compute cleaned tags for all cards.
Removes overly common tags like "Historics Matter" and "Legends Matter"
that don't provide meaningful similarity. This is done once during
initialization to avoid recalculating for every comparison.
Returns:
Dict mapping card name -> cleaned tags (full set minus exclusions)
"""
logger.info("Pre-computing cleaned tags for all cards...")
excluded_tags = {"Historics Matter", "Legends Matter"}
cleaned = {}
for _, row in self.cards_df.iterrows():
card_name = row["name"]
tags = self.parse_theme_tags(row["themeTags"])
if tags:
# Remove excluded tags
cleaned_tags = tags - excluded_tags
if cleaned_tags: # Only store if card has tags after exclusion
cleaned[card_name] = cleaned_tags
logger.info(f"Pre-computed {len(cleaned)} card tag sets")
return cleaned
def _precompute_card_metadata(self) -> dict[str, dict]:
"""
Pre-compute card metadata (EDHREC rank, etc.) for fast lookups.
Returns:
Dict mapping card name -> metadata dict
"""
logger.info("Pre-computing card metadata...")
metadata = {}
for _, row in self.cards_df.iterrows():
card_name = row["name"]
edhrec_rank = row.get("edhrecRank")
# Convert to float, use inf for NaN/None
edhrec_rank = float(edhrec_rank) if pd.notna(edhrec_rank) else float('inf')
metadata[card_name] = {
"edhrecRank": edhrec_rank,
}
logger.info(f"Pre-computed metadata for {len(metadata)} cards")
return metadata
def _build_tag_index(self) -> None:
"""
Build inverted index: tag -> set of card names that have this tag.
This allows fast candidate filtering - instead of checking all 29k cards,
we only check cards that share at least one tag with the target.
Performance impact: Reduces 29k comparisons to typically 100-2000 comparisons.
"""
logger.info("Building inverted tag index...")
index = {}
for card_name, tags in self.cleaned_tags_cache.items():
for tag in tags:
if tag not in index:
index[tag] = set()
index[tag].add(card_name)
self._tag_to_cards_index = index
# Log statistics
avg_cards_per_tag = sum(len(cards) for cards in index.values()) / len(index) if index else 0
logger.info(
f"Built tag index: {len(index)} unique tags, "
f"avg {avg_cards_per_tag:.1f} cards per tag"
)
def get_signature_tags(
self,
card_tags: set[str],
top_n: int = 5,
random_n: Optional[int] = None,
seed: Optional[int] = None,
) -> set[str]:
"""
Get signature tags for similarity comparison.
Takes the most frequent (popular) tags PLUS random tags for diversity.
This balances defining characteristics with discovery of niche synergies.
Excludes overly common tags like "Historics Matter" and "Legends Matter"
that appear on most legendary cards and don't provide meaningful similarity.
Args:
card_tags: Full set of card theme tags
top_n: Number of most frequent tags to use (default 5)
random_n: Number of random tags to add. If None, auto-scales:
- 6-10 tags: 1 random
- 11-15 tags: 2 random
- 16+ tags: 3 random
seed: Random seed for reproducibility (default: None)
Returns:
Set of signature tags (top_n most frequent + random_n random)
"""
# Exclude overly common tags that don't provide meaningful similarity
excluded_tags = {"Historics Matter", "Legends Matter"}
card_tags = card_tags - excluded_tags
if len(card_tags) <= top_n:
return card_tags # Use all if card has few tags
# Auto-scale random_n based on total tag count if not specified
if random_n is None:
tag_count = len(card_tags)
if tag_count >= 16:
random_n = 3
elif tag_count >= 11:
random_n = 2
elif tag_count >= 6:
random_n = 1
else:
random_n = 0 # Very few tags, no random needed
# Sort tags by frequency (higher card_count = more common = higher priority)
sorted_tags = sorted(
card_tags,
key=lambda t: -self.theme_frequencies.get(t, 0), # Negate for descending order
)
# Take top N most frequent tags
signature = set(sorted_tags[:top_n])
# Add random tags from remaining tags
remaining_tags = card_tags - signature
if remaining_tags and random_n > 0:
if seed is not None:
random.seed(seed)
# Sample min(random_n, len(remaining_tags)) to avoid errors
sample_size = min(random_n, len(remaining_tags))
random_tags = set(random.sample(list(remaining_tags), sample_size))
signature = signature | random_tags
return signature
@staticmethod
def parse_theme_tags(tags: str | list) -> set[str]:
"""
Parse theme tags from string or list format.
Args:
tags: Theme tags as string representation of list or actual list
Returns:
Set of theme tag strings
"""
# M4: Handle both scalar NA (CSV) and array values (Parquet)
if pd.isna(tags) if isinstance(tags, (str, float, int, type(None))) else False:
return set()
# M4: Handle numpy arrays from Parquet files
if hasattr(tags, '__len__') and not isinstance(tags, str):
# Parquet format - convert array-like to list
return set(list(tags)) if len(tags) > 0 else set()
if isinstance(tags, str):
# Handle string representation of list: "['tag1', 'tag2']"
try:
parsed = ast.literal_eval(tags)
if isinstance(parsed, list):
return set(parsed)
return set()
except (ValueError, SyntaxError):
# If parsing fails, return empty set
logger.warning(f"Failed to parse theme tags: {tags[:100]}")
return set()
return set()
@staticmethod
def calculate_similarity(tags_a: set[str], tags_b: set[str]) -> float:
"""
Calculate Jaccard similarity between two sets of theme tags.
Jaccard index = intersection / union
Args:
tags_a: First set of theme tags
tags_b: Second set of theme tags
Returns:
Similarity score from 0.0 (no overlap) to 1.0 (identical)
"""
if not tags_a or not tags_b:
return 0.0
intersection = len(tags_a & tags_b)
union = len(tags_a | tags_b)
if union == 0:
return 0.0
return intersection / union
def get_card_tags(self, card_name: str) -> Optional[set[str]]:
"""
Get theme tags for a specific card.
Args:
card_name: Name of the card
Returns:
Set of theme tags, or None if card not found
"""
card_row = self.cards_df[self.cards_df["name"] == card_name]
if card_row.empty:
return None
tags = card_row.iloc[0]["themeTags"]
return self.parse_theme_tags(tags)
def find_similar(
self,
card_name: str,
threshold: float = 0.8,
limit: int = 10,
min_results: int = 3,
adaptive: bool = True,
use_cache: bool = True,
) -> list[dict]:
"""
Find cards with similar theme tags.
Uses adaptive threshold scaling to ensure minimum number of results.
Tries 80% → 60% thresholds until min_results is met (skips 70% for performance).
Checks cache first for pre-computed results, falls back to real-time calculation.
Args:
card_name: Name of the target card
threshold: Starting similarity threshold (0.0-1.0), default 0.8 (80%)
limit: Maximum number of results, default 10
min_results: Minimum desired results for adaptive scaling, default 3
adaptive: Enable adaptive threshold scaling, default True
use_cache: Check cache first before calculating, default True
Returns:
List of dicts with keys: name, similarity, themeTags, edhrecRank, threshold_used
Sorted by similarity descending, then by EDHREC rank ascending (more popular first)
Returns empty list if card not found or has no tags
"""
# Check cache first
if use_cache and self.cache.enabled:
cached_results = self.cache.get_similar(card_name, limit=limit, randomize=True)
if cached_results is not None:
logger.info(f"Cache HIT for '{card_name}' ({len(cached_results)} results, randomized)")
return cached_results
else:
logger.info(f"Cache MISS for '{card_name}', calculating...")
# Get target card tags
target_tags = self.get_card_tags(card_name)
if target_tags is None:
logger.warning(f"Card not found: {card_name}")
return []
if not target_tags:
logger.info(f"Card has no theme tags: {card_name}")
return []
# Get signature tags for TARGET card only (top 5 most frequent + 1-3 random)
# This focuses the search on the target's defining characteristics
# with some diversity from random tags
# Use card name hash as seed for reproducible randomness per card
card_seed = hash(card_name) % (2**31)
target_signature = self.get_signature_tags(
target_tags,
top_n=5,
seed=card_seed
)
logger.debug(
f"Target '{card_name}': {len(target_tags)} tags → "
f"{len(target_signature)} signature tags"
)
# Try adaptive thresholds if enabled
thresholds_to_try = [threshold]
if adaptive:
# Build list of thresholds to try: 80% → 60% → 50% (skip 70% for performance)
thresholds_to_try = []
if threshold >= 0.8:
thresholds_to_try.append(0.8)
if threshold >= 0.6:
thresholds_to_try.append(0.6)
if threshold >= 0.5:
thresholds_to_try.append(0.5)
# Remove duplicates and sort descending
thresholds_to_try = sorted(set(thresholds_to_try), reverse=True)
results = []
threshold_used = threshold
for current_threshold in thresholds_to_try:
# Use inverted index for fast candidate filtering
# Instead of checking all 29k cards, only check cards that share at least one signature tag
results = []
# Build inverted index on first use (lazily)
if self._tag_to_cards_index is None:
self._build_tag_index()
# Get candidate cards that share at least one signature tag
# This drastically reduces the number of cards we need to check
candidate_cards = set()
for tag in target_signature:
if tag in self._tag_to_cards_index:
candidate_cards.update(self._tag_to_cards_index[tag])
# Remove the target card itself
candidate_cards.discard(card_name)
if not candidate_cards:
continue # No candidates at all, try lower threshold
# Now calculate scores only for candidates (vectorized where possible)
# Pre-filter candidates by checking if they meet minimum overlap requirement
min_overlap = int(len(target_signature) * current_threshold)
for candidate_name in candidate_cards:
candidate_tags = self.cleaned_tags_cache.get(candidate_name)
if not candidate_tags:
continue
# Fast overlap check using set intersection
overlap = target_signature & candidate_tags
overlap_count = len(overlap)
# Quick filter: skip if overlap too small
if overlap_count < min_overlap:
continue
# Calculate exact containment score
containment_score = overlap_count / len(target_signature)
if containment_score >= current_threshold:
# Get EDHREC rank efficiently from card metadata
edhrec_rank = self._card_metadata.get(candidate_name, {}).get('edhrecRank', float('inf'))
results.append({
"name": candidate_name,
"similarity": containment_score,
"themeTags": list(candidate_tags),
"edhrecRank": edhrec_rank,
})
# Sort by similarity descending, then by EDHREC rank ascending (lower is better)
# Unranked cards (inf) will appear last
results.sort(key=lambda x: (-x["similarity"], x["edhrecRank"]))
# Check if we have enough results
if len(results) >= min_results or not adaptive:
threshold_used = current_threshold
break
# Log that we're trying a lower threshold
logger.info(
f"Found {len(results)} results at {current_threshold:.0%} "
f"for '{card_name}', trying lower threshold..."
)
# Add threshold_used to results
for result in results:
result["threshold_used"] = threshold_used
logger.info(
f"Found {len(results)} similar cards for '{card_name}' "
f"at {threshold_used:.0%} threshold"
)
final_results = results[:limit]
# Cache the results for future lookups
if use_cache and self.cache.enabled and final_results:
self.cache.set_similar(card_name, final_results)
logger.debug(f"Cached {len(final_results)} results for '{card_name}'")
return final_results