mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-17 08:00:13 +01:00
Add card browser with similar cards and performance optimizations
This commit is contained in:
parent
a8dc1835eb
commit
c2960c808e
25 changed files with 4841 additions and 1392 deletions
483
code/web/services/card_similarity.py
Normal file
483
code/web/services/card_similarity.py
Normal file
|
|
@ -0,0 +1,483 @@
|
|||
"""
|
||||
Card similarity service using Jaccard index on theme tags.
|
||||
|
||||
Provides similarity scoring between cards based on theme tag overlap.
|
||||
Used for "Similar Cards" feature in card browser.
|
||||
|
||||
Supports persistent caching for improved performance (2-6s → <500ms).
|
||||
|
||||
Uses "signature tags" approach: compares top 5 most frequent tags instead
|
||||
of all tags, significantly improving performance and quality.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from code.web.services.similarity_cache import SimilarityCache, get_cache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CardSimilarity:
|
||||
"""Calculate card similarity using theme tag overlap (Jaccard index) with caching."""
|
||||
|
||||
def __init__(self, cards_df: Optional[pd.DataFrame] = None, cache: Optional[SimilarityCache] = None):
|
||||
"""
|
||||
Initialize similarity calculator.
|
||||
|
||||
Args:
|
||||
cards_df: DataFrame with card data. If None, loads from all_cards.parquet
|
||||
cache: SimilarityCache instance. If None, uses global singleton
|
||||
"""
|
||||
if cards_df is None:
|
||||
# Load from default location
|
||||
parquet_path = Path(__file__).parents[3] / "card_files" / "all_cards.parquet"
|
||||
logger.info(f"Loading cards from {parquet_path}")
|
||||
self.cards_df = pd.read_parquet(parquet_path)
|
||||
else:
|
||||
self.cards_df = cards_df
|
||||
|
||||
# Initialize cache
|
||||
self.cache = cache if cache is not None else get_cache()
|
||||
|
||||
# Load theme frequencies from catalog
|
||||
self.theme_frequencies = self._load_theme_frequencies()
|
||||
|
||||
# Pre-compute cleaned tags (with exclusions) for all cards (one-time cost, huge speedup)
|
||||
# This removes "Historics Matter" and "Legends Matter" from all cards
|
||||
self.cleaned_tags_cache = self._precompute_cleaned_tags()
|
||||
|
||||
# Pre-compute card metadata (EDHREC rank) for fast lookups
|
||||
self._card_metadata = self._precompute_card_metadata()
|
||||
|
||||
# Inverted index (tag -> set of card names) - built lazily on first use
|
||||
self._tag_to_cards_index = None
|
||||
|
||||
logger.info(
|
||||
f"Initialized CardSimilarity with {len(self.cards_df)} cards "
|
||||
f"and {len(self.theme_frequencies)} theme frequencies "
|
||||
f"(cache: {'enabled' if self.cache.enabled else 'disabled'})"
|
||||
)
|
||||
|
||||
def _load_theme_frequencies(self) -> dict[str, int]:
|
||||
"""
|
||||
Load theme frequencies from theme_catalog.csv.
|
||||
|
||||
Returns:
|
||||
Dict mapping theme name to card_count (higher = more common)
|
||||
"""
|
||||
catalog_path = Path(__file__).parents[3] / "config" / "themes" / "theme_catalog.csv"
|
||||
|
||||
try:
|
||||
# Read CSV, skipping comment line
|
||||
df = pd.read_csv(catalog_path, comment="#")
|
||||
|
||||
# Create dict mapping theme -> card_count
|
||||
# Higher card_count = more common/frequent theme
|
||||
frequencies = dict(zip(df["theme"], df["card_count"]))
|
||||
|
||||
logger.info(f"Loaded {len(frequencies)} theme frequencies from catalog")
|
||||
return frequencies
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load theme frequencies: {e}, using empty dict")
|
||||
return {}
|
||||
|
||||
def _precompute_cleaned_tags(self) -> dict[str, set[str]]:
|
||||
"""
|
||||
Pre-compute cleaned tags for all cards.
|
||||
|
||||
Removes overly common tags like "Historics Matter" and "Legends Matter"
|
||||
that don't provide meaningful similarity. This is done once during
|
||||
initialization to avoid recalculating for every comparison.
|
||||
|
||||
Returns:
|
||||
Dict mapping card name -> cleaned tags (full set minus exclusions)
|
||||
"""
|
||||
logger.info("Pre-computing cleaned tags for all cards...")
|
||||
excluded_tags = {"Historics Matter", "Legends Matter"}
|
||||
cleaned = {}
|
||||
|
||||
for _, row in self.cards_df.iterrows():
|
||||
card_name = row["name"]
|
||||
tags = self.parse_theme_tags(row["themeTags"])
|
||||
|
||||
if tags:
|
||||
# Remove excluded tags
|
||||
cleaned_tags = tags - excluded_tags
|
||||
if cleaned_tags: # Only store if card has tags after exclusion
|
||||
cleaned[card_name] = cleaned_tags
|
||||
|
||||
logger.info(f"Pre-computed {len(cleaned)} card tag sets")
|
||||
return cleaned
|
||||
|
||||
def _precompute_card_metadata(self) -> dict[str, dict]:
|
||||
"""
|
||||
Pre-compute card metadata (EDHREC rank, etc.) for fast lookups.
|
||||
|
||||
Returns:
|
||||
Dict mapping card name -> metadata dict
|
||||
"""
|
||||
logger.info("Pre-computing card metadata...")
|
||||
metadata = {}
|
||||
|
||||
for _, row in self.cards_df.iterrows():
|
||||
card_name = row["name"]
|
||||
edhrec_rank = row.get("edhrecRank")
|
||||
# Convert to float, use inf for NaN/None
|
||||
edhrec_rank = float(edhrec_rank) if pd.notna(edhrec_rank) else float('inf')
|
||||
|
||||
metadata[card_name] = {
|
||||
"edhrecRank": edhrec_rank,
|
||||
}
|
||||
|
||||
logger.info(f"Pre-computed metadata for {len(metadata)} cards")
|
||||
return metadata
|
||||
|
||||
def _build_tag_index(self) -> None:
|
||||
"""
|
||||
Build inverted index: tag -> set of card names that have this tag.
|
||||
|
||||
This allows fast candidate filtering - instead of checking all 29k cards,
|
||||
we only check cards that share at least one tag with the target.
|
||||
|
||||
Performance impact: Reduces 29k comparisons to typically 100-2000 comparisons.
|
||||
"""
|
||||
logger.info("Building inverted tag index...")
|
||||
index = {}
|
||||
|
||||
for card_name, tags in self.cleaned_tags_cache.items():
|
||||
for tag in tags:
|
||||
if tag not in index:
|
||||
index[tag] = set()
|
||||
index[tag].add(card_name)
|
||||
|
||||
self._tag_to_cards_index = index
|
||||
|
||||
# Log statistics
|
||||
avg_cards_per_tag = sum(len(cards) for cards in index.values()) / len(index) if index else 0
|
||||
logger.info(
|
||||
f"Built tag index: {len(index)} unique tags, "
|
||||
f"avg {avg_cards_per_tag:.1f} cards per tag"
|
||||
)
|
||||
|
||||
def get_signature_tags(
|
||||
self,
|
||||
card_tags: set[str],
|
||||
top_n: int = 5,
|
||||
random_n: Optional[int] = None,
|
||||
seed: Optional[int] = None,
|
||||
) -> set[str]:
|
||||
"""
|
||||
Get signature tags for similarity comparison.
|
||||
|
||||
Takes the most frequent (popular) tags PLUS random tags for diversity.
|
||||
This balances defining characteristics with discovery of niche synergies.
|
||||
|
||||
Excludes overly common tags like "Historics Matter" and "Legends Matter"
|
||||
that appear on most legendary cards and don't provide meaningful similarity.
|
||||
|
||||
Args:
|
||||
card_tags: Full set of card theme tags
|
||||
top_n: Number of most frequent tags to use (default 5)
|
||||
random_n: Number of random tags to add. If None, auto-scales:
|
||||
- 6-10 tags: 1 random
|
||||
- 11-15 tags: 2 random
|
||||
- 16+ tags: 3 random
|
||||
seed: Random seed for reproducibility (default: None)
|
||||
|
||||
Returns:
|
||||
Set of signature tags (top_n most frequent + random_n random)
|
||||
"""
|
||||
# Exclude overly common tags that don't provide meaningful similarity
|
||||
excluded_tags = {"Historics Matter", "Legends Matter"}
|
||||
card_tags = card_tags - excluded_tags
|
||||
|
||||
if len(card_tags) <= top_n:
|
||||
return card_tags # Use all if card has few tags
|
||||
|
||||
# Auto-scale random_n based on total tag count if not specified
|
||||
if random_n is None:
|
||||
tag_count = len(card_tags)
|
||||
if tag_count >= 16:
|
||||
random_n = 3
|
||||
elif tag_count >= 11:
|
||||
random_n = 2
|
||||
elif tag_count >= 6:
|
||||
random_n = 1
|
||||
else:
|
||||
random_n = 0 # Very few tags, no random needed
|
||||
|
||||
# Sort tags by frequency (higher card_count = more common = higher priority)
|
||||
sorted_tags = sorted(
|
||||
card_tags,
|
||||
key=lambda t: -self.theme_frequencies.get(t, 0), # Negate for descending order
|
||||
)
|
||||
|
||||
# Take top N most frequent tags
|
||||
signature = set(sorted_tags[:top_n])
|
||||
|
||||
# Add random tags from remaining tags
|
||||
remaining_tags = card_tags - signature
|
||||
if remaining_tags and random_n > 0:
|
||||
if seed is not None:
|
||||
random.seed(seed)
|
||||
|
||||
# Sample min(random_n, len(remaining_tags)) to avoid errors
|
||||
sample_size = min(random_n, len(remaining_tags))
|
||||
random_tags = set(random.sample(list(remaining_tags), sample_size))
|
||||
|
||||
signature = signature | random_tags
|
||||
|
||||
return signature
|
||||
|
||||
@staticmethod
|
||||
def parse_theme_tags(tags: str | list) -> set[str]:
|
||||
"""
|
||||
Parse theme tags from string or list format.
|
||||
|
||||
Args:
|
||||
tags: Theme tags as string representation of list or actual list
|
||||
|
||||
Returns:
|
||||
Set of theme tag strings
|
||||
"""
|
||||
if pd.isna(tags) or not tags:
|
||||
return set()
|
||||
|
||||
if isinstance(tags, list):
|
||||
return set(tags)
|
||||
|
||||
if isinstance(tags, str):
|
||||
# Handle string representation of list: "['tag1', 'tag2']"
|
||||
try:
|
||||
parsed = ast.literal_eval(tags)
|
||||
if isinstance(parsed, list):
|
||||
return set(parsed)
|
||||
return set()
|
||||
except (ValueError, SyntaxError):
|
||||
# If parsing fails, return empty set
|
||||
logger.warning(f"Failed to parse theme tags: {tags[:100]}")
|
||||
return set()
|
||||
|
||||
return set()
|
||||
|
||||
@staticmethod
|
||||
def calculate_similarity(tags_a: set[str], tags_b: set[str]) -> float:
|
||||
"""
|
||||
Calculate Jaccard similarity between two sets of theme tags.
|
||||
|
||||
Jaccard index = intersection / union
|
||||
|
||||
Args:
|
||||
tags_a: First set of theme tags
|
||||
tags_b: Second set of theme tags
|
||||
|
||||
Returns:
|
||||
Similarity score from 0.0 (no overlap) to 1.0 (identical)
|
||||
"""
|
||||
if not tags_a or not tags_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(tags_a & tags_b)
|
||||
union = len(tags_a | tags_b)
|
||||
|
||||
if union == 0:
|
||||
return 0.0
|
||||
|
||||
return intersection / union
|
||||
|
||||
def get_card_tags(self, card_name: str) -> Optional[set[str]]:
|
||||
"""
|
||||
Get theme tags for a specific card.
|
||||
|
||||
Args:
|
||||
card_name: Name of the card
|
||||
|
||||
Returns:
|
||||
Set of theme tags, or None if card not found
|
||||
"""
|
||||
card_row = self.cards_df[self.cards_df["name"] == card_name]
|
||||
|
||||
if card_row.empty:
|
||||
return None
|
||||
|
||||
tags = card_row.iloc[0]["themeTags"]
|
||||
return self.parse_theme_tags(tags)
|
||||
|
||||
def find_similar(
|
||||
self,
|
||||
card_name: str,
|
||||
threshold: float = 0.8,
|
||||
limit: int = 10,
|
||||
min_results: int = 3,
|
||||
adaptive: bool = True,
|
||||
use_cache: bool = True,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Find cards with similar theme tags.
|
||||
|
||||
Uses adaptive threshold scaling to ensure minimum number of results.
|
||||
Tries 80% → 60% thresholds until min_results is met (skips 70% for performance).
|
||||
|
||||
Checks cache first for pre-computed results, falls back to real-time calculation.
|
||||
|
||||
Args:
|
||||
card_name: Name of the target card
|
||||
threshold: Starting similarity threshold (0.0-1.0), default 0.8 (80%)
|
||||
limit: Maximum number of results, default 10
|
||||
min_results: Minimum desired results for adaptive scaling, default 3
|
||||
adaptive: Enable adaptive threshold scaling, default True
|
||||
use_cache: Check cache first before calculating, default True
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: name, similarity, themeTags, edhrecRank, threshold_used
|
||||
Sorted by similarity descending, then by EDHREC rank ascending (more popular first)
|
||||
Returns empty list if card not found or has no tags
|
||||
"""
|
||||
# Check cache first
|
||||
if use_cache and self.cache.enabled:
|
||||
cached_results = self.cache.get_similar(card_name, limit=limit, randomize=True)
|
||||
if cached_results is not None:
|
||||
logger.info(f"Cache HIT for '{card_name}' ({len(cached_results)} results, randomized)")
|
||||
return cached_results
|
||||
else:
|
||||
logger.info(f"Cache MISS for '{card_name}', calculating...")
|
||||
|
||||
# Get target card tags
|
||||
target_tags = self.get_card_tags(card_name)
|
||||
|
||||
if target_tags is None:
|
||||
logger.warning(f"Card not found: {card_name}")
|
||||
return []
|
||||
|
||||
if not target_tags:
|
||||
logger.info(f"Card has no theme tags: {card_name}")
|
||||
return []
|
||||
|
||||
# Get signature tags for TARGET card only (top 5 most frequent + 1-3 random)
|
||||
# This focuses the search on the target's defining characteristics
|
||||
# with some diversity from random tags
|
||||
|
||||
# Use card name hash as seed for reproducible randomness per card
|
||||
card_seed = hash(card_name) % (2**31)
|
||||
target_signature = self.get_signature_tags(
|
||||
target_tags,
|
||||
top_n=5,
|
||||
seed=card_seed
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Target '{card_name}': {len(target_tags)} tags → "
|
||||
f"{len(target_signature)} signature tags"
|
||||
)
|
||||
|
||||
# Try adaptive thresholds if enabled
|
||||
thresholds_to_try = [threshold]
|
||||
if adaptive:
|
||||
# Build list of thresholds to try: 80% → 60% → 50% (skip 70% for performance)
|
||||
thresholds_to_try = []
|
||||
if threshold >= 0.8:
|
||||
thresholds_to_try.append(0.8)
|
||||
if threshold >= 0.6:
|
||||
thresholds_to_try.append(0.6)
|
||||
if threshold >= 0.5:
|
||||
thresholds_to_try.append(0.5)
|
||||
|
||||
# Remove duplicates and sort descending
|
||||
thresholds_to_try = sorted(set(thresholds_to_try), reverse=True)
|
||||
|
||||
results = []
|
||||
threshold_used = threshold
|
||||
|
||||
for current_threshold in thresholds_to_try:
|
||||
# Use inverted index for fast candidate filtering
|
||||
# Instead of checking all 29k cards, only check cards that share at least one signature tag
|
||||
results = []
|
||||
|
||||
# Build inverted index on first use (lazily)
|
||||
if self._tag_to_cards_index is None:
|
||||
self._build_tag_index()
|
||||
|
||||
# Get candidate cards that share at least one signature tag
|
||||
# This drastically reduces the number of cards we need to check
|
||||
candidate_cards = set()
|
||||
for tag in target_signature:
|
||||
if tag in self._tag_to_cards_index:
|
||||
candidate_cards.update(self._tag_to_cards_index[tag])
|
||||
|
||||
# Remove the target card itself
|
||||
candidate_cards.discard(card_name)
|
||||
|
||||
if not candidate_cards:
|
||||
continue # No candidates at all, try lower threshold
|
||||
|
||||
# Now calculate scores only for candidates (vectorized where possible)
|
||||
# Pre-filter candidates by checking if they meet minimum overlap requirement
|
||||
min_overlap = int(len(target_signature) * current_threshold)
|
||||
|
||||
for candidate_name in candidate_cards:
|
||||
candidate_tags = self.cleaned_tags_cache.get(candidate_name)
|
||||
|
||||
if not candidate_tags:
|
||||
continue
|
||||
|
||||
# Fast overlap check using set intersection
|
||||
overlap = target_signature & candidate_tags
|
||||
overlap_count = len(overlap)
|
||||
|
||||
# Quick filter: skip if overlap too small
|
||||
if overlap_count < min_overlap:
|
||||
continue
|
||||
|
||||
# Calculate exact containment score
|
||||
containment_score = overlap_count / len(target_signature)
|
||||
|
||||
if containment_score >= current_threshold:
|
||||
# Get EDHREC rank efficiently from card metadata
|
||||
edhrec_rank = self._card_metadata.get(candidate_name, {}).get('edhrecRank', float('inf'))
|
||||
|
||||
results.append({
|
||||
"name": candidate_name,
|
||||
"similarity": containment_score,
|
||||
"themeTags": list(candidate_tags),
|
||||
"edhrecRank": edhrec_rank,
|
||||
})
|
||||
|
||||
# Sort by similarity descending, then by EDHREC rank ascending (lower is better)
|
||||
# Unranked cards (inf) will appear last
|
||||
results.sort(key=lambda x: (-x["similarity"], x["edhrecRank"]))
|
||||
|
||||
# Check if we have enough results
|
||||
if len(results) >= min_results or not adaptive:
|
||||
threshold_used = current_threshold
|
||||
break
|
||||
|
||||
# Log that we're trying a lower threshold
|
||||
logger.info(
|
||||
f"Found {len(results)} results at {current_threshold:.0%} "
|
||||
f"for '{card_name}', trying lower threshold..."
|
||||
)
|
||||
|
||||
# Add threshold_used to results
|
||||
for result in results:
|
||||
result["threshold_used"] = threshold_used
|
||||
|
||||
logger.info(
|
||||
f"Found {len(results)} similar cards for '{card_name}' "
|
||||
f"at {threshold_used:.0%} threshold"
|
||||
)
|
||||
|
||||
final_results = results[:limit]
|
||||
|
||||
# Cache the results for future lookups
|
||||
if use_cache and self.cache.enabled and final_results:
|
||||
self.cache.set_similar(card_name, final_results)
|
||||
logger.debug(f"Cached {len(final_results)} results for '{card_name}'")
|
||||
|
||||
return final_results
|
||||
Loading…
Add table
Add a link
Reference in a new issue