mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 07:30:13 +01:00
483 lines
18 KiB
Python
483 lines
18 KiB
Python
"""
|
|
Card similarity service using Jaccard index on theme tags.
|
|
|
|
Provides similarity scoring between cards based on theme tag overlap.
|
|
Used for "Similar Cards" feature in card browser.
|
|
|
|
Supports persistent caching for improved performance (2-6s → <500ms).
|
|
|
|
Uses "signature tags" approach: compares top 5 most frequent tags instead
|
|
of all tags, significantly improving performance and quality.
|
|
"""
|
|
|
|
import ast
|
|
import logging
|
|
import random
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
|
|
from code.web.services.similarity_cache import SimilarityCache, get_cache
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CardSimilarity:
|
|
"""Calculate card similarity using theme tag overlap (Jaccard index) with caching."""
|
|
|
|
def __init__(self, cards_df: Optional[pd.DataFrame] = None, cache: Optional[SimilarityCache] = None):
|
|
"""
|
|
Initialize similarity calculator.
|
|
|
|
Args:
|
|
cards_df: DataFrame with card data. If None, loads from all_cards.parquet
|
|
cache: SimilarityCache instance. If None, uses global singleton
|
|
"""
|
|
if cards_df is None:
|
|
# Load from default location
|
|
parquet_path = Path(__file__).parents[3] / "card_files" / "all_cards.parquet"
|
|
logger.info(f"Loading cards from {parquet_path}")
|
|
self.cards_df = pd.read_parquet(parquet_path)
|
|
else:
|
|
self.cards_df = cards_df
|
|
|
|
# Initialize cache
|
|
self.cache = cache if cache is not None else get_cache()
|
|
|
|
# Load theme frequencies from catalog
|
|
self.theme_frequencies = self._load_theme_frequencies()
|
|
|
|
# Pre-compute cleaned tags (with exclusions) for all cards (one-time cost, huge speedup)
|
|
# This removes "Historics Matter" and "Legends Matter" from all cards
|
|
self.cleaned_tags_cache = self._precompute_cleaned_tags()
|
|
|
|
# Pre-compute card metadata (EDHREC rank) for fast lookups
|
|
self._card_metadata = self._precompute_card_metadata()
|
|
|
|
# Inverted index (tag -> set of card names) - built lazily on first use
|
|
self._tag_to_cards_index = None
|
|
|
|
logger.info(
|
|
f"Initialized CardSimilarity with {len(self.cards_df)} cards "
|
|
f"and {len(self.theme_frequencies)} theme frequencies "
|
|
f"(cache: {'enabled' if self.cache.enabled else 'disabled'})"
|
|
)
|
|
|
|
def _load_theme_frequencies(self) -> dict[str, int]:
|
|
"""
|
|
Load theme frequencies from theme_catalog.csv.
|
|
|
|
Returns:
|
|
Dict mapping theme name to card_count (higher = more common)
|
|
"""
|
|
catalog_path = Path(__file__).parents[3] / "config" / "themes" / "theme_catalog.csv"
|
|
|
|
try:
|
|
# Read CSV, skipping comment line
|
|
df = pd.read_csv(catalog_path, comment="#")
|
|
|
|
# Create dict mapping theme -> card_count
|
|
# Higher card_count = more common/frequent theme
|
|
frequencies = dict(zip(df["theme"], df["card_count"]))
|
|
|
|
logger.info(f"Loaded {len(frequencies)} theme frequencies from catalog")
|
|
return frequencies
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load theme frequencies: {e}, using empty dict")
|
|
return {}
|
|
|
|
def _precompute_cleaned_tags(self) -> dict[str, set[str]]:
|
|
"""
|
|
Pre-compute cleaned tags for all cards.
|
|
|
|
Removes overly common tags like "Historics Matter" and "Legends Matter"
|
|
that don't provide meaningful similarity. This is done once during
|
|
initialization to avoid recalculating for every comparison.
|
|
|
|
Returns:
|
|
Dict mapping card name -> cleaned tags (full set minus exclusions)
|
|
"""
|
|
logger.info("Pre-computing cleaned tags for all cards...")
|
|
excluded_tags = {"Historics Matter", "Legends Matter"}
|
|
cleaned = {}
|
|
|
|
for _, row in self.cards_df.iterrows():
|
|
card_name = row["name"]
|
|
tags = self.parse_theme_tags(row["themeTags"])
|
|
|
|
if tags:
|
|
# Remove excluded tags
|
|
cleaned_tags = tags - excluded_tags
|
|
if cleaned_tags: # Only store if card has tags after exclusion
|
|
cleaned[card_name] = cleaned_tags
|
|
|
|
logger.info(f"Pre-computed {len(cleaned)} card tag sets")
|
|
return cleaned
|
|
|
|
def _precompute_card_metadata(self) -> dict[str, dict]:
|
|
"""
|
|
Pre-compute card metadata (EDHREC rank, etc.) for fast lookups.
|
|
|
|
Returns:
|
|
Dict mapping card name -> metadata dict
|
|
"""
|
|
logger.info("Pre-computing card metadata...")
|
|
metadata = {}
|
|
|
|
for _, row in self.cards_df.iterrows():
|
|
card_name = row["name"]
|
|
edhrec_rank = row.get("edhrecRank")
|
|
# Convert to float, use inf for NaN/None
|
|
edhrec_rank = float(edhrec_rank) if pd.notna(edhrec_rank) else float('inf')
|
|
|
|
metadata[card_name] = {
|
|
"edhrecRank": edhrec_rank,
|
|
}
|
|
|
|
logger.info(f"Pre-computed metadata for {len(metadata)} cards")
|
|
return metadata
|
|
|
|
def _build_tag_index(self) -> None:
|
|
"""
|
|
Build inverted index: tag -> set of card names that have this tag.
|
|
|
|
This allows fast candidate filtering - instead of checking all 29k cards,
|
|
we only check cards that share at least one tag with the target.
|
|
|
|
Performance impact: Reduces 29k comparisons to typically 100-2000 comparisons.
|
|
"""
|
|
logger.info("Building inverted tag index...")
|
|
index = {}
|
|
|
|
for card_name, tags in self.cleaned_tags_cache.items():
|
|
for tag in tags:
|
|
if tag not in index:
|
|
index[tag] = set()
|
|
index[tag].add(card_name)
|
|
|
|
self._tag_to_cards_index = index
|
|
|
|
# Log statistics
|
|
avg_cards_per_tag = sum(len(cards) for cards in index.values()) / len(index) if index else 0
|
|
logger.info(
|
|
f"Built tag index: {len(index)} unique tags, "
|
|
f"avg {avg_cards_per_tag:.1f} cards per tag"
|
|
)
|
|
|
|
def get_signature_tags(
|
|
self,
|
|
card_tags: set[str],
|
|
top_n: int = 5,
|
|
random_n: Optional[int] = None,
|
|
seed: Optional[int] = None,
|
|
) -> set[str]:
|
|
"""
|
|
Get signature tags for similarity comparison.
|
|
|
|
Takes the most frequent (popular) tags PLUS random tags for diversity.
|
|
This balances defining characteristics with discovery of niche synergies.
|
|
|
|
Excludes overly common tags like "Historics Matter" and "Legends Matter"
|
|
that appear on most legendary cards and don't provide meaningful similarity.
|
|
|
|
Args:
|
|
card_tags: Full set of card theme tags
|
|
top_n: Number of most frequent tags to use (default 5)
|
|
random_n: Number of random tags to add. If None, auto-scales:
|
|
- 6-10 tags: 1 random
|
|
- 11-15 tags: 2 random
|
|
- 16+ tags: 3 random
|
|
seed: Random seed for reproducibility (default: None)
|
|
|
|
Returns:
|
|
Set of signature tags (top_n most frequent + random_n random)
|
|
"""
|
|
# Exclude overly common tags that don't provide meaningful similarity
|
|
excluded_tags = {"Historics Matter", "Legends Matter"}
|
|
card_tags = card_tags - excluded_tags
|
|
|
|
if len(card_tags) <= top_n:
|
|
return card_tags # Use all if card has few tags
|
|
|
|
# Auto-scale random_n based on total tag count if not specified
|
|
if random_n is None:
|
|
tag_count = len(card_tags)
|
|
if tag_count >= 16:
|
|
random_n = 3
|
|
elif tag_count >= 11:
|
|
random_n = 2
|
|
elif tag_count >= 6:
|
|
random_n = 1
|
|
else:
|
|
random_n = 0 # Very few tags, no random needed
|
|
|
|
# Sort tags by frequency (higher card_count = more common = higher priority)
|
|
sorted_tags = sorted(
|
|
card_tags,
|
|
key=lambda t: -self.theme_frequencies.get(t, 0), # Negate for descending order
|
|
)
|
|
|
|
# Take top N most frequent tags
|
|
signature = set(sorted_tags[:top_n])
|
|
|
|
# Add random tags from remaining tags
|
|
remaining_tags = card_tags - signature
|
|
if remaining_tags and random_n > 0:
|
|
if seed is not None:
|
|
random.seed(seed)
|
|
|
|
# Sample min(random_n, len(remaining_tags)) to avoid errors
|
|
sample_size = min(random_n, len(remaining_tags))
|
|
random_tags = set(random.sample(list(remaining_tags), sample_size))
|
|
|
|
signature = signature | random_tags
|
|
|
|
return signature
|
|
|
|
@staticmethod
|
|
def parse_theme_tags(tags: str | list) -> set[str]:
|
|
"""
|
|
Parse theme tags from string or list format.
|
|
|
|
Args:
|
|
tags: Theme tags as string representation of list or actual list
|
|
|
|
Returns:
|
|
Set of theme tag strings
|
|
"""
|
|
if pd.isna(tags) or not tags:
|
|
return set()
|
|
|
|
if isinstance(tags, list):
|
|
return set(tags)
|
|
|
|
if isinstance(tags, str):
|
|
# Handle string representation of list: "['tag1', 'tag2']"
|
|
try:
|
|
parsed = ast.literal_eval(tags)
|
|
if isinstance(parsed, list):
|
|
return set(parsed)
|
|
return set()
|
|
except (ValueError, SyntaxError):
|
|
# If parsing fails, return empty set
|
|
logger.warning(f"Failed to parse theme tags: {tags[:100]}")
|
|
return set()
|
|
|
|
return set()
|
|
|
|
@staticmethod
|
|
def calculate_similarity(tags_a: set[str], tags_b: set[str]) -> float:
|
|
"""
|
|
Calculate Jaccard similarity between two sets of theme tags.
|
|
|
|
Jaccard index = intersection / union
|
|
|
|
Args:
|
|
tags_a: First set of theme tags
|
|
tags_b: Second set of theme tags
|
|
|
|
Returns:
|
|
Similarity score from 0.0 (no overlap) to 1.0 (identical)
|
|
"""
|
|
if not tags_a or not tags_b:
|
|
return 0.0
|
|
|
|
intersection = len(tags_a & tags_b)
|
|
union = len(tags_a | tags_b)
|
|
|
|
if union == 0:
|
|
return 0.0
|
|
|
|
return intersection / union
|
|
|
|
def get_card_tags(self, card_name: str) -> Optional[set[str]]:
|
|
"""
|
|
Get theme tags for a specific card.
|
|
|
|
Args:
|
|
card_name: Name of the card
|
|
|
|
Returns:
|
|
Set of theme tags, or None if card not found
|
|
"""
|
|
card_row = self.cards_df[self.cards_df["name"] == card_name]
|
|
|
|
if card_row.empty:
|
|
return None
|
|
|
|
tags = card_row.iloc[0]["themeTags"]
|
|
return self.parse_theme_tags(tags)
|
|
|
|
def find_similar(
|
|
self,
|
|
card_name: str,
|
|
threshold: float = 0.8,
|
|
limit: int = 10,
|
|
min_results: int = 3,
|
|
adaptive: bool = True,
|
|
use_cache: bool = True,
|
|
) -> list[dict]:
|
|
"""
|
|
Find cards with similar theme tags.
|
|
|
|
Uses adaptive threshold scaling to ensure minimum number of results.
|
|
Tries 80% → 60% thresholds until min_results is met (skips 70% for performance).
|
|
|
|
Checks cache first for pre-computed results, falls back to real-time calculation.
|
|
|
|
Args:
|
|
card_name: Name of the target card
|
|
threshold: Starting similarity threshold (0.0-1.0), default 0.8 (80%)
|
|
limit: Maximum number of results, default 10
|
|
min_results: Minimum desired results for adaptive scaling, default 3
|
|
adaptive: Enable adaptive threshold scaling, default True
|
|
use_cache: Check cache first before calculating, default True
|
|
|
|
Returns:
|
|
List of dicts with keys: name, similarity, themeTags, edhrecRank, threshold_used
|
|
Sorted by similarity descending, then by EDHREC rank ascending (more popular first)
|
|
Returns empty list if card not found or has no tags
|
|
"""
|
|
# Check cache first
|
|
if use_cache and self.cache.enabled:
|
|
cached_results = self.cache.get_similar(card_name, limit=limit, randomize=True)
|
|
if cached_results is not None:
|
|
logger.info(f"Cache HIT for '{card_name}' ({len(cached_results)} results, randomized)")
|
|
return cached_results
|
|
else:
|
|
logger.info(f"Cache MISS for '{card_name}', calculating...")
|
|
|
|
# Get target card tags
|
|
target_tags = self.get_card_tags(card_name)
|
|
|
|
if target_tags is None:
|
|
logger.warning(f"Card not found: {card_name}")
|
|
return []
|
|
|
|
if not target_tags:
|
|
logger.info(f"Card has no theme tags: {card_name}")
|
|
return []
|
|
|
|
# Get signature tags for TARGET card only (top 5 most frequent + 1-3 random)
|
|
# This focuses the search on the target's defining characteristics
|
|
# with some diversity from random tags
|
|
|
|
# Use card name hash as seed for reproducible randomness per card
|
|
card_seed = hash(card_name) % (2**31)
|
|
target_signature = self.get_signature_tags(
|
|
target_tags,
|
|
top_n=5,
|
|
seed=card_seed
|
|
)
|
|
|
|
logger.debug(
|
|
f"Target '{card_name}': {len(target_tags)} tags → "
|
|
f"{len(target_signature)} signature tags"
|
|
)
|
|
|
|
# Try adaptive thresholds if enabled
|
|
thresholds_to_try = [threshold]
|
|
if adaptive:
|
|
# Build list of thresholds to try: 80% → 60% → 50% (skip 70% for performance)
|
|
thresholds_to_try = []
|
|
if threshold >= 0.8:
|
|
thresholds_to_try.append(0.8)
|
|
if threshold >= 0.6:
|
|
thresholds_to_try.append(0.6)
|
|
if threshold >= 0.5:
|
|
thresholds_to_try.append(0.5)
|
|
|
|
# Remove duplicates and sort descending
|
|
thresholds_to_try = sorted(set(thresholds_to_try), reverse=True)
|
|
|
|
results = []
|
|
threshold_used = threshold
|
|
|
|
for current_threshold in thresholds_to_try:
|
|
# Use inverted index for fast candidate filtering
|
|
# Instead of checking all 29k cards, only check cards that share at least one signature tag
|
|
results = []
|
|
|
|
# Build inverted index on first use (lazily)
|
|
if self._tag_to_cards_index is None:
|
|
self._build_tag_index()
|
|
|
|
# Get candidate cards that share at least one signature tag
|
|
# This drastically reduces the number of cards we need to check
|
|
candidate_cards = set()
|
|
for tag in target_signature:
|
|
if tag in self._tag_to_cards_index:
|
|
candidate_cards.update(self._tag_to_cards_index[tag])
|
|
|
|
# Remove the target card itself
|
|
candidate_cards.discard(card_name)
|
|
|
|
if not candidate_cards:
|
|
continue # No candidates at all, try lower threshold
|
|
|
|
# Now calculate scores only for candidates (vectorized where possible)
|
|
# Pre-filter candidates by checking if they meet minimum overlap requirement
|
|
min_overlap = int(len(target_signature) * current_threshold)
|
|
|
|
for candidate_name in candidate_cards:
|
|
candidate_tags = self.cleaned_tags_cache.get(candidate_name)
|
|
|
|
if not candidate_tags:
|
|
continue
|
|
|
|
# Fast overlap check using set intersection
|
|
overlap = target_signature & candidate_tags
|
|
overlap_count = len(overlap)
|
|
|
|
# Quick filter: skip if overlap too small
|
|
if overlap_count < min_overlap:
|
|
continue
|
|
|
|
# Calculate exact containment score
|
|
containment_score = overlap_count / len(target_signature)
|
|
|
|
if containment_score >= current_threshold:
|
|
# Get EDHREC rank efficiently from card metadata
|
|
edhrec_rank = self._card_metadata.get(candidate_name, {}).get('edhrecRank', float('inf'))
|
|
|
|
results.append({
|
|
"name": candidate_name,
|
|
"similarity": containment_score,
|
|
"themeTags": list(candidate_tags),
|
|
"edhrecRank": edhrec_rank,
|
|
})
|
|
|
|
# Sort by similarity descending, then by EDHREC rank ascending (lower is better)
|
|
# Unranked cards (inf) will appear last
|
|
results.sort(key=lambda x: (-x["similarity"], x["edhrecRank"]))
|
|
|
|
# Check if we have enough results
|
|
if len(results) >= min_results or not adaptive:
|
|
threshold_used = current_threshold
|
|
break
|
|
|
|
# Log that we're trying a lower threshold
|
|
logger.info(
|
|
f"Found {len(results)} results at {current_threshold:.0%} "
|
|
f"for '{card_name}', trying lower threshold..."
|
|
)
|
|
|
|
# Add threshold_used to results
|
|
for result in results:
|
|
result["threshold_used"] = threshold_used
|
|
|
|
logger.info(
|
|
f"Found {len(results)} similar cards for '{card_name}' "
|
|
f"at {threshold_used:.0%} threshold"
|
|
)
|
|
|
|
final_results = results[:limit]
|
|
|
|
# Cache the results for future lookups
|
|
if use_cache and self.cache.enabled and final_results:
|
|
self.cache.set_similar(card_name, final_results)
|
|
logger.debug(f"Cached {len(final_results)} results for '{card_name}'")
|
|
|
|
return final_results
|