mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 23:50:12 +01:00
Add card browser with similar cards and performance optimizations
This commit is contained in:
parent
a8dc1835eb
commit
c2960c808e
25 changed files with 4841 additions and 1392 deletions
483
code/web/services/card_similarity.py
Normal file
483
code/web/services/card_similarity.py
Normal file
|
|
@ -0,0 +1,483 @@
|
|||
"""
|
||||
Card similarity service using Jaccard index on theme tags.
|
||||
|
||||
Provides similarity scoring between cards based on theme tag overlap.
|
||||
Used for "Similar Cards" feature in card browser.
|
||||
|
||||
Supports persistent caching for improved performance (2-6s → <500ms).
|
||||
|
||||
Uses "signature tags" approach: compares top 5 most frequent tags instead
|
||||
of all tags, significantly improving performance and quality.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from code.web.services.similarity_cache import SimilarityCache, get_cache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CardSimilarity:
|
||||
"""Calculate card similarity using theme tag overlap (Jaccard index) with caching."""
|
||||
|
||||
def __init__(self, cards_df: Optional[pd.DataFrame] = None, cache: Optional[SimilarityCache] = None):
|
||||
"""
|
||||
Initialize similarity calculator.
|
||||
|
||||
Args:
|
||||
cards_df: DataFrame with card data. If None, loads from all_cards.parquet
|
||||
cache: SimilarityCache instance. If None, uses global singleton
|
||||
"""
|
||||
if cards_df is None:
|
||||
# Load from default location
|
||||
parquet_path = Path(__file__).parents[3] / "card_files" / "all_cards.parquet"
|
||||
logger.info(f"Loading cards from {parquet_path}")
|
||||
self.cards_df = pd.read_parquet(parquet_path)
|
||||
else:
|
||||
self.cards_df = cards_df
|
||||
|
||||
# Initialize cache
|
||||
self.cache = cache if cache is not None else get_cache()
|
||||
|
||||
# Load theme frequencies from catalog
|
||||
self.theme_frequencies = self._load_theme_frequencies()
|
||||
|
||||
# Pre-compute cleaned tags (with exclusions) for all cards (one-time cost, huge speedup)
|
||||
# This removes "Historics Matter" and "Legends Matter" from all cards
|
||||
self.cleaned_tags_cache = self._precompute_cleaned_tags()
|
||||
|
||||
# Pre-compute card metadata (EDHREC rank) for fast lookups
|
||||
self._card_metadata = self._precompute_card_metadata()
|
||||
|
||||
# Inverted index (tag -> set of card names) - built lazily on first use
|
||||
self._tag_to_cards_index = None
|
||||
|
||||
logger.info(
|
||||
f"Initialized CardSimilarity with {len(self.cards_df)} cards "
|
||||
f"and {len(self.theme_frequencies)} theme frequencies "
|
||||
f"(cache: {'enabled' if self.cache.enabled else 'disabled'})"
|
||||
)
|
||||
|
||||
def _load_theme_frequencies(self) -> dict[str, int]:
|
||||
"""
|
||||
Load theme frequencies from theme_catalog.csv.
|
||||
|
||||
Returns:
|
||||
Dict mapping theme name to card_count (higher = more common)
|
||||
"""
|
||||
catalog_path = Path(__file__).parents[3] / "config" / "themes" / "theme_catalog.csv"
|
||||
|
||||
try:
|
||||
# Read CSV, skipping comment line
|
||||
df = pd.read_csv(catalog_path, comment="#")
|
||||
|
||||
# Create dict mapping theme -> card_count
|
||||
# Higher card_count = more common/frequent theme
|
||||
frequencies = dict(zip(df["theme"], df["card_count"]))
|
||||
|
||||
logger.info(f"Loaded {len(frequencies)} theme frequencies from catalog")
|
||||
return frequencies
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load theme frequencies: {e}, using empty dict")
|
||||
return {}
|
||||
|
||||
def _precompute_cleaned_tags(self) -> dict[str, set[str]]:
|
||||
"""
|
||||
Pre-compute cleaned tags for all cards.
|
||||
|
||||
Removes overly common tags like "Historics Matter" and "Legends Matter"
|
||||
that don't provide meaningful similarity. This is done once during
|
||||
initialization to avoid recalculating for every comparison.
|
||||
|
||||
Returns:
|
||||
Dict mapping card name -> cleaned tags (full set minus exclusions)
|
||||
"""
|
||||
logger.info("Pre-computing cleaned tags for all cards...")
|
||||
excluded_tags = {"Historics Matter", "Legends Matter"}
|
||||
cleaned = {}
|
||||
|
||||
for _, row in self.cards_df.iterrows():
|
||||
card_name = row["name"]
|
||||
tags = self.parse_theme_tags(row["themeTags"])
|
||||
|
||||
if tags:
|
||||
# Remove excluded tags
|
||||
cleaned_tags = tags - excluded_tags
|
||||
if cleaned_tags: # Only store if card has tags after exclusion
|
||||
cleaned[card_name] = cleaned_tags
|
||||
|
||||
logger.info(f"Pre-computed {len(cleaned)} card tag sets")
|
||||
return cleaned
|
||||
|
||||
def _precompute_card_metadata(self) -> dict[str, dict]:
|
||||
"""
|
||||
Pre-compute card metadata (EDHREC rank, etc.) for fast lookups.
|
||||
|
||||
Returns:
|
||||
Dict mapping card name -> metadata dict
|
||||
"""
|
||||
logger.info("Pre-computing card metadata...")
|
||||
metadata = {}
|
||||
|
||||
for _, row in self.cards_df.iterrows():
|
||||
card_name = row["name"]
|
||||
edhrec_rank = row.get("edhrecRank")
|
||||
# Convert to float, use inf for NaN/None
|
||||
edhrec_rank = float(edhrec_rank) if pd.notna(edhrec_rank) else float('inf')
|
||||
|
||||
metadata[card_name] = {
|
||||
"edhrecRank": edhrec_rank,
|
||||
}
|
||||
|
||||
logger.info(f"Pre-computed metadata for {len(metadata)} cards")
|
||||
return metadata
|
||||
|
||||
def _build_tag_index(self) -> None:
|
||||
"""
|
||||
Build inverted index: tag -> set of card names that have this tag.
|
||||
|
||||
This allows fast candidate filtering - instead of checking all 29k cards,
|
||||
we only check cards that share at least one tag with the target.
|
||||
|
||||
Performance impact: Reduces 29k comparisons to typically 100-2000 comparisons.
|
||||
"""
|
||||
logger.info("Building inverted tag index...")
|
||||
index = {}
|
||||
|
||||
for card_name, tags in self.cleaned_tags_cache.items():
|
||||
for tag in tags:
|
||||
if tag not in index:
|
||||
index[tag] = set()
|
||||
index[tag].add(card_name)
|
||||
|
||||
self._tag_to_cards_index = index
|
||||
|
||||
# Log statistics
|
||||
avg_cards_per_tag = sum(len(cards) for cards in index.values()) / len(index) if index else 0
|
||||
logger.info(
|
||||
f"Built tag index: {len(index)} unique tags, "
|
||||
f"avg {avg_cards_per_tag:.1f} cards per tag"
|
||||
)
|
||||
|
||||
def get_signature_tags(
|
||||
self,
|
||||
card_tags: set[str],
|
||||
top_n: int = 5,
|
||||
random_n: Optional[int] = None,
|
||||
seed: Optional[int] = None,
|
||||
) -> set[str]:
|
||||
"""
|
||||
Get signature tags for similarity comparison.
|
||||
|
||||
Takes the most frequent (popular) tags PLUS random tags for diversity.
|
||||
This balances defining characteristics with discovery of niche synergies.
|
||||
|
||||
Excludes overly common tags like "Historics Matter" and "Legends Matter"
|
||||
that appear on most legendary cards and don't provide meaningful similarity.
|
||||
|
||||
Args:
|
||||
card_tags: Full set of card theme tags
|
||||
top_n: Number of most frequent tags to use (default 5)
|
||||
random_n: Number of random tags to add. If None, auto-scales:
|
||||
- 6-10 tags: 1 random
|
||||
- 11-15 tags: 2 random
|
||||
- 16+ tags: 3 random
|
||||
seed: Random seed for reproducibility (default: None)
|
||||
|
||||
Returns:
|
||||
Set of signature tags (top_n most frequent + random_n random)
|
||||
"""
|
||||
# Exclude overly common tags that don't provide meaningful similarity
|
||||
excluded_tags = {"Historics Matter", "Legends Matter"}
|
||||
card_tags = card_tags - excluded_tags
|
||||
|
||||
if len(card_tags) <= top_n:
|
||||
return card_tags # Use all if card has few tags
|
||||
|
||||
# Auto-scale random_n based on total tag count if not specified
|
||||
if random_n is None:
|
||||
tag_count = len(card_tags)
|
||||
if tag_count >= 16:
|
||||
random_n = 3
|
||||
elif tag_count >= 11:
|
||||
random_n = 2
|
||||
elif tag_count >= 6:
|
||||
random_n = 1
|
||||
else:
|
||||
random_n = 0 # Very few tags, no random needed
|
||||
|
||||
# Sort tags by frequency (higher card_count = more common = higher priority)
|
||||
sorted_tags = sorted(
|
||||
card_tags,
|
||||
key=lambda t: -self.theme_frequencies.get(t, 0), # Negate for descending order
|
||||
)
|
||||
|
||||
# Take top N most frequent tags
|
||||
signature = set(sorted_tags[:top_n])
|
||||
|
||||
# Add random tags from remaining tags
|
||||
remaining_tags = card_tags - signature
|
||||
if remaining_tags and random_n > 0:
|
||||
if seed is not None:
|
||||
random.seed(seed)
|
||||
|
||||
# Sample min(random_n, len(remaining_tags)) to avoid errors
|
||||
sample_size = min(random_n, len(remaining_tags))
|
||||
random_tags = set(random.sample(list(remaining_tags), sample_size))
|
||||
|
||||
signature = signature | random_tags
|
||||
|
||||
return signature
|
||||
|
||||
@staticmethod
|
||||
def parse_theme_tags(tags: str | list) -> set[str]:
|
||||
"""
|
||||
Parse theme tags from string or list format.
|
||||
|
||||
Args:
|
||||
tags: Theme tags as string representation of list or actual list
|
||||
|
||||
Returns:
|
||||
Set of theme tag strings
|
||||
"""
|
||||
if pd.isna(tags) or not tags:
|
||||
return set()
|
||||
|
||||
if isinstance(tags, list):
|
||||
return set(tags)
|
||||
|
||||
if isinstance(tags, str):
|
||||
# Handle string representation of list: "['tag1', 'tag2']"
|
||||
try:
|
||||
parsed = ast.literal_eval(tags)
|
||||
if isinstance(parsed, list):
|
||||
return set(parsed)
|
||||
return set()
|
||||
except (ValueError, SyntaxError):
|
||||
# If parsing fails, return empty set
|
||||
logger.warning(f"Failed to parse theme tags: {tags[:100]}")
|
||||
return set()
|
||||
|
||||
return set()
|
||||
|
||||
@staticmethod
|
||||
def calculate_similarity(tags_a: set[str], tags_b: set[str]) -> float:
|
||||
"""
|
||||
Calculate Jaccard similarity between two sets of theme tags.
|
||||
|
||||
Jaccard index = intersection / union
|
||||
|
||||
Args:
|
||||
tags_a: First set of theme tags
|
||||
tags_b: Second set of theme tags
|
||||
|
||||
Returns:
|
||||
Similarity score from 0.0 (no overlap) to 1.0 (identical)
|
||||
"""
|
||||
if not tags_a or not tags_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(tags_a & tags_b)
|
||||
union = len(tags_a | tags_b)
|
||||
|
||||
if union == 0:
|
||||
return 0.0
|
||||
|
||||
return intersection / union
|
||||
|
||||
def get_card_tags(self, card_name: str) -> Optional[set[str]]:
|
||||
"""
|
||||
Get theme tags for a specific card.
|
||||
|
||||
Args:
|
||||
card_name: Name of the card
|
||||
|
||||
Returns:
|
||||
Set of theme tags, or None if card not found
|
||||
"""
|
||||
card_row = self.cards_df[self.cards_df["name"] == card_name]
|
||||
|
||||
if card_row.empty:
|
||||
return None
|
||||
|
||||
tags = card_row.iloc[0]["themeTags"]
|
||||
return self.parse_theme_tags(tags)
|
||||
|
||||
def find_similar(
|
||||
self,
|
||||
card_name: str,
|
||||
threshold: float = 0.8,
|
||||
limit: int = 10,
|
||||
min_results: int = 3,
|
||||
adaptive: bool = True,
|
||||
use_cache: bool = True,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Find cards with similar theme tags.
|
||||
|
||||
Uses adaptive threshold scaling to ensure minimum number of results.
|
||||
Tries 80% → 60% thresholds until min_results is met (skips 70% for performance).
|
||||
|
||||
Checks cache first for pre-computed results, falls back to real-time calculation.
|
||||
|
||||
Args:
|
||||
card_name: Name of the target card
|
||||
threshold: Starting similarity threshold (0.0-1.0), default 0.8 (80%)
|
||||
limit: Maximum number of results, default 10
|
||||
min_results: Minimum desired results for adaptive scaling, default 3
|
||||
adaptive: Enable adaptive threshold scaling, default True
|
||||
use_cache: Check cache first before calculating, default True
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: name, similarity, themeTags, edhrecRank, threshold_used
|
||||
Sorted by similarity descending, then by EDHREC rank ascending (more popular first)
|
||||
Returns empty list if card not found or has no tags
|
||||
"""
|
||||
# Check cache first
|
||||
if use_cache and self.cache.enabled:
|
||||
cached_results = self.cache.get_similar(card_name, limit=limit, randomize=True)
|
||||
if cached_results is not None:
|
||||
logger.info(f"Cache HIT for '{card_name}' ({len(cached_results)} results, randomized)")
|
||||
return cached_results
|
||||
else:
|
||||
logger.info(f"Cache MISS for '{card_name}', calculating...")
|
||||
|
||||
# Get target card tags
|
||||
target_tags = self.get_card_tags(card_name)
|
||||
|
||||
if target_tags is None:
|
||||
logger.warning(f"Card not found: {card_name}")
|
||||
return []
|
||||
|
||||
if not target_tags:
|
||||
logger.info(f"Card has no theme tags: {card_name}")
|
||||
return []
|
||||
|
||||
# Get signature tags for TARGET card only (top 5 most frequent + 1-3 random)
|
||||
# This focuses the search on the target's defining characteristics
|
||||
# with some diversity from random tags
|
||||
|
||||
# Use card name hash as seed for reproducible randomness per card
|
||||
card_seed = hash(card_name) % (2**31)
|
||||
target_signature = self.get_signature_tags(
|
||||
target_tags,
|
||||
top_n=5,
|
||||
seed=card_seed
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Target '{card_name}': {len(target_tags)} tags → "
|
||||
f"{len(target_signature)} signature tags"
|
||||
)
|
||||
|
||||
# Try adaptive thresholds if enabled
|
||||
thresholds_to_try = [threshold]
|
||||
if adaptive:
|
||||
# Build list of thresholds to try: 80% → 60% → 50% (skip 70% for performance)
|
||||
thresholds_to_try = []
|
||||
if threshold >= 0.8:
|
||||
thresholds_to_try.append(0.8)
|
||||
if threshold >= 0.6:
|
||||
thresholds_to_try.append(0.6)
|
||||
if threshold >= 0.5:
|
||||
thresholds_to_try.append(0.5)
|
||||
|
||||
# Remove duplicates and sort descending
|
||||
thresholds_to_try = sorted(set(thresholds_to_try), reverse=True)
|
||||
|
||||
results = []
|
||||
threshold_used = threshold
|
||||
|
||||
for current_threshold in thresholds_to_try:
|
||||
# Use inverted index for fast candidate filtering
|
||||
# Instead of checking all 29k cards, only check cards that share at least one signature tag
|
||||
results = []
|
||||
|
||||
# Build inverted index on first use (lazily)
|
||||
if self._tag_to_cards_index is None:
|
||||
self._build_tag_index()
|
||||
|
||||
# Get candidate cards that share at least one signature tag
|
||||
# This drastically reduces the number of cards we need to check
|
||||
candidate_cards = set()
|
||||
for tag in target_signature:
|
||||
if tag in self._tag_to_cards_index:
|
||||
candidate_cards.update(self._tag_to_cards_index[tag])
|
||||
|
||||
# Remove the target card itself
|
||||
candidate_cards.discard(card_name)
|
||||
|
||||
if not candidate_cards:
|
||||
continue # No candidates at all, try lower threshold
|
||||
|
||||
# Now calculate scores only for candidates (vectorized where possible)
|
||||
# Pre-filter candidates by checking if they meet minimum overlap requirement
|
||||
min_overlap = int(len(target_signature) * current_threshold)
|
||||
|
||||
for candidate_name in candidate_cards:
|
||||
candidate_tags = self.cleaned_tags_cache.get(candidate_name)
|
||||
|
||||
if not candidate_tags:
|
||||
continue
|
||||
|
||||
# Fast overlap check using set intersection
|
||||
overlap = target_signature & candidate_tags
|
||||
overlap_count = len(overlap)
|
||||
|
||||
# Quick filter: skip if overlap too small
|
||||
if overlap_count < min_overlap:
|
||||
continue
|
||||
|
||||
# Calculate exact containment score
|
||||
containment_score = overlap_count / len(target_signature)
|
||||
|
||||
if containment_score >= current_threshold:
|
||||
# Get EDHREC rank efficiently from card metadata
|
||||
edhrec_rank = self._card_metadata.get(candidate_name, {}).get('edhrecRank', float('inf'))
|
||||
|
||||
results.append({
|
||||
"name": candidate_name,
|
||||
"similarity": containment_score,
|
||||
"themeTags": list(candidate_tags),
|
||||
"edhrecRank": edhrec_rank,
|
||||
})
|
||||
|
||||
# Sort by similarity descending, then by EDHREC rank ascending (lower is better)
|
||||
# Unranked cards (inf) will appear last
|
||||
results.sort(key=lambda x: (-x["similarity"], x["edhrecRank"]))
|
||||
|
||||
# Check if we have enough results
|
||||
if len(results) >= min_results or not adaptive:
|
||||
threshold_used = current_threshold
|
||||
break
|
||||
|
||||
# Log that we're trying a lower threshold
|
||||
logger.info(
|
||||
f"Found {len(results)} results at {current_threshold:.0%} "
|
||||
f"for '{card_name}', trying lower threshold..."
|
||||
)
|
||||
|
||||
# Add threshold_used to results
|
||||
for result in results:
|
||||
result["threshold_used"] = threshold_used
|
||||
|
||||
logger.info(
|
||||
f"Found {len(results)} similar cards for '{card_name}' "
|
||||
f"at {threshold_used:.0%} threshold"
|
||||
)
|
||||
|
||||
final_results = results[:limit]
|
||||
|
||||
# Cache the results for future lookups
|
||||
if use_cache and self.cache.enabled and final_results:
|
||||
self.cache.set_similar(card_name, final_results)
|
||||
logger.debug(f"Cached {len(final_results)} results for '{card_name}'")
|
||||
|
||||
return final_results
|
||||
386
code/web/services/similarity_cache.py
Normal file
386
code/web/services/similarity_cache.py
Normal file
|
|
@ -0,0 +1,386 @@
|
|||
"""
|
||||
Similarity cache manager for card similarity calculations.
|
||||
|
||||
Provides persistent caching of pre-computed card similarity scores to improve
|
||||
card detail page load times from 2-6s down to <500ms.
|
||||
|
||||
Cache format: Parquet file with columnar structure:
|
||||
- card_name: str (source card)
|
||||
- similar_name: str (similar card name)
|
||||
- similarity: float (similarity score)
|
||||
- edhrecRank: float (EDHREC rank of similar card)
|
||||
- rank: int (ranking position, 0-19 for top 20)
|
||||
|
||||
Metadata stored in separate JSON sidecar file.
|
||||
|
||||
Benefits vs JSON:
|
||||
- 5-10x faster load times
|
||||
- 50-70% smaller file size
|
||||
- Better compression for large datasets
|
||||
- Consistent with other card data storage
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default cache settings
|
||||
CACHE_VERSION = "2.0" # Bumped for Parquet format
|
||||
DEFAULT_CACHE_PATH = Path(__file__).parents[3] / "card_files" / "similarity_cache.parquet"
|
||||
DEFAULT_METADATA_PATH = Path(__file__).parents[3] / "card_files" / "similarity_cache_metadata.json"
|
||||
|
||||
|
||||
class SimilarityCache:
|
||||
"""Manages persistent cache for card similarity calculations using Parquet."""
|
||||
|
||||
def __init__(self, cache_path: Optional[Path] = None, enabled: bool = True):
|
||||
"""
|
||||
Initialize similarity cache manager.
|
||||
|
||||
Args:
|
||||
cache_path: Path to cache file. If None, uses DEFAULT_CACHE_PATH
|
||||
enabled: Whether cache is enabled (can be disabled via env var)
|
||||
"""
|
||||
self.cache_path = cache_path or DEFAULT_CACHE_PATH
|
||||
self.metadata_path = self.cache_path.with_name(
|
||||
self.cache_path.stem + "_metadata.json"
|
||||
)
|
||||
self.enabled = enabled and os.getenv("SIMILARITY_CACHE_ENABLED", "1") == "1"
|
||||
self._cache_df: Optional[pd.DataFrame] = None
|
||||
self._metadata: Optional[dict] = None
|
||||
|
||||
# Ensure cache directory exists
|
||||
self.cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if self.enabled:
|
||||
logger.info(f"SimilarityCache initialized at {self.cache_path}")
|
||||
else:
|
||||
logger.info("SimilarityCache disabled")
|
||||
|
||||
def load_cache(self) -> pd.DataFrame:
|
||||
"""
|
||||
Load cache from disk.
|
||||
|
||||
Returns:
|
||||
DataFrame with columns: card_name, similar_name, similarity, edhrecRank, rank
|
||||
Returns empty DataFrame if file doesn't exist or loading fails
|
||||
"""
|
||||
if not self.enabled:
|
||||
return self._empty_cache_df()
|
||||
|
||||
if self._cache_df is not None:
|
||||
return self._cache_df
|
||||
|
||||
if not self.cache_path.exists():
|
||||
logger.info("Cache file not found, returning empty cache")
|
||||
self._cache_df = self._empty_cache_df()
|
||||
return self._cache_df
|
||||
|
||||
try:
|
||||
# Load Parquet file
|
||||
self._cache_df = pq.read_table(self.cache_path).to_pandas()
|
||||
|
||||
# Load metadata
|
||||
if self.metadata_path.exists():
|
||||
with open(self.metadata_path, "r", encoding="utf-8") as f:
|
||||
self._metadata = json.load(f)
|
||||
else:
|
||||
self._metadata = self._empty_metadata()
|
||||
|
||||
# Validate cache structure
|
||||
if not self._validate_cache(self._cache_df):
|
||||
logger.warning("Cache validation failed, returning empty cache")
|
||||
self._cache_df = self._empty_cache_df()
|
||||
return self._cache_df
|
||||
|
||||
total_cards = len(self._cache_df["card_name"].unique()) if len(self._cache_df) > 0 else 0
|
||||
logger.info(
|
||||
f"Loaded similarity cache v{self._metadata.get('version', 'unknown')} with {total_cards:,} cards ({len(self._cache_df):,} entries)"
|
||||
)
|
||||
|
||||
return self._cache_df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load cache: {e}")
|
||||
self._cache_df = self._empty_cache_df()
|
||||
return self._cache_df
|
||||
|
||||
def save_cache(self, cache_df: pd.DataFrame, metadata: Optional[dict] = None) -> bool:
|
||||
"""
|
||||
Save cache to disk.
|
||||
|
||||
Args:
|
||||
cache_df: DataFrame with similarity data
|
||||
metadata: Optional metadata dict. If None, uses current metadata with updates.
|
||||
|
||||
Returns:
|
||||
True if save successful, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
logger.debug("Cache disabled, skipping save")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Ensure directory exists
|
||||
self.cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Update metadata
|
||||
if metadata is None:
|
||||
metadata = self._metadata or self._empty_metadata()
|
||||
|
||||
total_cards = len(cache_df["card_name"].unique()) if len(cache_df) > 0 else 0
|
||||
metadata["total_cards"] = total_cards
|
||||
metadata["last_updated"] = datetime.now().isoformat()
|
||||
metadata["total_entries"] = len(cache_df)
|
||||
|
||||
# Write Parquet file (with compression)
|
||||
temp_cache = self.cache_path.with_suffix(".tmp")
|
||||
pq.write_table(
|
||||
pa.table(cache_df),
|
||||
temp_cache,
|
||||
compression="snappy",
|
||||
version="2.6",
|
||||
)
|
||||
temp_cache.replace(self.cache_path)
|
||||
|
||||
# Write metadata file
|
||||
temp_meta = self.metadata_path.with_suffix(".tmp")
|
||||
with open(temp_meta, "w", encoding="utf-8") as f:
|
||||
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
||||
temp_meta.replace(self.metadata_path)
|
||||
|
||||
self._cache_df = cache_df
|
||||
self._metadata = metadata
|
||||
|
||||
logger.info(f"Saved similarity cache with {total_cards:,} cards ({len(cache_df):,} entries)")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save cache: {e}")
|
||||
return False
|
||||
|
||||
def get_similar(self, card_name: str, limit: int = 5, randomize: bool = True) -> Optional[list[dict]]:
|
||||
"""
|
||||
Get cached similar cards for a given card.
|
||||
|
||||
Args:
|
||||
card_name: Name of the card to look up
|
||||
limit: Maximum number of results to return
|
||||
randomize: If True, randomly sample from cached results; if False, return top by rank
|
||||
|
||||
Returns:
|
||||
List of similar cards with similarity scores, or None if not in cache
|
||||
"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
cache_df = self.load_cache()
|
||||
|
||||
if len(cache_df) == 0:
|
||||
return None
|
||||
|
||||
# Filter to this card
|
||||
card_data = cache_df[cache_df["card_name"] == card_name]
|
||||
|
||||
if len(card_data) == 0:
|
||||
return None
|
||||
|
||||
# Randomly sample if requested and we have more results than limit
|
||||
if randomize and len(card_data) > limit:
|
||||
card_data = card_data.sample(n=limit, random_state=None)
|
||||
else:
|
||||
# Sort by rank and take top N
|
||||
card_data = card_data.sort_values("rank").head(limit)
|
||||
|
||||
# Convert to list of dicts
|
||||
results = []
|
||||
for _, row in card_data.iterrows():
|
||||
results.append({
|
||||
"name": row["similar_name"],
|
||||
"similarity": row["similarity"],
|
||||
"edhrecRank": row["edhrecRank"],
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def set_similar(self, card_name: str, similar_cards: list[dict]) -> bool:
|
||||
"""
|
||||
Cache similar cards for a given card.
|
||||
|
||||
Args:
|
||||
card_name: Name of the card
|
||||
similar_cards: List of similar cards with similarity scores
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
cache_df = self.load_cache()
|
||||
|
||||
# Remove existing entries for this card
|
||||
cache_df = cache_df[cache_df["card_name"] != card_name]
|
||||
|
||||
# Add new entries
|
||||
new_rows = []
|
||||
for rank, card in enumerate(similar_cards):
|
||||
new_rows.append({
|
||||
"card_name": card_name,
|
||||
"similar_name": card["name"],
|
||||
"similarity": card["similarity"],
|
||||
"edhrecRank": card.get("edhrecRank", float("inf")),
|
||||
"rank": rank,
|
||||
})
|
||||
|
||||
if new_rows:
|
||||
new_df = pd.DataFrame(new_rows)
|
||||
cache_df = pd.concat([cache_df, new_df], ignore_index=True)
|
||||
|
||||
return self.save_cache(cache_df)
|
||||
|
||||
def invalidate(self, card_name: Optional[str] = None) -> bool:
|
||||
"""
|
||||
Invalidate cache entries.
|
||||
|
||||
Args:
|
||||
card_name: If provided, invalidate only this card. If None, clear entire cache.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
if card_name is None:
|
||||
# Clear entire cache
|
||||
logger.info("Clearing entire similarity cache")
|
||||
self._cache_df = self._empty_cache_df()
|
||||
self._metadata = self._empty_metadata()
|
||||
return self.save_cache(self._cache_df, self._metadata)
|
||||
|
||||
# Clear specific card
|
||||
cache_df = self.load_cache()
|
||||
|
||||
initial_len = len(cache_df)
|
||||
cache_df = cache_df[cache_df["card_name"] != card_name]
|
||||
|
||||
if len(cache_df) < initial_len:
|
||||
logger.info(f"Invalidated cache for card: {card_name}")
|
||||
return self.save_cache(cache_df)
|
||||
|
||||
return False
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
"""
|
||||
Get cache statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with cache stats (version, total_cards, build_date, file_size, etc.)
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {"enabled": False}
|
||||
|
||||
cache_df = self.load_cache()
|
||||
metadata = self._metadata or self._empty_metadata()
|
||||
|
||||
stats = {
|
||||
"enabled": True,
|
||||
"version": metadata.get("version", "unknown"),
|
||||
"total_cards": len(cache_df["card_name"].unique()) if len(cache_df) > 0 else 0,
|
||||
"total_entries": len(cache_df),
|
||||
"build_date": metadata.get("build_date"),
|
||||
"last_updated": metadata.get("last_updated"),
|
||||
"file_exists": self.cache_path.exists(),
|
||||
"file_path": str(self.cache_path),
|
||||
"format": "parquet",
|
||||
}
|
||||
|
||||
if self.cache_path.exists():
|
||||
stats["file_size_mb"] = round(
|
||||
self.cache_path.stat().st_size / (1024 * 1024), 2
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
@staticmethod
|
||||
def _empty_cache_df() -> pd.DataFrame:
|
||||
"""
|
||||
Create empty cache DataFrame.
|
||||
|
||||
Returns:
|
||||
Empty DataFrame with correct schema
|
||||
"""
|
||||
return pd.DataFrame(columns=["card_name", "similar_name", "similarity", "edhrecRank", "rank"])
|
||||
|
||||
@staticmethod
|
||||
def _empty_metadata() -> dict:
|
||||
"""
|
||||
Create empty metadata structure.
|
||||
|
||||
Returns:
|
||||
Empty metadata dictionary
|
||||
"""
|
||||
return {
|
||||
"version": CACHE_VERSION,
|
||||
"total_cards": 0,
|
||||
"total_entries": 0,
|
||||
"build_date": None,
|
||||
"last_updated": None,
|
||||
"threshold": 0.6,
|
||||
"min_results": 3,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _validate_cache(cache_df: pd.DataFrame) -> bool:
|
||||
"""
|
||||
Validate cache DataFrame structure.
|
||||
|
||||
Args:
|
||||
cache_df: DataFrame to validate
|
||||
|
||||
Returns:
|
||||
True if valid, False otherwise
|
||||
"""
|
||||
if not isinstance(cache_df, pd.DataFrame):
|
||||
return False
|
||||
|
||||
# Check required columns
|
||||
required_cols = {"card_name", "similar_name", "similarity", "edhrecRank", "rank"}
|
||||
if not required_cols.issubset(cache_df.columns):
|
||||
logger.warning(f"Cache missing required columns. Expected: {required_cols}, Got: {set(cache_df.columns)}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Singleton instance for global access
|
||||
_cache_instance: Optional[SimilarityCache] = None
|
||||
|
||||
|
||||
def get_cache() -> SimilarityCache:
|
||||
"""
|
||||
Get singleton cache instance.
|
||||
|
||||
Returns:
|
||||
Global SimilarityCache instance
|
||||
"""
|
||||
global _cache_instance
|
||||
|
||||
if _cache_instance is None:
|
||||
# Check environment variables for custom path
|
||||
cache_path_str = os.getenv("SIMILARITY_CACHE_PATH")
|
||||
cache_path = Path(cache_path_str) if cache_path_str else None
|
||||
|
||||
_cache_instance = SimilarityCache(cache_path=cache_path)
|
||||
|
||||
return _cache_instance
|
||||
Loading…
Add table
Add a link
Reference in a new issue