feat: consolidate card data into optimized format for faster queries and reduced file sizes

This commit is contained in:
matt 2025-10-15 11:04:49 -07:00
parent 5753bb19f8
commit f70ffca23e
24 changed files with 2903 additions and 135 deletions

View file

@ -0,0 +1,6 @@
"""Services package for MTG Python Deckbuilder."""
from code.services.all_cards_loader import AllCardsLoader
from code.services.card_query_builder import CardQueryBuilder
__all__ = ["AllCardsLoader", "CardQueryBuilder"]

View file

@ -0,0 +1,289 @@
"""
All Cards Loader
Provides efficient loading and querying of the consolidated all_cards.parquet file.
Features in-memory caching with TTL and automatic reload on file changes.
Usage:
loader = AllCardsLoader()
# Single card lookup
card = loader.get_by_name("Sol Ring")
# Batch lookup
cards = loader.get_by_names(["Sol Ring", "Lightning Bolt", "Counterspell"])
# Filter by color identity
blue_cards = loader.filter_by_color_identity(["U"])
# Filter by themes
token_cards = loader.filter_by_themes(["tokens"], mode="any")
# Simple text search
results = loader.search("create token", limit=100)
"""
from __future__ import annotations
import os
import time
from typing import Optional
import pandas as pd
from code.logging_util import get_logger
from code.settings import CARD_FILES_DIRECTORY
# Initialize logger
logger = get_logger(__name__)
class AllCardsLoader:
"""Loads and caches the consolidated all_cards.parquet file with query methods."""
def __init__(self, file_path: Optional[str] = None, cache_ttl: int = 300) -> None:
"""
Initialize AllCardsLoader.
Args:
file_path: Path to all_cards.parquet (defaults to card_files/all_cards.parquet)
cache_ttl: Time-to-live for cache in seconds (default: 300 = 5 minutes)
"""
self.file_path = file_path or os.path.join(CARD_FILES_DIRECTORY, "all_cards.parquet")
self.cache_ttl = cache_ttl
self._df: Optional[pd.DataFrame] = None
self._last_load_time: float = 0
self._file_mtime: float = 0
def load(self, force_reload: bool = False) -> pd.DataFrame:
"""
Load all_cards.parquet with caching.
Returns cached DataFrame if:
- Cache exists
- Cache is not expired (within TTL)
- File hasn't been modified since last load
- force_reload is False
Args:
force_reload: Force reload from disk even if cached
Returns:
DataFrame containing all cards
Raises:
FileNotFoundError: If all_cards.parquet doesn't exist
"""
if not os.path.exists(self.file_path):
raise FileNotFoundError(f"All cards file not found: {self.file_path}")
# Check if we need to reload
current_time = time.time()
file_mtime = os.path.getmtime(self.file_path)
cache_valid = (
self._df is not None
and not force_reload
and (current_time - self._last_load_time) < self.cache_ttl
and file_mtime == self._file_mtime
)
if cache_valid:
return self._df # type: ignore
# Load from disk
logger.info(f"Loading all_cards from {self.file_path}...")
start_time = time.time()
self._df = pd.read_parquet(self.file_path, engine="pyarrow")
elapsed = time.time() - start_time
self._last_load_time = current_time
self._file_mtime = file_mtime
logger.info(
f"Loaded {len(self._df)} cards with {len(self._df.columns)} columns in {elapsed:.3f}s"
)
return self._df
def get_by_name(self, name: str) -> Optional[pd.Series]:
"""
Get a single card by exact name match.
Args:
name: Card name to search for
Returns:
Series containing card data, or None if not found
"""
df = self.load()
if "name" not in df.columns:
logger.warning("'name' column not found in all_cards")
return None
# Use .loc[] for faster exact match lookup
try:
matches = df.loc[df["name"] == name]
if matches.empty:
return None
return matches.iloc[0]
except (KeyError, IndexError):
return None
def get_by_names(self, names: list[str]) -> pd.DataFrame:
"""
Get multiple cards by exact name matches (batch lookup).
Args:
names: List of card names to search for
Returns:
DataFrame containing matching cards (may be empty)
"""
df = self.load()
if "name" not in df.columns:
logger.warning("'name' column not found in all_cards")
return pd.DataFrame()
return df[df["name"].isin(names)]
def filter_by_color_identity(self, colors: list[str]) -> pd.DataFrame:
"""
Filter cards by color identity.
Args:
colors: List of color codes (e.g., ["W", "U"], ["Colorless"], ["G", "R", "U"])
Returns:
DataFrame containing cards matching the color identity
"""
df = self.load()
if "colorIdentity" not in df.columns:
logger.warning("'colorIdentity' column not found in all_cards")
return pd.DataFrame()
# Convert colors list to a set for comparison
color_set = set(colors)
# Handle special case for colorless
if "Colorless" in color_set or "colorless" in color_set:
return df[df["colorIdentity"].isin(["Colorless", "colorless"])]
# For multi-color searches, match any card that contains those colors
# This is a simple exact match - could be enhanced for subset/superset matching
if len(colors) == 1:
# Single color - exact match
return df[df["colorIdentity"] == colors[0]]
else:
# Multi-color - match any of the provided colors (could be refined)
return df[df["colorIdentity"].isin(colors)]
def filter_by_themes(self, themes: list[str], mode: str = "any") -> pd.DataFrame:
"""
Filter cards by theme tags.
Args:
themes: List of theme tags to search for
mode: "any" (at least one theme) or "all" (must have all themes)
Returns:
DataFrame containing cards matching the theme criteria
"""
df = self.load()
if "themeTags" not in df.columns:
logger.warning("'themeTags' column not found in all_cards")
return pd.DataFrame()
if mode == "all":
# Card must have all specified themes
mask = pd.Series([True] * len(df), index=df.index)
for theme in themes:
mask &= df["themeTags"].str.contains(theme, case=False, na=False)
return df[mask]
else:
# Card must have at least one of the specified themes (default)
mask = pd.Series([False] * len(df), index=df.index)
for theme in themes:
mask |= df["themeTags"].str.contains(theme, case=False, na=False)
return df[mask]
def search(self, query: str, limit: int = 100) -> pd.DataFrame:
"""
Simple text search across card name, type, and oracle text.
Args:
query: Search query string
limit: Maximum number of results to return
Returns:
DataFrame containing matching cards (up to limit)
"""
df = self.load()
# Search across multiple columns
mask = pd.Series([False] * len(df), index=df.index)
if "name" in df.columns:
mask |= df["name"].str.contains(query, case=False, na=False)
if "type" in df.columns:
mask |= df["type"].str.contains(query, case=False, na=False)
if "text" in df.columns:
mask |= df["text"].str.contains(query, case=False, na=False)
results = df[mask]
if len(results) > limit:
return results.head(limit)
return results
def filter_by_type(self, type_query: str) -> pd.DataFrame:
"""
Filter cards by type line (supports partial matching).
Args:
type_query: Type string to search for (e.g., "Creature", "Instant", "Artifact")
Returns:
DataFrame containing cards matching the type
"""
df = self.load()
if "type" not in df.columns:
logger.warning("'type' column not found in all_cards")
return pd.DataFrame()
return df[df["type"].str.contains(type_query, case=False, na=False)]
def get_stats(self) -> dict:
"""
Get statistics about the loaded card data.
Returns:
Dictionary with card count, column count, file size, and load time
"""
df = self.load()
stats = {
"total_cards": len(df),
"columns": len(df.columns),
"file_path": self.file_path,
"file_size_mb": (
round(os.path.getsize(self.file_path) / (1024 * 1024), 2)
if os.path.exists(self.file_path)
else 0
),
"cached": self._df is not None,
"cache_age_seconds": int(time.time() - self._last_load_time)
if self._last_load_time > 0
else None,
}
return stats
def clear_cache(self) -> None:
"""Clear the cached DataFrame, forcing next load to read from disk."""
self._df = None
self._last_load_time = 0
logger.info("Cache cleared")

View file

@ -0,0 +1,207 @@
"""
Card Query Builder
Provides a fluent API for building complex card queries against the consolidated all_cards.parquet.
Usage:
from code.services.card_query_builder import CardQueryBuilder
# Simple query
builder = CardQueryBuilder()
cards = builder.colors(["W", "U"]).execute()
# Complex query
cards = (CardQueryBuilder()
.colors(["G"])
.themes(["tokens"], mode="any")
.types("Creature")
.limit(20)
.execute())
# Get specific cards
cards = CardQueryBuilder().names(["Sol Ring", "Lightning Bolt"]).execute()
"""
from __future__ import annotations
from typing import Optional
import pandas as pd
from code.services.all_cards_loader import AllCardsLoader
class CardQueryBuilder:
"""Fluent API for building card queries."""
def __init__(self, loader: Optional[AllCardsLoader] = None) -> None:
"""
Initialize CardQueryBuilder.
Args:
loader: AllCardsLoader instance (creates default if None)
"""
self._loader = loader or AllCardsLoader()
self._color_filter: Optional[list[str]] = None
self._theme_filter: Optional[list[str]] = None
self._theme_mode: str = "any"
self._type_filter: Optional[str] = None
self._name_filter: Optional[list[str]] = None
self._search_query: Optional[str] = None
self._limit: Optional[int] = None
def colors(self, colors: list[str]) -> CardQueryBuilder:
"""
Filter by color identity.
Args:
colors: List of color codes (e.g., ["W", "U"])
Returns:
Self for chaining
"""
self._color_filter = colors
return self
def themes(self, themes: list[str], mode: str = "any") -> CardQueryBuilder:
"""
Filter by theme tags.
Args:
themes: List of theme tags
mode: "any" (at least one) or "all" (must have all)
Returns:
Self for chaining
"""
self._theme_filter = themes
self._theme_mode = mode
return self
def types(self, type_query: str) -> CardQueryBuilder:
"""
Filter by type line (partial match).
Args:
type_query: Type string to search for
Returns:
Self for chaining
"""
self._type_filter = type_query
return self
def names(self, names: list[str]) -> CardQueryBuilder:
"""
Filter by specific card names (batch lookup).
Args:
names: List of card names
Returns:
Self for chaining
"""
self._name_filter = names
return self
def search(self, query: str) -> CardQueryBuilder:
"""
Add text search across name, type, and oracle text.
Args:
query: Search query string
Returns:
Self for chaining
"""
self._search_query = query
return self
def limit(self, limit: int) -> CardQueryBuilder:
"""
Limit number of results.
Args:
limit: Maximum number of results
Returns:
Self for chaining
"""
self._limit = limit
return self
def execute(self) -> pd.DataFrame:
"""
Execute the query and return results.
Returns:
DataFrame containing matching cards
"""
# Start with all cards or specific names
if self._name_filter:
df = self._loader.get_by_names(self._name_filter)
else:
df = self._loader.load()
# Apply color filter
if self._color_filter:
color_results = self._loader.filter_by_color_identity(self._color_filter)
df = df[df.index.isin(color_results.index)]
# Apply theme filter
if self._theme_filter:
theme_results = self._loader.filter_by_themes(self._theme_filter, mode=self._theme_mode)
df = df[df.index.isin(theme_results.index)]
# Apply type filter
if self._type_filter:
type_results = self._loader.filter_by_type(self._type_filter)
df = df[df.index.isin(type_results.index)]
# Apply text search
if self._search_query:
search_results = self._loader.search(self._search_query, limit=999999)
df = df[df.index.isin(search_results.index)]
# Apply limit
if self._limit and len(df) > self._limit:
df = df.head(self._limit)
return df
def count(self) -> int:
"""
Count results without returning full DataFrame.
Returns:
Number of matching cards
"""
return len(self.execute())
def first(self) -> Optional[pd.Series]:
"""
Get first result only.
Returns:
First matching card as Series, or None if no results
"""
results = self.execute()
if results.empty:
return None
return results.iloc[0]
def reset(self) -> CardQueryBuilder:
"""
Reset all filters.
Returns:
Self for chaining
"""
self._color_filter = None
self._theme_filter = None
self._theme_mode = "any"
self._type_filter = None
self._name_filter = None
self._search_query = None
self._limit = None
return self

View file

@ -0,0 +1,281 @@
"""
Legacy Loader Adapter
Provides backward-compatible wrapper functions around AllCardsLoader for smooth migration.
Existing code can continue using old file-loading patterns while benefiting from
the new consolidated Parquet backend.
This adapter will be maintained through v3.0.x and deprecated in v3.1+.
Usage:
# Old code (still works):
from code.services.legacy_loader_adapter import load_cards_by_type
creatures = load_cards_by_type("Creature")
# New code (preferred):
from code.services.all_cards_loader import AllCardsLoader
loader = AllCardsLoader()
creatures = loader.filter_by_type("Creature")
"""
from __future__ import annotations
import warnings
from typing import Optional
import pandas as pd
from code.logging_util import get_logger
from code.services.all_cards_loader import AllCardsLoader
from code.settings import USE_ALL_CARDS_FILE
# Initialize logger
logger = get_logger(__name__)
# Shared loader instance for performance
_shared_loader: Optional[AllCardsLoader] = None
def _get_loader() -> AllCardsLoader:
"""Get or create shared AllCardsLoader instance."""
global _shared_loader
if _shared_loader is None:
_shared_loader = AllCardsLoader()
return _shared_loader
def _deprecation_warning(func_name: str, replacement: str) -> None:
"""Log deprecation warning for legacy functions."""
warnings.warn(
f"{func_name} is deprecated and will be removed in v3.1+. "
f"Use {replacement} instead.",
DeprecationWarning,
stacklevel=3,
)
logger.warning(
f"DEPRECATION: {func_name} called. Migrate to {replacement} before v3.1+"
)
def load_all_cards(use_cache: bool = True) -> pd.DataFrame:
"""
Load all cards from consolidated Parquet file.
Legacy function for backward compatibility.
Args:
use_cache: Whether to use cached data (default: True)
Returns:
DataFrame containing all cards
Deprecated:
Use AllCardsLoader().load() instead.
"""
_deprecation_warning("load_all_cards()", "AllCardsLoader().load()")
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
return loader.load(force_reload=not use_cache)
def load_cards_by_name(name: str) -> Optional[pd.Series]:
"""
Load a single card by exact name match.
Legacy function for backward compatibility.
Args:
name: Card name to search for
Returns:
Series containing card data, or None if not found
Deprecated:
Use AllCardsLoader().get_by_name() instead.
"""
_deprecation_warning("load_cards_by_name()", "AllCardsLoader().get_by_name()")
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning None")
return None
loader = _get_loader()
return loader.get_by_name(name)
def load_cards_by_names(names: list[str]) -> pd.DataFrame:
"""
Load multiple cards by exact name matches.
Legacy function for backward compatibility.
Args:
names: List of card names to search for
Returns:
DataFrame containing matching cards
Deprecated:
Use AllCardsLoader().get_by_names() instead.
"""
_deprecation_warning("load_cards_by_names()", "AllCardsLoader().get_by_names()")
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
return loader.get_by_names(names)
def load_cards_by_type(type_str: str) -> pd.DataFrame:
"""
Load cards by type line (partial match).
Legacy function for backward compatibility.
Args:
type_str: Type string to search for (e.g., "Creature", "Instant")
Returns:
DataFrame containing cards matching the type
Deprecated:
Use AllCardsLoader().filter_by_type() instead.
"""
_deprecation_warning("load_cards_by_type()", "AllCardsLoader().filter_by_type()")
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
return loader.filter_by_type(type_str)
def load_cards_with_tag(tag: str) -> pd.DataFrame:
"""
Load cards containing a specific theme tag.
Legacy function for backward compatibility.
Args:
tag: Theme tag to search for
Returns:
DataFrame containing cards with the tag
Deprecated:
Use AllCardsLoader().filter_by_themes() instead.
"""
_deprecation_warning("load_cards_with_tag()", "AllCardsLoader().filter_by_themes()")
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
return loader.filter_by_themes([tag], mode="any")
def load_cards_with_tags(tags: list[str], require_all: bool = False) -> pd.DataFrame:
"""
Load cards containing theme tags.
Legacy function for backward compatibility.
Args:
tags: List of theme tags to search for
require_all: If True, card must have all tags; if False, at least one tag
Returns:
DataFrame containing cards matching the tag criteria
Deprecated:
Use AllCardsLoader().filter_by_themes() instead.
"""
_deprecation_warning(
"load_cards_with_tags()", "AllCardsLoader().filter_by_themes()"
)
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
mode = "all" if require_all else "any"
return loader.filter_by_themes(tags, mode=mode)
def load_cards_by_color_identity(colors: list[str]) -> pd.DataFrame:
"""
Load cards by color identity.
Legacy function for backward compatibility.
Args:
colors: List of color codes (e.g., ["W", "U"])
Returns:
DataFrame containing cards matching the color identity
Deprecated:
Use AllCardsLoader().filter_by_color_identity() instead.
"""
_deprecation_warning(
"load_cards_by_color_identity()", "AllCardsLoader().filter_by_color_identity()"
)
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
return loader.filter_by_color_identity(colors)
def search_cards(query: str, limit: int = 100) -> pd.DataFrame:
"""
Search cards by text query.
Legacy function for backward compatibility.
Args:
query: Search query string
limit: Maximum number of results
Returns:
DataFrame containing matching cards
Deprecated:
Use AllCardsLoader().search() instead.
"""
_deprecation_warning("search_cards()", "AllCardsLoader().search()")
if not USE_ALL_CARDS_FILE:
logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
return pd.DataFrame()
loader = _get_loader()
return loader.search(query, limit=limit)
def clear_card_cache() -> None:
"""
Clear the cached card data, forcing next load to read from disk.
Legacy function for backward compatibility.
Deprecated:
Use AllCardsLoader().clear_cache() instead.
"""
_deprecation_warning("clear_card_cache()", "AllCardsLoader().clear_cache()")
global _shared_loader
if _shared_loader is not None:
_shared_loader.clear_cache()
_shared_loader = None