feat: consolidate card data into optimized format for faster queries and reduced file sizes

2026-03-17 10:46:30 +01:00 · 2025-10-15 11:04:49 -07:00 · 2025-10-15 11:04:49 -07:00 · f70ffca23e
commit f70ffca23e
parent 5753bb19f8
24 changed files with 2903 additions and 135 deletions
--- a/code/services/init.py
+++ b/code/services/init.py
@ -0,0 +1,6 @@
+"""Services package for MTG Python Deckbuilder."""
+
+from code.services.all_cards_loader import AllCardsLoader
+from code.services.card_query_builder import CardQueryBuilder
+
+__all__ = ["AllCardsLoader", "CardQueryBuilder"]
--- a/code/services/all_cards_loader.py
+++ b/code/services/all_cards_loader.py
@ -0,0 +1,289 @@
+"""
+All Cards Loader
+
+Provides efficient loading and querying of the consolidated all_cards.parquet file.
+Features in-memory caching with TTL and automatic reload on file changes.
+
+Usage:
+    loader = AllCardsLoader()
+    
+    # Single card lookup
+    card = loader.get_by_name("Sol Ring")
+    
+    # Batch lookup
+    cards = loader.get_by_names(["Sol Ring", "Lightning Bolt", "Counterspell"])
+    
+    # Filter by color identity
+    blue_cards = loader.filter_by_color_identity(["U"])
+    
+    # Filter by themes
+    token_cards = loader.filter_by_themes(["tokens"], mode="any")
+    
+    # Simple text search
+    results = loader.search("create token", limit=100)
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from typing import Optional
+
+import pandas as pd
+
+from code.logging_util import get_logger
+from code.settings import CARD_FILES_DIRECTORY
+
+# Initialize logger
+logger = get_logger(__name__)
+
+
+class AllCardsLoader:
+    """Loads and caches the consolidated all_cards.parquet file with query methods."""
+
+    def __init__(self, file_path: Optional[str] = None, cache_ttl: int = 300) -> None:
+        """
+        Initialize AllCardsLoader.
+
+        Args:
+            file_path: Path to all_cards.parquet (defaults to card_files/all_cards.parquet)
+            cache_ttl: Time-to-live for cache in seconds (default: 300 = 5 minutes)
+        """
+        self.file_path = file_path or os.path.join(CARD_FILES_DIRECTORY, "all_cards.parquet")
+        self.cache_ttl = cache_ttl
+        self._df: Optional[pd.DataFrame] = None
+        self._last_load_time: float = 0
+        self._file_mtime: float = 0
+
+    def load(self, force_reload: bool = False) -> pd.DataFrame:
+        """
+        Load all_cards.parquet with caching.
+
+        Returns cached DataFrame if:
+        - Cache exists
+        - Cache is not expired (within TTL)
+        - File hasn't been modified since last load
+        - force_reload is False
+
+        Args:
+            force_reload: Force reload from disk even if cached
+
+        Returns:
+            DataFrame containing all cards
+
+        Raises:
+            FileNotFoundError: If all_cards.parquet doesn't exist
+        """
+        if not os.path.exists(self.file_path):
+            raise FileNotFoundError(f"All cards file not found: {self.file_path}")
+
+        # Check if we need to reload
+        current_time = time.time()
+        file_mtime = os.path.getmtime(self.file_path)
+
+        cache_valid = (
+            self._df is not None
+            and not force_reload
+            and (current_time - self._last_load_time) < self.cache_ttl
+            and file_mtime == self._file_mtime
+        )
+
+        if cache_valid:
+            return self._df  # type: ignore
+
+        # Load from disk
+        logger.info(f"Loading all_cards from {self.file_path}...")
+        start_time = time.time()
+        self._df = pd.read_parquet(self.file_path, engine="pyarrow")
+        elapsed = time.time() - start_time
+
+        self._last_load_time = current_time
+        self._file_mtime = file_mtime
+
+        logger.info(
+            f"Loaded {len(self._df)} cards with {len(self._df.columns)} columns in {elapsed:.3f}s"
+        )
+
+        return self._df
+
+    def get_by_name(self, name: str) -> Optional[pd.Series]:
+        """
+        Get a single card by exact name match.
+
+        Args:
+            name: Card name to search for
+
+        Returns:
+            Series containing card data, or None if not found
+        """
+        df = self.load()
+        if "name" not in df.columns:
+            logger.warning("'name' column not found in all_cards")
+            return None
+
+        # Use .loc[] for faster exact match lookup
+        try:
+            matches = df.loc[df["name"] == name]
+            if matches.empty:
+                return None
+            return matches.iloc[0]
+        except (KeyError, IndexError):
+            return None
+
+    def get_by_names(self, names: list[str]) -> pd.DataFrame:
+        """
+        Get multiple cards by exact name matches (batch lookup).
+
+        Args:
+            names: List of card names to search for
+
+        Returns:
+            DataFrame containing matching cards (may be empty)
+        """
+        df = self.load()
+        if "name" not in df.columns:
+            logger.warning("'name' column not found in all_cards")
+            return pd.DataFrame()
+
+        return df[df["name"].isin(names)]
+
+    def filter_by_color_identity(self, colors: list[str]) -> pd.DataFrame:
+        """
+        Filter cards by color identity.
+
+        Args:
+            colors: List of color codes (e.g., ["W", "U"], ["Colorless"], ["G", "R", "U"])
+
+        Returns:
+            DataFrame containing cards matching the color identity
+        """
+        df = self.load()
+        if "colorIdentity" not in df.columns:
+            logger.warning("'colorIdentity' column not found in all_cards")
+            return pd.DataFrame()
+
+        # Convert colors list to a set for comparison
+        color_set = set(colors)
+
+        # Handle special case for colorless
+        if "Colorless" in color_set or "colorless" in color_set:
+            return df[df["colorIdentity"].isin(["Colorless", "colorless"])]
+
+        # For multi-color searches, match any card that contains those colors
+        # This is a simple exact match - could be enhanced for subset/superset matching
+        if len(colors) == 1:
+            # Single color - exact match
+            return df[df["colorIdentity"] == colors[0]]
+        else:
+            # Multi-color - match any of the provided colors (could be refined)
+            return df[df["colorIdentity"].isin(colors)]
+
+    def filter_by_themes(self, themes: list[str], mode: str = "any") -> pd.DataFrame:
+        """
+        Filter cards by theme tags.
+
+        Args:
+            themes: List of theme tags to search for
+            mode: "any" (at least one theme) or "all" (must have all themes)
+
+        Returns:
+            DataFrame containing cards matching the theme criteria
+        """
+        df = self.load()
+        if "themeTags" not in df.columns:
+            logger.warning("'themeTags' column not found in all_cards")
+            return pd.DataFrame()
+
+        if mode == "all":
+            # Card must have all specified themes
+            mask = pd.Series([True] * len(df), index=df.index)
+            for theme in themes:
+                mask &= df["themeTags"].str.contains(theme, case=False, na=False)
+            return df[mask]
+        else:
+            # Card must have at least one of the specified themes (default)
+            mask = pd.Series([False] * len(df), index=df.index)
+            for theme in themes:
+                mask |= df["themeTags"].str.contains(theme, case=False, na=False)
+            return df[mask]
+
+    def search(self, query: str, limit: int = 100) -> pd.DataFrame:
+        """
+        Simple text search across card name, type, and oracle text.
+
+        Args:
+            query: Search query string
+            limit: Maximum number of results to return
+
+        Returns:
+            DataFrame containing matching cards (up to limit)
+        """
+        df = self.load()
+
+        # Search across multiple columns
+        mask = pd.Series([False] * len(df), index=df.index)
+
+        if "name" in df.columns:
+            mask |= df["name"].str.contains(query, case=False, na=False)
+
+        if "type" in df.columns:
+            mask |= df["type"].str.contains(query, case=False, na=False)
+
+        if "text" in df.columns:
+            mask |= df["text"].str.contains(query, case=False, na=False)
+
+        results = df[mask]
+
+        if len(results) > limit:
+            return results.head(limit)
+
+        return results
+
+    def filter_by_type(self, type_query: str) -> pd.DataFrame:
+        """
+        Filter cards by type line (supports partial matching).
+
+        Args:
+            type_query: Type string to search for (e.g., "Creature", "Instant", "Artifact")
+
+        Returns:
+            DataFrame containing cards matching the type
+        """
+        df = self.load()
+        if "type" not in df.columns:
+            logger.warning("'type' column not found in all_cards")
+            return pd.DataFrame()
+
+        return df[df["type"].str.contains(type_query, case=False, na=False)]
+
+    def get_stats(self) -> dict:
+        """
+        Get statistics about the loaded card data.
+
+        Returns:
+            Dictionary with card count, column count, file size, and load time
+        """
+        df = self.load()
+
+        stats = {
+            "total_cards": len(df),
+            "columns": len(df.columns),
+            "file_path": self.file_path,
+            "file_size_mb": (
+                round(os.path.getsize(self.file_path) / (1024 * 1024), 2)
+                if os.path.exists(self.file_path)
+                else 0
+            ),
+            "cached": self._df is not None,
+            "cache_age_seconds": int(time.time() - self._last_load_time)
+            if self._last_load_time > 0
+            else None,
+        }
+
+        return stats
+
+    def clear_cache(self) -> None:
+        """Clear the cached DataFrame, forcing next load to read from disk."""
+        self._df = None
+        self._last_load_time = 0
+        logger.info("Cache cleared")
--- a/code/services/card_query_builder.py
+++ b/code/services/card_query_builder.py
@ -0,0 +1,207 @@
+"""
+Card Query Builder
+
+Provides a fluent API for building complex card queries against the consolidated all_cards.parquet.
+
+Usage:
+    from code.services.card_query_builder import CardQueryBuilder
+    
+    # Simple query
+    builder = CardQueryBuilder()
+    cards = builder.colors(["W", "U"]).execute()
+    
+    # Complex query
+    cards = (CardQueryBuilder()
+        .colors(["G"])
+        .themes(["tokens"], mode="any")
+        .types("Creature")
+        .limit(20)
+        .execute())
+    
+    # Get specific cards
+    cards = CardQueryBuilder().names(["Sol Ring", "Lightning Bolt"]).execute()
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+import pandas as pd
+
+from code.services.all_cards_loader import AllCardsLoader
+
+
+class CardQueryBuilder:
+    """Fluent API for building card queries."""
+
+    def __init__(self, loader: Optional[AllCardsLoader] = None) -> None:
+        """
+        Initialize CardQueryBuilder.
+
+        Args:
+            loader: AllCardsLoader instance (creates default if None)
+        """
+        self._loader = loader or AllCardsLoader()
+        self._color_filter: Optional[list[str]] = None
+        self._theme_filter: Optional[list[str]] = None
+        self._theme_mode: str = "any"
+        self._type_filter: Optional[str] = None
+        self._name_filter: Optional[list[str]] = None
+        self._search_query: Optional[str] = None
+        self._limit: Optional[int] = None
+
+    def colors(self, colors: list[str]) -> CardQueryBuilder:
+        """
+        Filter by color identity.
+
+        Args:
+            colors: List of color codes (e.g., ["W", "U"])
+
+        Returns:
+            Self for chaining
+        """
+        self._color_filter = colors
+        return self
+
+    def themes(self, themes: list[str], mode: str = "any") -> CardQueryBuilder:
+        """
+        Filter by theme tags.
+
+        Args:
+            themes: List of theme tags
+            mode: "any" (at least one) or "all" (must have all)
+
+        Returns:
+            Self for chaining
+        """
+        self._theme_filter = themes
+        self._theme_mode = mode
+        return self
+
+    def types(self, type_query: str) -> CardQueryBuilder:
+        """
+        Filter by type line (partial match).
+
+        Args:
+            type_query: Type string to search for
+
+        Returns:
+            Self for chaining
+        """
+        self._type_filter = type_query
+        return self
+
+    def names(self, names: list[str]) -> CardQueryBuilder:
+        """
+        Filter by specific card names (batch lookup).
+
+        Args:
+            names: List of card names
+
+        Returns:
+            Self for chaining
+        """
+        self._name_filter = names
+        return self
+
+    def search(self, query: str) -> CardQueryBuilder:
+        """
+        Add text search across name, type, and oracle text.
+
+        Args:
+            query: Search query string
+
+        Returns:
+            Self for chaining
+        """
+        self._search_query = query
+        return self
+
+    def limit(self, limit: int) -> CardQueryBuilder:
+        """
+        Limit number of results.
+
+        Args:
+            limit: Maximum number of results
+
+        Returns:
+            Self for chaining
+        """
+        self._limit = limit
+        return self
+
+    def execute(self) -> pd.DataFrame:
+        """
+        Execute the query and return results.
+
+        Returns:
+            DataFrame containing matching cards
+        """
+        # Start with all cards or specific names
+        if self._name_filter:
+            df = self._loader.get_by_names(self._name_filter)
+        else:
+            df = self._loader.load()
+
+        # Apply color filter
+        if self._color_filter:
+            color_results = self._loader.filter_by_color_identity(self._color_filter)
+            df = df[df.index.isin(color_results.index)]
+
+        # Apply theme filter
+        if self._theme_filter:
+            theme_results = self._loader.filter_by_themes(self._theme_filter, mode=self._theme_mode)
+            df = df[df.index.isin(theme_results.index)]
+
+        # Apply type filter
+        if self._type_filter:
+            type_results = self._loader.filter_by_type(self._type_filter)
+            df = df[df.index.isin(type_results.index)]
+
+        # Apply text search
+        if self._search_query:
+            search_results = self._loader.search(self._search_query, limit=999999)
+            df = df[df.index.isin(search_results.index)]
+
+        # Apply limit
+        if self._limit and len(df) > self._limit:
+            df = df.head(self._limit)
+
+        return df
+
+    def count(self) -> int:
+        """
+        Count results without returning full DataFrame.
+
+        Returns:
+            Number of matching cards
+        """
+        return len(self.execute())
+
+    def first(self) -> Optional[pd.Series]:
+        """
+        Get first result only.
+
+        Returns:
+            First matching card as Series, or None if no results
+        """
+        results = self.execute()
+        if results.empty:
+            return None
+        return results.iloc[0]
+
+    def reset(self) -> CardQueryBuilder:
+        """
+        Reset all filters.
+
+        Returns:
+            Self for chaining
+        """
+        self._color_filter = None
+        self._theme_filter = None
+        self._theme_mode = "any"
+        self._type_filter = None
+        self._name_filter = None
+        self._search_query = None
+        self._limit = None
+        return self
--- a/code/services/legacy_loader_adapter.py
+++ b/code/services/legacy_loader_adapter.py
@ -0,0 +1,281 @@
+"""
+Legacy Loader Adapter
+
+Provides backward-compatible wrapper functions around AllCardsLoader for smooth migration.
+Existing code can continue using old file-loading patterns while benefiting from
+the new consolidated Parquet backend.
+
+This adapter will be maintained through v3.0.x and deprecated in v3.1+.
+
+Usage:
+    # Old code (still works):
+    from code.services.legacy_loader_adapter import load_cards_by_type
+    creatures = load_cards_by_type("Creature")
+    
+    # New code (preferred):
+    from code.services.all_cards_loader import AllCardsLoader
+    loader = AllCardsLoader()
+    creatures = loader.filter_by_type("Creature")
+"""
+
+from __future__ import annotations
+
+import warnings
+from typing import Optional
+
+import pandas as pd
+
+from code.logging_util import get_logger
+from code.services.all_cards_loader import AllCardsLoader
+from code.settings import USE_ALL_CARDS_FILE
+
+# Initialize logger
+logger = get_logger(__name__)
+
+# Shared loader instance for performance
+_shared_loader: Optional[AllCardsLoader] = None
+
+
+def _get_loader() -> AllCardsLoader:
+    """Get or create shared AllCardsLoader instance."""
+    global _shared_loader
+    if _shared_loader is None:
+        _shared_loader = AllCardsLoader()
+    return _shared_loader
+
+
+def _deprecation_warning(func_name: str, replacement: str) -> None:
+    """Log deprecation warning for legacy functions."""
+    warnings.warn(
+        f"{func_name} is deprecated and will be removed in v3.1+. "
+        f"Use {replacement} instead.",
+        DeprecationWarning,
+        stacklevel=3,
+    )
+    logger.warning(
+        f"DEPRECATION: {func_name} called. Migrate to {replacement} before v3.1+"
+    )
+
+
+def load_all_cards(use_cache: bool = True) -> pd.DataFrame:
+    """
+    Load all cards from consolidated Parquet file.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        use_cache: Whether to use cached data (default: True)
+    
+    Returns:
+        DataFrame containing all cards
+    
+    Deprecated:
+        Use AllCardsLoader().load() instead.
+    """
+    _deprecation_warning("load_all_cards()", "AllCardsLoader().load()")
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    return loader.load(force_reload=not use_cache)
+
+
+def load_cards_by_name(name: str) -> Optional[pd.Series]:
+    """
+    Load a single card by exact name match.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        name: Card name to search for
+    
+    Returns:
+        Series containing card data, or None if not found
+    
+    Deprecated:
+        Use AllCardsLoader().get_by_name() instead.
+    """
+    _deprecation_warning("load_cards_by_name()", "AllCardsLoader().get_by_name()")
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning None")
+        return None
+    
+    loader = _get_loader()
+    return loader.get_by_name(name)
+
+
+def load_cards_by_names(names: list[str]) -> pd.DataFrame:
+    """
+    Load multiple cards by exact name matches.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        names: List of card names to search for
+    
+    Returns:
+        DataFrame containing matching cards
+    
+    Deprecated:
+        Use AllCardsLoader().get_by_names() instead.
+    """
+    _deprecation_warning("load_cards_by_names()", "AllCardsLoader().get_by_names()")
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    return loader.get_by_names(names)
+
+
+def load_cards_by_type(type_str: str) -> pd.DataFrame:
+    """
+    Load cards by type line (partial match).
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        type_str: Type string to search for (e.g., "Creature", "Instant")
+    
+    Returns:
+        DataFrame containing cards matching the type
+    
+    Deprecated:
+        Use AllCardsLoader().filter_by_type() instead.
+    """
+    _deprecation_warning("load_cards_by_type()", "AllCardsLoader().filter_by_type()")
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    return loader.filter_by_type(type_str)
+
+
+def load_cards_with_tag(tag: str) -> pd.DataFrame:
+    """
+    Load cards containing a specific theme tag.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        tag: Theme tag to search for
+    
+    Returns:
+        DataFrame containing cards with the tag
+    
+    Deprecated:
+        Use AllCardsLoader().filter_by_themes() instead.
+    """
+    _deprecation_warning("load_cards_with_tag()", "AllCardsLoader().filter_by_themes()")
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    return loader.filter_by_themes([tag], mode="any")
+
+
+def load_cards_with_tags(tags: list[str], require_all: bool = False) -> pd.DataFrame:
+    """
+    Load cards containing theme tags.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        tags: List of theme tags to search for
+        require_all: If True, card must have all tags; if False, at least one tag
+    
+    Returns:
+        DataFrame containing cards matching the tag criteria
+    
+    Deprecated:
+        Use AllCardsLoader().filter_by_themes() instead.
+    """
+    _deprecation_warning(
+        "load_cards_with_tags()", "AllCardsLoader().filter_by_themes()"
+    )
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    mode = "all" if require_all else "any"
+    return loader.filter_by_themes(tags, mode=mode)
+
+
+def load_cards_by_color_identity(colors: list[str]) -> pd.DataFrame:
+    """
+    Load cards by color identity.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        colors: List of color codes (e.g., ["W", "U"])
+    
+    Returns:
+        DataFrame containing cards matching the color identity
+    
+    Deprecated:
+        Use AllCardsLoader().filter_by_color_identity() instead.
+    """
+    _deprecation_warning(
+        "load_cards_by_color_identity()", "AllCardsLoader().filter_by_color_identity()"
+    )
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    return loader.filter_by_color_identity(colors)
+
+
+def search_cards(query: str, limit: int = 100) -> pd.DataFrame:
+    """
+    Search cards by text query.
+    
+    Legacy function for backward compatibility.
+    
+    Args:
+        query: Search query string
+        limit: Maximum number of results
+    
+    Returns:
+        DataFrame containing matching cards
+    
+    Deprecated:
+        Use AllCardsLoader().search() instead.
+    """
+    _deprecation_warning("search_cards()", "AllCardsLoader().search()")
+    
+    if not USE_ALL_CARDS_FILE:
+        logger.warning("USE_ALL_CARDS_FILE is disabled, returning empty DataFrame")
+        return pd.DataFrame()
+    
+    loader = _get_loader()
+    return loader.search(query, limit=limit)
+
+
+def clear_card_cache() -> None:
+    """
+    Clear the cached card data, forcing next load to read from disk.
+    
+    Legacy function for backward compatibility.
+    
+    Deprecated:
+        Use AllCardsLoader().clear_cache() instead.
+    """
+    _deprecation_warning("clear_card_cache()", "AllCardsLoader().clear_cache()")
+    
+    global _shared_loader
+    if _shared_loader is not None:
+        _shared_loader.clear_cache()
+        _shared_loader = None