mtg_python_deckbuilder/code/services/all_cards_loader.py

"""
All Cards Loader

Provides efficient loading and querying of the consolidated all_cards.parquet file.
Features in-memory caching with TTL and automatic reload on file changes.

Usage:
    loader = AllCardsLoader()
    
    # Single card lookup
    card = loader.get_by_name("Sol Ring")
    
    # Batch lookup
    cards = loader.get_by_names(["Sol Ring", "Lightning Bolt", "Counterspell"])
    
    # Filter by color identity
    blue_cards = loader.filter_by_color_identity(["U"])
    
    # Filter by themes
    token_cards = loader.filter_by_themes(["tokens"], mode="any")
    
    # Simple text search
    results = loader.search("create token", limit=100)
"""

from __future__ import annotations

import os
import time
from typing import Optional

import pandas as pd

from code.logging_util import get_logger
from code.settings import CARD_FILES_DIRECTORY

# Initialize logger
logger = get_logger(__name__)


class AllCardsLoader:
    """Loads and caches the consolidated all_cards.parquet file with query methods."""

    def __init__(self, file_path: Optional[str] = None, cache_ttl: int = 300) -> None:
        """
        Initialize AllCardsLoader.

        Args:
            file_path: Path to all_cards.parquet (defaults to card_files/all_cards.parquet)
            cache_ttl: Time-to-live for cache in seconds (default: 300 = 5 minutes)
        """
        self.file_path = file_path or os.path.join(CARD_FILES_DIRECTORY, "all_cards.parquet")
        self.cache_ttl = cache_ttl
        self._df: Optional[pd.DataFrame] = None
        self._last_load_time: float = 0
        self._file_mtime: float = 0

    def load(self, force_reload: bool = False) -> pd.DataFrame:
        """
        Load all_cards.parquet with caching.

        Returns cached DataFrame if:
        - Cache exists
        - Cache is not expired (within TTL)
        - File hasn't been modified since last load
        - force_reload is False

        Args:
            force_reload: Force reload from disk even if cached

        Returns:
            DataFrame containing all cards

        Raises:
            FileNotFoundError: If all_cards.parquet doesn't exist
        """
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"All cards file not found: {self.file_path}")

        # Check if we need to reload
        current_time = time.time()
        file_mtime = os.path.getmtime(self.file_path)

        cache_valid = (
            self._df is not None
            and not force_reload
            and (current_time - self._last_load_time) < self.cache_ttl
            and file_mtime == self._file_mtime
        )

        if cache_valid:
            return self._df  # type: ignore

        # Load from disk
        logger.info(f"Loading all_cards from {self.file_path}...")
        start_time = time.time()
        self._df = pd.read_parquet(self.file_path, engine="pyarrow")
        elapsed = time.time() - start_time

        self._last_load_time = current_time
        self._file_mtime = file_mtime

        logger.info(
            f"Loaded {len(self._df)} cards with {len(self._df.columns)} columns in {elapsed:.3f}s"
        )

        return self._df

    def get_by_name(self, name: str) -> Optional[pd.Series]:
        """
        Get a single card by exact name match.

        Args:
            name: Card name to search for

        Returns:
            Series containing card data, or None if not found
        """
        df = self.load()
        if "name" not in df.columns:
            logger.warning("'name' column not found in all_cards")
            return None

        # Use .loc[] for faster exact match lookup
        try:
            matches = df.loc[df["name"] == name]
            if matches.empty:
                return None
            return matches.iloc[0]
        except (KeyError, IndexError):
            return None

    def get_by_names(self, names: list[str]) -> pd.DataFrame:
        """
        Get multiple cards by exact name matches (batch lookup).

        Args:
            names: List of card names to search for

        Returns:
            DataFrame containing matching cards (may be empty)
        """
        df = self.load()
        if "name" not in df.columns:
            logger.warning("'name' column not found in all_cards")
            return pd.DataFrame()

        return df[df["name"].isin(names)]

    def filter_by_color_identity(self, colors: list[str]) -> pd.DataFrame:
        """
        Filter cards by color identity.

        Args:
            colors: List of color codes (e.g., ["W", "U"], ["Colorless"], ["G", "R", "U"])

        Returns:
            DataFrame containing cards matching the color identity
        """
        df = self.load()
        if "colorIdentity" not in df.columns:
            logger.warning("'colorIdentity' column not found in all_cards")
            return pd.DataFrame()

        # Convert colors list to a set for comparison
        color_set = set(colors)

        # Handle special case for colorless
        if "Colorless" in color_set or "colorless" in color_set:
            return df[df["colorIdentity"].isin(["Colorless", "colorless"])]

        # For multi-color searches, match any card that contains those colors
        # This is a simple exact match - could be enhanced for subset/superset matching
        if len(colors) == 1:
            # Single color - exact match
            return df[df["colorIdentity"] == colors[0]]
        else:
            # Multi-color - match any of the provided colors (could be refined)
            return df[df["colorIdentity"].isin(colors)]

    def filter_by_themes(self, themes: list[str], mode: str = "any") -> pd.DataFrame:
        """
        Filter cards by theme tags.

        Args:
            themes: List of theme tags to search for
            mode: "any" (at least one theme) or "all" (must have all themes)

        Returns:
            DataFrame containing cards matching the theme criteria
        """
        df = self.load()
        if "themeTags" not in df.columns:
            logger.warning("'themeTags' column not found in all_cards")
            return pd.DataFrame()

        if mode == "all":
            # Card must have all specified themes
            mask = pd.Series([True] * len(df), index=df.index)
            for theme in themes:
                mask &= df["themeTags"].str.contains(theme, case=False, na=False)
            return df[mask]
        else:
            # Card must have at least one of the specified themes (default)
            mask = pd.Series([False] * len(df), index=df.index)
            for theme in themes:
                mask |= df["themeTags"].str.contains(theme, case=False, na=False)
            return df[mask]

    def search(self, query: str, limit: int = 100) -> pd.DataFrame:
        """
        Simple text search across card name, type, and oracle text.

        Args:
            query: Search query string
            limit: Maximum number of results to return

        Returns:
            DataFrame containing matching cards (up to limit)
        """
        df = self.load()

        # Search across multiple columns
        mask = pd.Series([False] * len(df), index=df.index)

        if "name" in df.columns:
            mask |= df["name"].str.contains(query, case=False, na=False)

        if "type" in df.columns:
            mask |= df["type"].str.contains(query, case=False, na=False)

        if "text" in df.columns:
            mask |= df["text"].str.contains(query, case=False, na=False)

        results = df[mask]

        if len(results) > limit:
            return results.head(limit)

        return results

    def filter_by_type(self, type_query: str) -> pd.DataFrame:
        """
        Filter cards by type line (supports partial matching).

        Args:
            type_query: Type string to search for (e.g., "Creature", "Instant", "Artifact")

        Returns:
            DataFrame containing cards matching the type
        """
        df = self.load()
        if "type" not in df.columns:
            logger.warning("'type' column not found in all_cards")
            return pd.DataFrame()

        return df[df["type"].str.contains(type_query, case=False, na=False)]

    def get_stats(self) -> dict:
        """
        Get statistics about the loaded card data.

        Returns:
            Dictionary with card count, column count, file size, and load time
        """
        df = self.load()

        stats = {
            "total_cards": len(df),
            "columns": len(df.columns),
            "file_path": self.file_path,
            "file_size_mb": (
                round(os.path.getsize(self.file_path) / (1024 * 1024), 2)
                if os.path.exists(self.file_path)
                else 0
            ),
            "cached": self._df is not None,
            "cache_age_seconds": int(time.time() - self._last_load_time)
            if self._last_load_time > 0
            else None,
        }

        return stats

    def clear_cache(self) -> None:
        """Clear the cached DataFrame, forcing next load to read from disk."""
        self._df = None
        self._last_load_time = 0
        logger.info("Cache cleared")
feat: consolidate card data into optimized format for faster queries and reduced file sizes 2025-10-15 11:04:49 -07:00			`"""`
			`All Cards Loader`

			`Provides efficient loading and querying of the consolidated all_cards.parquet file.`
			`Features in-memory caching with TTL and automatic reload on file changes.`

			`Usage:`
			`loader = AllCardsLoader()`

			`# Single card lookup`
			`card = loader.get_by_name("Sol Ring")`

			`# Batch lookup`
			`cards = loader.get_by_names(["Sol Ring", "Lightning Bolt", "Counterspell"])`

			`# Filter by color identity`
			`blue_cards = loader.filter_by_color_identity(["U"])`

			`# Filter by themes`
			`token_cards = loader.filter_by_themes(["tokens"], mode="any")`

			`# Simple text search`
			`results = loader.search("create token", limit=100)`
			`"""`

			`from __future__ import annotations`

			`import os`
			`import time`
			`from typing import Optional`

			`import pandas as pd`

			`from code.logging_util import get_logger`
			`from code.settings import CARD_FILES_DIRECTORY`

			`# Initialize logger`
			`logger = get_logger(__name__)`


			`class AllCardsLoader:`
			`"""Loads and caches the consolidated all_cards.parquet file with query methods."""`

			`def __init__(self, file_path: Optional[str] = None, cache_ttl: int = 300) -> None:`
			`"""`
			`Initialize AllCardsLoader.`

			`Args:`
			`file_path: Path to all_cards.parquet (defaults to card_files/all_cards.parquet)`
			`cache_ttl: Time-to-live for cache in seconds (default: 300 = 5 minutes)`
			`"""`
			`self.file_path = file_path or os.path.join(CARD_FILES_DIRECTORY, "all_cards.parquet")`
			`self.cache_ttl = cache_ttl`
			`self._df: Optional[pd.DataFrame] = None`
			`self._last_load_time: float = 0`
			`self._file_mtime: float = 0`

			`def load(self, force_reload: bool = False) -> pd.DataFrame:`
			`"""`
			`Load all_cards.parquet with caching.`

			`Returns cached DataFrame if:`
			`- Cache exists`
			`- Cache is not expired (within TTL)`
			`- File hasn't been modified since last load`
			`- force_reload is False`

			`Args:`
			`force_reload: Force reload from disk even if cached`

			`Returns:`
			`DataFrame containing all cards`

			`Raises:`
			`FileNotFoundError: If all_cards.parquet doesn't exist`
			`"""`
			`if not os.path.exists(self.file_path):`
			`raise FileNotFoundError(f"All cards file not found: {self.file_path}")`

			`# Check if we need to reload`
			`current_time = time.time()`
			`file_mtime = os.path.getmtime(self.file_path)`

			`cache_valid = (`
			`self._df is not None`
			`and not force_reload`
			`and (current_time - self._last_load_time) < self.cache_ttl`
			`and file_mtime == self._file_mtime`
			`)`

			`if cache_valid:`
			`return self._df # type: ignore`

			`# Load from disk`
			`logger.info(f"Loading all_cards from {self.file_path}...")`
			`start_time = time.time()`
			`self._df = pd.read_parquet(self.file_path, engine="pyarrow")`
			`elapsed = time.time() - start_time`

			`self._last_load_time = current_time`
			`self._file_mtime = file_mtime`

			`logger.info(`
			`f"Loaded {len(self._df)} cards with {len(self._df.columns)} columns in {elapsed:.3f}s"`
			`)`

			`return self._df`

			`def get_by_name(self, name: str) -> Optional[pd.Series]:`
			`"""`
			`Get a single card by exact name match.`

			`Args:`
			`name: Card name to search for`

			`Returns:`
			`Series containing card data, or None if not found`
			`"""`
			`df = self.load()`
			`if "name" not in df.columns:`
			`logger.warning("'name' column not found in all_cards")`
			`return None`

			`# Use .loc[] for faster exact match lookup`
			`try:`
			`matches = df.loc[df["name"] == name]`
			`if matches.empty:`
			`return None`
			`return matches.iloc[0]`
			`except (KeyError, IndexError):`
			`return None`

			`def get_by_names(self, names: list[str]) -> pd.DataFrame:`
			`"""`
			`Get multiple cards by exact name matches (batch lookup).`

			`Args:`
			`names: List of card names to search for`

			`Returns:`
			`DataFrame containing matching cards (may be empty)`
			`"""`
			`df = self.load()`
			`if "name" not in df.columns:`
			`logger.warning("'name' column not found in all_cards")`
			`return pd.DataFrame()`

			`return df[df["name"].isin(names)]`

			`def filter_by_color_identity(self, colors: list[str]) -> pd.DataFrame:`
			`"""`
			`Filter cards by color identity.`

			`Args:`
			`colors: List of color codes (e.g., ["W", "U"], ["Colorless"], ["G", "R", "U"])`

			`Returns:`
			`DataFrame containing cards matching the color identity`
			`"""`
			`df = self.load()`
			`if "colorIdentity" not in df.columns:`
			`logger.warning("'colorIdentity' column not found in all_cards")`
			`return pd.DataFrame()`

			`# Convert colors list to a set for comparison`
			`color_set = set(colors)`

			`# Handle special case for colorless`
			`if "Colorless" in color_set or "colorless" in color_set:`
			`return df[df["colorIdentity"].isin(["Colorless", "colorless"])]`

			`# For multi-color searches, match any card that contains those colors`
			`# This is a simple exact match - could be enhanced for subset/superset matching`
			`if len(colors) == 1:`
			`# Single color - exact match`
			`return df[df["colorIdentity"] == colors[0]]`
			`else:`
			`# Multi-color - match any of the provided colors (could be refined)`
			`return df[df["colorIdentity"].isin(colors)]`

			`def filter_by_themes(self, themes: list[str], mode: str = "any") -> pd.DataFrame:`
			`"""`
			`Filter cards by theme tags.`

			`Args:`
			`themes: List of theme tags to search for`
			`mode: "any" (at least one theme) or "all" (must have all themes)`

			`Returns:`
			`DataFrame containing cards matching the theme criteria`
			`"""`
			`df = self.load()`
			`if "themeTags" not in df.columns:`
			`logger.warning("'themeTags' column not found in all_cards")`
			`return pd.DataFrame()`

			`if mode == "all":`
			`# Card must have all specified themes`
			`mask = pd.Series([True] * len(df), index=df.index)`
			`for theme in themes:`
			`mask &= df["themeTags"].str.contains(theme, case=False, na=False)`
			`return df[mask]`
			`else:`
			`# Card must have at least one of the specified themes (default)`
			`mask = pd.Series([False] * len(df), index=df.index)`
			`for theme in themes:`
			`mask \|= df["themeTags"].str.contains(theme, case=False, na=False)`
			`return df[mask]`

			`def search(self, query: str, limit: int = 100) -> pd.DataFrame:`
			`"""`
			`Simple text search across card name, type, and oracle text.`

			`Args:`
			`query: Search query string`
			`limit: Maximum number of results to return`

			`Returns:`
			`DataFrame containing matching cards (up to limit)`
			`"""`
			`df = self.load()`

			`# Search across multiple columns`
			`mask = pd.Series([False] * len(df), index=df.index)`

			`if "name" in df.columns:`
			`mask \|= df["name"].str.contains(query, case=False, na=False)`

			`if "type" in df.columns:`
			`mask \|= df["type"].str.contains(query, case=False, na=False)`

			`if "text" in df.columns:`
			`mask \|= df["text"].str.contains(query, case=False, na=False)`

			`results = df[mask]`

			`if len(results) > limit:`
			`return results.head(limit)`

			`return results`

			`def filter_by_type(self, type_query: str) -> pd.DataFrame:`
			`"""`
			`Filter cards by type line (supports partial matching).`

			`Args:`
			`type_query: Type string to search for (e.g., "Creature", "Instant", "Artifact")`

			`Returns:`
			`DataFrame containing cards matching the type`
			`"""`
			`df = self.load()`
			`if "type" not in df.columns:`
			`logger.warning("'type' column not found in all_cards")`
			`return pd.DataFrame()`

			`return df[df["type"].str.contains(type_query, case=False, na=False)]`

			`def get_stats(self) -> dict:`
			`"""`
			`Get statistics about the loaded card data.`

			`Returns:`
			`Dictionary with card count, column count, file size, and load time`
			`"""`
			`df = self.load()`

			`stats = {`
			`"total_cards": len(df),`
			`"columns": len(df.columns),`
			`"file_path": self.file_path,`
			`"file_size_mb": (`
			`round(os.path.getsize(self.file_path) / (1024 * 1024), 2)`
			`if os.path.exists(self.file_path)`
			`else 0`
			`),`
			`"cached": self._df is not None,`
			`"cache_age_seconds": int(time.time() - self._last_load_time)`
			`if self._last_load_time > 0`
			`else None,`
			`}`

			`return stats`

			`def clear_cache(self) -> None:`
			`"""Clear the cached DataFrame, forcing next load to read from disk."""`
			`self._df = None`
			`self._last_load_time = 0`
			`logger.info("Cache cleared")`