feat: migrate to unified Parquet format with instant GitHub setup and 4x faster tagging

2026-03-08 06:32:37 +01:00 · 2025-10-18 21:32:12 -07:00 · 2025-10-18 21:32:12 -07:00 · 8435312c8f
commit 8435312c8f
parent e9e949aae3
58 changed files with 11921 additions and 3961 deletions
--- a/code/file_setup/data_loader.py
+++ b/code/file_setup/data_loader.py
@ -0,0 +1,338 @@
+"""Data loader abstraction for CSV and Parquet formats.
+
+This module provides a unified interface for reading and writing card data
+in both CSV and Parquet formats. It handles format detection, conversion,
+and schema validation.
+
+Introduced in v3.0.0 as part of the Parquet migration.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import pandas as pd
+
+from logging_util import get_logger
+from path_util import card_files_processed_dir
+
+logger = get_logger(__name__)
+
+
+# Required columns for deck building
+REQUIRED_COLUMNS = [
+    "name",
+    "colorIdentity",
+    "type",  # MTGJSON uses 'type' not 'types'
+    "keywords",
+    "manaValue",
+    "text",
+    "power",
+    "toughness",
+]
+
+
+def validate_schema(df: pd.DataFrame, required: Optional[List[str]] = None) -> None:
+    """Validate that DataFrame contains required columns.
+    
+    Args:
+        df: DataFrame to validate
+        required: List of required columns (uses REQUIRED_COLUMNS if None)
+    
+    Raises:
+        ValueError: If required columns are missing
+    """
+    required = required or REQUIRED_COLUMNS
+    missing = [col for col in required if col not in df.columns]
+    
+    if missing:
+        raise ValueError(
+            f"Schema validation failed: missing required columns {missing}. "
+            f"Available columns: {list(df.columns)}"
+        )
+    
+    logger.debug(f"✓ Schema validation passed ({len(required)} required columns present)")
+
+
+class DataLoader:
+    """Unified data loading interface supporting CSV and Parquet formats.
+    
+    This class provides transparent access to card data regardless of the
+    underlying storage format. It automatically detects the format based on
+    file extensions and provides conversion utilities.
+    
+    Examples:
+        >>> loader = DataLoader()
+        >>> df = loader.read_cards("card_files/processed/all_cards.parquet")
+        >>> loader.write_cards(df, "output.parquet")
+        >>> loader.convert("input.csv", "output.parquet")
+    """
+    
+    def __init__(self, format: str = "auto"):
+        """Initialize the data loader.
+        
+        Args:
+            format: Format preference - "csv", "parquet", or "auto" (default: auto)
+                   "auto" detects format from file extension
+        """
+        self.format = format.lower()
+        if self.format not in ("csv", "parquet", "auto"):
+            raise ValueError(f"Unsupported format: {format}. Use 'csv', 'parquet', or 'auto'.")
+    
+    def read_cards(
+        self,
+        path: str,
+        columns: Optional[List[str]] = None,
+        format: Optional[str] = None
+    ) -> pd.DataFrame:
+        """Load card data from a file.
+        
+        Args:
+            path: File path (e.g., "card_files/processed/all_cards.parquet")
+            columns: Optional list of columns to load (Parquet optimization)
+            format: Override format detection (uses self.format if None)
+        
+        Returns:
+            DataFrame with card data
+        
+        Raises:
+            FileNotFoundError: If the file doesn't exist
+            ValueError: If format is unsupported
+        """
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Card data file not found: {path}")
+        
+        detected_format = format or self._detect_format(path)
+        
+        logger.debug(f"Loading card data from {path} (format: {detected_format})")
+        
+        if detected_format == "csv":
+            return self._read_csv(path, columns)
+        elif detected_format == "parquet":
+            return self._read_parquet(path, columns)
+        else:
+            raise ValueError(f"Unsupported format: {detected_format}")
+    
+    def write_cards(
+        self,
+        df: pd.DataFrame,
+        path: str,
+        format: Optional[str] = None,
+        index: bool = False
+    ) -> None:
+        """Save card data to a file.
+        
+        Args:
+            df: DataFrame to save
+            path: Output file path
+            format: Force format (overrides auto-detection)
+            index: Whether to write DataFrame index (default: False)
+        
+        Raises:
+            ValueError: If format is unsupported
+        """
+        detected_format = format or self._detect_format(path)
+        
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+        
+        logger.debug(f"Writing card data to {path} (format: {detected_format}, rows: {len(df)})")
+        
+        if detected_format == "csv":
+            self._write_csv(df, path, index)
+        elif detected_format == "parquet":
+            self._write_parquet(df, path, index)
+        else:
+            raise ValueError(f"Unsupported format: {detected_format}")
+    
+    def convert(
+        self,
+        src_path: str,
+        dst_path: str,
+        columns: Optional[List[str]] = None
+    ) -> None:
+        """Convert between CSV and Parquet formats.
+        
+        Args:
+            src_path: Source file path
+            dst_path: Destination file path
+            columns: Optional list of columns to include (all if None)
+        
+        Examples:
+            >>> loader.convert("cards.csv", "cards.parquet")
+            >>> loader.convert("cards.parquet", "cards.csv", columns=["name", "type"])
+        """
+        logger.info(f"Converting {src_path} → {dst_path}")
+        df = self.read_cards(src_path, columns=columns)
+        self.write_cards(df, dst_path)
+        logger.info(f"✓ Converted {len(df)} cards")
+    
+    def _read_csv(self, path: str, columns: Optional[List[str]] = None) -> pd.DataFrame:
+        """Read CSV file."""
+        try:
+            return pd.read_csv(path, usecols=columns, low_memory=False)
+        except Exception as e:
+            logger.error(f"Failed to read CSV from {path}: {e}")
+            raise
+    
+    def _read_parquet(self, path: str, columns: Optional[List[str]] = None) -> pd.DataFrame:
+        """Read Parquet file."""
+        try:
+            return pd.read_parquet(path, columns=columns)
+        except Exception as e:
+            logger.error(f"Failed to read Parquet from {path}: {e}")
+            raise
+    
+    def _write_csv(self, df: pd.DataFrame, path: str, index: bool) -> None:
+        """Write CSV file."""
+        try:
+            df.to_csv(path, index=index)
+        except Exception as e:
+            logger.error(f"Failed to write CSV to {path}: {e}")
+            raise
+    
+    def _write_parquet(self, df: pd.DataFrame, path: str, index: bool) -> None:
+        """Write Parquet file with Snappy compression."""
+        try:
+            df.to_parquet(path, index=index, compression="snappy", engine="pyarrow")
+        except Exception as e:
+            logger.error(f"Failed to write Parquet to {path}: {e}")
+            raise
+    
+    def _detect_format(self, path: str) -> str:
+        """Detect file format from extension.
+        
+        Args:
+            path: File path to analyze
+        
+        Returns:
+            Format string: "csv" or "parquet"
+        
+        Raises:
+            ValueError: If format cannot be determined
+        """
+        if self.format != "auto":
+            return self.format
+        
+        # Check file extension
+        if path.endswith(".csv"):
+            return "csv"
+        elif path.endswith(".parquet"):
+            return "parquet"
+        
+        # Try to infer from existing files (no extension provided)
+        if os.path.exists(f"{path}.parquet"):
+            return "parquet"
+        elif os.path.exists(f"{path}.csv"):
+            return "csv"
+        
+        raise ValueError(
+            f"Cannot determine format for '{path}'. "
+            "Use .csv or .parquet extension, or specify format explicitly."
+        )
+    
+    def write_batch_parquet(
+        self,
+        df: pd.DataFrame,
+        batch_id: int,
+        tag: str = "",
+        batches_dir: Optional[str] = None
+    ) -> str:
+        """Write a batch Parquet file (used during tagging).
+        
+        Args:
+            df: DataFrame to save as a batch
+            batch_id: Unique batch identifier (e.g., 0, 1, 2...)
+            tag: Optional tag to include in filename (e.g., "white", "commander")
+            batches_dir: Directory for batch files (defaults to card_files/processed/batches)
+        
+        Returns:
+            Path to the written batch file
+        
+        Example:
+            >>> loader.write_batch_parquet(white_df, batch_id=0, tag="white")
+            'card_files/processed/batches/batch_0_white.parquet'
+        """
+        if batches_dir is None:
+            batches_dir = os.path.join(card_files_processed_dir(), "batches")
+        
+        os.makedirs(batches_dir, exist_ok=True)
+        
+        # Build filename: batch_{id}_{tag}.parquet or batch_{id}.parquet
+        filename = f"batch_{batch_id}_{tag}.parquet" if tag else f"batch_{batch_id}.parquet"
+        path = os.path.join(batches_dir, filename)
+        
+        logger.debug(f"Writing batch {batch_id} ({tag or 'no tag'}): {len(df)} cards → {path}")
+        self.write_cards(df, path, format="parquet")
+        
+        return path
+    
+    def merge_batches(
+        self,
+        output_path: Optional[str] = None,
+        batches_dir: Optional[str] = None,
+        cleanup: bool = True
+    ) -> pd.DataFrame:
+        """Merge all batch Parquet files into a single output file.
+        
+        Args:
+            output_path: Path for merged output (defaults to card_files/processed/all_cards.parquet)
+            batches_dir: Directory containing batch files (defaults to card_files/processed/batches)
+            cleanup: Whether to delete batch files after merging (default: True)
+        
+        Returns:
+            Merged DataFrame
+        
+        Raises:
+            FileNotFoundError: If no batch files found
+        
+        Example:
+            >>> loader.merge_batches()  # Merges all batches → all_cards.parquet
+        """
+        if batches_dir is None:
+            batches_dir = os.path.join(card_files_processed_dir(), "batches")
+        
+        if output_path is None:
+            from code.path_util import get_processed_cards_path
+            output_path = get_processed_cards_path()
+        
+        # Find all batch files
+        batch_files = sorted(Path(batches_dir).glob("batch_*.parquet"))
+        
+        if not batch_files:
+            raise FileNotFoundError(f"No batch files found in {batches_dir}")
+        
+        logger.info(f"Merging {len(batch_files)} batch files from {batches_dir}")
+        
+        # Read and concatenate all batches
+        dfs = []
+        for batch_file in batch_files:
+            logger.debug(f"Reading batch: {batch_file.name}")
+            df = self.read_cards(str(batch_file), format="parquet")
+            dfs.append(df)
+        
+        # Merge all batches
+        merged_df = pd.concat(dfs, ignore_index=True)
+        logger.info(f"Merged {len(merged_df)} total cards from {len(dfs)} batches")
+        
+        # Write merged output
+        self.write_cards(merged_df, output_path, format="parquet")
+        logger.info(f"✓ Wrote merged data to {output_path}")
+        
+        # Cleanup batch files if requested
+        if cleanup:
+            logger.debug(f"Cleaning up {len(batch_files)} batch files")
+            for batch_file in batch_files:
+                batch_file.unlink()
+            
+            # Remove batches directory if empty
+            try:
+                Path(batches_dir).rmdir()
+                logger.debug(f"Removed empty batches directory: {batches_dir}")
+            except OSError:
+                pass  # Directory not empty, keep it
+        
+        return merged_df
+