""" Card Data Aggregator Consolidates individual card CSV files into a single Parquet file for improved performance in card browsing, theme cataloging, and searches. Key Features: - Merges all card CSVs into all_cards.parquet (50-70% size reduction, 2-5x faster) - Excludes master files (cards.csv, commander_cards.csv) from aggregation - Deduplication logic (keeps most recent when card appears in multiple files) - Incremental updates (only re-process changed files) - Version rotation (maintains 2-3 historical versions for rollback) - Validation (ensures no data loss) Usage: aggregator = CardAggregator() stats = aggregator.aggregate_all('csv_files', 'card_files/all_cards.parquet') """ from __future__ import annotations import glob import json import os from datetime import datetime from typing import Optional import pandas as pd from code.logging_util import get_logger # Initialize logger logger = get_logger(__name__) class CardAggregator: """Aggregates individual card CSV files into a consolidated Parquet file.""" # Files to exclude from aggregation (master files used for other purposes) EXCLUDED_FILES = {"cards.csv", "commander_cards.csv", "background_cards.csv"} def __init__(self, output_dir: Optional[str] = None) -> None: """ Initialize CardAggregator. Args: output_dir: Directory for output files (defaults to CARD_FILES_DIR env var or 'card_files/') """ self.output_dir = output_dir or os.getenv("CARD_FILES_DIR", "card_files") self.ensure_output_dir() def ensure_output_dir(self) -> None: """Create output directory if it doesn't exist.""" os.makedirs(self.output_dir, exist_ok=True) logger.info(f"Card aggregator output directory: {self.output_dir}") def get_card_csvs(self, source_dir: str) -> list[str]: """ Get all card CSV files to aggregate, excluding master files. Args: source_dir: Directory containing card CSV files Returns: List of file paths to aggregate """ all_csvs = glob.glob(os.path.join(source_dir, "*.csv")) # Filter out excluded files and temporary files filtered = [ f for f in all_csvs if os.path.basename(f) not in self.EXCLUDED_FILES and not os.path.basename(f).startswith(".") and not os.path.basename(f).startswith("_temp") ] logger.info( f"Found {len(all_csvs)} CSV files, {len(filtered)} to aggregate " f"(excluded {len(all_csvs) - len(filtered)})" ) return filtered def deduplicate_cards(self, df: pd.DataFrame) -> pd.DataFrame: """ Remove duplicate card entries, keeping the most recent version. Uses the 'name' column as the unique identifier. When duplicates exist, keeps the last occurrence (assumes files are processed in order of modification time). Args: df: DataFrame with potential duplicates Returns: DataFrame with duplicates removed """ if "name" not in df.columns: logger.warning("Cannot deduplicate: 'name' column not found") return df original_count = len(df) df_deduped = df.drop_duplicates(subset=["name"], keep="last") removed_count = original_count - len(df_deduped) if removed_count > 0: logger.info(f"Removed {removed_count} duplicate cards (kept most recent)") return df_deduped def aggregate_all(self, source_dir: str, output_path: str) -> dict: """ Perform full aggregation of all card CSV files into a single Parquet file. Args: source_dir: Directory containing individual card CSV files output_path: Path for output Parquet file Returns: Dictionary with aggregation statistics: - files_processed: Number of CSV files aggregated - total_cards: Total cards in output (after deduplication) - duplicates_removed: Number of duplicate cards removed - file_size_mb: Size of output Parquet file in MB - elapsed_seconds: Time taken for aggregation Raises: FileNotFoundError: If source_dir doesn't exist ValueError: If no CSV files found to aggregate """ start_time = datetime.now() if not os.path.exists(source_dir): raise FileNotFoundError(f"Source directory not found: {source_dir}") # Get CSV files to aggregate csv_files = self.get_card_csvs(source_dir) if not csv_files: raise ValueError(f"No CSV files found to aggregate in {source_dir}") logger.info(f"Starting aggregation of {len(csv_files)} files...") # Sort by modification time (oldest first, so newest are kept in deduplication) csv_files_sorted = sorted(csv_files, key=lambda f: os.path.getmtime(f)) # Read and concatenate all CSV files dfs = [] for csv_file in csv_files_sorted: try: # Skip comment lines (lines starting with #) in CSV files df = pd.read_csv(csv_file, low_memory=False, comment='#') if not df.empty: dfs.append(df) except Exception as e: logger.warning(f"Failed to read {os.path.basename(csv_file)}: {e}") continue if not dfs: raise ValueError("No valid CSV files could be read") # Concatenate all DataFrames logger.info(f"Concatenating {len(dfs)} DataFrames...") combined_df = pd.concat(dfs, ignore_index=True) original_count = len(combined_df) # Deduplicate cards combined_df = self.deduplicate_cards(combined_df) duplicates_removed = original_count - len(combined_df) # Convert object columns with mixed types to strings for Parquet compatibility # Common columns that may have mixed types: power, toughness, keywords for col in ["power", "toughness", "keywords"]: if col in combined_df.columns: combined_df[col] = combined_df[col].astype(str) # Rotate existing versions before writing new file self.rotate_versions(output_path, keep_versions=3) # Write to Parquet logger.info(f"Writing {len(combined_df)} cards to {output_path}...") combined_df.to_parquet(output_path, engine="pyarrow", compression="snappy", index=False) # Calculate stats elapsed = (datetime.now() - start_time).total_seconds() file_size_mb = os.path.getsize(output_path) / (1024 * 1024) stats = { "files_processed": len(csv_files), "total_cards": len(combined_df), "duplicates_removed": duplicates_removed, "file_size_mb": round(file_size_mb, 2), "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat(), } logger.info( f"Aggregation complete: {stats['total_cards']} cards " f"({stats['file_size_mb']} MB) in {stats['elapsed_seconds']}s" ) # Save metadata self._save_metadata(source_dir, output_path, stats) return stats def detect_changes(self, source_dir: str, metadata_path: str) -> list[str]: """ Detect which CSV files have changed since last aggregation. Args: source_dir: Directory containing card CSV files metadata_path: Path to metadata JSON file from previous run Returns: List of file paths that have been added or modified """ if not os.path.exists(metadata_path): logger.info("No previous metadata found, all files considered changed") return self.get_card_csvs(source_dir) try: with open(metadata_path, "r", encoding="utf-8") as f: metadata = json.load(f) last_run = datetime.fromisoformat(metadata.get("timestamp", "")) except (json.JSONDecodeError, ValueError, KeyError) as e: logger.warning(f"Invalid metadata file: {e}, treating all files as changed") return self.get_card_csvs(source_dir) # Find files modified after last aggregation csv_files = self.get_card_csvs(source_dir) changed_files = [ f for f in csv_files if datetime.fromtimestamp(os.path.getmtime(f)) > last_run ] logger.info(f"Detected {len(changed_files)} changed files since last aggregation") return changed_files def incremental_update(self, changed_files: list[str], output_path: str) -> dict: """ Perform incremental update by replacing only changed cards. Note: This is a simplified implementation. For production use, consider: - Loading existing Parquet, removing old versions of changed cards, adding new - Currently performs full re-aggregation (simpler, safer for MVP) Args: changed_files: List of CSV files that have changed output_path: Path to existing Parquet file to update Returns: Dictionary with update statistics """ # For MVP, we'll perform a full aggregation instead of true incremental update # True incremental update would require: # 1. Load existing Parquet # 2. Identify cards from changed files # 3. Remove old versions of those cards # 4. Add new versions # This is more complex and error-prone, so we'll defer to a future iteration logger.info("Incremental update not yet implemented, performing full aggregation") source_dir = os.path.dirname(changed_files[0]) if changed_files else "csv_files" return self.aggregate_all(source_dir, output_path) def validate_output(self, output_path: str, source_dir: str) -> tuple[bool, list[str]]: """ Validate the aggregated output file. Checks: - File exists and is readable - Contains expected columns - Has reasonable number of cards (>0) - Random sampling matches source data Args: output_path: Path to Parquet file to validate source_dir: Original source directory for comparison Returns: Tuple of (is_valid, list_of_errors) """ errors = [] # Check file exists if not os.path.exists(output_path): errors.append(f"Output file not found: {output_path}") return False, errors try: # Load Parquet file df = pd.read_parquet(output_path, engine="pyarrow") # Check not empty if df.empty: errors.append("Output file is empty") # Check has 'name' column at minimum if "name" not in df.columns: errors.append("Output file missing 'name' column") # Check for reasonable card count (at least 100 cards expected in any real dataset) if len(df) < 100: logger.warning(f"Output has only {len(df)} cards (expected more)") logger.info(f"Validation passed: {len(df)} cards with {len(df.columns)} columns") except Exception as e: errors.append(f"Failed to read/validate output file: {e}") return len(errors) == 0, errors def rotate_versions(self, output_path: str, keep_versions: int = 3) -> None: """ Rotate historical versions of the output file. Keeps the last N versions as backups (e.g., all_cards_v1.parquet, all_cards_v2.parquet). Args: output_path: Path to current output file keep_versions: Number of historical versions to keep (default: 3) """ if not os.path.exists(output_path): return # Nothing to rotate # Parse output path base_dir = os.path.dirname(output_path) filename = os.path.basename(output_path) name, ext = os.path.splitext(filename) # Rotate existing versions (v2 -> v3, v1 -> v2, current -> v1) for version in range(keep_versions - 1, 0, -1): old_path = os.path.join(base_dir, f"{name}_v{version}{ext}") new_path = os.path.join(base_dir, f"{name}_v{version + 1}{ext}") if os.path.exists(old_path): if version + 1 > keep_versions: # Delete oldest version os.remove(old_path) logger.info(f"Deleted old version: {os.path.basename(old_path)}") else: # Rename to next version os.rename(old_path, new_path) logger.info( f"Rotated {os.path.basename(old_path)} -> {os.path.basename(new_path)}" ) # Move current file to v1 v1_path = os.path.join(base_dir, f"{name}_v1{ext}") if os.path.exists(output_path): os.rename(output_path, v1_path) logger.info(f"Rotated current file to {os.path.basename(v1_path)}") def _save_metadata(self, source_dir: str, output_path: str, stats: dict) -> None: """Save aggregation metadata for incremental updates.""" metadata_path = os.path.join(self.output_dir, ".aggregate_metadata.json") metadata = { "source_dir": source_dir, "output_path": output_path, "last_aggregation": stats["timestamp"], "stats": stats, } with open(metadata_path, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=2) logger.info(f"Saved aggregation metadata to {metadata_path}")