mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-12-16 07:30:13 +01:00
367 lines
14 KiB
Python
367 lines
14 KiB
Python
"""
|
|
Card Data Aggregator
|
|
|
|
Consolidates individual card CSV files into a single Parquet file for improved
|
|
performance in card browsing, theme cataloging, and searches.
|
|
|
|
Key Features:
|
|
- Merges all card CSVs into all_cards.parquet (50-70% size reduction, 2-5x faster)
|
|
- Excludes master files (cards.csv, commander_cards.csv) from aggregation
|
|
- Deduplication logic (keeps most recent when card appears in multiple files)
|
|
- Incremental updates (only re-process changed files)
|
|
- Version rotation (maintains 2-3 historical versions for rollback)
|
|
- Validation (ensures no data loss)
|
|
|
|
Usage:
|
|
aggregator = CardAggregator()
|
|
stats = aggregator.aggregate_all('csv_files', 'card_files/all_cards.parquet')
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import glob
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
|
|
from code.logging_util import get_logger
|
|
|
|
# Initialize logger
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class CardAggregator:
|
|
"""Aggregates individual card CSV files into a consolidated Parquet file."""
|
|
|
|
# Files to exclude from aggregation (master files used for other purposes)
|
|
EXCLUDED_FILES = {"cards.csv", "commander_cards.csv", "background_cards.csv"}
|
|
|
|
def __init__(self, output_dir: Optional[str] = None) -> None:
|
|
"""
|
|
Initialize CardAggregator.
|
|
|
|
Args:
|
|
output_dir: Directory for output files (defaults to CARD_FILES_DIR env var or 'card_files/')
|
|
"""
|
|
self.output_dir = output_dir or os.getenv("CARD_FILES_DIR", "card_files")
|
|
self.ensure_output_dir()
|
|
|
|
def ensure_output_dir(self) -> None:
|
|
"""Create output directory if it doesn't exist."""
|
|
os.makedirs(self.output_dir, exist_ok=True)
|
|
logger.info(f"Card aggregator output directory: {self.output_dir}")
|
|
|
|
def get_card_csvs(self, source_dir: str) -> list[str]:
|
|
"""
|
|
Get all card CSV files to aggregate, excluding master files.
|
|
|
|
Args:
|
|
source_dir: Directory containing card CSV files
|
|
|
|
Returns:
|
|
List of file paths to aggregate
|
|
"""
|
|
all_csvs = glob.glob(os.path.join(source_dir, "*.csv"))
|
|
|
|
# Filter out excluded files and temporary files
|
|
filtered = [
|
|
f
|
|
for f in all_csvs
|
|
if os.path.basename(f) not in self.EXCLUDED_FILES
|
|
and not os.path.basename(f).startswith(".")
|
|
and not os.path.basename(f).startswith("_temp")
|
|
]
|
|
|
|
logger.info(
|
|
f"Found {len(all_csvs)} CSV files, {len(filtered)} to aggregate "
|
|
f"(excluded {len(all_csvs) - len(filtered)})"
|
|
)
|
|
|
|
return filtered
|
|
|
|
def deduplicate_cards(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Remove duplicate card entries, keeping the most recent version.
|
|
|
|
Uses the 'name' column as the unique identifier. When duplicates exist,
|
|
keeps the last occurrence (assumes files are processed in order of modification time).
|
|
|
|
Args:
|
|
df: DataFrame with potential duplicates
|
|
|
|
Returns:
|
|
DataFrame with duplicates removed
|
|
"""
|
|
if "name" not in df.columns:
|
|
logger.warning("Cannot deduplicate: 'name' column not found")
|
|
return df
|
|
|
|
original_count = len(df)
|
|
df_deduped = df.drop_duplicates(subset=["name"], keep="last")
|
|
removed_count = original_count - len(df_deduped)
|
|
|
|
if removed_count > 0:
|
|
logger.info(f"Removed {removed_count} duplicate cards (kept most recent)")
|
|
|
|
return df_deduped
|
|
|
|
def aggregate_all(self, source_dir: str, output_path: str) -> dict:
|
|
"""
|
|
Perform full aggregation of all card CSV files into a single Parquet file.
|
|
|
|
Args:
|
|
source_dir: Directory containing individual card CSV files
|
|
output_path: Path for output Parquet file
|
|
|
|
Returns:
|
|
Dictionary with aggregation statistics:
|
|
- files_processed: Number of CSV files aggregated
|
|
- total_cards: Total cards in output (after deduplication)
|
|
- duplicates_removed: Number of duplicate cards removed
|
|
- file_size_mb: Size of output Parquet file in MB
|
|
- elapsed_seconds: Time taken for aggregation
|
|
|
|
Raises:
|
|
FileNotFoundError: If source_dir doesn't exist
|
|
ValueError: If no CSV files found to aggregate
|
|
"""
|
|
start_time = datetime.now()
|
|
|
|
if not os.path.exists(source_dir):
|
|
raise FileNotFoundError(f"Source directory not found: {source_dir}")
|
|
|
|
# Get CSV files to aggregate
|
|
csv_files = self.get_card_csvs(source_dir)
|
|
if not csv_files:
|
|
raise ValueError(f"No CSV files found to aggregate in {source_dir}")
|
|
|
|
logger.info(f"Starting aggregation of {len(csv_files)} files...")
|
|
|
|
# Sort by modification time (oldest first, so newest are kept in deduplication)
|
|
csv_files_sorted = sorted(csv_files, key=lambda f: os.path.getmtime(f))
|
|
|
|
# Read and concatenate all CSV files
|
|
dfs = []
|
|
for csv_file in csv_files_sorted:
|
|
try:
|
|
# Skip comment lines (lines starting with #) in CSV files
|
|
df = pd.read_csv(csv_file, low_memory=False, comment='#')
|
|
if not df.empty:
|
|
dfs.append(df)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to read {os.path.basename(csv_file)}: {e}")
|
|
continue
|
|
|
|
if not dfs:
|
|
raise ValueError("No valid CSV files could be read")
|
|
|
|
# Concatenate all DataFrames
|
|
logger.info(f"Concatenating {len(dfs)} DataFrames...")
|
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
original_count = len(combined_df)
|
|
|
|
# Deduplicate cards
|
|
combined_df = self.deduplicate_cards(combined_df)
|
|
duplicates_removed = original_count - len(combined_df)
|
|
|
|
# Convert object columns with mixed types to strings for Parquet compatibility
|
|
# Common columns that may have mixed types: power, toughness, keywords
|
|
for col in ["power", "toughness", "keywords"]:
|
|
if col in combined_df.columns:
|
|
combined_df[col] = combined_df[col].astype(str)
|
|
|
|
# Rotate existing versions before writing new file
|
|
self.rotate_versions(output_path, keep_versions=3)
|
|
|
|
# Write to Parquet
|
|
logger.info(f"Writing {len(combined_df)} cards to {output_path}...")
|
|
combined_df.to_parquet(output_path, engine="pyarrow", compression="snappy", index=False)
|
|
|
|
# Calculate stats
|
|
elapsed = (datetime.now() - start_time).total_seconds()
|
|
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
|
|
|
stats = {
|
|
"files_processed": len(csv_files),
|
|
"total_cards": len(combined_df),
|
|
"duplicates_removed": duplicates_removed,
|
|
"file_size_mb": round(file_size_mb, 2),
|
|
"elapsed_seconds": round(elapsed, 2),
|
|
"timestamp": datetime.now().isoformat(),
|
|
}
|
|
|
|
logger.info(
|
|
f"Aggregation complete: {stats['total_cards']} cards "
|
|
f"({stats['file_size_mb']} MB) in {stats['elapsed_seconds']}s"
|
|
)
|
|
|
|
# Save metadata
|
|
self._save_metadata(source_dir, output_path, stats)
|
|
|
|
return stats
|
|
|
|
def detect_changes(self, source_dir: str, metadata_path: str) -> list[str]:
|
|
"""
|
|
Detect which CSV files have changed since last aggregation.
|
|
|
|
Args:
|
|
source_dir: Directory containing card CSV files
|
|
metadata_path: Path to metadata JSON file from previous run
|
|
|
|
Returns:
|
|
List of file paths that have been added or modified
|
|
"""
|
|
if not os.path.exists(metadata_path):
|
|
logger.info("No previous metadata found, all files considered changed")
|
|
return self.get_card_csvs(source_dir)
|
|
|
|
try:
|
|
with open(metadata_path, "r", encoding="utf-8") as f:
|
|
metadata = json.load(f)
|
|
last_run = datetime.fromisoformat(metadata.get("timestamp", ""))
|
|
except (json.JSONDecodeError, ValueError, KeyError) as e:
|
|
logger.warning(f"Invalid metadata file: {e}, treating all files as changed")
|
|
return self.get_card_csvs(source_dir)
|
|
|
|
# Find files modified after last aggregation
|
|
csv_files = self.get_card_csvs(source_dir)
|
|
changed_files = [
|
|
f for f in csv_files if datetime.fromtimestamp(os.path.getmtime(f)) > last_run
|
|
]
|
|
|
|
logger.info(f"Detected {len(changed_files)} changed files since last aggregation")
|
|
return changed_files
|
|
|
|
def incremental_update(self, changed_files: list[str], output_path: str) -> dict:
|
|
"""
|
|
Perform incremental update by replacing only changed cards.
|
|
|
|
Note: This is a simplified implementation. For production use, consider:
|
|
- Loading existing Parquet, removing old versions of changed cards, adding new
|
|
- Currently performs full re-aggregation (simpler, safer for MVP)
|
|
|
|
Args:
|
|
changed_files: List of CSV files that have changed
|
|
output_path: Path to existing Parquet file to update
|
|
|
|
Returns:
|
|
Dictionary with update statistics
|
|
"""
|
|
# For MVP, we'll perform a full aggregation instead of true incremental update
|
|
# True incremental update would require:
|
|
# 1. Load existing Parquet
|
|
# 2. Identify cards from changed files
|
|
# 3. Remove old versions of those cards
|
|
# 4. Add new versions
|
|
# This is more complex and error-prone, so we'll defer to a future iteration
|
|
|
|
logger.info("Incremental update not yet implemented, performing full aggregation")
|
|
source_dir = os.path.dirname(changed_files[0]) if changed_files else "csv_files"
|
|
return self.aggregate_all(source_dir, output_path)
|
|
|
|
def validate_output(self, output_path: str, source_dir: str) -> tuple[bool, list[str]]:
|
|
"""
|
|
Validate the aggregated output file.
|
|
|
|
Checks:
|
|
- File exists and is readable
|
|
- Contains expected columns
|
|
- Has reasonable number of cards (>0)
|
|
- Random sampling matches source data
|
|
|
|
Args:
|
|
output_path: Path to Parquet file to validate
|
|
source_dir: Original source directory for comparison
|
|
|
|
Returns:
|
|
Tuple of (is_valid, list_of_errors)
|
|
"""
|
|
errors = []
|
|
|
|
# Check file exists
|
|
if not os.path.exists(output_path):
|
|
errors.append(f"Output file not found: {output_path}")
|
|
return False, errors
|
|
|
|
try:
|
|
# Load Parquet file
|
|
df = pd.read_parquet(output_path, engine="pyarrow")
|
|
|
|
# Check not empty
|
|
if df.empty:
|
|
errors.append("Output file is empty")
|
|
|
|
# Check has 'name' column at minimum
|
|
if "name" not in df.columns:
|
|
errors.append("Output file missing 'name' column")
|
|
|
|
# Check for reasonable card count (at least 100 cards expected in any real dataset)
|
|
if len(df) < 100:
|
|
logger.warning(f"Output has only {len(df)} cards (expected more)")
|
|
|
|
logger.info(f"Validation passed: {len(df)} cards with {len(df.columns)} columns")
|
|
|
|
except Exception as e:
|
|
errors.append(f"Failed to read/validate output file: {e}")
|
|
|
|
return len(errors) == 0, errors
|
|
|
|
def rotate_versions(self, output_path: str, keep_versions: int = 3) -> None:
|
|
"""
|
|
Rotate historical versions of the output file.
|
|
|
|
Keeps the last N versions as backups (e.g., all_cards_v1.parquet, all_cards_v2.parquet).
|
|
|
|
Args:
|
|
output_path: Path to current output file
|
|
keep_versions: Number of historical versions to keep (default: 3)
|
|
"""
|
|
if not os.path.exists(output_path):
|
|
return # Nothing to rotate
|
|
|
|
# Parse output path
|
|
base_dir = os.path.dirname(output_path)
|
|
filename = os.path.basename(output_path)
|
|
name, ext = os.path.splitext(filename)
|
|
|
|
# Rotate existing versions (v2 -> v3, v1 -> v2, current -> v1)
|
|
for version in range(keep_versions - 1, 0, -1):
|
|
old_path = os.path.join(base_dir, f"{name}_v{version}{ext}")
|
|
new_path = os.path.join(base_dir, f"{name}_v{version + 1}{ext}")
|
|
|
|
if os.path.exists(old_path):
|
|
if version + 1 > keep_versions:
|
|
# Delete oldest version
|
|
os.remove(old_path)
|
|
logger.info(f"Deleted old version: {os.path.basename(old_path)}")
|
|
else:
|
|
# Rename to next version
|
|
os.rename(old_path, new_path)
|
|
logger.info(
|
|
f"Rotated {os.path.basename(old_path)} -> {os.path.basename(new_path)}"
|
|
)
|
|
|
|
# Move current file to v1
|
|
v1_path = os.path.join(base_dir, f"{name}_v1{ext}")
|
|
if os.path.exists(output_path):
|
|
os.rename(output_path, v1_path)
|
|
logger.info(f"Rotated current file to {os.path.basename(v1_path)}")
|
|
|
|
def _save_metadata(self, source_dir: str, output_path: str, stats: dict) -> None:
|
|
"""Save aggregation metadata for incremental updates."""
|
|
metadata_path = os.path.join(self.output_dir, ".aggregate_metadata.json")
|
|
|
|
metadata = {
|
|
"source_dir": source_dir,
|
|
"output_path": output_path,
|
|
"last_aggregation": stats["timestamp"],
|
|
"stats": stats,
|
|
}
|
|
|
|
with open(metadata_path, "w", encoding="utf-8") as f:
|
|
json.dump(metadata, f, indent=2)
|
|
|
|
logger.info(f"Saved aggregation metadata to {metadata_path}")
|