mtg_python_deckbuilder/code/scripts/aggregate_cards.py

161 lines
4.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Aggregate Cards CLI Script
Command-line interface for consolidating individual card CSV files into a single
Parquet file. Useful for manual aggregation runs, testing, and recovery.
Usage:
python code/scripts/aggregate_cards.py
python code/scripts/aggregate_cards.py --source csv_files --output card_files/all_cards.parquet
python code/scripts/aggregate_cards.py --validate-only
python code/scripts/aggregate_cards.py --incremental
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Add project root to path for imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from code.file_setup.card_aggregator import CardAggregator
from code.logging_util import get_logger
from code.settings import CSV_DIRECTORY, CARD_FILES_DIRECTORY
# Initialize logger
logger = get_logger(__name__)
def main() -> int:
"""Main entry point for aggregate_cards CLI."""
parser = argparse.ArgumentParser(
description="Aggregate individual card CSV files into consolidated Parquet file",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--source",
"-s",
default=CSV_DIRECTORY,
help=f"Source directory containing card CSV files (default: {CSV_DIRECTORY})",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Output Parquet file path (default: card_files/all_cards.parquet)",
)
parser.add_argument(
"--output-dir",
default=CARD_FILES_DIRECTORY,
help=f"Output directory for Parquet files (default: {CARD_FILES_DIRECTORY})",
)
parser.add_argument(
"--validate-only",
action="store_true",
help="Only validate existing output file, don't aggregate",
)
parser.add_argument(
"--incremental",
"-i",
action="store_true",
help="Perform incremental update (only changed files)",
)
parser.add_argument(
"--keep-versions",
type=int,
default=3,
help="Number of historical versions to keep (default: 3)",
)
args = parser.parse_args()
# Initialize aggregator
aggregator = CardAggregator(output_dir=args.output_dir)
# Determine output path
output_path = args.output or f"{args.output_dir}/all_cards.parquet"
try:
if args.validate_only:
# Validation only mode
logger.info(f"Validating {output_path}...")
is_valid, errors = aggregator.validate_output(output_path, args.source)
if is_valid:
logger.info("✓ Validation passed")
return 0
else:
logger.error("✗ Validation failed:")
for error in errors:
logger.error(f" - {error}")
return 1
elif args.incremental:
# Incremental update mode
logger.info("Starting incremental aggregation...")
metadata_path = f"{args.output_dir}/.aggregate_metadata.json"
changed_files = aggregator.detect_changes(args.source, metadata_path)
if not changed_files:
logger.info("No changes detected, skipping aggregation")
return 0
stats = aggregator.incremental_update(changed_files, output_path)
else:
# Full aggregation mode
logger.info("Starting full aggregation...")
stats = aggregator.aggregate_all(args.source, output_path)
# Print summary
print("\n" + "=" * 60)
print("AGGREGATION SUMMARY")
print("=" * 60)
print(f"Files processed: {stats['files_processed']}")
print(f"Total cards: {stats['total_cards']:,}")
print(f"Duplicates removed: {stats['duplicates_removed']:,}")
print(f"File size: {stats['file_size_mb']:.2f} MB")
print(f"Time elapsed: {stats['elapsed_seconds']:.2f} seconds")
print(f"Output: {output_path}")
print("=" * 60)
# Run validation
logger.info("\nValidating output...")
is_valid, errors = aggregator.validate_output(output_path, args.source)
if is_valid:
logger.info("✓ Validation passed")
return 0
else:
logger.error("✗ Validation failed:")
for error in errors:
logger.error(f" - {error}")
return 1
except FileNotFoundError as e:
logger.error(f"Error: {e}")
return 1
except ValueError as e:
logger.error(f"Error: {e}")
return 1
except Exception as e:
logger.error(f"Unexpected error: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())