mtg_python_deckbuilder/code/scripts/strip_parquet_themes.py

253 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""
Strip low-card themes from parquet file themeTags columns.
This script identifies and removes themes below the THEME_MIN_CARDS threshold
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
the Theme Stripping roadmap (R21).
Usage:
# Dry run to see what would be stripped
python code/scripts/strip_parquet_themes.py --dry-run
# Strip from single parquet file
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
# Strip from all parquet files in directory
python code/scripts/strip_parquet_themes.py --all
# Specify custom threshold
python code/scripts/strip_parquet_themes.py --threshold 10 --all
Environment Variables:
THEME_MIN_CARDS: Minimum card threshold (default: 5)
Outputs:
- Modified parquet file(s) with stripped themeTags
- Timestamped backup (.parquet.bak) if --backup enabled
- Updated logs/stripped_themes.yml log
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
from code import settings as code_settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_parquet_themes,
create_stripped_themes_log
)
def find_parquet_files(directory: Path) -> list[Path]:
"""Find all parquet files in processed directory."""
return sorted(directory.glob("*.parquet"))
def update_stripped_themes_log(
theme_counts: dict,
themes_to_strip: set[str],
min_cards: int
) -> None:
"""Update the stripped_themes.yml log with parquet stripping results."""
log_path = ROOT / "logs" / "stripped_themes.yml"
# Create log with parquet source indicator
create_stripped_themes_log(
output_path=log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=min_cards,
sources=["parquet files"]
)
print(f"\nUpdated stripped themes log: {log_path}")
def main():
parser = argparse.ArgumentParser(
description="Strip low-card themes from parquet themeTags columns",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'--file',
type=Path,
help='Specific parquet file to process'
)
parser.add_argument(
'--all',
action='store_true',
help='Process all parquet files in card_files/processed/'
)
parser.add_argument(
'--threshold',
type=int,
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be stripped without making changes'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Skip creating backup files before modification'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed stripping information'
)
args = parser.parse_args()
# Determine threshold
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
# Determine which files to process
if args.file:
if not args.file.exists():
print(f"Error: File not found: {args.file}")
return 1
parquet_files = [args.file]
elif args.all:
processed_dir = ROOT / "card_files" / "processed"
parquet_files = find_parquet_files(processed_dir)
if not parquet_files:
print(f"No parquet files found in {processed_dir}")
return 1
else:
# Default: process all_cards.parquet
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
if not default_file.exists():
print(f"Error: Default file not found: {default_file}")
print("Use --file or --all to specify files to process")
return 1
parquet_files = [default_file]
print(f"Theme Stripping Configuration:")
print(f" Minimum cards: {min_cards}")
print(f" Files to process: {len(parquet_files)}")
print(f" Backup enabled: {not args.no_backup}")
print(f" Dry run: {args.dry_run}")
print()
# Get theme card counts from parquet files
print("Analyzing theme card counts...")
try:
theme_counts = get_theme_card_counts(parquet_files)
print(f"Found {len(theme_counts)} unique themes across files")
except Exception as e:
print(f"Error analyzing theme counts: {e}")
return 1
# Identify themes to strip
print("Identifying themes to strip...")
try:
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
except Exception as e:
print(f"Error identifying themes to strip: {e}")
return 1
if not themes_to_strip:
print("No themes found below threshold. Nothing to strip.")
return 0
print(f"Found {len(themes_to_strip)} themes to strip")
if args.verbose:
sample = sorted(list(themes_to_strip))[:10]
print(f"Sample themes: {', '.join(sample)}")
if len(themes_to_strip) > 10:
print(f" ... and {len(themes_to_strip) - 10} more")
print()
# Dry run mode
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
print()
for parquet_file in parquet_files:
print(f"Would process: {parquet_file}")
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
return 0
# Process each parquet file
total_results = {
"files_processed": 0,
"cards_processed": 0,
"tags_removed": 0,
"errors": []
}
for parquet_file in parquet_files:
print(f"Processing: {parquet_file.name}")
try:
results = strip_parquet_themes(
parquet_path=parquet_file,
themes_to_strip=themes_to_strip,
backup=not args.no_backup
)
total_results["files_processed"] += 1
total_results["cards_processed"] += results["cards_processed"]
total_results["tags_removed"] += results["tags_removed"]
total_results["errors"].extend(results["errors"])
if args.verbose:
print(f" Cards: {results['cards_processed']}")
print(f" Tags removed: {results['tags_removed']}")
if results["backup_created"]:
print(f" Backup: {results['backup_created']}")
except Exception as e:
error_msg = f"Error processing {parquet_file}: {e}"
print(f" {error_msg}")
total_results["errors"].append(error_msg)
continue
print()
# Update stripped themes log
try:
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
except Exception as e:
print(f"Warning: Failed to update stripped themes log: {e}")
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Files processed: {total_results['files_processed']}")
print(f"Cards processed: {total_results['cards_processed']}")
print(f"Tags removed: {total_results['tags_removed']}")
print(f"Themes stripped: {len(themes_to_strip)}")
if total_results["errors"]:
print(f"\nErrors encountered: {len(total_results['errors'])}")
for error in total_results["errors"]:
print(f" - {error}")
else:
print("\nStripping completed successfully!")
return 0 if not total_results["errors"] else 1
if __name__ == "__main__":
sys.exit(main())