mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-24 14:06:31 +01:00
254 lines
7.8 KiB
Python
254 lines
7.8 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Strip low-card themes from parquet file themeTags columns.
|
||
|
|
|
||
|
|
This script identifies and removes themes below the THEME_MIN_CARDS threshold
|
||
|
|
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
|
||
|
|
the Theme Stripping roadmap (R21).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
# Dry run to see what would be stripped
|
||
|
|
python code/scripts/strip_parquet_themes.py --dry-run
|
||
|
|
|
||
|
|
# Strip from single parquet file
|
||
|
|
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
|
||
|
|
|
||
|
|
# Strip from all parquet files in directory
|
||
|
|
python code/scripts/strip_parquet_themes.py --all
|
||
|
|
|
||
|
|
# Specify custom threshold
|
||
|
|
python code/scripts/strip_parquet_themes.py --threshold 10 --all
|
||
|
|
|
||
|
|
Environment Variables:
|
||
|
|
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||
|
|
|
||
|
|
Outputs:
|
||
|
|
- Modified parquet file(s) with stripped themeTags
|
||
|
|
- Timestamped backup (.parquet.bak) if --backup enabled
|
||
|
|
- Updated logs/stripped_themes.yml log
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
# Add project root to path
|
||
|
|
ROOT = Path(__file__).resolve().parent.parent.parent
|
||
|
|
sys.path.insert(0, str(ROOT))
|
||
|
|
|
||
|
|
from code import settings as code_settings
|
||
|
|
from code.tagging.theme_stripper import (
|
||
|
|
get_theme_card_counts,
|
||
|
|
identify_themes_to_strip,
|
||
|
|
strip_parquet_themes,
|
||
|
|
create_stripped_themes_log
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def find_parquet_files(directory: Path) -> list[Path]:
|
||
|
|
"""Find all parquet files in processed directory."""
|
||
|
|
return sorted(directory.glob("*.parquet"))
|
||
|
|
|
||
|
|
|
||
|
|
def update_stripped_themes_log(
|
||
|
|
theme_counts: dict,
|
||
|
|
themes_to_strip: set[str],
|
||
|
|
min_cards: int
|
||
|
|
) -> None:
|
||
|
|
"""Update the stripped_themes.yml log with parquet stripping results."""
|
||
|
|
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||
|
|
|
||
|
|
# Create log with parquet source indicator
|
||
|
|
create_stripped_themes_log(
|
||
|
|
output_path=log_path,
|
||
|
|
theme_counts=theme_counts,
|
||
|
|
themes_stripped=themes_to_strip,
|
||
|
|
min_threshold=min_cards,
|
||
|
|
sources=["parquet files"]
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\nUpdated stripped themes log: {log_path}")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Strip low-card themes from parquet themeTags columns",
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog=__doc__
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument(
|
||
|
|
'--file',
|
||
|
|
type=Path,
|
||
|
|
help='Specific parquet file to process'
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument(
|
||
|
|
'--all',
|
||
|
|
action='store_true',
|
||
|
|
help='Process all parquet files in card_files/processed/'
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument(
|
||
|
|
'--threshold',
|
||
|
|
type=int,
|
||
|
|
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument(
|
||
|
|
'--dry-run',
|
||
|
|
action='store_true',
|
||
|
|
help='Show what would be stripped without making changes'
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument(
|
||
|
|
'--no-backup',
|
||
|
|
action='store_true',
|
||
|
|
help='Skip creating backup files before modification'
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument(
|
||
|
|
'--verbose',
|
||
|
|
action='store_true',
|
||
|
|
help='Show detailed stripping information'
|
||
|
|
)
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Determine threshold
|
||
|
|
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
|
||
|
|
|
||
|
|
# Determine which files to process
|
||
|
|
if args.file:
|
||
|
|
if not args.file.exists():
|
||
|
|
print(f"Error: File not found: {args.file}")
|
||
|
|
return 1
|
||
|
|
parquet_files = [args.file]
|
||
|
|
elif args.all:
|
||
|
|
processed_dir = ROOT / "card_files" / "processed"
|
||
|
|
parquet_files = find_parquet_files(processed_dir)
|
||
|
|
if not parquet_files:
|
||
|
|
print(f"No parquet files found in {processed_dir}")
|
||
|
|
return 1
|
||
|
|
else:
|
||
|
|
# Default: process all_cards.parquet
|
||
|
|
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
|
||
|
|
if not default_file.exists():
|
||
|
|
print(f"Error: Default file not found: {default_file}")
|
||
|
|
print("Use --file or --all to specify files to process")
|
||
|
|
return 1
|
||
|
|
parquet_files = [default_file]
|
||
|
|
|
||
|
|
print(f"Theme Stripping Configuration:")
|
||
|
|
print(f" Minimum cards: {min_cards}")
|
||
|
|
print(f" Files to process: {len(parquet_files)}")
|
||
|
|
print(f" Backup enabled: {not args.no_backup}")
|
||
|
|
print(f" Dry run: {args.dry_run}")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Get theme card counts from parquet files
|
||
|
|
print("Analyzing theme card counts...")
|
||
|
|
try:
|
||
|
|
theme_counts = get_theme_card_counts(parquet_files)
|
||
|
|
print(f"Found {len(theme_counts)} unique themes across files")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error analyzing theme counts: {e}")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
# Identify themes to strip
|
||
|
|
print("Identifying themes to strip...")
|
||
|
|
try:
|
||
|
|
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error identifying themes to strip: {e}")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
if not themes_to_strip:
|
||
|
|
print("No themes found below threshold. Nothing to strip.")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
print(f"Found {len(themes_to_strip)} themes to strip")
|
||
|
|
|
||
|
|
if args.verbose:
|
||
|
|
sample = sorted(list(themes_to_strip))[:10]
|
||
|
|
print(f"Sample themes: {', '.join(sample)}")
|
||
|
|
if len(themes_to_strip) > 10:
|
||
|
|
print(f" ... and {len(themes_to_strip) - 10} more")
|
||
|
|
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Dry run mode
|
||
|
|
if args.dry_run:
|
||
|
|
print("DRY RUN MODE - No files will be modified")
|
||
|
|
print()
|
||
|
|
for parquet_file in parquet_files:
|
||
|
|
print(f"Would process: {parquet_file}")
|
||
|
|
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
# Process each parquet file
|
||
|
|
total_results = {
|
||
|
|
"files_processed": 0,
|
||
|
|
"cards_processed": 0,
|
||
|
|
"tags_removed": 0,
|
||
|
|
"errors": []
|
||
|
|
}
|
||
|
|
|
||
|
|
for parquet_file in parquet_files:
|
||
|
|
print(f"Processing: {parquet_file.name}")
|
||
|
|
|
||
|
|
try:
|
||
|
|
results = strip_parquet_themes(
|
||
|
|
parquet_path=parquet_file,
|
||
|
|
themes_to_strip=themes_to_strip,
|
||
|
|
backup=not args.no_backup
|
||
|
|
)
|
||
|
|
|
||
|
|
total_results["files_processed"] += 1
|
||
|
|
total_results["cards_processed"] += results["cards_processed"]
|
||
|
|
total_results["tags_removed"] += results["tags_removed"]
|
||
|
|
total_results["errors"].extend(results["errors"])
|
||
|
|
|
||
|
|
if args.verbose:
|
||
|
|
print(f" Cards: {results['cards_processed']}")
|
||
|
|
print(f" Tags removed: {results['tags_removed']}")
|
||
|
|
if results["backup_created"]:
|
||
|
|
print(f" Backup: {results['backup_created']}")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
error_msg = f"Error processing {parquet_file}: {e}"
|
||
|
|
print(f" {error_msg}")
|
||
|
|
total_results["errors"].append(error_msg)
|
||
|
|
continue
|
||
|
|
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Update stripped themes log
|
||
|
|
try:
|
||
|
|
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Warning: Failed to update stripped themes log: {e}")
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
print("\n" + "="*60)
|
||
|
|
print("SUMMARY")
|
||
|
|
print("="*60)
|
||
|
|
print(f"Files processed: {total_results['files_processed']}")
|
||
|
|
print(f"Cards processed: {total_results['cards_processed']}")
|
||
|
|
print(f"Tags removed: {total_results['tags_removed']}")
|
||
|
|
print(f"Themes stripped: {len(themes_to_strip)}")
|
||
|
|
|
||
|
|
if total_results["errors"]:
|
||
|
|
print(f"\nErrors encountered: {len(total_results['errors'])}")
|
||
|
|
for error in total_results["errors"]:
|
||
|
|
print(f" - {error}")
|
||
|
|
else:
|
||
|
|
print("\nStripping completed successfully!")
|
||
|
|
|
||
|
|
return 0 if not total_results["errors"] else 1
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|