mtg_python_deckbuilder/code/scripts/strip_catalog_themes.py

166 lines
5.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Strip Theme Catalog Script
Removes themes with insufficient card counts from the theme catalog YAML files.
Creates backups and logs all stripped themes for reference.
Usage:
python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
Options:
--min-cards N Override THEME_MIN_CARDS setting (default: from environment/settings)
--no-backup Skip creating backup files
--dry-run Show what would be stripped without making changes
Example:
python -m code.scripts.strip_catalog_themes
python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Add project root to path for imports
PROJECT_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(PROJECT_ROOT))
from code import settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_catalog_themes,
create_stripped_themes_log,
get_theme_distribution
)
def main():
parser = argparse.ArgumentParser(
description="Strip themes with insufficient card counts from catalog YAML files"
)
parser.add_argument(
"--min-cards",
type=int,
default=settings.THEME_MIN_CARDS,
help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
)
parser.add_argument(
"--no-backup",
action="store_true",
help="Skip creating backup files before modification"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be stripped without making changes"
)
args = parser.parse_args()
# Paths
processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
log_dir = PROJECT_ROOT / 'logs'
stripped_log_path = log_dir / 'stripped_themes.yml'
print(f"Stripping themes from catalog (min_cards={args.min_cards})")
print(f"Catalog directory: {catalog_dir}")
print(f"Dry run: {args.dry_run}")
print()
# Step 1: Get theme card counts from parquet files
print("Step 1: Analyzing theme card counts from parquet files...")
parquet_files = sorted(processed_dir.glob("*.parquet"))
if not parquet_files:
print(f"Error: No parquet files found in {processed_dir}")
return 1
print(f"Found {len(parquet_files)} parquet files")
theme_counts = get_theme_card_counts(parquet_files)
print(f"Found {len(theme_counts)} unique themes")
print()
# Step 2: Get distribution
distribution = get_theme_distribution(theme_counts)
print("Theme distribution:")
print(f" 1 card: {distribution['1_card']:4d} themes")
print(f" 2 cards: {distribution['2_cards']:4d} themes")
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
print(f" Total: {distribution['total']:4d} themes")
print()
# Step 3: Identify themes to strip
themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
themes_to_keep = set(theme_counts.keys()) - themes_to_strip
print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
print(f"Themes to keep: {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
print()
# Show sample of themes to strip
if themes_to_strip:
print("Sample themes to strip (first 10):")
sample = sorted(themes_to_strip)[:10]
for theme_id in sample:
count = len(theme_counts[theme_id])
cards_sample = sorted(theme_counts[theme_id])[:3]
cards_str = ", ".join(cards_sample)
if count > 3:
cards_str += f", ... ({count} total)"
print(f" - {theme_id} ({count} cards): {cards_str}")
print()
if args.dry_run:
print("DRY RUN: No changes made")
return 0
# Step 4: Strip themes from catalog
print("Step 4: Stripping themes from catalog YAML files...")
results = strip_catalog_themes(
catalog_dir=catalog_dir,
themes_to_strip=themes_to_strip,
backup=not args.no_backup
)
print(f" Stripped: {results['stripped_count']} themes")
print(f" Files deleted: {len(results['files_deleted'])}")
print(f" Backups created: {len(results['backups_created'])}")
if results['errors']:
print(f" Errors: {len(results['errors'])}")
for error in results['errors'][:5]: # Show first 5 errors
print(f" - {error}")
print()
# Step 5: Create stripped themes log
print("Step 5: Creating stripped themes log...")
create_stripped_themes_log(
output_path=stripped_log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=args.min_cards,
sources=["catalog YAML"]
)
print(f" Log written to {stripped_log_path}")
print()
print("✅ Catalog stripping complete!")
print()
print(f"Summary:")
print(f" Total themes analyzed: {len(theme_counts)}")
print(f" Themes stripped: {len(themes_to_strip)}")
print(f" Themes remaining: {len(themes_to_keep)}")
print(f" Catalog files deleted: {len(results['files_deleted'])}")
return 0
if __name__ == "__main__":
sys.exit(main())