mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-24 14:06:31 +01:00
380 lines
13 KiB
Python
380 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Standalone theme stripping orchestration script.
|
|
|
|
This script coordinates the complete theme stripping pipeline:
|
|
1. Analyze parquet files to identify low-card themes
|
|
2. Strip from catalog YAML files (optional)
|
|
3. Strip from parquet themeTags columns (optional)
|
|
4. Rebuild theme_list.json from stripped parquet data
|
|
5. Generate stripped_themes.yml log
|
|
|
|
Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
|
|
|
|
Usage:
|
|
# Dry run to preview changes
|
|
python code/scripts/strip_themes.py --dry-run
|
|
|
|
# Strip everything with default threshold (5 cards)
|
|
python code/scripts/strip_themes.py
|
|
|
|
# Strip only catalog YAML files
|
|
python code/scripts/strip_themes.py --sources catalog
|
|
|
|
# Strip only parquet files
|
|
python code/scripts/strip_themes.py --sources parquet
|
|
|
|
# Custom threshold
|
|
python code/scripts/strip_themes.py --min-cards 10
|
|
|
|
# Skip backups (not recommended)
|
|
python code/scripts/strip_themes.py --no-backup
|
|
|
|
Environment Variables:
|
|
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
|
|
|
Outputs:
|
|
- Modified catalog/*.yml files (if --sources includes catalog)
|
|
- Modified parquet files (if --sources includes parquet)
|
|
- Regenerated config/themes/theme_list.json
|
|
- Updated logs/stripped_themes.yml log
|
|
- Timestamped backups (if --backup enabled)
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Set, Dict
|
|
|
|
# Add project root to path
|
|
ROOT = Path(__file__).resolve().parent.parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from code import settings as code_settings
|
|
from code.tagging.theme_stripper import (
|
|
get_theme_card_counts,
|
|
identify_themes_to_strip,
|
|
strip_catalog_themes,
|
|
strip_parquet_themes,
|
|
create_stripped_themes_log
|
|
)
|
|
|
|
|
|
def strip_all_sources(
|
|
min_cards: int,
|
|
sources: Set[str],
|
|
backup: bool,
|
|
dry_run: bool,
|
|
verbose: bool
|
|
) -> Dict:
|
|
"""
|
|
Execute complete theme stripping pipeline.
|
|
|
|
Args:
|
|
min_cards: Minimum card count threshold
|
|
sources: Set of sources to strip ('catalog', 'parquet', or both)
|
|
backup: Whether to create backups before modification
|
|
dry_run: Preview changes without modifying files
|
|
verbose: Show detailed output
|
|
|
|
Returns:
|
|
Dictionary with stripping results and statistics
|
|
"""
|
|
start_time = time.time()
|
|
results = {
|
|
"themes_analyzed": 0,
|
|
"themes_to_strip": 0,
|
|
"catalog_stripped": 0,
|
|
"parquet_tags_removed": 0,
|
|
"json_regenerated": False,
|
|
"errors": []
|
|
}
|
|
|
|
print("="*70)
|
|
print("THEME STRIPPING PIPELINE")
|
|
print("="*70)
|
|
print(f"Configuration:")
|
|
print(f" Minimum cards: {min_cards}")
|
|
print(f" Sources: {', '.join(sorted(sources))}")
|
|
print(f" Backup enabled: {backup}")
|
|
print(f" Dry run: {dry_run}")
|
|
print()
|
|
|
|
# Step 1: Analyze parquet files
|
|
print("Step 1: Analyzing theme card counts...")
|
|
try:
|
|
parquet_dir = ROOT / "card_files" / "processed"
|
|
parquet_files = sorted(parquet_dir.glob("*.parquet"))
|
|
|
|
if not parquet_files:
|
|
results["errors"].append("No parquet files found in card_files/processed/")
|
|
return results
|
|
|
|
theme_counts = get_theme_card_counts(parquet_files)
|
|
results["themes_analyzed"] = len(theme_counts)
|
|
print(f" Found {len(theme_counts)} unique themes")
|
|
|
|
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
|
results["themes_to_strip"] = len(themes_to_strip)
|
|
print(f" Identified {len(themes_to_strip)} themes below threshold")
|
|
|
|
if verbose and themes_to_strip:
|
|
sample = sorted(list(themes_to_strip))[:5]
|
|
print(f" Sample themes: {', '.join(sample)}")
|
|
if len(themes_to_strip) > 5:
|
|
print(f" ... and {len(themes_to_strip) - 5} more")
|
|
|
|
if not themes_to_strip:
|
|
print("\n✅ No themes below threshold. Nothing to strip.")
|
|
return results
|
|
|
|
except Exception as e:
|
|
error_msg = f"Analysis failed: {e}"
|
|
print(f" ❌ {error_msg}")
|
|
results["errors"].append(error_msg)
|
|
return results
|
|
|
|
print()
|
|
|
|
# Dry run mode
|
|
if dry_run:
|
|
print("DRY RUN MODE - No files will be modified")
|
|
print()
|
|
if 'catalog' in sources:
|
|
print("Would strip from catalog YAML files:")
|
|
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
|
yaml_files = sorted(catalog_dir.glob("*.yml"))
|
|
for yaml_file in yaml_files[:5]:
|
|
print(f" - {yaml_file.name}")
|
|
if len(yaml_files) > 5:
|
|
print(f" ... and {len(yaml_files) - 5} more")
|
|
|
|
if 'parquet' in sources:
|
|
print("\nWould strip from parquet files:")
|
|
for pf in parquet_files[:3]:
|
|
print(f" - {pf.name}")
|
|
if len(parquet_files) > 3:
|
|
print(f" ... and {len(parquet_files) - 3} more")
|
|
|
|
print(f"\nWould strip {len(themes_to_strip)} themes total")
|
|
print("Would regenerate theme_list.json")
|
|
print("Would update stripped_themes.yml log")
|
|
return results
|
|
|
|
# Step 2: Strip from catalog (if requested)
|
|
# NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
|
|
# otherwise build_theme_catalog.py will read un-stripped themes from YAML
|
|
if 'catalog' in sources:
|
|
print("Step 2: Stripping from catalog YAML files...")
|
|
try:
|
|
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
|
catalog_results = strip_catalog_themes(
|
|
catalog_dir=catalog_dir,
|
|
themes_to_strip=themes_to_strip,
|
|
backup=backup
|
|
)
|
|
|
|
results["catalog_stripped"] = catalog_results["files_modified"]
|
|
|
|
if verbose:
|
|
print(f" Files modified: {catalog_results['files_modified']}")
|
|
print(f" Themes removed: {catalog_results['themes_removed']}")
|
|
if catalog_results["backups_created"]:
|
|
print(f" Backups created: {len(catalog_results['backups_created'])}")
|
|
else:
|
|
print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
|
|
|
|
results["errors"].extend(catalog_results["errors"])
|
|
|
|
except Exception as e:
|
|
error_msg = f"Catalog stripping failed: {e}"
|
|
print(f" ❌ {error_msg}")
|
|
results["errors"].append(error_msg)
|
|
|
|
print()
|
|
|
|
# Step 3: Strip from parquet (if requested)
|
|
if 'parquet' in sources:
|
|
step_num = 3 if 'catalog' in sources else 2
|
|
print(f"Step {step_num}: Stripping from parquet files...")
|
|
try:
|
|
for parquet_file in parquet_files:
|
|
if verbose:
|
|
print(f" Processing: {parquet_file.name}")
|
|
|
|
parquet_results = strip_parquet_themes(
|
|
parquet_path=parquet_file,
|
|
themes_to_strip=themes_to_strip,
|
|
backup=backup
|
|
)
|
|
|
|
results["parquet_tags_removed"] += parquet_results["tags_removed"]
|
|
results["errors"].extend(parquet_results["errors"])
|
|
|
|
if verbose and parquet_results["tags_removed"] > 0:
|
|
print(f" Removed {parquet_results['tags_removed']} tag occurrences")
|
|
|
|
if not verbose:
|
|
print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
|
|
|
|
except Exception as e:
|
|
error_msg = f"Parquet stripping failed: {e}"
|
|
print(f" ❌ {error_msg}")
|
|
results["errors"].append(error_msg)
|
|
|
|
print()
|
|
|
|
# Step 4: Rebuild theme_list.json (if parquet was stripped)
|
|
# NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
|
|
if 'parquet' in sources:
|
|
step_num = 4 if 'catalog' in sources else 3
|
|
print(f"Step {step_num}: Rebuilding theme_list.json...")
|
|
try:
|
|
# Import build script
|
|
from code.scripts.build_theme_catalog import main as build_main
|
|
|
|
# Suppress verbose build output unless --verbose flag
|
|
import io
|
|
import contextlib
|
|
|
|
if not verbose:
|
|
with contextlib.redirect_stdout(io.StringIO()):
|
|
build_main()
|
|
else:
|
|
build_main()
|
|
|
|
results["json_regenerated"] = True
|
|
print(" ✓ theme_list.json regenerated")
|
|
|
|
except Exception as e:
|
|
error_msg = f"JSON regeneration failed: {e}"
|
|
print(f" ❌ {error_msg}")
|
|
results["errors"].append(error_msg)
|
|
|
|
print()
|
|
|
|
# Step 5: Update stripped themes log
|
|
final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
|
|
print(f"Step {final_step}: Updating stripped_themes.yml log...")
|
|
try:
|
|
log_path = ROOT / "logs" / "stripped_themes.yml"
|
|
source_labels = []
|
|
if 'catalog' in sources:
|
|
source_labels.append("catalog YAML")
|
|
if 'parquet' in sources:
|
|
source_labels.append("parquet files")
|
|
|
|
create_stripped_themes_log(
|
|
output_path=log_path,
|
|
theme_counts=theme_counts,
|
|
themes_stripped=themes_to_strip,
|
|
min_threshold=min_cards,
|
|
sources=source_labels if source_labels else None
|
|
)
|
|
print(f" ✓ Log updated: {log_path}")
|
|
|
|
except Exception as e:
|
|
error_msg = f"Log update failed: {e}"
|
|
print(f" ❌ {error_msg}")
|
|
results["errors"].append(error_msg)
|
|
|
|
# Final summary
|
|
elapsed = time.time() - start_time
|
|
print()
|
|
print("="*70)
|
|
print("SUMMARY")
|
|
print("="*70)
|
|
print(f"Themes analyzed: {results['themes_analyzed']}")
|
|
print(f"Themes stripped: {results['themes_to_strip']}")
|
|
if 'catalog' in sources:
|
|
print(f"Catalog files modified: {results['catalog_stripped']}")
|
|
if 'parquet' in sources:
|
|
print(f"Parquet tags removed: {results['parquet_tags_removed']}")
|
|
print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
|
|
print(f"Time elapsed: {elapsed:.2f}s")
|
|
|
|
if results["errors"]:
|
|
print(f"\n⚠️ Errors encountered: {len(results['errors'])}")
|
|
for error in results["errors"]:
|
|
print(f" - {error}")
|
|
else:
|
|
print("\n✅ Theme stripping completed successfully!")
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Orchestrate complete theme stripping pipeline",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--min-cards',
|
|
type=int,
|
|
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--sources',
|
|
type=str,
|
|
help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Show what would be stripped without making changes'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--no-backup',
|
|
action='store_true',
|
|
help='Skip creating backup files before modification'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--verbose',
|
|
action='store_true',
|
|
help='Show detailed stripping information'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine threshold
|
|
min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
|
|
|
|
# Determine sources
|
|
if args.sources:
|
|
source_input = args.sources.lower()
|
|
if source_input == 'all':
|
|
sources = {'catalog', 'parquet'}
|
|
else:
|
|
sources = set(s.strip() for s in source_input.split(','))
|
|
valid_sources = {'catalog', 'parquet'}
|
|
invalid = sources - valid_sources
|
|
if invalid:
|
|
print(f"Error: Invalid sources: {', '.join(invalid)}")
|
|
print(f"Valid sources: {', '.join(valid_sources)}, all")
|
|
return 1
|
|
else:
|
|
sources = {'catalog', 'parquet'} # Default: all sources
|
|
|
|
# Execute pipeline
|
|
results = strip_all_sources(
|
|
min_cards=min_cards,
|
|
sources=sources,
|
|
backup=not args.no_backup,
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
# Return exit code
|
|
return 0 if not results["errors"] else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|