mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-24 22:16:31 +01:00
feat: implement theme stripping system with THEME_MIN_CARDS config
This commit is contained in:
parent
1ebc2fcb3c
commit
86ece36012
20 changed files with 6604 additions and 1364 deletions
380
code/scripts/strip_themes.py
Normal file
380
code/scripts/strip_themes.py
Normal file
|
|
@ -0,0 +1,380 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone theme stripping orchestration script.
|
||||
|
||||
This script coordinates the complete theme stripping pipeline:
|
||||
1. Analyze parquet files to identify low-card themes
|
||||
2. Strip from catalog YAML files (optional)
|
||||
3. Strip from parquet themeTags columns (optional)
|
||||
4. Rebuild theme_list.json from stripped parquet data
|
||||
5. Generate stripped_themes.yml log
|
||||
|
||||
Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
|
||||
|
||||
Usage:
|
||||
# Dry run to preview changes
|
||||
python code/scripts/strip_themes.py --dry-run
|
||||
|
||||
# Strip everything with default threshold (5 cards)
|
||||
python code/scripts/strip_themes.py
|
||||
|
||||
# Strip only catalog YAML files
|
||||
python code/scripts/strip_themes.py --sources catalog
|
||||
|
||||
# Strip only parquet files
|
||||
python code/scripts/strip_themes.py --sources parquet
|
||||
|
||||
# Custom threshold
|
||||
python code/scripts/strip_themes.py --min-cards 10
|
||||
|
||||
# Skip backups (not recommended)
|
||||
python code/scripts/strip_themes.py --no-backup
|
||||
|
||||
Environment Variables:
|
||||
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||||
|
||||
Outputs:
|
||||
- Modified catalog/*.yml files (if --sources includes catalog)
|
||||
- Modified parquet files (if --sources includes parquet)
|
||||
- Regenerated config/themes/theme_list.json
|
||||
- Updated logs/stripped_themes.yml log
|
||||
- Timestamped backups (if --backup enabled)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Set, Dict
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from code import settings as code_settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_catalog_themes,
|
||||
strip_parquet_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
|
||||
def strip_all_sources(
|
||||
min_cards: int,
|
||||
sources: Set[str],
|
||||
backup: bool,
|
||||
dry_run: bool,
|
||||
verbose: bool
|
||||
) -> Dict:
|
||||
"""
|
||||
Execute complete theme stripping pipeline.
|
||||
|
||||
Args:
|
||||
min_cards: Minimum card count threshold
|
||||
sources: Set of sources to strip ('catalog', 'parquet', or both)
|
||||
backup: Whether to create backups before modification
|
||||
dry_run: Preview changes without modifying files
|
||||
verbose: Show detailed output
|
||||
|
||||
Returns:
|
||||
Dictionary with stripping results and statistics
|
||||
"""
|
||||
start_time = time.time()
|
||||
results = {
|
||||
"themes_analyzed": 0,
|
||||
"themes_to_strip": 0,
|
||||
"catalog_stripped": 0,
|
||||
"parquet_tags_removed": 0,
|
||||
"json_regenerated": False,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
print("="*70)
|
||||
print("THEME STRIPPING PIPELINE")
|
||||
print("="*70)
|
||||
print(f"Configuration:")
|
||||
print(f" Minimum cards: {min_cards}")
|
||||
print(f" Sources: {', '.join(sorted(sources))}")
|
||||
print(f" Backup enabled: {backup}")
|
||||
print(f" Dry run: {dry_run}")
|
||||
print()
|
||||
|
||||
# Step 1: Analyze parquet files
|
||||
print("Step 1: Analyzing theme card counts...")
|
||||
try:
|
||||
parquet_dir = ROOT / "card_files" / "processed"
|
||||
parquet_files = sorted(parquet_dir.glob("*.parquet"))
|
||||
|
||||
if not parquet_files:
|
||||
results["errors"].append("No parquet files found in card_files/processed/")
|
||||
return results
|
||||
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
results["themes_analyzed"] = len(theme_counts)
|
||||
print(f" Found {len(theme_counts)} unique themes")
|
||||
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
results["themes_to_strip"] = len(themes_to_strip)
|
||||
print(f" Identified {len(themes_to_strip)} themes below threshold")
|
||||
|
||||
if verbose and themes_to_strip:
|
||||
sample = sorted(list(themes_to_strip))[:5]
|
||||
print(f" Sample themes: {', '.join(sample)}")
|
||||
if len(themes_to_strip) > 5:
|
||||
print(f" ... and {len(themes_to_strip) - 5} more")
|
||||
|
||||
if not themes_to_strip:
|
||||
print("\n✅ No themes below threshold. Nothing to strip.")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Analysis failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
return results
|
||||
|
||||
print()
|
||||
|
||||
# Dry run mode
|
||||
if dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
if 'catalog' in sources:
|
||||
print("Would strip from catalog YAML files:")
|
||||
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
||||
yaml_files = sorted(catalog_dir.glob("*.yml"))
|
||||
for yaml_file in yaml_files[:5]:
|
||||
print(f" - {yaml_file.name}")
|
||||
if len(yaml_files) > 5:
|
||||
print(f" ... and {len(yaml_files) - 5} more")
|
||||
|
||||
if 'parquet' in sources:
|
||||
print("\nWould strip from parquet files:")
|
||||
for pf in parquet_files[:3]:
|
||||
print(f" - {pf.name}")
|
||||
if len(parquet_files) > 3:
|
||||
print(f" ... and {len(parquet_files) - 3} more")
|
||||
|
||||
print(f"\nWould strip {len(themes_to_strip)} themes total")
|
||||
print("Would regenerate theme_list.json")
|
||||
print("Would update stripped_themes.yml log")
|
||||
return results
|
||||
|
||||
# Step 2: Strip from catalog (if requested)
|
||||
# NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
|
||||
# otherwise build_theme_catalog.py will read un-stripped themes from YAML
|
||||
if 'catalog' in sources:
|
||||
print("Step 2: Stripping from catalog YAML files...")
|
||||
try:
|
||||
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
||||
catalog_results = strip_catalog_themes(
|
||||
catalog_dir=catalog_dir,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=backup
|
||||
)
|
||||
|
||||
results["catalog_stripped"] = catalog_results["files_modified"]
|
||||
|
||||
if verbose:
|
||||
print(f" Files modified: {catalog_results['files_modified']}")
|
||||
print(f" Themes removed: {catalog_results['themes_removed']}")
|
||||
if catalog_results["backups_created"]:
|
||||
print(f" Backups created: {len(catalog_results['backups_created'])}")
|
||||
else:
|
||||
print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
|
||||
|
||||
results["errors"].extend(catalog_results["errors"])
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Catalog stripping failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 3: Strip from parquet (if requested)
|
||||
if 'parquet' in sources:
|
||||
step_num = 3 if 'catalog' in sources else 2
|
||||
print(f"Step {step_num}: Stripping from parquet files...")
|
||||
try:
|
||||
for parquet_file in parquet_files:
|
||||
if verbose:
|
||||
print(f" Processing: {parquet_file.name}")
|
||||
|
||||
parquet_results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=backup
|
||||
)
|
||||
|
||||
results["parquet_tags_removed"] += parquet_results["tags_removed"]
|
||||
results["errors"].extend(parquet_results["errors"])
|
||||
|
||||
if verbose and parquet_results["tags_removed"] > 0:
|
||||
print(f" Removed {parquet_results['tags_removed']} tag occurrences")
|
||||
|
||||
if not verbose:
|
||||
print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Parquet stripping failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 4: Rebuild theme_list.json (if parquet was stripped)
|
||||
# NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
|
||||
if 'parquet' in sources:
|
||||
step_num = 4 if 'catalog' in sources else 3
|
||||
print(f"Step {step_num}: Rebuilding theme_list.json...")
|
||||
try:
|
||||
# Import build script
|
||||
from code.scripts.build_theme_catalog import main as build_main
|
||||
|
||||
# Suppress verbose build output unless --verbose flag
|
||||
import io
|
||||
import contextlib
|
||||
|
||||
if not verbose:
|
||||
with contextlib.redirect_stdout(io.StringIO()):
|
||||
build_main()
|
||||
else:
|
||||
build_main()
|
||||
|
||||
results["json_regenerated"] = True
|
||||
print(" ✓ theme_list.json regenerated")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"JSON regeneration failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 5: Update stripped themes log
|
||||
final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
|
||||
print(f"Step {final_step}: Updating stripped_themes.yml log...")
|
||||
try:
|
||||
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||||
source_labels = []
|
||||
if 'catalog' in sources:
|
||||
source_labels.append("catalog YAML")
|
||||
if 'parquet' in sources:
|
||||
source_labels.append("parquet files")
|
||||
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=min_cards,
|
||||
sources=source_labels if source_labels else None
|
||||
)
|
||||
print(f" ✓ Log updated: {log_path}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Log update failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
# Final summary
|
||||
elapsed = time.time() - start_time
|
||||
print()
|
||||
print("="*70)
|
||||
print("SUMMARY")
|
||||
print("="*70)
|
||||
print(f"Themes analyzed: {results['themes_analyzed']}")
|
||||
print(f"Themes stripped: {results['themes_to_strip']}")
|
||||
if 'catalog' in sources:
|
||||
print(f"Catalog files modified: {results['catalog_stripped']}")
|
||||
if 'parquet' in sources:
|
||||
print(f"Parquet tags removed: {results['parquet_tags_removed']}")
|
||||
print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
|
||||
print(f"Time elapsed: {elapsed:.2f}s")
|
||||
|
||||
if results["errors"]:
|
||||
print(f"\n⚠️ Errors encountered: {len(results['errors'])}")
|
||||
for error in results["errors"]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print("\n✅ Theme stripping completed successfully!")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Orchestrate complete theme stripping pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--min-cards',
|
||||
type=int,
|
||||
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sources',
|
||||
type=str,
|
||||
help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be stripped without making changes'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Skip creating backup files before modification'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed stripping information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine threshold
|
||||
min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
|
||||
|
||||
# Determine sources
|
||||
if args.sources:
|
||||
source_input = args.sources.lower()
|
||||
if source_input == 'all':
|
||||
sources = {'catalog', 'parquet'}
|
||||
else:
|
||||
sources = set(s.strip() for s in source_input.split(','))
|
||||
valid_sources = {'catalog', 'parquet'}
|
||||
invalid = sources - valid_sources
|
||||
if invalid:
|
||||
print(f"Error: Invalid sources: {', '.join(invalid)}")
|
||||
print(f"Valid sources: {', '.join(valid_sources)}, all")
|
||||
return 1
|
||||
else:
|
||||
sources = {'catalog', 'parquet'} # Default: all sources
|
||||
|
||||
# Execute pipeline
|
||||
results = strip_all_sources(
|
||||
min_cards=min_cards,
|
||||
sources=sources,
|
||||
backup=not args.no_backup,
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
# Return exit code
|
||||
return 0 if not results["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue