mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-24 14:06:31 +01:00
feat: implement theme stripping system with THEME_MIN_CARDS config
This commit is contained in:
parent
1ebc2fcb3c
commit
86ece36012
20 changed files with 6604 additions and 1364 deletions
253
code/scripts/strip_parquet_themes.py
Normal file
253
code/scripts/strip_parquet_themes.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Strip low-card themes from parquet file themeTags columns.
|
||||
|
||||
This script identifies and removes themes below the THEME_MIN_CARDS threshold
|
||||
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
|
||||
the Theme Stripping roadmap (R21).
|
||||
|
||||
Usage:
|
||||
# Dry run to see what would be stripped
|
||||
python code/scripts/strip_parquet_themes.py --dry-run
|
||||
|
||||
# Strip from single parquet file
|
||||
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
|
||||
|
||||
# Strip from all parquet files in directory
|
||||
python code/scripts/strip_parquet_themes.py --all
|
||||
|
||||
# Specify custom threshold
|
||||
python code/scripts/strip_parquet_themes.py --threshold 10 --all
|
||||
|
||||
Environment Variables:
|
||||
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||||
|
||||
Outputs:
|
||||
- Modified parquet file(s) with stripped themeTags
|
||||
- Timestamped backup (.parquet.bak) if --backup enabled
|
||||
- Updated logs/stripped_themes.yml log
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from code import settings as code_settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_parquet_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
|
||||
def find_parquet_files(directory: Path) -> list[Path]:
|
||||
"""Find all parquet files in processed directory."""
|
||||
return sorted(directory.glob("*.parquet"))
|
||||
|
||||
|
||||
def update_stripped_themes_log(
|
||||
theme_counts: dict,
|
||||
themes_to_strip: set[str],
|
||||
min_cards: int
|
||||
) -> None:
|
||||
"""Update the stripped_themes.yml log with parquet stripping results."""
|
||||
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||||
|
||||
# Create log with parquet source indicator
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=min_cards,
|
||||
sources=["parquet files"]
|
||||
)
|
||||
|
||||
print(f"\nUpdated stripped themes log: {log_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Strip low-card themes from parquet themeTags columns",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--file',
|
||||
type=Path,
|
||||
help='Specific parquet file to process'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
help='Process all parquet files in card_files/processed/'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--threshold',
|
||||
type=int,
|
||||
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be stripped without making changes'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Skip creating backup files before modification'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed stripping information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine threshold
|
||||
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
|
||||
|
||||
# Determine which files to process
|
||||
if args.file:
|
||||
if not args.file.exists():
|
||||
print(f"Error: File not found: {args.file}")
|
||||
return 1
|
||||
parquet_files = [args.file]
|
||||
elif args.all:
|
||||
processed_dir = ROOT / "card_files" / "processed"
|
||||
parquet_files = find_parquet_files(processed_dir)
|
||||
if not parquet_files:
|
||||
print(f"No parquet files found in {processed_dir}")
|
||||
return 1
|
||||
else:
|
||||
# Default: process all_cards.parquet
|
||||
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
|
||||
if not default_file.exists():
|
||||
print(f"Error: Default file not found: {default_file}")
|
||||
print("Use --file or --all to specify files to process")
|
||||
return 1
|
||||
parquet_files = [default_file]
|
||||
|
||||
print(f"Theme Stripping Configuration:")
|
||||
print(f" Minimum cards: {min_cards}")
|
||||
print(f" Files to process: {len(parquet_files)}")
|
||||
print(f" Backup enabled: {not args.no_backup}")
|
||||
print(f" Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
# Get theme card counts from parquet files
|
||||
print("Analyzing theme card counts...")
|
||||
try:
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
print(f"Found {len(theme_counts)} unique themes across files")
|
||||
except Exception as e:
|
||||
print(f"Error analyzing theme counts: {e}")
|
||||
return 1
|
||||
|
||||
# Identify themes to strip
|
||||
print("Identifying themes to strip...")
|
||||
try:
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
except Exception as e:
|
||||
print(f"Error identifying themes to strip: {e}")
|
||||
return 1
|
||||
|
||||
if not themes_to_strip:
|
||||
print("No themes found below threshold. Nothing to strip.")
|
||||
return 0
|
||||
|
||||
print(f"Found {len(themes_to_strip)} themes to strip")
|
||||
|
||||
if args.verbose:
|
||||
sample = sorted(list(themes_to_strip))[:10]
|
||||
print(f"Sample themes: {', '.join(sample)}")
|
||||
if len(themes_to_strip) > 10:
|
||||
print(f" ... and {len(themes_to_strip) - 10} more")
|
||||
|
||||
print()
|
||||
|
||||
# Dry run mode
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Would process: {parquet_file}")
|
||||
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
|
||||
return 0
|
||||
|
||||
# Process each parquet file
|
||||
total_results = {
|
||||
"files_processed": 0,
|
||||
"cards_processed": 0,
|
||||
"tags_removed": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Processing: {parquet_file.name}")
|
||||
|
||||
try:
|
||||
results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=not args.no_backup
|
||||
)
|
||||
|
||||
total_results["files_processed"] += 1
|
||||
total_results["cards_processed"] += results["cards_processed"]
|
||||
total_results["tags_removed"] += results["tags_removed"]
|
||||
total_results["errors"].extend(results["errors"])
|
||||
|
||||
if args.verbose:
|
||||
print(f" Cards: {results['cards_processed']}")
|
||||
print(f" Tags removed: {results['tags_removed']}")
|
||||
if results["backup_created"]:
|
||||
print(f" Backup: {results['backup_created']}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing {parquet_file}: {e}"
|
||||
print(f" {error_msg}")
|
||||
total_results["errors"].append(error_msg)
|
||||
continue
|
||||
|
||||
print()
|
||||
|
||||
# Update stripped themes log
|
||||
try:
|
||||
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to update stripped themes log: {e}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*60)
|
||||
print("SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Files processed: {total_results['files_processed']}")
|
||||
print(f"Cards processed: {total_results['cards_processed']}")
|
||||
print(f"Tags removed: {total_results['tags_removed']}")
|
||||
print(f"Themes stripped: {len(themes_to_strip)}")
|
||||
|
||||
if total_results["errors"]:
|
||||
print(f"\nErrors encountered: {len(total_results['errors'])}")
|
||||
for error in total_results["errors"]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print("\nStripping completed successfully!")
|
||||
|
||||
return 0 if not total_results["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue