mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-25 06:26:31 +01:00
208 lines
7.5 KiB
Python
208 lines
7.5 KiB
Python
|
|
"""
|
||
|
|
Theme Distribution Analysis Script
|
||
|
|
|
||
|
|
Analyzes theme distribution across the card catalog and generates reports
|
||
|
|
showing which themes would be stripped based on minimum card thresholds.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
--min-cards N Minimum card threshold (default: from THEME_MIN_CARDS setting)
|
||
|
|
--output FILE Output file path (default: logs/theme_stripping_analysis.txt)
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
from typing import Dict, Set
|
||
|
|
|
||
|
|
# Add project root to path
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||
|
|
|
||
|
|
from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
|
||
|
|
from code.tagging.theme_stripper import (
|
||
|
|
get_theme_card_counts,
|
||
|
|
identify_themes_to_strip,
|
||
|
|
get_theme_distribution,
|
||
|
|
get_themes_by_count
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
|
||
|
|
"""
|
||
|
|
Analyze theme distribution and generate report.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
|
||
|
|
output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
|
||
|
|
"""
|
||
|
|
if min_cards is None:
|
||
|
|
min_cards = THEME_MIN_CARDS
|
||
|
|
|
||
|
|
if output_path is None:
|
||
|
|
output_path = "logs/theme_stripping_analysis.txt"
|
||
|
|
|
||
|
|
print(f"Analyzing theme distribution (min_cards={min_cards})...")
|
||
|
|
|
||
|
|
# Find all parquet files
|
||
|
|
processed_dir = Path(CARD_FILES_PROCESSED_DIR)
|
||
|
|
if not processed_dir.exists():
|
||
|
|
print(f"Error: Processed cards directory not found: {processed_dir}")
|
||
|
|
print("Please run initial setup first to generate parquet files.")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
parquet_files = list(processed_dir.glob("*.parquet"))
|
||
|
|
if not parquet_files:
|
||
|
|
print(f"Error: No parquet files found in {processed_dir}")
|
||
|
|
print("Please run initial setup first to generate parquet files.")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f"Found {len(parquet_files)} parquet files to analyze")
|
||
|
|
|
||
|
|
# Build theme counts
|
||
|
|
print("Building theme -> card count mapping...")
|
||
|
|
theme_counts = get_theme_card_counts(parquet_files)
|
||
|
|
|
||
|
|
if not theme_counts:
|
||
|
|
print("Error: No themes found in parquet files")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f"Found {len(theme_counts)} unique themes")
|
||
|
|
|
||
|
|
# Identify themes to strip
|
||
|
|
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||
|
|
|
||
|
|
# Get distribution
|
||
|
|
distribution = get_theme_distribution(theme_counts)
|
||
|
|
|
||
|
|
# Get themes below threshold
|
||
|
|
below_threshold = get_themes_by_count(theme_counts, min_cards)
|
||
|
|
|
||
|
|
# Generate report
|
||
|
|
output_file = Path(output_path)
|
||
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
# Header
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
|
|
f.write(f"Minimum Card Threshold: {min_cards}\n")
|
||
|
|
f.write(f"Source: {processed_dir}\n")
|
||
|
|
f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
|
||
|
|
f.write("=" * 80 + "\n\n")
|
||
|
|
|
||
|
|
# Summary statistics
|
||
|
|
f.write("SUMMARY STATISTICS\n")
|
||
|
|
f.write("-" * 80 + "\n")
|
||
|
|
f.write(f"Total Themes: {distribution['total']}\n")
|
||
|
|
f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
|
||
|
|
f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
|
||
|
|
f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
|
||
|
|
f.write("\n")
|
||
|
|
|
||
|
|
# Distribution by card count
|
||
|
|
f.write("DISTRIBUTION BY CARD COUNT\n")
|
||
|
|
f.write("-" * 80 + "\n")
|
||
|
|
f.write(f" 1 card: {distribution['1_card']:4d} themes\n")
|
||
|
|
f.write(f" 2 cards: {distribution['2_cards']:4d} themes\n")
|
||
|
|
f.write(f" 3-4 cards: {distribution['3_4_cards']:4d} themes\n")
|
||
|
|
f.write(f" 5-9 cards: {distribution['5_9_cards']:4d} themes\n")
|
||
|
|
f.write(f" 10+ cards: {distribution['10_plus']:4d} themes\n")
|
||
|
|
f.write(f" Total: {distribution['total']:4d} themes\n")
|
||
|
|
f.write("\n")
|
||
|
|
|
||
|
|
# Themes below threshold
|
||
|
|
if below_threshold:
|
||
|
|
f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
f.write(f"Total: {len(below_threshold)} themes\n\n")
|
||
|
|
|
||
|
|
for theme_id, count, card_list in below_threshold:
|
||
|
|
f.write(f"Theme: {theme_id}\n")
|
||
|
|
f.write(f"Card Count: {count}\n")
|
||
|
|
f.write(f"Cards:\n")
|
||
|
|
for card in card_list:
|
||
|
|
f.write(f" - {card}\n")
|
||
|
|
f.write("\n")
|
||
|
|
else:
|
||
|
|
f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
f.write("All themes meet the minimum card requirement.\n\n")
|
||
|
|
|
||
|
|
# Recommendations
|
||
|
|
f.write("RECOMMENDATIONS\n")
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
if len(themes_to_strip) > 0:
|
||
|
|
f.write(f"• {len(themes_to_strip)} themes should be stripped\n")
|
||
|
|
f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
|
||
|
|
f.write(f"• Run theme stripping to remove these low-viability themes\n")
|
||
|
|
f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
|
||
|
|
else:
|
||
|
|
f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
|
||
|
|
f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
|
||
|
|
f.write("\n")
|
||
|
|
|
||
|
|
# Footer
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
f.write("END OF REPORT\n")
|
||
|
|
f.write("=" * 80 + "\n")
|
||
|
|
|
||
|
|
print(f"\nReport generated: {output_file}")
|
||
|
|
print(f"\nSummary:")
|
||
|
|
print(f" Total themes: {distribution['total']}")
|
||
|
|
print(f" Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
|
||
|
|
print(f" Themes to keep: {distribution['total'] - len(themes_to_strip)}")
|
||
|
|
|
||
|
|
# Print distribution
|
||
|
|
print(f"\nDistribution:")
|
||
|
|
print(f" 1 card: {distribution['1_card']:4d} themes")
|
||
|
|
print(f" 2 cards: {distribution['2_cards']:4d} themes")
|
||
|
|
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
|
||
|
|
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
|
||
|
|
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""CLI entry point."""
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Analyze theme distribution and identify themes below minimum card threshold"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
'--min-cards',
|
||
|
|
type=int,
|
||
|
|
default=None,
|
||
|
|
help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
'--output',
|
||
|
|
type=str,
|
||
|
|
default=None,
|
||
|
|
help='Output file path (default: logs/theme_stripping_analysis.txt)'
|
||
|
|
)
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
try:
|
||
|
|
analyze_theme_distribution(
|
||
|
|
min_cards=args.min_cards,
|
||
|
|
output_path=args.output
|
||
|
|
)
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
print("\nAnalysis cancelled by user")
|
||
|
|
sys.exit(1)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"\nError during analysis: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|