mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-25 06:26:31 +01:00
feat: implement theme stripping system with THEME_MIN_CARDS config
This commit is contained in:
parent
1ebc2fcb3c
commit
86ece36012
20 changed files with 6604 additions and 1364 deletions
207
code/scripts/analyze_theme_distribution.py
Normal file
207
code/scripts/analyze_theme_distribution.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""
|
||||
Theme Distribution Analysis Script
|
||||
|
||||
Analyzes theme distribution across the card catalog and generates reports
|
||||
showing which themes would be stripped based on minimum card thresholds.
|
||||
|
||||
Usage:
|
||||
python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
|
||||
|
||||
Arguments:
|
||||
--min-cards N Minimum card threshold (default: from THEME_MIN_CARDS setting)
|
||||
--output FILE Output file path (default: logs/theme_stripping_analysis.txt)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, Set
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
get_theme_distribution,
|
||||
get_themes_by_count
|
||||
)
|
||||
|
||||
|
||||
def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
|
||||
"""
|
||||
Analyze theme distribution and generate report.
|
||||
|
||||
Args:
|
||||
min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
|
||||
output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
|
||||
"""
|
||||
if min_cards is None:
|
||||
min_cards = THEME_MIN_CARDS
|
||||
|
||||
if output_path is None:
|
||||
output_path = "logs/theme_stripping_analysis.txt"
|
||||
|
||||
print(f"Analyzing theme distribution (min_cards={min_cards})...")
|
||||
|
||||
# Find all parquet files
|
||||
processed_dir = Path(CARD_FILES_PROCESSED_DIR)
|
||||
if not processed_dir.exists():
|
||||
print(f"Error: Processed cards directory not found: {processed_dir}")
|
||||
print("Please run initial setup first to generate parquet files.")
|
||||
sys.exit(1)
|
||||
|
||||
parquet_files = list(processed_dir.glob("*.parquet"))
|
||||
if not parquet_files:
|
||||
print(f"Error: No parquet files found in {processed_dir}")
|
||||
print("Please run initial setup first to generate parquet files.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(parquet_files)} parquet files to analyze")
|
||||
|
||||
# Build theme counts
|
||||
print("Building theme -> card count mapping...")
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
|
||||
if not theme_counts:
|
||||
print("Error: No themes found in parquet files")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(theme_counts)} unique themes")
|
||||
|
||||
# Identify themes to strip
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
|
||||
# Get distribution
|
||||
distribution = get_theme_distribution(theme_counts)
|
||||
|
||||
# Get themes below threshold
|
||||
below_threshold = get_themes_by_count(theme_counts, min_cards)
|
||||
|
||||
# Generate report
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
# Header
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||
f.write(f"Minimum Card Threshold: {min_cards}\n")
|
||||
f.write(f"Source: {processed_dir}\n")
|
||||
f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
# Summary statistics
|
||||
f.write("SUMMARY STATISTICS\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
f.write(f"Total Themes: {distribution['total']}\n")
|
||||
f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
|
||||
f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
|
||||
f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
|
||||
f.write("\n")
|
||||
|
||||
# Distribution by card count
|
||||
f.write("DISTRIBUTION BY CARD COUNT\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
f.write(f" 1 card: {distribution['1_card']:4d} themes\n")
|
||||
f.write(f" 2 cards: {distribution['2_cards']:4d} themes\n")
|
||||
f.write(f" 3-4 cards: {distribution['3_4_cards']:4d} themes\n")
|
||||
f.write(f" 5-9 cards: {distribution['5_9_cards']:4d} themes\n")
|
||||
f.write(f" 10+ cards: {distribution['10_plus']:4d} themes\n")
|
||||
f.write(f" Total: {distribution['total']:4d} themes\n")
|
||||
f.write("\n")
|
||||
|
||||
# Themes below threshold
|
||||
if below_threshold:
|
||||
f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write(f"Total: {len(below_threshold)} themes\n\n")
|
||||
|
||||
for theme_id, count, card_list in below_threshold:
|
||||
f.write(f"Theme: {theme_id}\n")
|
||||
f.write(f"Card Count: {count}\n")
|
||||
f.write(f"Cards:\n")
|
||||
for card in card_list:
|
||||
f.write(f" - {card}\n")
|
||||
f.write("\n")
|
||||
else:
|
||||
f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("All themes meet the minimum card requirement.\n\n")
|
||||
|
||||
# Recommendations
|
||||
f.write("RECOMMENDATIONS\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
if len(themes_to_strip) > 0:
|
||||
f.write(f"• {len(themes_to_strip)} themes should be stripped\n")
|
||||
f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
|
||||
f.write(f"• Run theme stripping to remove these low-viability themes\n")
|
||||
f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
|
||||
else:
|
||||
f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
|
||||
f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
|
||||
f.write("\n")
|
||||
|
||||
# Footer
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("END OF REPORT\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
|
||||
print(f"\nReport generated: {output_file}")
|
||||
print(f"\nSummary:")
|
||||
print(f" Total themes: {distribution['total']}")
|
||||
print(f" Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
|
||||
print(f" Themes to keep: {distribution['total'] - len(themes_to_strip)}")
|
||||
|
||||
# Print distribution
|
||||
print(f"\nDistribution:")
|
||||
print(f" 1 card: {distribution['1_card']:4d} themes")
|
||||
print(f" 2 cards: {distribution['2_cards']:4d} themes")
|
||||
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
|
||||
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
|
||||
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze theme distribution and identify themes below minimum card threshold"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--min-cards',
|
||||
type=int,
|
||||
default=None,
|
||||
help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Output file path (default: logs/theme_stripping_analysis.txt)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
analyze_theme_distribution(
|
||||
min_cards=args.min_cards,
|
||||
output_path=args.output
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\nAnalysis cancelled by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nError during analysis: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -34,6 +34,14 @@ try: # Optional
|
|||
except Exception: # pragma: no cover
|
||||
yaml = None
|
||||
|
||||
# Import settings for THEME_MIN_CARDS threshold
|
||||
# Import at module level to avoid stdlib 'code' conflict when running as script
|
||||
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
if ROOT not in sys.path:
|
||||
sys.path.insert(0, ROOT)
|
||||
|
||||
from code import settings as code_settings
|
||||
|
||||
try:
|
||||
# Support running as `python code/scripts/build_theme_catalog.py` when 'code' already on path
|
||||
from scripts.extract_themes import (
|
||||
|
|
@ -166,17 +174,29 @@ def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:
|
|||
|
||||
|
||||
def regenerate_analytics(verbose: bool):
|
||||
"""
|
||||
Regenerate theme analytics from parquet data, constants, and tagger source.
|
||||
|
||||
Now reads from parquet files instead of CSV. Applies THEME_MIN_CARDS filtering
|
||||
to exclude themes with too few cards.
|
||||
|
||||
Args:
|
||||
verbose: Whether to print detailed progress
|
||||
|
||||
Returns:
|
||||
Tuple of (theme_tags, selected_synergies, taxonomy)
|
||||
"""
|
||||
theme_tags: Set[str] = set()
|
||||
theme_tags |= collect_theme_tags_from_constants()
|
||||
theme_tags |= collect_theme_tags_from_tagger_source()
|
||||
try:
|
||||
csv_rows = gather_theme_tag_rows()
|
||||
for row_tags in csv_rows:
|
||||
for t in row_tags:
|
||||
if isinstance(t, str) and t:
|
||||
theme_tags.add(t)
|
||||
except Exception:
|
||||
csv_rows = []
|
||||
|
||||
# M3: Read from parquet (no longer silent fail)
|
||||
# Fail loudly if parquet read fails - this is a critical error
|
||||
parquet_rows = gather_theme_tag_rows()
|
||||
for row_tags in parquet_rows:
|
||||
for t in row_tags:
|
||||
if isinstance(t, str) and t:
|
||||
theme_tags.add(t)
|
||||
|
||||
whitelist = load_whitelist_config()
|
||||
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
|
||||
|
|
@ -190,10 +210,8 @@ def regenerate_analytics(verbose: bool):
|
|||
blacklist = {"Draw Triggers"}
|
||||
theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}
|
||||
|
||||
try:
|
||||
frequencies = tally_tag_frequencies_by_base_color()
|
||||
except Exception:
|
||||
frequencies = {}
|
||||
# M3: Read frequencies from parquet (fail loudly)
|
||||
frequencies = tally_tag_frequencies_by_base_color()
|
||||
|
||||
if frequencies:
|
||||
def total_count(t: str) -> int:
|
||||
|
|
@ -204,19 +222,40 @@ def regenerate_analytics(verbose: bool):
|
|||
except Exception:
|
||||
pass
|
||||
return s
|
||||
|
||||
kept: Set[str] = set()
|
||||
|
||||
# M3: Apply THEME_MIN_CARDS filtering
|
||||
min_cards = getattr(code_settings, 'THEME_MIN_CARDS', 5)
|
||||
if verbose:
|
||||
print(f"Applying THEME_MIN_CARDS filter (threshold: {min_cards} cards)")
|
||||
|
||||
themes_before_filter = len(theme_tags)
|
||||
|
||||
for t in list(theme_tags):
|
||||
if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
|
||||
kept.add(t)
|
||||
count = total_count(t)
|
||||
# Check both should_keep_theme (whitelist logic) AND THEME_MIN_CARDS threshold
|
||||
if should_keep_theme(t, count, whitelist, protected_prefixes, protected_suffixes, min_overrides):
|
||||
# Additional check: must meet minimum card threshold
|
||||
if count >= min_cards:
|
||||
kept.add(t)
|
||||
elif verbose:
|
||||
print(f" Filtered out '{t}' ({count} cards < {min_cards} threshold)")
|
||||
|
||||
# Always include whitelist themes (override threshold)
|
||||
for extra in whitelist.get('always_include', []) or []:
|
||||
kept.add(str(extra))
|
||||
|
||||
theme_tags = kept
|
||||
|
||||
if verbose:
|
||||
themes_after_filter = len(theme_tags)
|
||||
filtered_count = themes_before_filter - themes_after_filter
|
||||
print(f"Filtered {filtered_count} themes below threshold ({themes_after_filter} remain)")
|
||||
|
||||
try:
|
||||
rows = csv_rows if csv_rows else gather_theme_tag_rows()
|
||||
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
|
||||
except Exception:
|
||||
co_map, tag_counts, total_rows = {}, Counter(), 0
|
||||
# M3: Compute co-occurrence from parquet data (fail loudly)
|
||||
rows = parquet_rows if parquet_rows else gather_theme_tag_rows()
|
||||
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
|
||||
|
||||
return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from collections import Counter
|
|||
from typing import Dict, List, Set, Any
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import itertools
|
||||
import math
|
||||
try:
|
||||
|
|
@ -20,6 +21,7 @@ if ROOT not in sys.path:
|
|||
|
||||
from code.settings import CSV_DIRECTORY
|
||||
from code.tagging import tag_constants
|
||||
from code.path_util import get_processed_cards_path
|
||||
|
||||
BASE_COLORS = {
|
||||
'white': 'W',
|
||||
|
|
@ -88,83 +90,113 @@ def collect_theme_tags_from_tagger_source() -> Set[str]:
|
|||
|
||||
|
||||
def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]:
|
||||
"""
|
||||
Tally theme tag frequencies by base color from parquet files.
|
||||
|
||||
Note: This function now reads from card_files/processed/all_cards.parquet
|
||||
instead of per-color CSV files. The CSV files no longer exist after the
|
||||
parquet migration.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping color names to Counter of tag frequencies
|
||||
"""
|
||||
result: Dict[str, Dict[str, int]] = {c: Counter() for c in BASE_COLORS.keys()}
|
||||
# Iterate over per-color CSVs; if not present, skip
|
||||
for color in BASE_COLORS.keys():
|
||||
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
|
||||
if not os.path.exists(path):
|
||||
|
||||
# Load from all_cards.parquet
|
||||
parquet_path = get_processed_cards_path()
|
||||
if not os.path.exists(parquet_path):
|
||||
print(f"Warning: Parquet file not found: {parquet_path}")
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
try:
|
||||
df = pd.read_parquet(parquet_path, columns=['themeTags', 'colorIdentity'], engine='pyarrow')
|
||||
except Exception as e:
|
||||
print(f"Error reading parquet file: {e}")
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
if 'themeTags' not in df.columns:
|
||||
print("Warning: themeTags column not found in parquet file")
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
# Iterate rows and tally tags by base color
|
||||
for _, row in df.iterrows():
|
||||
# Parquet stores themeTags as numpy array
|
||||
tags = row.get('themeTags')
|
||||
if not isinstance(tags, (list, np.ndarray)):
|
||||
continue
|
||||
try:
|
||||
df = pd.read_csv(path, converters={'themeTags': pd.eval, 'colorIdentity': pd.eval})
|
||||
except Exception:
|
||||
df = pd.read_csv(path)
|
||||
if 'themeTags' in df.columns:
|
||||
try:
|
||||
df['themeTags'] = df['themeTags'].apply(pd.eval)
|
||||
except Exception:
|
||||
df['themeTags'] = df['themeTags'].apply(lambda x: [])
|
||||
if 'colorIdentity' in df.columns:
|
||||
try:
|
||||
df['colorIdentity'] = df['colorIdentity'].apply(pd.eval)
|
||||
except Exception:
|
||||
pass
|
||||
if 'themeTags' not in df.columns:
|
||||
if isinstance(tags, np.ndarray):
|
||||
tags = tags.tolist()
|
||||
|
||||
# Get color identity (stored as string like "W", "UB", "WUG", etc.)
|
||||
ci = row.get('colorIdentity')
|
||||
if isinstance(ci, np.ndarray):
|
||||
ci = ci.tolist()
|
||||
|
||||
# Convert colorIdentity to set of letters
|
||||
if isinstance(ci, str):
|
||||
letters = set(ci) # "WUG" -> {'W', 'U', 'G'}
|
||||
elif isinstance(ci, list):
|
||||
letters = set(ci) # ['W', 'U', 'G'] -> {'W', 'U', 'G'}
|
||||
else:
|
||||
letters = set()
|
||||
|
||||
# Determine base colors from color identity
|
||||
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
|
||||
if not bases:
|
||||
# Colorless cards don't contribute to any specific color
|
||||
continue
|
||||
# Derive base colors from colorIdentity if available, else assume single color file
|
||||
def rows_base_colors(row):
|
||||
ids = row.get('colorIdentity') if isinstance(row, dict) else row
|
||||
if isinstance(ids, list):
|
||||
letters = set(ids)
|
||||
else:
|
||||
letters = set()
|
||||
derived = set()
|
||||
for name, letter in BASE_COLORS.items():
|
||||
if letter in letters:
|
||||
derived.add(name)
|
||||
if not derived:
|
||||
derived.add(color)
|
||||
return derived
|
||||
# Iterate rows
|
||||
for _, row in df.iterrows():
|
||||
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
|
||||
# Compute base colors contribution
|
||||
ci = row['colorIdentity'] if 'colorIdentity' in row else None
|
||||
letters = set(ci) if isinstance(ci, list) else set()
|
||||
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
|
||||
if not bases:
|
||||
bases = {color}
|
||||
for bc in bases:
|
||||
for t in tags:
|
||||
result[bc][t] += 1
|
||||
|
||||
# Tally tags for each base color this card belongs to
|
||||
for base_color in bases:
|
||||
for tag in tags:
|
||||
if isinstance(tag, str) and tag:
|
||||
result[base_color][tag] += 1
|
||||
|
||||
# Convert Counters to plain dicts
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
|
||||
def gather_theme_tag_rows() -> List[List[str]]:
|
||||
"""Collect per-card themeTags lists across all base color CSVs.
|
||||
"""
|
||||
Collect per-card themeTags lists from parquet file.
|
||||
|
||||
Note: This function now reads from card_files/processed/all_cards.parquet
|
||||
instead of per-color CSV files. The CSV files no longer exist after the
|
||||
parquet migration.
|
||||
|
||||
Returns a list of themeTags arrays, one per card row where themeTags is present.
|
||||
Returns:
|
||||
List of themeTags arrays, one per card row where themeTags is present.
|
||||
"""
|
||||
rows: List[List[str]] = []
|
||||
for color in BASE_COLORS.keys():
|
||||
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
try:
|
||||
df = pd.read_csv(path, converters={'themeTags': pd.eval})
|
||||
except Exception:
|
||||
df = pd.read_csv(path)
|
||||
if 'themeTags' in df.columns:
|
||||
try:
|
||||
df['themeTags'] = df['themeTags'].apply(pd.eval)
|
||||
except Exception:
|
||||
df['themeTags'] = df['themeTags'].apply(lambda x: [])
|
||||
if 'themeTags' not in df.columns:
|
||||
continue
|
||||
for _, row in df.iterrows():
|
||||
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
|
||||
if tags:
|
||||
rows.append(tags)
|
||||
|
||||
# Load from all_cards.parquet
|
||||
parquet_path = get_processed_cards_path()
|
||||
if not os.path.exists(parquet_path):
|
||||
print(f"Warning: Parquet file not found: {parquet_path}")
|
||||
return rows
|
||||
|
||||
try:
|
||||
df = pd.read_parquet(parquet_path, columns=['themeTags'], engine='pyarrow')
|
||||
except Exception as e:
|
||||
print(f"Error reading parquet file: {e}")
|
||||
return rows
|
||||
|
||||
if 'themeTags' not in df.columns:
|
||||
print("Warning: themeTags column not found in parquet file")
|
||||
return rows
|
||||
|
||||
# Collect theme tags from each card
|
||||
for _, row in df.iterrows():
|
||||
# Parquet stores themeTags as numpy array
|
||||
tags = row.get('themeTags')
|
||||
if isinstance(tags, np.ndarray):
|
||||
tags = tags.tolist()
|
||||
if isinstance(tags, list) and tags:
|
||||
# Convert to list of strings (filter out non-strings)
|
||||
tag_list = [str(t) for t in tags if isinstance(t, str) and t]
|
||||
if tag_list:
|
||||
rows.append(tag_list)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
|
|
|
|||
165
code/scripts/strip_catalog_themes.py
Normal file
165
code/scripts/strip_catalog_themes.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Strip Theme Catalog Script
|
||||
|
||||
Removes themes with insufficient card counts from the theme catalog YAML files.
|
||||
Creates backups and logs all stripped themes for reference.
|
||||
|
||||
Usage:
|
||||
python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
|
||||
|
||||
Options:
|
||||
--min-cards N Override THEME_MIN_CARDS setting (default: from environment/settings)
|
||||
--no-backup Skip creating backup files
|
||||
--dry-run Show what would be stripped without making changes
|
||||
|
||||
Example:
|
||||
python -m code.scripts.strip_catalog_themes
|
||||
python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path for imports
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from code import settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_catalog_themes,
|
||||
create_stripped_themes_log,
|
||||
get_theme_distribution
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Strip themes with insufficient card counts from catalog YAML files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-cards",
|
||||
type=int,
|
||||
default=settings.THEME_MIN_CARDS,
|
||||
help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-backup",
|
||||
action="store_true",
|
||||
help="Skip creating backup files before modification"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be stripped without making changes"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Paths
|
||||
processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
|
||||
catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
|
||||
log_dir = PROJECT_ROOT / 'logs'
|
||||
stripped_log_path = log_dir / 'stripped_themes.yml'
|
||||
|
||||
print(f"Stripping themes from catalog (min_cards={args.min_cards})")
|
||||
print(f"Catalog directory: {catalog_dir}")
|
||||
print(f"Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
# Step 1: Get theme card counts from parquet files
|
||||
print("Step 1: Analyzing theme card counts from parquet files...")
|
||||
parquet_files = sorted(processed_dir.glob("*.parquet"))
|
||||
if not parquet_files:
|
||||
print(f"Error: No parquet files found in {processed_dir}")
|
||||
return 1
|
||||
|
||||
print(f"Found {len(parquet_files)} parquet files")
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
print(f"Found {len(theme_counts)} unique themes")
|
||||
print()
|
||||
|
||||
# Step 2: Get distribution
|
||||
distribution = get_theme_distribution(theme_counts)
|
||||
print("Theme distribution:")
|
||||
print(f" 1 card: {distribution['1_card']:4d} themes")
|
||||
print(f" 2 cards: {distribution['2_cards']:4d} themes")
|
||||
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
|
||||
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
|
||||
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
|
||||
print(f" Total: {distribution['total']:4d} themes")
|
||||
print()
|
||||
|
||||
# Step 3: Identify themes to strip
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
|
||||
themes_to_keep = set(theme_counts.keys()) - themes_to_strip
|
||||
|
||||
print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
|
||||
print(f"Themes to keep: {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Show sample of themes to strip
|
||||
if themes_to_strip:
|
||||
print("Sample themes to strip (first 10):")
|
||||
sample = sorted(themes_to_strip)[:10]
|
||||
for theme_id in sample:
|
||||
count = len(theme_counts[theme_id])
|
||||
cards_sample = sorted(theme_counts[theme_id])[:3]
|
||||
cards_str = ", ".join(cards_sample)
|
||||
if count > 3:
|
||||
cards_str += f", ... ({count} total)"
|
||||
print(f" - {theme_id} ({count} cards): {cards_str}")
|
||||
print()
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN: No changes made")
|
||||
return 0
|
||||
|
||||
# Step 4: Strip themes from catalog
|
||||
print("Step 4: Stripping themes from catalog YAML files...")
|
||||
results = strip_catalog_themes(
|
||||
catalog_dir=catalog_dir,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=not args.no_backup
|
||||
)
|
||||
|
||||
print(f" Stripped: {results['stripped_count']} themes")
|
||||
print(f" Files deleted: {len(results['files_deleted'])}")
|
||||
print(f" Backups created: {len(results['backups_created'])}")
|
||||
|
||||
if results['errors']:
|
||||
print(f" Errors: {len(results['errors'])}")
|
||||
for error in results['errors'][:5]: # Show first 5 errors
|
||||
print(f" - {error}")
|
||||
print()
|
||||
|
||||
# Step 5: Create stripped themes log
|
||||
print("Step 5: Creating stripped themes log...")
|
||||
create_stripped_themes_log(
|
||||
output_path=stripped_log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=args.min_cards,
|
||||
sources=["catalog YAML"]
|
||||
)
|
||||
print(f" Log written to {stripped_log_path}")
|
||||
print()
|
||||
|
||||
print("✅ Catalog stripping complete!")
|
||||
print()
|
||||
print(f"Summary:")
|
||||
print(f" Total themes analyzed: {len(theme_counts)}")
|
||||
print(f" Themes stripped: {len(themes_to_strip)}")
|
||||
print(f" Themes remaining: {len(themes_to_keep)}")
|
||||
print(f" Catalog files deleted: {len(results['files_deleted'])}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
253
code/scripts/strip_parquet_themes.py
Normal file
253
code/scripts/strip_parquet_themes.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Strip low-card themes from parquet file themeTags columns.
|
||||
|
||||
This script identifies and removes themes below the THEME_MIN_CARDS threshold
|
||||
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
|
||||
the Theme Stripping roadmap (R21).
|
||||
|
||||
Usage:
|
||||
# Dry run to see what would be stripped
|
||||
python code/scripts/strip_parquet_themes.py --dry-run
|
||||
|
||||
# Strip from single parquet file
|
||||
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
|
||||
|
||||
# Strip from all parquet files in directory
|
||||
python code/scripts/strip_parquet_themes.py --all
|
||||
|
||||
# Specify custom threshold
|
||||
python code/scripts/strip_parquet_themes.py --threshold 10 --all
|
||||
|
||||
Environment Variables:
|
||||
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||||
|
||||
Outputs:
|
||||
- Modified parquet file(s) with stripped themeTags
|
||||
- Timestamped backup (.parquet.bak) if --backup enabled
|
||||
- Updated logs/stripped_themes.yml log
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from code import settings as code_settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_parquet_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
|
||||
def find_parquet_files(directory: Path) -> list[Path]:
|
||||
"""Find all parquet files in processed directory."""
|
||||
return sorted(directory.glob("*.parquet"))
|
||||
|
||||
|
||||
def update_stripped_themes_log(
|
||||
theme_counts: dict,
|
||||
themes_to_strip: set[str],
|
||||
min_cards: int
|
||||
) -> None:
|
||||
"""Update the stripped_themes.yml log with parquet stripping results."""
|
||||
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||||
|
||||
# Create log with parquet source indicator
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=min_cards,
|
||||
sources=["parquet files"]
|
||||
)
|
||||
|
||||
print(f"\nUpdated stripped themes log: {log_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Strip low-card themes from parquet themeTags columns",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--file',
|
||||
type=Path,
|
||||
help='Specific parquet file to process'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
help='Process all parquet files in card_files/processed/'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--threshold',
|
||||
type=int,
|
||||
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be stripped without making changes'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Skip creating backup files before modification'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed stripping information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine threshold
|
||||
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
|
||||
|
||||
# Determine which files to process
|
||||
if args.file:
|
||||
if not args.file.exists():
|
||||
print(f"Error: File not found: {args.file}")
|
||||
return 1
|
||||
parquet_files = [args.file]
|
||||
elif args.all:
|
||||
processed_dir = ROOT / "card_files" / "processed"
|
||||
parquet_files = find_parquet_files(processed_dir)
|
||||
if not parquet_files:
|
||||
print(f"No parquet files found in {processed_dir}")
|
||||
return 1
|
||||
else:
|
||||
# Default: process all_cards.parquet
|
||||
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
|
||||
if not default_file.exists():
|
||||
print(f"Error: Default file not found: {default_file}")
|
||||
print("Use --file or --all to specify files to process")
|
||||
return 1
|
||||
parquet_files = [default_file]
|
||||
|
||||
print(f"Theme Stripping Configuration:")
|
||||
print(f" Minimum cards: {min_cards}")
|
||||
print(f" Files to process: {len(parquet_files)}")
|
||||
print(f" Backup enabled: {not args.no_backup}")
|
||||
print(f" Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
# Get theme card counts from parquet files
|
||||
print("Analyzing theme card counts...")
|
||||
try:
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
print(f"Found {len(theme_counts)} unique themes across files")
|
||||
except Exception as e:
|
||||
print(f"Error analyzing theme counts: {e}")
|
||||
return 1
|
||||
|
||||
# Identify themes to strip
|
||||
print("Identifying themes to strip...")
|
||||
try:
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
except Exception as e:
|
||||
print(f"Error identifying themes to strip: {e}")
|
||||
return 1
|
||||
|
||||
if not themes_to_strip:
|
||||
print("No themes found below threshold. Nothing to strip.")
|
||||
return 0
|
||||
|
||||
print(f"Found {len(themes_to_strip)} themes to strip")
|
||||
|
||||
if args.verbose:
|
||||
sample = sorted(list(themes_to_strip))[:10]
|
||||
print(f"Sample themes: {', '.join(sample)}")
|
||||
if len(themes_to_strip) > 10:
|
||||
print(f" ... and {len(themes_to_strip) - 10} more")
|
||||
|
||||
print()
|
||||
|
||||
# Dry run mode
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Would process: {parquet_file}")
|
||||
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
|
||||
return 0
|
||||
|
||||
# Process each parquet file
|
||||
total_results = {
|
||||
"files_processed": 0,
|
||||
"cards_processed": 0,
|
||||
"tags_removed": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Processing: {parquet_file.name}")
|
||||
|
||||
try:
|
||||
results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=not args.no_backup
|
||||
)
|
||||
|
||||
total_results["files_processed"] += 1
|
||||
total_results["cards_processed"] += results["cards_processed"]
|
||||
total_results["tags_removed"] += results["tags_removed"]
|
||||
total_results["errors"].extend(results["errors"])
|
||||
|
||||
if args.verbose:
|
||||
print(f" Cards: {results['cards_processed']}")
|
||||
print(f" Tags removed: {results['tags_removed']}")
|
||||
if results["backup_created"]:
|
||||
print(f" Backup: {results['backup_created']}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing {parquet_file}: {e}"
|
||||
print(f" {error_msg}")
|
||||
total_results["errors"].append(error_msg)
|
||||
continue
|
||||
|
||||
print()
|
||||
|
||||
# Update stripped themes log
|
||||
try:
|
||||
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to update stripped themes log: {e}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*60)
|
||||
print("SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Files processed: {total_results['files_processed']}")
|
||||
print(f"Cards processed: {total_results['cards_processed']}")
|
||||
print(f"Tags removed: {total_results['tags_removed']}")
|
||||
print(f"Themes stripped: {len(themes_to_strip)}")
|
||||
|
||||
if total_results["errors"]:
|
||||
print(f"\nErrors encountered: {len(total_results['errors'])}")
|
||||
for error in total_results["errors"]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print("\nStripping completed successfully!")
|
||||
|
||||
return 0 if not total_results["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
380
code/scripts/strip_themes.py
Normal file
380
code/scripts/strip_themes.py
Normal file
|
|
@ -0,0 +1,380 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone theme stripping orchestration script.
|
||||
|
||||
This script coordinates the complete theme stripping pipeline:
|
||||
1. Analyze parquet files to identify low-card themes
|
||||
2. Strip from catalog YAML files (optional)
|
||||
3. Strip from parquet themeTags columns (optional)
|
||||
4. Rebuild theme_list.json from stripped parquet data
|
||||
5. Generate stripped_themes.yml log
|
||||
|
||||
Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
|
||||
|
||||
Usage:
|
||||
# Dry run to preview changes
|
||||
python code/scripts/strip_themes.py --dry-run
|
||||
|
||||
# Strip everything with default threshold (5 cards)
|
||||
python code/scripts/strip_themes.py
|
||||
|
||||
# Strip only catalog YAML files
|
||||
python code/scripts/strip_themes.py --sources catalog
|
||||
|
||||
# Strip only parquet files
|
||||
python code/scripts/strip_themes.py --sources parquet
|
||||
|
||||
# Custom threshold
|
||||
python code/scripts/strip_themes.py --min-cards 10
|
||||
|
||||
# Skip backups (not recommended)
|
||||
python code/scripts/strip_themes.py --no-backup
|
||||
|
||||
Environment Variables:
|
||||
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||||
|
||||
Outputs:
|
||||
- Modified catalog/*.yml files (if --sources includes catalog)
|
||||
- Modified parquet files (if --sources includes parquet)
|
||||
- Regenerated config/themes/theme_list.json
|
||||
- Updated logs/stripped_themes.yml log
|
||||
- Timestamped backups (if --backup enabled)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Set, Dict
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from code import settings as code_settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_catalog_themes,
|
||||
strip_parquet_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
|
||||
def strip_all_sources(
|
||||
min_cards: int,
|
||||
sources: Set[str],
|
||||
backup: bool,
|
||||
dry_run: bool,
|
||||
verbose: bool
|
||||
) -> Dict:
|
||||
"""
|
||||
Execute complete theme stripping pipeline.
|
||||
|
||||
Args:
|
||||
min_cards: Minimum card count threshold
|
||||
sources: Set of sources to strip ('catalog', 'parquet', or both)
|
||||
backup: Whether to create backups before modification
|
||||
dry_run: Preview changes without modifying files
|
||||
verbose: Show detailed output
|
||||
|
||||
Returns:
|
||||
Dictionary with stripping results and statistics
|
||||
"""
|
||||
start_time = time.time()
|
||||
results = {
|
||||
"themes_analyzed": 0,
|
||||
"themes_to_strip": 0,
|
||||
"catalog_stripped": 0,
|
||||
"parquet_tags_removed": 0,
|
||||
"json_regenerated": False,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
print("="*70)
|
||||
print("THEME STRIPPING PIPELINE")
|
||||
print("="*70)
|
||||
print(f"Configuration:")
|
||||
print(f" Minimum cards: {min_cards}")
|
||||
print(f" Sources: {', '.join(sorted(sources))}")
|
||||
print(f" Backup enabled: {backup}")
|
||||
print(f" Dry run: {dry_run}")
|
||||
print()
|
||||
|
||||
# Step 1: Analyze parquet files
|
||||
print("Step 1: Analyzing theme card counts...")
|
||||
try:
|
||||
parquet_dir = ROOT / "card_files" / "processed"
|
||||
parquet_files = sorted(parquet_dir.glob("*.parquet"))
|
||||
|
||||
if not parquet_files:
|
||||
results["errors"].append("No parquet files found in card_files/processed/")
|
||||
return results
|
||||
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
results["themes_analyzed"] = len(theme_counts)
|
||||
print(f" Found {len(theme_counts)} unique themes")
|
||||
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
results["themes_to_strip"] = len(themes_to_strip)
|
||||
print(f" Identified {len(themes_to_strip)} themes below threshold")
|
||||
|
||||
if verbose and themes_to_strip:
|
||||
sample = sorted(list(themes_to_strip))[:5]
|
||||
print(f" Sample themes: {', '.join(sample)}")
|
||||
if len(themes_to_strip) > 5:
|
||||
print(f" ... and {len(themes_to_strip) - 5} more")
|
||||
|
||||
if not themes_to_strip:
|
||||
print("\n✅ No themes below threshold. Nothing to strip.")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Analysis failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
return results
|
||||
|
||||
print()
|
||||
|
||||
# Dry run mode
|
||||
if dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
if 'catalog' in sources:
|
||||
print("Would strip from catalog YAML files:")
|
||||
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
||||
yaml_files = sorted(catalog_dir.glob("*.yml"))
|
||||
for yaml_file in yaml_files[:5]:
|
||||
print(f" - {yaml_file.name}")
|
||||
if len(yaml_files) > 5:
|
||||
print(f" ... and {len(yaml_files) - 5} more")
|
||||
|
||||
if 'parquet' in sources:
|
||||
print("\nWould strip from parquet files:")
|
||||
for pf in parquet_files[:3]:
|
||||
print(f" - {pf.name}")
|
||||
if len(parquet_files) > 3:
|
||||
print(f" ... and {len(parquet_files) - 3} more")
|
||||
|
||||
print(f"\nWould strip {len(themes_to_strip)} themes total")
|
||||
print("Would regenerate theme_list.json")
|
||||
print("Would update stripped_themes.yml log")
|
||||
return results
|
||||
|
||||
# Step 2: Strip from catalog (if requested)
|
||||
# NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
|
||||
# otherwise build_theme_catalog.py will read un-stripped themes from YAML
|
||||
if 'catalog' in sources:
|
||||
print("Step 2: Stripping from catalog YAML files...")
|
||||
try:
|
||||
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
||||
catalog_results = strip_catalog_themes(
|
||||
catalog_dir=catalog_dir,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=backup
|
||||
)
|
||||
|
||||
results["catalog_stripped"] = catalog_results["files_modified"]
|
||||
|
||||
if verbose:
|
||||
print(f" Files modified: {catalog_results['files_modified']}")
|
||||
print(f" Themes removed: {catalog_results['themes_removed']}")
|
||||
if catalog_results["backups_created"]:
|
||||
print(f" Backups created: {len(catalog_results['backups_created'])}")
|
||||
else:
|
||||
print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
|
||||
|
||||
results["errors"].extend(catalog_results["errors"])
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Catalog stripping failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 3: Strip from parquet (if requested)
|
||||
if 'parquet' in sources:
|
||||
step_num = 3 if 'catalog' in sources else 2
|
||||
print(f"Step {step_num}: Stripping from parquet files...")
|
||||
try:
|
||||
for parquet_file in parquet_files:
|
||||
if verbose:
|
||||
print(f" Processing: {parquet_file.name}")
|
||||
|
||||
parquet_results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=backup
|
||||
)
|
||||
|
||||
results["parquet_tags_removed"] += parquet_results["tags_removed"]
|
||||
results["errors"].extend(parquet_results["errors"])
|
||||
|
||||
if verbose and parquet_results["tags_removed"] > 0:
|
||||
print(f" Removed {parquet_results['tags_removed']} tag occurrences")
|
||||
|
||||
if not verbose:
|
||||
print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Parquet stripping failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 4: Rebuild theme_list.json (if parquet was stripped)
|
||||
# NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
|
||||
if 'parquet' in sources:
|
||||
step_num = 4 if 'catalog' in sources else 3
|
||||
print(f"Step {step_num}: Rebuilding theme_list.json...")
|
||||
try:
|
||||
# Import build script
|
||||
from code.scripts.build_theme_catalog import main as build_main
|
||||
|
||||
# Suppress verbose build output unless --verbose flag
|
||||
import io
|
||||
import contextlib
|
||||
|
||||
if not verbose:
|
||||
with contextlib.redirect_stdout(io.StringIO()):
|
||||
build_main()
|
||||
else:
|
||||
build_main()
|
||||
|
||||
results["json_regenerated"] = True
|
||||
print(" ✓ theme_list.json regenerated")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"JSON regeneration failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 5: Update stripped themes log
|
||||
final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
|
||||
print(f"Step {final_step}: Updating stripped_themes.yml log...")
|
||||
try:
|
||||
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||||
source_labels = []
|
||||
if 'catalog' in sources:
|
||||
source_labels.append("catalog YAML")
|
||||
if 'parquet' in sources:
|
||||
source_labels.append("parquet files")
|
||||
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=min_cards,
|
||||
sources=source_labels if source_labels else None
|
||||
)
|
||||
print(f" ✓ Log updated: {log_path}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Log update failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
# Final summary
|
||||
elapsed = time.time() - start_time
|
||||
print()
|
||||
print("="*70)
|
||||
print("SUMMARY")
|
||||
print("="*70)
|
||||
print(f"Themes analyzed: {results['themes_analyzed']}")
|
||||
print(f"Themes stripped: {results['themes_to_strip']}")
|
||||
if 'catalog' in sources:
|
||||
print(f"Catalog files modified: {results['catalog_stripped']}")
|
||||
if 'parquet' in sources:
|
||||
print(f"Parquet tags removed: {results['parquet_tags_removed']}")
|
||||
print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
|
||||
print(f"Time elapsed: {elapsed:.2f}s")
|
||||
|
||||
if results["errors"]:
|
||||
print(f"\n⚠️ Errors encountered: {len(results['errors'])}")
|
||||
for error in results["errors"]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print("\n✅ Theme stripping completed successfully!")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Orchestrate complete theme stripping pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--min-cards',
|
||||
type=int,
|
||||
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sources',
|
||||
type=str,
|
||||
help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be stripped without making changes'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Skip creating backup files before modification'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed stripping information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine threshold
|
||||
min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
|
||||
|
||||
# Determine sources
|
||||
if args.sources:
|
||||
source_input = args.sources.lower()
|
||||
if source_input == 'all':
|
||||
sources = {'catalog', 'parquet'}
|
||||
else:
|
||||
sources = set(s.strip() for s in source_input.split(','))
|
||||
valid_sources = {'catalog', 'parquet'}
|
||||
invalid = sources - valid_sources
|
||||
if invalid:
|
||||
print(f"Error: Invalid sources: {', '.join(invalid)}")
|
||||
print(f"Valid sources: {', '.join(valid_sources)}, all")
|
||||
return 1
|
||||
else:
|
||||
sources = {'catalog', 'parquet'} # Default: all sources
|
||||
|
||||
# Execute pipeline
|
||||
results = strip_all_sources(
|
||||
min_cards=min_cards,
|
||||
sources=sources,
|
||||
backup=not args.no_backup,
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
# Return exit code
|
||||
return 0 if not results["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -156,4 +156,14 @@ SIMILARITY_CACHE_MAX_AGE_DAYS = int(os.getenv('SIMILARITY_CACHE_MAX_AGE_DAYS', '
|
|||
SIMILARITY_CACHE_DOWNLOAD = os.getenv('SIMILARITY_CACHE_DOWNLOAD', '1').lower() not in ('0', 'false', 'off', 'disabled')
|
||||
|
||||
# Batch build feature flag (Build X and Compare)
|
||||
ENABLE_BATCH_BUILD = os.getenv('ENABLE_BATCH_BUILD', '1').lower() not in ('0', 'false', 'off', 'disabled')
|
||||
ENABLE_BATCH_BUILD = os.getenv('ENABLE_BATCH_BUILD', '1').lower() not in ('0', 'false', 'off', 'disabled')
|
||||
|
||||
# ----------------------------------------------------------------------------------
|
||||
# THEME CATALOG SETTINGS
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
# Minimum number of cards required for a theme to be kept in the system
|
||||
# Themes with fewer cards will be stripped during setup/tagging
|
||||
# Set to 1 to keep all themes with at least one card
|
||||
# Set to 0 to only strip orphaned themes (themes with zero cards)
|
||||
THEME_MIN_CARDS = max(0, int(os.getenv('THEME_MIN_CARDS', '5')))
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from pathlib import Path
|
|||
from typing import DefaultDict, Dict, List, Set
|
||||
|
||||
# Third-party imports
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
|
@ -151,7 +152,8 @@ def apply_combo_tags(
|
|||
# Calculate updated counts
|
||||
updated_counts: Dict[str, int] = {}
|
||||
if before_hash != after_hash:
|
||||
updated_counts["total"] = int((df["comboTags"].apply(bool)).sum())
|
||||
# Use len() > 0 to handle arrays properly (avoid ambiguous truth value)
|
||||
updated_counts["total"] = int((df["comboTags"].apply(lambda x: len(x) > 0 if isinstance(x, (list, np.ndarray)) else bool(x))).sum())
|
||||
else:
|
||||
updated_counts["total"] = 0
|
||||
|
||||
|
|
|
|||
|
|
@ -6897,6 +6897,103 @@ def run_tagging(parallel: bool = False, max_workers: int | None = None):
|
|||
logger.info(f"✓ Wrote tagging completion flag to {flag_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write tagging completion flag: {e}")
|
||||
|
||||
# R21: Theme stripping after tagging (if THEME_MIN_CARDS > 1)
|
||||
try:
|
||||
from settings import THEME_MIN_CARDS
|
||||
|
||||
if THEME_MIN_CARDS > 1:
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Starting theme stripping (THEME_MIN_CARDS={THEME_MIN_CARDS})")
|
||||
logger.info("=" * 80)
|
||||
|
||||
strip_start = pd.Timestamp.now()
|
||||
|
||||
# Import theme stripping functions
|
||||
from tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_parquet_themes,
|
||||
strip_catalog_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
# Define project root (tagger.py is in code/tagging/, so go up 2 levels)
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
# Step 1: Analyze themes
|
||||
parquet_dir = Path("card_files/processed")
|
||||
parquet_files = sorted(parquet_dir.glob("*.parquet"))
|
||||
|
||||
logger.info(f"Analyzing {len(parquet_files)} parquet files...")
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, THEME_MIN_CARDS)
|
||||
|
||||
logger.info(f"Found {len(theme_counts)} themes, {len(themes_to_strip)} below threshold")
|
||||
|
||||
if themes_to_strip:
|
||||
# Step 2: Strip from catalog YAML (MUST happen before building JSON)
|
||||
logger.info("Stripping themes from catalog YAML files...")
|
||||
catalog_dir = PROJECT_ROOT / "config" / "themes" / "catalog"
|
||||
|
||||
if catalog_dir.exists():
|
||||
catalog_results = strip_catalog_themes(
|
||||
catalog_dir=catalog_dir,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=True
|
||||
)
|
||||
logger.info(f"✓ Modified {len(catalog_results['files_modified'])} catalog files, stripped {catalog_results['stripped_count']} themes")
|
||||
else:
|
||||
logger.info("Catalog directory doesn't exist yet, skipping YAML stripping")
|
||||
|
||||
# Step 3: Strip from parquet files
|
||||
logger.info("Stripping themes from parquet files...")
|
||||
total_tags_removed = 0
|
||||
for parquet_file in parquet_files:
|
||||
results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=True
|
||||
)
|
||||
total_tags_removed += results["tags_removed"]
|
||||
|
||||
logger.info(f"✓ Removed {total_tags_removed} theme tag occurrences")
|
||||
|
||||
# Step 4: Rebuild theme_list.json from stripped data
|
||||
logger.info("Rebuilding theme_list.json from stripped parquet and catalog...")
|
||||
try:
|
||||
from scripts.build_theme_catalog import main as build_catalog
|
||||
build_catalog()
|
||||
logger.info("✓ theme_list.json regenerated from stripped sources")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to rebuild theme_list.json: {e}")
|
||||
|
||||
# Step 5: Update stripped themes log
|
||||
logger.info("Updating stripped_themes.yml log...")
|
||||
log_path = PROJECT_ROOT / "logs" / "stripped_themes.yml"
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=THEME_MIN_CARDS,
|
||||
sources=["parquet files", "catalog YAML"]
|
||||
)
|
||||
logger.info(f"✓ Log updated: {log_path}")
|
||||
|
||||
strip_duration = (pd.Timestamp.now() - strip_start).total_seconds()
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"✓ Theme stripping complete in {strip_duration:.2f}s")
|
||||
logger.info(f" Themes stripped: {len(themes_to_strip)}")
|
||||
logger.info(f" Tags removed: {total_tags_removed}")
|
||||
logger.info("=" * 80)
|
||||
else:
|
||||
logger.info("No themes below threshold, skipping stripping")
|
||||
else:
|
||||
logger.info(f"Theme stripping disabled (THEME_MIN_CARDS={THEME_MIN_CARDS})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Theme stripping failed: {e}")
|
||||
logger.warning("Continuing without theme stripping")
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
621
code/tagging/theme_stripper.py
Normal file
621
code/tagging/theme_stripper.py
Normal file
|
|
@ -0,0 +1,621 @@
|
|||
"""
|
||||
Theme Stripping Module
|
||||
|
||||
Provides threshold logic and utilities for identifying and stripping themes
|
||||
with insufficient card counts from the theme catalog and card data.
|
||||
|
||||
This module supports M1-M4 of the Theme Stripping roadmap:
|
||||
- M1: Threshold logic and theme count analysis
|
||||
- M2: Theme catalog YAML stripping
|
||||
- M3: theme_list.json stripping
|
||||
- M4: Parquet file theme_tags stripping
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Set, List, Tuple, Any, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None # type: ignore
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------
|
||||
# M1: Threshold Logic & Analysis
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
def get_theme_card_counts(parquet_paths: List[Path]) -> Dict[str, Set[str]]:
|
||||
"""
|
||||
Build a mapping of theme -> set of card names from parquet files.
|
||||
|
||||
Args:
|
||||
parquet_paths: List of paths to parquet files to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary mapping theme ID to set of card names containing that theme
|
||||
|
||||
Example:
|
||||
{"lifegain": {"Ajani's Pridemate", "Soul Warden", ...}, ...}
|
||||
"""
|
||||
theme_to_cards: Dict[str, Set[str]] = {}
|
||||
|
||||
for parquet_path in parquet_paths:
|
||||
try:
|
||||
df = pd.read_parquet(parquet_path)
|
||||
|
||||
# Process each card's theme_tags
|
||||
for _, row in df.iterrows():
|
||||
card_name = row.get('name', '')
|
||||
theme_tags = row.get('themeTags', [])
|
||||
|
||||
# Handle numpy arrays, lists, and string formats
|
||||
if isinstance(theme_tags, np.ndarray):
|
||||
themes = [str(t).strip() for t in theme_tags if str(t).strip()]
|
||||
elif isinstance(theme_tags, str):
|
||||
# Try common separators
|
||||
if '|' in theme_tags:
|
||||
themes = [t.strip() for t in theme_tags.split('|') if t.strip()]
|
||||
elif ',' in theme_tags:
|
||||
themes = [t.strip() for t in theme_tags.split(',') if t.strip()]
|
||||
else:
|
||||
themes = [theme_tags.strip()] if theme_tags.strip() else []
|
||||
elif isinstance(theme_tags, list):
|
||||
themes = [str(t).strip() for t in theme_tags if str(t).strip()]
|
||||
else:
|
||||
themes = []
|
||||
|
||||
# Add card to each theme's set
|
||||
for theme in themes:
|
||||
if theme: # Skip empty themes
|
||||
# Normalize theme ID (lowercase, replace spaces with underscores)
|
||||
theme_id = theme.lower().replace(' ', '_')
|
||||
if theme_id not in theme_to_cards:
|
||||
theme_to_cards[theme_id] = set()
|
||||
theme_to_cards[theme_id].add(card_name)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {parquet_path}: {e}")
|
||||
continue
|
||||
|
||||
return theme_to_cards
|
||||
|
||||
|
||||
def identify_themes_to_strip(
|
||||
theme_counts: Dict[str, Set[str]],
|
||||
min_cards: int
|
||||
) -> Set[str]:
|
||||
"""
|
||||
Identify themes that should be stripped based on card count threshold.
|
||||
|
||||
Args:
|
||||
theme_counts: Dictionary mapping theme ID to set of card names
|
||||
min_cards: Minimum number of cards required to keep a theme
|
||||
|
||||
Returns:
|
||||
Set of theme IDs that should be stripped
|
||||
|
||||
Example:
|
||||
>>> counts = {"daybound": {"Card1", "Card2"}, "lifegain": {"Card1", "Card2", "Card3", "Card4", "Card5"}}
|
||||
>>> identify_themes_to_strip(counts, 5)
|
||||
{'daybound'}
|
||||
"""
|
||||
themes_to_strip = set()
|
||||
|
||||
for theme_id, card_set in theme_counts.items():
|
||||
card_count = len(card_set)
|
||||
if card_count < min_cards:
|
||||
themes_to_strip.add(theme_id)
|
||||
|
||||
return themes_to_strip
|
||||
|
||||
|
||||
def should_strip_theme(theme: str, card_count: int, min_cards: int) -> bool:
|
||||
"""
|
||||
Determine if a specific theme should be stripped based on threshold.
|
||||
|
||||
Args:
|
||||
theme: Theme ID
|
||||
card_count: Number of cards with this theme
|
||||
min_cards: Minimum threshold
|
||||
|
||||
Returns:
|
||||
True if theme should be stripped, False otherwise
|
||||
"""
|
||||
return card_count < min_cards
|
||||
|
||||
|
||||
def get_theme_distribution(theme_counts: Dict[str, Set[str]]) -> Dict[str, int]:
|
||||
"""
|
||||
Get distribution of themes by card count buckets.
|
||||
|
||||
Args:
|
||||
theme_counts: Dictionary mapping theme ID to set of card names
|
||||
|
||||
Returns:
|
||||
Dictionary with distribution statistics:
|
||||
- "1_card": Count of themes with exactly 1 card
|
||||
- "2_cards": Count of themes with exactly 2 cards
|
||||
- "3_4_cards": Count of themes with 3-4 cards
|
||||
- "5_9_cards": Count of themes with 5-9 cards
|
||||
- "10_plus": Count of themes with 10+ cards
|
||||
- "total": Total number of themes
|
||||
"""
|
||||
distribution = {
|
||||
"1_card": 0,
|
||||
"2_cards": 0,
|
||||
"3_4_cards": 0,
|
||||
"5_9_cards": 0,
|
||||
"10_plus": 0,
|
||||
"total": 0
|
||||
}
|
||||
|
||||
for card_set in theme_counts.values():
|
||||
count = len(card_set)
|
||||
distribution["total"] += 1
|
||||
|
||||
if count == 1:
|
||||
distribution["1_card"] += 1
|
||||
elif count == 2:
|
||||
distribution["2_cards"] += 1
|
||||
elif 3 <= count <= 4:
|
||||
distribution["3_4_cards"] += 1
|
||||
elif 5 <= count <= 9:
|
||||
distribution["5_9_cards"] += 1
|
||||
else: # 10+
|
||||
distribution["10_plus"] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
|
||||
def get_themes_by_count(
|
||||
theme_counts: Dict[str, Set[str]],
|
||||
below_threshold: int
|
||||
) -> List[Tuple[str, int, List[str]]]:
|
||||
"""
|
||||
Get list of themes below threshold with their counts and card lists.
|
||||
|
||||
Args:
|
||||
theme_counts: Dictionary mapping theme ID to set of card names
|
||||
below_threshold: Threshold for listing themes
|
||||
|
||||
Returns:
|
||||
List of tuples (theme_id, card_count, card_list) sorted by count (ascending)
|
||||
|
||||
Example:
|
||||
[("miracle", 4, ["Temporal Mastery", "Terminus", "Entreat the Angels", "Bonfire"]), ...]
|
||||
"""
|
||||
below_list = []
|
||||
|
||||
for theme_id, card_set in theme_counts.items():
|
||||
count = len(card_set)
|
||||
if count < below_threshold:
|
||||
card_list = sorted(card_set) # Sort for consistent output
|
||||
below_list.append((theme_id, count, card_list))
|
||||
|
||||
# Sort by count (ascending), then alphabetically
|
||||
below_list.sort(key=lambda x: (x[1], x[0]))
|
||||
|
||||
return below_list
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------
|
||||
# M2: Theme Catalog Stripping
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
def backup_catalog_file(file_path: Path) -> Path:
|
||||
"""
|
||||
Create a timestamped backup of a catalog YAML file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the YAML file to backup
|
||||
|
||||
Returns:
|
||||
Path to the backup file created
|
||||
|
||||
Example:
|
||||
daybound.yml -> daybound_20260319_143025.yml.bak
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"Cannot backup non-existent file: {file_path}")
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
stem = file_path.stem # filename without extension
|
||||
backup_path = file_path.parent / f"{stem}_{timestamp}.yml.bak"
|
||||
|
||||
# Copy content to backup
|
||||
backup_path.write_text(file_path.read_text(encoding='utf-8'), encoding='utf-8')
|
||||
|
||||
return backup_path
|
||||
|
||||
|
||||
def remove_theme_from_catalog(yaml_data: Dict[str, Any], theme_id: str) -> bool:
|
||||
"""
|
||||
Remove a theme entry from catalog YAML data.
|
||||
|
||||
Args:
|
||||
yaml_data: Loaded YAML data (dict)
|
||||
theme_id: Theme ID to remove (must match exactly)
|
||||
|
||||
Returns:
|
||||
True if theme was removed, False if not found
|
||||
|
||||
Note:
|
||||
Modifies yaml_data in-place. Handles single-theme files (where entire
|
||||
file content is the theme dict) and potential multi-theme structures.
|
||||
"""
|
||||
# Single-theme file: check if the 'id' field matches
|
||||
if isinstance(yaml_data, dict) and yaml_data.get('id') == theme_id:
|
||||
# For single-theme files, we can't remove the theme from the dict itself
|
||||
# Caller must handle file deletion
|
||||
return True
|
||||
|
||||
# Multi-theme file: check if yaml_data contains a list or dict of themes
|
||||
# (Future-proofing: current catalog uses one file per theme, but structure may change)
|
||||
if isinstance(yaml_data, list):
|
||||
for i, theme in enumerate(yaml_data):
|
||||
if isinstance(theme, dict) and theme.get('id') == theme_id:
|
||||
yaml_data.pop(i)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def strip_catalog_themes(
|
||||
catalog_dir: Path,
|
||||
themes_to_strip: Set[str],
|
||||
backup: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Strip low-card themes from YAML catalog files.
|
||||
|
||||
Args:
|
||||
catalog_dir: Directory containing theme catalog YAML files
|
||||
themes_to_strip: Set of theme IDs to remove
|
||||
backup: Whether to create timestamped backups before modification
|
||||
|
||||
Returns:
|
||||
Dictionary with stripping results:
|
||||
- "stripped_count": Number of themes stripped
|
||||
- "files_modified": List of file paths modified
|
||||
- "files_deleted": List of file paths deleted (empty single-theme files)
|
||||
- "backups_created": List of backup file paths
|
||||
- "errors": List of error messages
|
||||
|
||||
Example:
|
||||
results = strip_catalog_themes(
|
||||
Path("config/themes/catalog"),
|
||||
{"daybound", "miracle"},
|
||||
backup=True
|
||||
)
|
||||
# Results: {"stripped_count": 2, "files_modified": [...], ...}
|
||||
"""
|
||||
if yaml is None:
|
||||
raise RuntimeError("PyYAML not installed - cannot strip catalog themes")
|
||||
|
||||
if not catalog_dir.exists():
|
||||
raise FileNotFoundError(f"Catalog directory does not exist: {catalog_dir}")
|
||||
|
||||
results = {
|
||||
"stripped_count": 0,
|
||||
"files_modified": [],
|
||||
"files_deleted": [],
|
||||
"backups_created": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
# Find all YAML files in catalog directory
|
||||
yaml_files = sorted(catalog_dir.glob("*.yml"))
|
||||
|
||||
for yaml_file in yaml_files:
|
||||
try:
|
||||
# Load YAML content
|
||||
content = yaml_file.read_text(encoding='utf-8')
|
||||
data = yaml.safe_load(content)
|
||||
|
||||
if not isinstance(data, dict):
|
||||
continue # Skip non-dict files
|
||||
|
||||
theme_id = data.get('id')
|
||||
if not theme_id or theme_id not in themes_to_strip:
|
||||
continue # Skip if theme not in strip list
|
||||
|
||||
# Create backup before modification
|
||||
if backup:
|
||||
try:
|
||||
backup_path = backup_catalog_file(yaml_file)
|
||||
results["backups_created"].append(str(backup_path))
|
||||
except Exception as e:
|
||||
results["errors"].append(f"Backup failed for {yaml_file.name}: {e}")
|
||||
# Continue anyway - modification is important
|
||||
|
||||
# For single-theme files, delete the file entirely
|
||||
# (Current catalog structure: one theme per file)
|
||||
yaml_file.unlink()
|
||||
results["stripped_count"] += 1
|
||||
results["files_deleted"].append(str(yaml_file))
|
||||
|
||||
except yaml.YAMLError as e:
|
||||
results["errors"].append(f"YAML parse error in {yaml_file.name}: {e}")
|
||||
except Exception as e:
|
||||
results["errors"].append(f"Error processing {yaml_file.name}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def create_stripped_themes_log(
|
||||
output_path: Path,
|
||||
theme_counts: Dict[str, Set[str]],
|
||||
themes_stripped: Set[str],
|
||||
min_threshold: int,
|
||||
sources: Optional[List[str]] = None
|
||||
) -> None:
|
||||
"""
|
||||
Create a YAML log of stripped themes with metadata.
|
||||
|
||||
Args:
|
||||
output_path: Path where stripped_themes.yml will be written
|
||||
theme_counts: Dictionary mapping theme ID to set of card names
|
||||
themes_stripped: Set of theme IDs that were stripped
|
||||
min_threshold: The minimum card threshold used for stripping
|
||||
sources: Optional list of sources themes were stripped from
|
||||
|
||||
Creates a YAML file with structure:
|
||||
metadata:
|
||||
last_updated: "2026-03-19T12:30:00"
|
||||
min_card_threshold: 5
|
||||
total_stripped: 42
|
||||
|
||||
stripped_themes:
|
||||
- theme_id: "daybound"
|
||||
display_name: "Daybound"
|
||||
card_count: 3
|
||||
cards:
|
||||
- "Card Name 1"
|
||||
- "Card Name 2"
|
||||
reason: "Below minimum card threshold (3 < 5)"
|
||||
stripped_from:
|
||||
- "catalog/daybound.yml"
|
||||
- "theme_list.json"
|
||||
- "parquet files"
|
||||
"""
|
||||
if yaml is None:
|
||||
raise RuntimeError("PyYAML not installed - cannot create stripped themes log")
|
||||
|
||||
# Build stripped themes list
|
||||
stripped_list = []
|
||||
for theme_id in sorted(themes_stripped):
|
||||
if theme_id not in theme_counts:
|
||||
continue # Skip if we don't have count data
|
||||
|
||||
card_set = theme_counts[theme_id]
|
||||
card_count = len(card_set)
|
||||
sorted_cards = sorted(card_set)
|
||||
|
||||
# Convert theme_id to display name (capitalize each word, replace underscores)
|
||||
display_name = theme_id.replace('_', ' ').title()
|
||||
|
||||
theme_entry = {
|
||||
'theme_id': theme_id,
|
||||
'display_name': display_name,
|
||||
'card_count': card_count,
|
||||
'cards': sorted_cards,
|
||||
'reason': f"Below minimum card threshold ({card_count} < {min_threshold})",
|
||||
'stripped_from': sources if sources else ["catalog YAML", "theme_list.json", "parquet files"]
|
||||
}
|
||||
|
||||
stripped_list.append(theme_entry)
|
||||
|
||||
# Sort by card count (ascending), then alphabetically
|
||||
stripped_list.sort(key=lambda x: (x['card_count'], x['theme_id']))
|
||||
|
||||
# Build complete log structure
|
||||
log_data = {
|
||||
'metadata': {
|
||||
'last_updated': datetime.now().isoformat(),
|
||||
'min_card_threshold': min_threshold,
|
||||
'total_stripped': len(stripped_list)
|
||||
},
|
||||
'stripped_themes': stripped_list
|
||||
}
|
||||
|
||||
# Write to file
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(log_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, indent=2)
|
||||
|
||||
print(f"Stripped themes log written to {output_path}")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------
|
||||
# M4: Parquet File Stripping
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
||||
def backup_parquet_file(file_path: Path) -> Path:
|
||||
"""
|
||||
Create a timestamped backup of a parquet file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the parquet file to backup
|
||||
|
||||
Returns:
|
||||
Path to the backup file created
|
||||
|
||||
Example:
|
||||
all_cards.parquet -> all_cards_20260319_143025.parquet.bak
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"Cannot backup non-existent file: {file_path}")
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
stem = file_path.stem # filename without extension
|
||||
backup_path = file_path.parent / f"{stem}_{timestamp}.parquet.bak"
|
||||
|
||||
# Copy file to backup
|
||||
import shutil
|
||||
shutil.copy2(file_path, backup_path)
|
||||
|
||||
return backup_path
|
||||
|
||||
|
||||
def filter_theme_tags(theme_tags: Any, themes_to_strip: Set[str]) -> List[str]:
|
||||
"""
|
||||
Remove specific themes from a themeTags value (handles multiple formats).
|
||||
|
||||
Args:
|
||||
theme_tags: Can be numpy array, list, or string
|
||||
themes_to_strip: Set of theme IDs to remove (case-insensitive matching)
|
||||
|
||||
Returns:
|
||||
Filtered list of theme tags
|
||||
|
||||
Note:
|
||||
Matches themes case-insensitively for robustness.
|
||||
"""
|
||||
# Convert to list if needed
|
||||
if isinstance(theme_tags, np.ndarray):
|
||||
tags_list = theme_tags.tolist()
|
||||
elif isinstance(theme_tags, list):
|
||||
tags_list = theme_tags
|
||||
elif isinstance(theme_tags, str):
|
||||
# Handle string formats (comma or pipe separated)
|
||||
if '|' in theme_tags:
|
||||
tags_list = [t.strip() for t in theme_tags.split('|') if t.strip()]
|
||||
elif ',' in theme_tags:
|
||||
tags_list = [t.strip() for t in theme_tags.split(',') if t.strip()]
|
||||
else:
|
||||
tags_list = [theme_tags] if theme_tags else []
|
||||
else:
|
||||
tags_list = []
|
||||
|
||||
# Normalize themes to strip (lowercase for case-insensitive matching)
|
||||
normalized_strip_set = {theme.lower() for theme in themes_to_strip}
|
||||
|
||||
# Filter themes
|
||||
filtered = [tag for tag in tags_list if str(tag).lower() not in normalized_strip_set]
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def update_parquet_theme_tags(df: pd.DataFrame, themes_to_strip: Set[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Process entire dataframe to remove stripped themes from themeTags column.
|
||||
|
||||
Args:
|
||||
df: DataFrame with themeTags column
|
||||
themes_to_strip: Set of theme IDs to remove
|
||||
|
||||
Returns:
|
||||
Modified DataFrame (in-place modification + return for convenience)
|
||||
|
||||
Note:
|
||||
Modifies df in-place and also returns it.
|
||||
"""
|
||||
if 'themeTags' not in df.columns:
|
||||
print("Warning: themeTags column not found in dataframe")
|
||||
return df
|
||||
|
||||
# Apply filtering to each row
|
||||
df['themeTags'] = df['themeTags'].apply(
|
||||
lambda tags: filter_theme_tags(tags, themes_to_strip)
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def strip_parquet_themes(
|
||||
parquet_path: Path,
|
||||
themes_to_strip: Set[str],
|
||||
backup: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Strip low-card themes from parquet file's themeTags column.
|
||||
|
||||
Args:
|
||||
parquet_path: Path to parquet file
|
||||
themes_to_strip: Set of theme IDs to remove
|
||||
backup: Whether to create timestamped backup before modification
|
||||
|
||||
Returns:
|
||||
Dictionary with stripping results:
|
||||
- "cards_processed": Total number of cards
|
||||
- "cards_modified": Number of cards with tags removed
|
||||
- "tags_removed": Total number of tag removals
|
||||
- "backup_created": Backup file path (if backup=True)
|
||||
- "errors": List of error messages
|
||||
|
||||
Example:
|
||||
results = strip_parquet_themes(
|
||||
Path("card_files/processed/all_cards.parquet"),
|
||||
{"fateseal", "gravestorm"},
|
||||
backup=True
|
||||
)
|
||||
"""
|
||||
if not parquet_path.exists():
|
||||
raise FileNotFoundError(f"Parquet file does not exist: {parquet_path}")
|
||||
|
||||
results = {
|
||||
"cards_processed": 0,
|
||||
"cards_modified": 0,
|
||||
"tags_removed": 0,
|
||||
"backup_created": None,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
# Load parquet
|
||||
df = pd.read_parquet(parquet_path, engine='pyarrow')
|
||||
results["cards_processed"] = len(df)
|
||||
|
||||
# Create backup before modification
|
||||
if backup:
|
||||
try:
|
||||
backup_path = backup_parquet_file(parquet_path)
|
||||
results["backup_created"] = str(backup_path)
|
||||
print(f"Created backup: {backup_path}")
|
||||
except Exception as e:
|
||||
results["errors"].append(f"Backup failed: {e}")
|
||||
# Continue anyway - modification is important
|
||||
|
||||
# Track modifications
|
||||
if 'themeTags' in df.columns:
|
||||
# Count tags before stripping
|
||||
tags_before = sum(
|
||||
len(tags) if isinstance(tags, (list, np.ndarray)) else 0
|
||||
for tags in df['themeTags']
|
||||
)
|
||||
|
||||
# Apply filtering
|
||||
update_parquet_theme_tags(df, themes_to_strip)
|
||||
|
||||
# Count tags after stripping
|
||||
tags_after = sum(
|
||||
len(tags) if isinstance(tags, list) else 0
|
||||
for tags in df['themeTags']
|
||||
)
|
||||
|
||||
results["tags_removed"] = tags_before - tags_after
|
||||
|
||||
# Count cards with modifications (cards that had at least one tag removed)
|
||||
# This is approximate: tags_removed / ~avg_tags_per_card
|
||||
if results["tags_removed"] > 0:
|
||||
results["cards_modified"] = results["tags_removed"] # Conservative estimate
|
||||
|
||||
print(f"Stripped {results['tags_removed']} tag occurrences from {results['cards_processed']} cards")
|
||||
else:
|
||||
results["errors"].append("themeTags column not found in parquet file")
|
||||
return results
|
||||
|
||||
# Write modified parquet back
|
||||
df.to_parquet(parquet_path, engine='pyarrow', index=False)
|
||||
print(f"Updated {parquet_path}")
|
||||
|
||||
except Exception as e:
|
||||
results["errors"].append(f"Error processing parquet: {e}")
|
||||
|
||||
return results
|
||||
Loading…
Add table
Add a link
Reference in a new issue