feat: implement theme stripping system with THEME_MIN_CARDS config (#55)
Some checks are pending
CI / build (push) Waiting to run

* feat: implement theme stripping system with THEME_MIN_CARDS config

* fix: call build_catalog directly to avoid argparse conflicts in CI
This commit is contained in:
mwisnowski 2026-03-19 15:27:17 -07:00 committed by GitHub
parent 1ebc2fcb3c
commit 03e2846882
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 6613 additions and 1364 deletions

View file

@ -0,0 +1,207 @@
"""
Theme Distribution Analysis Script
Analyzes theme distribution across the card catalog and generates reports
showing which themes would be stripped based on minimum card thresholds.
Usage:
python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
Arguments:
--min-cards N Minimum card threshold (default: from THEME_MIN_CARDS setting)
--output FILE Output file path (default: logs/theme_stripping_analysis.txt)
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, Set
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
get_theme_distribution,
get_themes_by_count
)
def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
"""
Analyze theme distribution and generate report.
Args:
min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
"""
if min_cards is None:
min_cards = THEME_MIN_CARDS
if output_path is None:
output_path = "logs/theme_stripping_analysis.txt"
print(f"Analyzing theme distribution (min_cards={min_cards})...")
# Find all parquet files
processed_dir = Path(CARD_FILES_PROCESSED_DIR)
if not processed_dir.exists():
print(f"Error: Processed cards directory not found: {processed_dir}")
print("Please run initial setup first to generate parquet files.")
sys.exit(1)
parquet_files = list(processed_dir.glob("*.parquet"))
if not parquet_files:
print(f"Error: No parquet files found in {processed_dir}")
print("Please run initial setup first to generate parquet files.")
sys.exit(1)
print(f"Found {len(parquet_files)} parquet files to analyze")
# Build theme counts
print("Building theme -> card count mapping...")
theme_counts = get_theme_card_counts(parquet_files)
if not theme_counts:
print("Error: No themes found in parquet files")
sys.exit(1)
print(f"Found {len(theme_counts)} unique themes")
# Identify themes to strip
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
# Get distribution
distribution = get_theme_distribution(theme_counts)
# Get themes below threshold
below_threshold = get_themes_by_count(theme_counts, min_cards)
# Generate report
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
# Header
f.write("=" * 80 + "\n")
f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
f.write("=" * 80 + "\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"Minimum Card Threshold: {min_cards}\n")
f.write(f"Source: {processed_dir}\n")
f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
f.write("=" * 80 + "\n\n")
# Summary statistics
f.write("SUMMARY STATISTICS\n")
f.write("-" * 80 + "\n")
f.write(f"Total Themes: {distribution['total']}\n")
f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
f.write("\n")
# Distribution by card count
f.write("DISTRIBUTION BY CARD COUNT\n")
f.write("-" * 80 + "\n")
f.write(f" 1 card: {distribution['1_card']:4d} themes\n")
f.write(f" 2 cards: {distribution['2_cards']:4d} themes\n")
f.write(f" 3-4 cards: {distribution['3_4_cards']:4d} themes\n")
f.write(f" 5-9 cards: {distribution['5_9_cards']:4d} themes\n")
f.write(f" 10+ cards: {distribution['10_plus']:4d} themes\n")
f.write(f" Total: {distribution['total']:4d} themes\n")
f.write("\n")
# Themes below threshold
if below_threshold:
f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
f.write("=" * 80 + "\n")
f.write(f"Total: {len(below_threshold)} themes\n\n")
for theme_id, count, card_list in below_threshold:
f.write(f"Theme: {theme_id}\n")
f.write(f"Card Count: {count}\n")
f.write(f"Cards:\n")
for card in card_list:
f.write(f" - {card}\n")
f.write("\n")
else:
f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
f.write("=" * 80 + "\n")
f.write("All themes meet the minimum card requirement.\n\n")
# Recommendations
f.write("RECOMMENDATIONS\n")
f.write("=" * 80 + "\n")
if len(themes_to_strip) > 0:
f.write(f"{len(themes_to_strip)} themes should be stripped\n")
f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
f.write(f"• Run theme stripping to remove these low-viability themes\n")
f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
else:
f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
f.write("\n")
# Footer
f.write("=" * 80 + "\n")
f.write("END OF REPORT\n")
f.write("=" * 80 + "\n")
print(f"\nReport generated: {output_file}")
print(f"\nSummary:")
print(f" Total themes: {distribution['total']}")
print(f" Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
print(f" Themes to keep: {distribution['total'] - len(themes_to_strip)}")
# Print distribution
print(f"\nDistribution:")
print(f" 1 card: {distribution['1_card']:4d} themes")
print(f" 2 cards: {distribution['2_cards']:4d} themes")
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Analyze theme distribution and identify themes below minimum card threshold"
)
parser.add_argument(
'--min-cards',
type=int,
default=None,
help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
)
parser.add_argument(
'--output',
type=str,
default=None,
help='Output file path (default: logs/theme_stripping_analysis.txt)'
)
args = parser.parse_args()
try:
analyze_theme_distribution(
min_cards=args.min_cards,
output_path=args.output
)
except KeyboardInterrupt:
print("\nAnalysis cancelled by user")
sys.exit(1)
except Exception as e:
print(f"\nError during analysis: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
main()

View file

@ -34,6 +34,14 @@ try: # Optional
except Exception: # pragma: no cover
yaml = None
# Import settings for THEME_MIN_CARDS threshold
# Import at module level to avoid stdlib 'code' conflict when running as script
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
if ROOT not in sys.path:
sys.path.insert(0, ROOT)
from code import settings as code_settings
try:
# Support running as `python code/scripts/build_theme_catalog.py` when 'code' already on path
from scripts.extract_themes import (
@ -166,17 +174,29 @@ def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:
def regenerate_analytics(verbose: bool):
"""
Regenerate theme analytics from parquet data, constants, and tagger source.
Now reads from parquet files instead of CSV. Applies THEME_MIN_CARDS filtering
to exclude themes with too few cards.
Args:
verbose: Whether to print detailed progress
Returns:
Tuple of (theme_tags, selected_synergies, taxonomy)
"""
theme_tags: Set[str] = set()
theme_tags |= collect_theme_tags_from_constants()
theme_tags |= collect_theme_tags_from_tagger_source()
try:
csv_rows = gather_theme_tag_rows()
for row_tags in csv_rows:
for t in row_tags:
if isinstance(t, str) and t:
theme_tags.add(t)
except Exception:
csv_rows = []
# M3: Read from parquet (no longer silent fail)
# Fail loudly if parquet read fails - this is a critical error
parquet_rows = gather_theme_tag_rows()
for row_tags in parquet_rows:
for t in row_tags:
if isinstance(t, str) and t:
theme_tags.add(t)
whitelist = load_whitelist_config()
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
@ -190,10 +210,8 @@ def regenerate_analytics(verbose: bool):
blacklist = {"Draw Triggers"}
theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}
try:
frequencies = tally_tag_frequencies_by_base_color()
except Exception:
frequencies = {}
# M3: Read frequencies from parquet (fail loudly)
frequencies = tally_tag_frequencies_by_base_color()
if frequencies:
def total_count(t: str) -> int:
@ -204,19 +222,40 @@ def regenerate_analytics(verbose: bool):
except Exception:
pass
return s
kept: Set[str] = set()
# M3: Apply THEME_MIN_CARDS filtering
min_cards = getattr(code_settings, 'THEME_MIN_CARDS', 5)
if verbose:
print(f"Applying THEME_MIN_CARDS filter (threshold: {min_cards} cards)")
themes_before_filter = len(theme_tags)
for t in list(theme_tags):
if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
kept.add(t)
count = total_count(t)
# Check both should_keep_theme (whitelist logic) AND THEME_MIN_CARDS threshold
if should_keep_theme(t, count, whitelist, protected_prefixes, protected_suffixes, min_overrides):
# Additional check: must meet minimum card threshold
if count >= min_cards:
kept.add(t)
elif verbose:
print(f" Filtered out '{t}' ({count} cards < {min_cards} threshold)")
# Always include whitelist themes (override threshold)
for extra in whitelist.get('always_include', []) or []:
kept.add(str(extra))
theme_tags = kept
if verbose:
themes_after_filter = len(theme_tags)
filtered_count = themes_before_filter - themes_after_filter
print(f"Filtered {filtered_count} themes below threshold ({themes_after_filter} remain)")
try:
rows = csv_rows if csv_rows else gather_theme_tag_rows()
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
except Exception:
co_map, tag_counts, total_rows = {}, Counter(), 0
# M3: Compute co-occurrence from parquet data (fail loudly)
rows = parquet_rows if parquet_rows else gather_theme_tag_rows()
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)

View file

@ -6,6 +6,7 @@ from collections import Counter
from typing import Dict, List, Set, Any
import pandas as pd
import numpy as np
import itertools
import math
try:
@ -20,6 +21,7 @@ if ROOT not in sys.path:
from code.settings import CSV_DIRECTORY
from code.tagging import tag_constants
from code.path_util import get_processed_cards_path
BASE_COLORS = {
'white': 'W',
@ -88,83 +90,113 @@ def collect_theme_tags_from_tagger_source() -> Set[str]:
def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]:
"""
Tally theme tag frequencies by base color from parquet files.
Note: This function now reads from card_files/processed/all_cards.parquet
instead of per-color CSV files. The CSV files no longer exist after the
parquet migration.
Returns:
Dictionary mapping color names to Counter of tag frequencies
"""
result: Dict[str, Dict[str, int]] = {c: Counter() for c in BASE_COLORS.keys()}
# Iterate over per-color CSVs; if not present, skip
for color in BASE_COLORS.keys():
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
if not os.path.exists(path):
# Load from all_cards.parquet
parquet_path = get_processed_cards_path()
if not os.path.exists(parquet_path):
print(f"Warning: Parquet file not found: {parquet_path}")
return {k: dict(v) for k, v in result.items()}
try:
df = pd.read_parquet(parquet_path, columns=['themeTags', 'colorIdentity'], engine='pyarrow')
except Exception as e:
print(f"Error reading parquet file: {e}")
return {k: dict(v) for k, v in result.items()}
if 'themeTags' not in df.columns:
print("Warning: themeTags column not found in parquet file")
return {k: dict(v) for k, v in result.items()}
# Iterate rows and tally tags by base color
for _, row in df.iterrows():
# Parquet stores themeTags as numpy array
tags = row.get('themeTags')
if not isinstance(tags, (list, np.ndarray)):
continue
try:
df = pd.read_csv(path, converters={'themeTags': pd.eval, 'colorIdentity': pd.eval})
except Exception:
df = pd.read_csv(path)
if 'themeTags' in df.columns:
try:
df['themeTags'] = df['themeTags'].apply(pd.eval)
except Exception:
df['themeTags'] = df['themeTags'].apply(lambda x: [])
if 'colorIdentity' in df.columns:
try:
df['colorIdentity'] = df['colorIdentity'].apply(pd.eval)
except Exception:
pass
if 'themeTags' not in df.columns:
if isinstance(tags, np.ndarray):
tags = tags.tolist()
# Get color identity (stored as string like "W", "UB", "WUG", etc.)
ci = row.get('colorIdentity')
if isinstance(ci, np.ndarray):
ci = ci.tolist()
# Convert colorIdentity to set of letters
if isinstance(ci, str):
letters = set(ci) # "WUG" -> {'W', 'U', 'G'}
elif isinstance(ci, list):
letters = set(ci) # ['W', 'U', 'G'] -> {'W', 'U', 'G'}
else:
letters = set()
# Determine base colors from color identity
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
if not bases:
# Colorless cards don't contribute to any specific color
continue
# Derive base colors from colorIdentity if available, else assume single color file
def rows_base_colors(row):
ids = row.get('colorIdentity') if isinstance(row, dict) else row
if isinstance(ids, list):
letters = set(ids)
else:
letters = set()
derived = set()
for name, letter in BASE_COLORS.items():
if letter in letters:
derived.add(name)
if not derived:
derived.add(color)
return derived
# Iterate rows
for _, row in df.iterrows():
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
# Compute base colors contribution
ci = row['colorIdentity'] if 'colorIdentity' in row else None
letters = set(ci) if isinstance(ci, list) else set()
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
if not bases:
bases = {color}
for bc in bases:
for t in tags:
result[bc][t] += 1
# Tally tags for each base color this card belongs to
for base_color in bases:
for tag in tags:
if isinstance(tag, str) and tag:
result[base_color][tag] += 1
# Convert Counters to plain dicts
return {k: dict(v) for k, v in result.items()}
def gather_theme_tag_rows() -> List[List[str]]:
"""Collect per-card themeTags lists across all base color CSVs.
"""
Collect per-card themeTags lists from parquet file.
Note: This function now reads from card_files/processed/all_cards.parquet
instead of per-color CSV files. The CSV files no longer exist after the
parquet migration.
Returns a list of themeTags arrays, one per card row where themeTags is present.
Returns:
List of themeTags arrays, one per card row where themeTags is present.
"""
rows: List[List[str]] = []
for color in BASE_COLORS.keys():
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
if not os.path.exists(path):
continue
try:
df = pd.read_csv(path, converters={'themeTags': pd.eval})
except Exception:
df = pd.read_csv(path)
if 'themeTags' in df.columns:
try:
df['themeTags'] = df['themeTags'].apply(pd.eval)
except Exception:
df['themeTags'] = df['themeTags'].apply(lambda x: [])
if 'themeTags' not in df.columns:
continue
for _, row in df.iterrows():
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
if tags:
rows.append(tags)
# Load from all_cards.parquet
parquet_path = get_processed_cards_path()
if not os.path.exists(parquet_path):
print(f"Warning: Parquet file not found: {parquet_path}")
return rows
try:
df = pd.read_parquet(parquet_path, columns=['themeTags'], engine='pyarrow')
except Exception as e:
print(f"Error reading parquet file: {e}")
return rows
if 'themeTags' not in df.columns:
print("Warning: themeTags column not found in parquet file")
return rows
# Collect theme tags from each card
for _, row in df.iterrows():
# Parquet stores themeTags as numpy array
tags = row.get('themeTags')
if isinstance(tags, np.ndarray):
tags = tags.tolist()
if isinstance(tags, list) and tags:
# Convert to list of strings (filter out non-strings)
tag_list = [str(t) for t in tags if isinstance(t, str) and t]
if tag_list:
rows.append(tag_list)
return rows

View file

@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
Strip Theme Catalog Script
Removes themes with insufficient card counts from the theme catalog YAML files.
Creates backups and logs all stripped themes for reference.
Usage:
python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
Options:
--min-cards N Override THEME_MIN_CARDS setting (default: from environment/settings)
--no-backup Skip creating backup files
--dry-run Show what would be stripped without making changes
Example:
python -m code.scripts.strip_catalog_themes
python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Add project root to path for imports
PROJECT_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(PROJECT_ROOT))
from code import settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_catalog_themes,
create_stripped_themes_log,
get_theme_distribution
)
def main():
parser = argparse.ArgumentParser(
description="Strip themes with insufficient card counts from catalog YAML files"
)
parser.add_argument(
"--min-cards",
type=int,
default=settings.THEME_MIN_CARDS,
help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
)
parser.add_argument(
"--no-backup",
action="store_true",
help="Skip creating backup files before modification"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be stripped without making changes"
)
args = parser.parse_args()
# Paths
processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
log_dir = PROJECT_ROOT / 'logs'
stripped_log_path = log_dir / 'stripped_themes.yml'
print(f"Stripping themes from catalog (min_cards={args.min_cards})")
print(f"Catalog directory: {catalog_dir}")
print(f"Dry run: {args.dry_run}")
print()
# Step 1: Get theme card counts from parquet files
print("Step 1: Analyzing theme card counts from parquet files...")
parquet_files = sorted(processed_dir.glob("*.parquet"))
if not parquet_files:
print(f"Error: No parquet files found in {processed_dir}")
return 1
print(f"Found {len(parquet_files)} parquet files")
theme_counts = get_theme_card_counts(parquet_files)
print(f"Found {len(theme_counts)} unique themes")
print()
# Step 2: Get distribution
distribution = get_theme_distribution(theme_counts)
print("Theme distribution:")
print(f" 1 card: {distribution['1_card']:4d} themes")
print(f" 2 cards: {distribution['2_cards']:4d} themes")
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
print(f" Total: {distribution['total']:4d} themes")
print()
# Step 3: Identify themes to strip
themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
themes_to_keep = set(theme_counts.keys()) - themes_to_strip
print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
print(f"Themes to keep: {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
print()
# Show sample of themes to strip
if themes_to_strip:
print("Sample themes to strip (first 10):")
sample = sorted(themes_to_strip)[:10]
for theme_id in sample:
count = len(theme_counts[theme_id])
cards_sample = sorted(theme_counts[theme_id])[:3]
cards_str = ", ".join(cards_sample)
if count > 3:
cards_str += f", ... ({count} total)"
print(f" - {theme_id} ({count} cards): {cards_str}")
print()
if args.dry_run:
print("DRY RUN: No changes made")
return 0
# Step 4: Strip themes from catalog
print("Step 4: Stripping themes from catalog YAML files...")
results = strip_catalog_themes(
catalog_dir=catalog_dir,
themes_to_strip=themes_to_strip,
backup=not args.no_backup
)
print(f" Stripped: {results['stripped_count']} themes")
print(f" Files deleted: {len(results['files_deleted'])}")
print(f" Backups created: {len(results['backups_created'])}")
if results['errors']:
print(f" Errors: {len(results['errors'])}")
for error in results['errors'][:5]: # Show first 5 errors
print(f" - {error}")
print()
# Step 5: Create stripped themes log
print("Step 5: Creating stripped themes log...")
create_stripped_themes_log(
output_path=stripped_log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=args.min_cards,
sources=["catalog YAML"]
)
print(f" Log written to {stripped_log_path}")
print()
print("✅ Catalog stripping complete!")
print()
print(f"Summary:")
print(f" Total themes analyzed: {len(theme_counts)}")
print(f" Themes stripped: {len(themes_to_strip)}")
print(f" Themes remaining: {len(themes_to_keep)}")
print(f" Catalog files deleted: {len(results['files_deleted'])}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""
Strip low-card themes from parquet file themeTags columns.
This script identifies and removes themes below the THEME_MIN_CARDS threshold
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
the Theme Stripping roadmap (R21).
Usage:
# Dry run to see what would be stripped
python code/scripts/strip_parquet_themes.py --dry-run
# Strip from single parquet file
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
# Strip from all parquet files in directory
python code/scripts/strip_parquet_themes.py --all
# Specify custom threshold
python code/scripts/strip_parquet_themes.py --threshold 10 --all
Environment Variables:
THEME_MIN_CARDS: Minimum card threshold (default: 5)
Outputs:
- Modified parquet file(s) with stripped themeTags
- Timestamped backup (.parquet.bak) if --backup enabled
- Updated logs/stripped_themes.yml log
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
from code import settings as code_settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_parquet_themes,
create_stripped_themes_log
)
def find_parquet_files(directory: Path) -> list[Path]:
"""Find all parquet files in processed directory."""
return sorted(directory.glob("*.parquet"))
def update_stripped_themes_log(
theme_counts: dict,
themes_to_strip: set[str],
min_cards: int
) -> None:
"""Update the stripped_themes.yml log with parquet stripping results."""
log_path = ROOT / "logs" / "stripped_themes.yml"
# Create log with parquet source indicator
create_stripped_themes_log(
output_path=log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=min_cards,
sources=["parquet files"]
)
print(f"\nUpdated stripped themes log: {log_path}")
def main():
parser = argparse.ArgumentParser(
description="Strip low-card themes from parquet themeTags columns",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'--file',
type=Path,
help='Specific parquet file to process'
)
parser.add_argument(
'--all',
action='store_true',
help='Process all parquet files in card_files/processed/'
)
parser.add_argument(
'--threshold',
type=int,
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be stripped without making changes'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Skip creating backup files before modification'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed stripping information'
)
args = parser.parse_args()
# Determine threshold
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
# Determine which files to process
if args.file:
if not args.file.exists():
print(f"Error: File not found: {args.file}")
return 1
parquet_files = [args.file]
elif args.all:
processed_dir = ROOT / "card_files" / "processed"
parquet_files = find_parquet_files(processed_dir)
if not parquet_files:
print(f"No parquet files found in {processed_dir}")
return 1
else:
# Default: process all_cards.parquet
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
if not default_file.exists():
print(f"Error: Default file not found: {default_file}")
print("Use --file or --all to specify files to process")
return 1
parquet_files = [default_file]
print(f"Theme Stripping Configuration:")
print(f" Minimum cards: {min_cards}")
print(f" Files to process: {len(parquet_files)}")
print(f" Backup enabled: {not args.no_backup}")
print(f" Dry run: {args.dry_run}")
print()
# Get theme card counts from parquet files
print("Analyzing theme card counts...")
try:
theme_counts = get_theme_card_counts(parquet_files)
print(f"Found {len(theme_counts)} unique themes across files")
except Exception as e:
print(f"Error analyzing theme counts: {e}")
return 1
# Identify themes to strip
print("Identifying themes to strip...")
try:
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
except Exception as e:
print(f"Error identifying themes to strip: {e}")
return 1
if not themes_to_strip:
print("No themes found below threshold. Nothing to strip.")
return 0
print(f"Found {len(themes_to_strip)} themes to strip")
if args.verbose:
sample = sorted(list(themes_to_strip))[:10]
print(f"Sample themes: {', '.join(sample)}")
if len(themes_to_strip) > 10:
print(f" ... and {len(themes_to_strip) - 10} more")
print()
# Dry run mode
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
print()
for parquet_file in parquet_files:
print(f"Would process: {parquet_file}")
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
return 0
# Process each parquet file
total_results = {
"files_processed": 0,
"cards_processed": 0,
"tags_removed": 0,
"errors": []
}
for parquet_file in parquet_files:
print(f"Processing: {parquet_file.name}")
try:
results = strip_parquet_themes(
parquet_path=parquet_file,
themes_to_strip=themes_to_strip,
backup=not args.no_backup
)
total_results["files_processed"] += 1
total_results["cards_processed"] += results["cards_processed"]
total_results["tags_removed"] += results["tags_removed"]
total_results["errors"].extend(results["errors"])
if args.verbose:
print(f" Cards: {results['cards_processed']}")
print(f" Tags removed: {results['tags_removed']}")
if results["backup_created"]:
print(f" Backup: {results['backup_created']}")
except Exception as e:
error_msg = f"Error processing {parquet_file}: {e}"
print(f" {error_msg}")
total_results["errors"].append(error_msg)
continue
print()
# Update stripped themes log
try:
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
except Exception as e:
print(f"Warning: Failed to update stripped themes log: {e}")
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Files processed: {total_results['files_processed']}")
print(f"Cards processed: {total_results['cards_processed']}")
print(f"Tags removed: {total_results['tags_removed']}")
print(f"Themes stripped: {len(themes_to_strip)}")
if total_results["errors"]:
print(f"\nErrors encountered: {len(total_results['errors'])}")
for error in total_results["errors"]:
print(f" - {error}")
else:
print("\nStripping completed successfully!")
return 0 if not total_results["errors"] else 1
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,380 @@
#!/usr/bin/env python3
"""
Standalone theme stripping orchestration script.
This script coordinates the complete theme stripping pipeline:
1. Analyze parquet files to identify low-card themes
2. Strip from catalog YAML files (optional)
3. Strip from parquet themeTags columns (optional)
4. Rebuild theme_list.json from stripped parquet data
5. Generate stripped_themes.yml log
Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
Usage:
# Dry run to preview changes
python code/scripts/strip_themes.py --dry-run
# Strip everything with default threshold (5 cards)
python code/scripts/strip_themes.py
# Strip only catalog YAML files
python code/scripts/strip_themes.py --sources catalog
# Strip only parquet files
python code/scripts/strip_themes.py --sources parquet
# Custom threshold
python code/scripts/strip_themes.py --min-cards 10
# Skip backups (not recommended)
python code/scripts/strip_themes.py --no-backup
Environment Variables:
THEME_MIN_CARDS: Minimum card threshold (default: 5)
Outputs:
- Modified catalog/*.yml files (if --sources includes catalog)
- Modified parquet files (if --sources includes parquet)
- Regenerated config/themes/theme_list.json
- Updated logs/stripped_themes.yml log
- Timestamped backups (if --backup enabled)
"""
import argparse
import sys
import time
from pathlib import Path
from datetime import datetime
from typing import Set, Dict
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
from code import settings as code_settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_catalog_themes,
strip_parquet_themes,
create_stripped_themes_log
)
def strip_all_sources(
min_cards: int,
sources: Set[str],
backup: bool,
dry_run: bool,
verbose: bool
) -> Dict:
"""
Execute complete theme stripping pipeline.
Args:
min_cards: Minimum card count threshold
sources: Set of sources to strip ('catalog', 'parquet', or both)
backup: Whether to create backups before modification
dry_run: Preview changes without modifying files
verbose: Show detailed output
Returns:
Dictionary with stripping results and statistics
"""
start_time = time.time()
results = {
"themes_analyzed": 0,
"themes_to_strip": 0,
"catalog_stripped": 0,
"parquet_tags_removed": 0,
"json_regenerated": False,
"errors": []
}
print("="*70)
print("THEME STRIPPING PIPELINE")
print("="*70)
print(f"Configuration:")
print(f" Minimum cards: {min_cards}")
print(f" Sources: {', '.join(sorted(sources))}")
print(f" Backup enabled: {backup}")
print(f" Dry run: {dry_run}")
print()
# Step 1: Analyze parquet files
print("Step 1: Analyzing theme card counts...")
try:
parquet_dir = ROOT / "card_files" / "processed"
parquet_files = sorted(parquet_dir.glob("*.parquet"))
if not parquet_files:
results["errors"].append("No parquet files found in card_files/processed/")
return results
theme_counts = get_theme_card_counts(parquet_files)
results["themes_analyzed"] = len(theme_counts)
print(f" Found {len(theme_counts)} unique themes")
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
results["themes_to_strip"] = len(themes_to_strip)
print(f" Identified {len(themes_to_strip)} themes below threshold")
if verbose and themes_to_strip:
sample = sorted(list(themes_to_strip))[:5]
print(f" Sample themes: {', '.join(sample)}")
if len(themes_to_strip) > 5:
print(f" ... and {len(themes_to_strip) - 5} more")
if not themes_to_strip:
print("\n✅ No themes below threshold. Nothing to strip.")
return results
except Exception as e:
error_msg = f"Analysis failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
return results
print()
# Dry run mode
if dry_run:
print("DRY RUN MODE - No files will be modified")
print()
if 'catalog' in sources:
print("Would strip from catalog YAML files:")
catalog_dir = ROOT / "config" / "themes" / "catalog"
yaml_files = sorted(catalog_dir.glob("*.yml"))
for yaml_file in yaml_files[:5]:
print(f" - {yaml_file.name}")
if len(yaml_files) > 5:
print(f" ... and {len(yaml_files) - 5} more")
if 'parquet' in sources:
print("\nWould strip from parquet files:")
for pf in parquet_files[:3]:
print(f" - {pf.name}")
if len(parquet_files) > 3:
print(f" ... and {len(parquet_files) - 3} more")
print(f"\nWould strip {len(themes_to_strip)} themes total")
print("Would regenerate theme_list.json")
print("Would update stripped_themes.yml log")
return results
# Step 2: Strip from catalog (if requested)
# NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
# otherwise build_theme_catalog.py will read un-stripped themes from YAML
if 'catalog' in sources:
print("Step 2: Stripping from catalog YAML files...")
try:
catalog_dir = ROOT / "config" / "themes" / "catalog"
catalog_results = strip_catalog_themes(
catalog_dir=catalog_dir,
themes_to_strip=themes_to_strip,
backup=backup
)
results["catalog_stripped"] = catalog_results["files_modified"]
if verbose:
print(f" Files modified: {catalog_results['files_modified']}")
print(f" Themes removed: {catalog_results['themes_removed']}")
if catalog_results["backups_created"]:
print(f" Backups created: {len(catalog_results['backups_created'])}")
else:
print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
results["errors"].extend(catalog_results["errors"])
except Exception as e:
error_msg = f"Catalog stripping failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
print()
# Step 3: Strip from parquet (if requested)
if 'parquet' in sources:
step_num = 3 if 'catalog' in sources else 2
print(f"Step {step_num}: Stripping from parquet files...")
try:
for parquet_file in parquet_files:
if verbose:
print(f" Processing: {parquet_file.name}")
parquet_results = strip_parquet_themes(
parquet_path=parquet_file,
themes_to_strip=themes_to_strip,
backup=backup
)
results["parquet_tags_removed"] += parquet_results["tags_removed"]
results["errors"].extend(parquet_results["errors"])
if verbose and parquet_results["tags_removed"] > 0:
print(f" Removed {parquet_results['tags_removed']} tag occurrences")
if not verbose:
print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
except Exception as e:
error_msg = f"Parquet stripping failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
print()
# Step 4: Rebuild theme_list.json (if parquet was stripped)
# NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
if 'parquet' in sources:
step_num = 4 if 'catalog' in sources else 3
print(f"Step {step_num}: Rebuilding theme_list.json...")
try:
# Import build script
from code.scripts.build_theme_catalog import main as build_main
# Suppress verbose build output unless --verbose flag
import io
import contextlib
if not verbose:
with contextlib.redirect_stdout(io.StringIO()):
build_main()
else:
build_main()
results["json_regenerated"] = True
print(" ✓ theme_list.json regenerated")
except Exception as e:
error_msg = f"JSON regeneration failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
print()
# Step 5: Update stripped themes log
final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
print(f"Step {final_step}: Updating stripped_themes.yml log...")
try:
log_path = ROOT / "logs" / "stripped_themes.yml"
source_labels = []
if 'catalog' in sources:
source_labels.append("catalog YAML")
if 'parquet' in sources:
source_labels.append("parquet files")
create_stripped_themes_log(
output_path=log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=min_cards,
sources=source_labels if source_labels else None
)
print(f" ✓ Log updated: {log_path}")
except Exception as e:
error_msg = f"Log update failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
# Final summary
elapsed = time.time() - start_time
print()
print("="*70)
print("SUMMARY")
print("="*70)
print(f"Themes analyzed: {results['themes_analyzed']}")
print(f"Themes stripped: {results['themes_to_strip']}")
if 'catalog' in sources:
print(f"Catalog files modified: {results['catalog_stripped']}")
if 'parquet' in sources:
print(f"Parquet tags removed: {results['parquet_tags_removed']}")
print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
print(f"Time elapsed: {elapsed:.2f}s")
if results["errors"]:
print(f"\n⚠️ Errors encountered: {len(results['errors'])}")
for error in results["errors"]:
print(f" - {error}")
else:
print("\n✅ Theme stripping completed successfully!")
return results
def main():
parser = argparse.ArgumentParser(
description="Orchestrate complete theme stripping pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'--min-cards',
type=int,
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
)
parser.add_argument(
'--sources',
type=str,
help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be stripped without making changes'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Skip creating backup files before modification'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed stripping information'
)
args = parser.parse_args()
# Determine threshold
min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
# Determine sources
if args.sources:
source_input = args.sources.lower()
if source_input == 'all':
sources = {'catalog', 'parquet'}
else:
sources = set(s.strip() for s in source_input.split(','))
valid_sources = {'catalog', 'parquet'}
invalid = sources - valid_sources
if invalid:
print(f"Error: Invalid sources: {', '.join(invalid)}")
print(f"Valid sources: {', '.join(valid_sources)}, all")
return 1
else:
sources = {'catalog', 'parquet'} # Default: all sources
# Execute pipeline
results = strip_all_sources(
min_cards=min_cards,
sources=sources,
backup=not args.no_backup,
dry_run=args.dry_run,
verbose=args.verbose
)
# Return exit code
return 0 if not results["errors"] else 1
if __name__ == "__main__":
sys.exit(main())

View file

@ -156,4 +156,14 @@ SIMILARITY_CACHE_MAX_AGE_DAYS = int(os.getenv('SIMILARITY_CACHE_MAX_AGE_DAYS', '
SIMILARITY_CACHE_DOWNLOAD = os.getenv('SIMILARITY_CACHE_DOWNLOAD', '1').lower() not in ('0', 'false', 'off', 'disabled')
# Batch build feature flag (Build X and Compare)
ENABLE_BATCH_BUILD = os.getenv('ENABLE_BATCH_BUILD', '1').lower() not in ('0', 'false', 'off', 'disabled')
ENABLE_BATCH_BUILD = os.getenv('ENABLE_BATCH_BUILD', '1').lower() not in ('0', 'false', 'off', 'disabled')
# ----------------------------------------------------------------------------------
# THEME CATALOG SETTINGS
# ----------------------------------------------------------------------------------
# Minimum number of cards required for a theme to be kept in the system
# Themes with fewer cards will be stripped during setup/tagging
# Set to 1 to keep all themes with at least one card
# Set to 0 to only strip orphaned themes (themes with zero cards)
THEME_MIN_CARDS = max(0, int(os.getenv('THEME_MIN_CARDS', '5')))

View file

@ -9,6 +9,7 @@ from pathlib import Path
from typing import DefaultDict, Dict, List, Set
# Third-party imports
import numpy as np
import pandas as pd
@ -151,7 +152,8 @@ def apply_combo_tags(
# Calculate updated counts
updated_counts: Dict[str, int] = {}
if before_hash != after_hash:
updated_counts["total"] = int((df["comboTags"].apply(bool)).sum())
# Use len() > 0 to handle arrays properly (avoid ambiguous truth value)
updated_counts["total"] = int((df["comboTags"].apply(lambda x: len(x) > 0 if isinstance(x, (list, np.ndarray)) else bool(x))).sum())
else:
updated_counts["total"] = 0

View file

@ -6897,6 +6897,112 @@ def run_tagging(parallel: bool = False, max_workers: int | None = None):
logger.info(f"✓ Wrote tagging completion flag to {flag_path}")
except Exception as e:
logger.warning(f"Failed to write tagging completion flag: {e}")
# R21: Theme stripping after tagging (if THEME_MIN_CARDS > 1)
try:
from settings import THEME_MIN_CARDS
if THEME_MIN_CARDS > 1:
logger.info("=" * 80)
logger.info(f"Starting theme stripping (THEME_MIN_CARDS={THEME_MIN_CARDS})")
logger.info("=" * 80)
strip_start = pd.Timestamp.now()
# Import theme stripping functions
from tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_parquet_themes,
strip_catalog_themes,
create_stripped_themes_log
)
# Define project root (tagger.py is in code/tagging/, so go up 2 levels)
PROJECT_ROOT = Path(__file__).resolve().parents[2]
# Step 1: Analyze themes
parquet_dir = Path("card_files/processed")
parquet_files = sorted(parquet_dir.glob("*.parquet"))
logger.info(f"Analyzing {len(parquet_files)} parquet files...")
theme_counts = get_theme_card_counts(parquet_files)
themes_to_strip = identify_themes_to_strip(theme_counts, THEME_MIN_CARDS)
logger.info(f"Found {len(theme_counts)} themes, {len(themes_to_strip)} below threshold")
if themes_to_strip:
# Step 2: Strip from catalog YAML (MUST happen before building JSON)
logger.info("Stripping themes from catalog YAML files...")
catalog_dir = PROJECT_ROOT / "config" / "themes" / "catalog"
if catalog_dir.exists():
catalog_results = strip_catalog_themes(
catalog_dir=catalog_dir,
themes_to_strip=themes_to_strip,
backup=True
)
logger.info(f"✓ Modified {len(catalog_results['files_modified'])} catalog files, stripped {catalog_results['stripped_count']} themes")
else:
logger.info("Catalog directory doesn't exist yet, skipping YAML stripping")
# Step 3: Strip from parquet files
logger.info("Stripping themes from parquet files...")
total_tags_removed = 0
for parquet_file in parquet_files:
results = strip_parquet_themes(
parquet_path=parquet_file,
themes_to_strip=themes_to_strip,
backup=True
)
total_tags_removed += results["tags_removed"]
logger.info(f"✓ Removed {total_tags_removed} theme tag occurrences")
# Step 4: Rebuild theme_list.json from stripped data
logger.info("Rebuilding theme_list.json from stripped parquet and catalog...")
try:
from scripts.build_theme_catalog import build_catalog
import json
from pathlib import Path
# Call build_catalog directly to avoid argparse issues
data = build_catalog(limit=0, verbose=False)
output_path = PROJECT_ROOT / "config" / "themes" / "theme_list.json"
with open(output_path, 'w', encoding='utf-8') as f:
json.dump({k: v for k, v in data.items() if k != 'yaml_catalog'}, f, indent=2, ensure_ascii=False)
logger.info("✓ theme_list.json regenerated from stripped sources")
except Exception as e:
logger.warning(f"Failed to rebuild theme_list.json: {e}")
# Step 5: Update stripped themes log
logger.info("Updating stripped_themes.yml log...")
log_path = PROJECT_ROOT / "logs" / "stripped_themes.yml"
create_stripped_themes_log(
output_path=log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=THEME_MIN_CARDS,
sources=["parquet files", "catalog YAML"]
)
logger.info(f"✓ Log updated: {log_path}")
strip_duration = (pd.Timestamp.now() - strip_start).total_seconds()
logger.info("=" * 80)
logger.info(f"✓ Theme stripping complete in {strip_duration:.2f}s")
logger.info(f" Themes stripped: {len(themes_to_strip)}")
logger.info(f" Tags removed: {total_tags_removed}")
logger.info("=" * 80)
else:
logger.info("No themes below threshold, skipping stripping")
else:
logger.info(f"Theme stripping disabled (THEME_MIN_CARDS={THEME_MIN_CARDS})")
except Exception as e:
logger.error(f"Theme stripping failed: {e}")
logger.warning("Continuing without theme stripping")

View file

@ -0,0 +1,621 @@
"""
Theme Stripping Module
Provides threshold logic and utilities for identifying and stripping themes
with insufficient card counts from the theme catalog and card data.
This module supports M1-M4 of the Theme Stripping roadmap:
- M1: Threshold logic and theme count analysis
- M2: Theme catalog YAML stripping
- M3: theme_list.json stripping
- M4: Parquet file theme_tags stripping
"""
from __future__ import annotations
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, Set, List, Tuple, Any, Optional
import pandas as pd
import numpy as np
try:
import yaml
except ImportError:
yaml = None # type: ignore
# ----------------------------------------------------------------------------------
# M1: Threshold Logic & Analysis
# ----------------------------------------------------------------------------------
def get_theme_card_counts(parquet_paths: List[Path]) -> Dict[str, Set[str]]:
"""
Build a mapping of theme -> set of card names from parquet files.
Args:
parquet_paths: List of paths to parquet files to analyze
Returns:
Dictionary mapping theme ID to set of card names containing that theme
Example:
{"lifegain": {"Ajani's Pridemate", "Soul Warden", ...}, ...}
"""
theme_to_cards: Dict[str, Set[str]] = {}
for parquet_path in parquet_paths:
try:
df = pd.read_parquet(parquet_path)
# Process each card's theme_tags
for _, row in df.iterrows():
card_name = row.get('name', '')
theme_tags = row.get('themeTags', [])
# Handle numpy arrays, lists, and string formats
if isinstance(theme_tags, np.ndarray):
themes = [str(t).strip() for t in theme_tags if str(t).strip()]
elif isinstance(theme_tags, str):
# Try common separators
if '|' in theme_tags:
themes = [t.strip() for t in theme_tags.split('|') if t.strip()]
elif ',' in theme_tags:
themes = [t.strip() for t in theme_tags.split(',') if t.strip()]
else:
themes = [theme_tags.strip()] if theme_tags.strip() else []
elif isinstance(theme_tags, list):
themes = [str(t).strip() for t in theme_tags if str(t).strip()]
else:
themes = []
# Add card to each theme's set
for theme in themes:
if theme: # Skip empty themes
# Normalize theme ID (lowercase, replace spaces with underscores)
theme_id = theme.lower().replace(' ', '_')
if theme_id not in theme_to_cards:
theme_to_cards[theme_id] = set()
theme_to_cards[theme_id].add(card_name)
except Exception as e:
print(f"Warning: Failed to process {parquet_path}: {e}")
continue
return theme_to_cards
def identify_themes_to_strip(
theme_counts: Dict[str, Set[str]],
min_cards: int
) -> Set[str]:
"""
Identify themes that should be stripped based on card count threshold.
Args:
theme_counts: Dictionary mapping theme ID to set of card names
min_cards: Minimum number of cards required to keep a theme
Returns:
Set of theme IDs that should be stripped
Example:
>>> counts = {"daybound": {"Card1", "Card2"}, "lifegain": {"Card1", "Card2", "Card3", "Card4", "Card5"}}
>>> identify_themes_to_strip(counts, 5)
{'daybound'}
"""
themes_to_strip = set()
for theme_id, card_set in theme_counts.items():
card_count = len(card_set)
if card_count < min_cards:
themes_to_strip.add(theme_id)
return themes_to_strip
def should_strip_theme(theme: str, card_count: int, min_cards: int) -> bool:
"""
Determine if a specific theme should be stripped based on threshold.
Args:
theme: Theme ID
card_count: Number of cards with this theme
min_cards: Minimum threshold
Returns:
True if theme should be stripped, False otherwise
"""
return card_count < min_cards
def get_theme_distribution(theme_counts: Dict[str, Set[str]]) -> Dict[str, int]:
"""
Get distribution of themes by card count buckets.
Args:
theme_counts: Dictionary mapping theme ID to set of card names
Returns:
Dictionary with distribution statistics:
- "1_card": Count of themes with exactly 1 card
- "2_cards": Count of themes with exactly 2 cards
- "3_4_cards": Count of themes with 3-4 cards
- "5_9_cards": Count of themes with 5-9 cards
- "10_plus": Count of themes with 10+ cards
- "total": Total number of themes
"""
distribution = {
"1_card": 0,
"2_cards": 0,
"3_4_cards": 0,
"5_9_cards": 0,
"10_plus": 0,
"total": 0
}
for card_set in theme_counts.values():
count = len(card_set)
distribution["total"] += 1
if count == 1:
distribution["1_card"] += 1
elif count == 2:
distribution["2_cards"] += 1
elif 3 <= count <= 4:
distribution["3_4_cards"] += 1
elif 5 <= count <= 9:
distribution["5_9_cards"] += 1
else: # 10+
distribution["10_plus"] += 1
return distribution
def get_themes_by_count(
theme_counts: Dict[str, Set[str]],
below_threshold: int
) -> List[Tuple[str, int, List[str]]]:
"""
Get list of themes below threshold with their counts and card lists.
Args:
theme_counts: Dictionary mapping theme ID to set of card names
below_threshold: Threshold for listing themes
Returns:
List of tuples (theme_id, card_count, card_list) sorted by count (ascending)
Example:
[("miracle", 4, ["Temporal Mastery", "Terminus", "Entreat the Angels", "Bonfire"]), ...]
"""
below_list = []
for theme_id, card_set in theme_counts.items():
count = len(card_set)
if count < below_threshold:
card_list = sorted(card_set) # Sort for consistent output
below_list.append((theme_id, count, card_list))
# Sort by count (ascending), then alphabetically
below_list.sort(key=lambda x: (x[1], x[0]))
return below_list
# ----------------------------------------------------------------------------------
# M2: Theme Catalog Stripping
# ----------------------------------------------------------------------------------
def backup_catalog_file(file_path: Path) -> Path:
"""
Create a timestamped backup of a catalog YAML file.
Args:
file_path: Path to the YAML file to backup
Returns:
Path to the backup file created
Example:
daybound.yml -> daybound_20260319_143025.yml.bak
"""
if not file_path.exists():
raise FileNotFoundError(f"Cannot backup non-existent file: {file_path}")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
stem = file_path.stem # filename without extension
backup_path = file_path.parent / f"{stem}_{timestamp}.yml.bak"
# Copy content to backup
backup_path.write_text(file_path.read_text(encoding='utf-8'), encoding='utf-8')
return backup_path
def remove_theme_from_catalog(yaml_data: Dict[str, Any], theme_id: str) -> bool:
"""
Remove a theme entry from catalog YAML data.
Args:
yaml_data: Loaded YAML data (dict)
theme_id: Theme ID to remove (must match exactly)
Returns:
True if theme was removed, False if not found
Note:
Modifies yaml_data in-place. Handles single-theme files (where entire
file content is the theme dict) and potential multi-theme structures.
"""
# Single-theme file: check if the 'id' field matches
if isinstance(yaml_data, dict) and yaml_data.get('id') == theme_id:
# For single-theme files, we can't remove the theme from the dict itself
# Caller must handle file deletion
return True
# Multi-theme file: check if yaml_data contains a list or dict of themes
# (Future-proofing: current catalog uses one file per theme, but structure may change)
if isinstance(yaml_data, list):
for i, theme in enumerate(yaml_data):
if isinstance(theme, dict) and theme.get('id') == theme_id:
yaml_data.pop(i)
return True
return False
def strip_catalog_themes(
catalog_dir: Path,
themes_to_strip: Set[str],
backup: bool = True
) -> Dict[str, Any]:
"""
Strip low-card themes from YAML catalog files.
Args:
catalog_dir: Directory containing theme catalog YAML files
themes_to_strip: Set of theme IDs to remove
backup: Whether to create timestamped backups before modification
Returns:
Dictionary with stripping results:
- "stripped_count": Number of themes stripped
- "files_modified": List of file paths modified
- "files_deleted": List of file paths deleted (empty single-theme files)
- "backups_created": List of backup file paths
- "errors": List of error messages
Example:
results = strip_catalog_themes(
Path("config/themes/catalog"),
{"daybound", "miracle"},
backup=True
)
# Results: {"stripped_count": 2, "files_modified": [...], ...}
"""
if yaml is None:
raise RuntimeError("PyYAML not installed - cannot strip catalog themes")
if not catalog_dir.exists():
raise FileNotFoundError(f"Catalog directory does not exist: {catalog_dir}")
results = {
"stripped_count": 0,
"files_modified": [],
"files_deleted": [],
"backups_created": [],
"errors": []
}
# Find all YAML files in catalog directory
yaml_files = sorted(catalog_dir.glob("*.yml"))
for yaml_file in yaml_files:
try:
# Load YAML content
content = yaml_file.read_text(encoding='utf-8')
data = yaml.safe_load(content)
if not isinstance(data, dict):
continue # Skip non-dict files
theme_id = data.get('id')
if not theme_id or theme_id not in themes_to_strip:
continue # Skip if theme not in strip list
# Create backup before modification
if backup:
try:
backup_path = backup_catalog_file(yaml_file)
results["backups_created"].append(str(backup_path))
except Exception as e:
results["errors"].append(f"Backup failed for {yaml_file.name}: {e}")
# Continue anyway - modification is important
# For single-theme files, delete the file entirely
# (Current catalog structure: one theme per file)
yaml_file.unlink()
results["stripped_count"] += 1
results["files_deleted"].append(str(yaml_file))
except yaml.YAMLError as e:
results["errors"].append(f"YAML parse error in {yaml_file.name}: {e}")
except Exception as e:
results["errors"].append(f"Error processing {yaml_file.name}: {e}")
return results
def create_stripped_themes_log(
output_path: Path,
theme_counts: Dict[str, Set[str]],
themes_stripped: Set[str],
min_threshold: int,
sources: Optional[List[str]] = None
) -> None:
"""
Create a YAML log of stripped themes with metadata.
Args:
output_path: Path where stripped_themes.yml will be written
theme_counts: Dictionary mapping theme ID to set of card names
themes_stripped: Set of theme IDs that were stripped
min_threshold: The minimum card threshold used for stripping
sources: Optional list of sources themes were stripped from
Creates a YAML file with structure:
metadata:
last_updated: "2026-03-19T12:30:00"
min_card_threshold: 5
total_stripped: 42
stripped_themes:
- theme_id: "daybound"
display_name: "Daybound"
card_count: 3
cards:
- "Card Name 1"
- "Card Name 2"
reason: "Below minimum card threshold (3 < 5)"
stripped_from:
- "catalog/daybound.yml"
- "theme_list.json"
- "parquet files"
"""
if yaml is None:
raise RuntimeError("PyYAML not installed - cannot create stripped themes log")
# Build stripped themes list
stripped_list = []
for theme_id in sorted(themes_stripped):
if theme_id not in theme_counts:
continue # Skip if we don't have count data
card_set = theme_counts[theme_id]
card_count = len(card_set)
sorted_cards = sorted(card_set)
# Convert theme_id to display name (capitalize each word, replace underscores)
display_name = theme_id.replace('_', ' ').title()
theme_entry = {
'theme_id': theme_id,
'display_name': display_name,
'card_count': card_count,
'cards': sorted_cards,
'reason': f"Below minimum card threshold ({card_count} < {min_threshold})",
'stripped_from': sources if sources else ["catalog YAML", "theme_list.json", "parquet files"]
}
stripped_list.append(theme_entry)
# Sort by card count (ascending), then alphabetically
stripped_list.sort(key=lambda x: (x['card_count'], x['theme_id']))
# Build complete log structure
log_data = {
'metadata': {
'last_updated': datetime.now().isoformat(),
'min_card_threshold': min_threshold,
'total_stripped': len(stripped_list)
},
'stripped_themes': stripped_list
}
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(log_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, indent=2)
print(f"Stripped themes log written to {output_path}")
# ----------------------------------------------------------------------------------
# M4: Parquet File Stripping
# ----------------------------------------------------------------------------------
def backup_parquet_file(file_path: Path) -> Path:
"""
Create a timestamped backup of a parquet file.
Args:
file_path: Path to the parquet file to backup
Returns:
Path to the backup file created
Example:
all_cards.parquet -> all_cards_20260319_143025.parquet.bak
"""
if not file_path.exists():
raise FileNotFoundError(f"Cannot backup non-existent file: {file_path}")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
stem = file_path.stem # filename without extension
backup_path = file_path.parent / f"{stem}_{timestamp}.parquet.bak"
# Copy file to backup
import shutil
shutil.copy2(file_path, backup_path)
return backup_path
def filter_theme_tags(theme_tags: Any, themes_to_strip: Set[str]) -> List[str]:
"""
Remove specific themes from a themeTags value (handles multiple formats).
Args:
theme_tags: Can be numpy array, list, or string
themes_to_strip: Set of theme IDs to remove (case-insensitive matching)
Returns:
Filtered list of theme tags
Note:
Matches themes case-insensitively for robustness.
"""
# Convert to list if needed
if isinstance(theme_tags, np.ndarray):
tags_list = theme_tags.tolist()
elif isinstance(theme_tags, list):
tags_list = theme_tags
elif isinstance(theme_tags, str):
# Handle string formats (comma or pipe separated)
if '|' in theme_tags:
tags_list = [t.strip() for t in theme_tags.split('|') if t.strip()]
elif ',' in theme_tags:
tags_list = [t.strip() for t in theme_tags.split(',') if t.strip()]
else:
tags_list = [theme_tags] if theme_tags else []
else:
tags_list = []
# Normalize themes to strip (lowercase for case-insensitive matching)
normalized_strip_set = {theme.lower() for theme in themes_to_strip}
# Filter themes
filtered = [tag for tag in tags_list if str(tag).lower() not in normalized_strip_set]
return filtered
def update_parquet_theme_tags(df: pd.DataFrame, themes_to_strip: Set[str]) -> pd.DataFrame:
"""
Process entire dataframe to remove stripped themes from themeTags column.
Args:
df: DataFrame with themeTags column
themes_to_strip: Set of theme IDs to remove
Returns:
Modified DataFrame (in-place modification + return for convenience)
Note:
Modifies df in-place and also returns it.
"""
if 'themeTags' not in df.columns:
print("Warning: themeTags column not found in dataframe")
return df
# Apply filtering to each row
df['themeTags'] = df['themeTags'].apply(
lambda tags: filter_theme_tags(tags, themes_to_strip)
)
return df
def strip_parquet_themes(
parquet_path: Path,
themes_to_strip: Set[str],
backup: bool = True
) -> Dict[str, Any]:
"""
Strip low-card themes from parquet file's themeTags column.
Args:
parquet_path: Path to parquet file
themes_to_strip: Set of theme IDs to remove
backup: Whether to create timestamped backup before modification
Returns:
Dictionary with stripping results:
- "cards_processed": Total number of cards
- "cards_modified": Number of cards with tags removed
- "tags_removed": Total number of tag removals
- "backup_created": Backup file path (if backup=True)
- "errors": List of error messages
Example:
results = strip_parquet_themes(
Path("card_files/processed/all_cards.parquet"),
{"fateseal", "gravestorm"},
backup=True
)
"""
if not parquet_path.exists():
raise FileNotFoundError(f"Parquet file does not exist: {parquet_path}")
results = {
"cards_processed": 0,
"cards_modified": 0,
"tags_removed": 0,
"backup_created": None,
"errors": []
}
try:
# Load parquet
df = pd.read_parquet(parquet_path, engine='pyarrow')
results["cards_processed"] = len(df)
# Create backup before modification
if backup:
try:
backup_path = backup_parquet_file(parquet_path)
results["backup_created"] = str(backup_path)
print(f"Created backup: {backup_path}")
except Exception as e:
results["errors"].append(f"Backup failed: {e}")
# Continue anyway - modification is important
# Track modifications
if 'themeTags' in df.columns:
# Count tags before stripping
tags_before = sum(
len(tags) if isinstance(tags, (list, np.ndarray)) else 0
for tags in df['themeTags']
)
# Apply filtering
update_parquet_theme_tags(df, themes_to_strip)
# Count tags after stripping
tags_after = sum(
len(tags) if isinstance(tags, list) else 0
for tags in df['themeTags']
)
results["tags_removed"] = tags_before - tags_after
# Count cards with modifications (cards that had at least one tag removed)
# This is approximate: tags_removed / ~avg_tags_per_card
if results["tags_removed"] > 0:
results["cards_modified"] = results["tags_removed"] # Conservative estimate
print(f"Stripped {results['tags_removed']} tag occurrences from {results['cards_processed']} cards")
else:
results["errors"].append("themeTags column not found in parquet file")
return results
# Write modified parquet back
df.to_parquet(parquet_path, engine='pyarrow', index=False)
print(f"Updated {parquet_path}")
except Exception as e:
results["errors"].append(f"Error processing parquet: {e}")
return results