feat: implement theme stripping system with THEME_MIN_CARDS config (#55)
Some checks are pending
CI / build (push) Waiting to run

* feat: implement theme stripping system with THEME_MIN_CARDS config

* fix: call build_catalog directly to avoid argparse conflicts in CI
This commit is contained in:
mwisnowski 2026-03-19 15:27:17 -07:00 committed by GitHub
parent 1ebc2fcb3c
commit 03e2846882
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 6613 additions and 1364 deletions

View file

@ -0,0 +1,207 @@
"""
Theme Distribution Analysis Script
Analyzes theme distribution across the card catalog and generates reports
showing which themes would be stripped based on minimum card thresholds.
Usage:
python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
Arguments:
--min-cards N Minimum card threshold (default: from THEME_MIN_CARDS setting)
--output FILE Output file path (default: logs/theme_stripping_analysis.txt)
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, Set
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
get_theme_distribution,
get_themes_by_count
)
def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
"""
Analyze theme distribution and generate report.
Args:
min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
"""
if min_cards is None:
min_cards = THEME_MIN_CARDS
if output_path is None:
output_path = "logs/theme_stripping_analysis.txt"
print(f"Analyzing theme distribution (min_cards={min_cards})...")
# Find all parquet files
processed_dir = Path(CARD_FILES_PROCESSED_DIR)
if not processed_dir.exists():
print(f"Error: Processed cards directory not found: {processed_dir}")
print("Please run initial setup first to generate parquet files.")
sys.exit(1)
parquet_files = list(processed_dir.glob("*.parquet"))
if not parquet_files:
print(f"Error: No parquet files found in {processed_dir}")
print("Please run initial setup first to generate parquet files.")
sys.exit(1)
print(f"Found {len(parquet_files)} parquet files to analyze")
# Build theme counts
print("Building theme -> card count mapping...")
theme_counts = get_theme_card_counts(parquet_files)
if not theme_counts:
print("Error: No themes found in parquet files")
sys.exit(1)
print(f"Found {len(theme_counts)} unique themes")
# Identify themes to strip
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
# Get distribution
distribution = get_theme_distribution(theme_counts)
# Get themes below threshold
below_threshold = get_themes_by_count(theme_counts, min_cards)
# Generate report
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
# Header
f.write("=" * 80 + "\n")
f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
f.write("=" * 80 + "\n")
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"Minimum Card Threshold: {min_cards}\n")
f.write(f"Source: {processed_dir}\n")
f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
f.write("=" * 80 + "\n\n")
# Summary statistics
f.write("SUMMARY STATISTICS\n")
f.write("-" * 80 + "\n")
f.write(f"Total Themes: {distribution['total']}\n")
f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
f.write("\n")
# Distribution by card count
f.write("DISTRIBUTION BY CARD COUNT\n")
f.write("-" * 80 + "\n")
f.write(f" 1 card: {distribution['1_card']:4d} themes\n")
f.write(f" 2 cards: {distribution['2_cards']:4d} themes\n")
f.write(f" 3-4 cards: {distribution['3_4_cards']:4d} themes\n")
f.write(f" 5-9 cards: {distribution['5_9_cards']:4d} themes\n")
f.write(f" 10+ cards: {distribution['10_plus']:4d} themes\n")
f.write(f" Total: {distribution['total']:4d} themes\n")
f.write("\n")
# Themes below threshold
if below_threshold:
f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
f.write("=" * 80 + "\n")
f.write(f"Total: {len(below_threshold)} themes\n\n")
for theme_id, count, card_list in below_threshold:
f.write(f"Theme: {theme_id}\n")
f.write(f"Card Count: {count}\n")
f.write(f"Cards:\n")
for card in card_list:
f.write(f" - {card}\n")
f.write("\n")
else:
f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
f.write("=" * 80 + "\n")
f.write("All themes meet the minimum card requirement.\n\n")
# Recommendations
f.write("RECOMMENDATIONS\n")
f.write("=" * 80 + "\n")
if len(themes_to_strip) > 0:
f.write(f"{len(themes_to_strip)} themes should be stripped\n")
f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
f.write(f"• Run theme stripping to remove these low-viability themes\n")
f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
else:
f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
f.write("\n")
# Footer
f.write("=" * 80 + "\n")
f.write("END OF REPORT\n")
f.write("=" * 80 + "\n")
print(f"\nReport generated: {output_file}")
print(f"\nSummary:")
print(f" Total themes: {distribution['total']}")
print(f" Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
print(f" Themes to keep: {distribution['total'] - len(themes_to_strip)}")
# Print distribution
print(f"\nDistribution:")
print(f" 1 card: {distribution['1_card']:4d} themes")
print(f" 2 cards: {distribution['2_cards']:4d} themes")
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Analyze theme distribution and identify themes below minimum card threshold"
)
parser.add_argument(
'--min-cards',
type=int,
default=None,
help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
)
parser.add_argument(
'--output',
type=str,
default=None,
help='Output file path (default: logs/theme_stripping_analysis.txt)'
)
args = parser.parse_args()
try:
analyze_theme_distribution(
min_cards=args.min_cards,
output_path=args.output
)
except KeyboardInterrupt:
print("\nAnalysis cancelled by user")
sys.exit(1)
except Exception as e:
print(f"\nError during analysis: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
main()

View file

@ -34,6 +34,14 @@ try: # Optional
except Exception: # pragma: no cover
yaml = None
# Import settings for THEME_MIN_CARDS threshold
# Import at module level to avoid stdlib 'code' conflict when running as script
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
if ROOT not in sys.path:
sys.path.insert(0, ROOT)
from code import settings as code_settings
try:
# Support running as `python code/scripts/build_theme_catalog.py` when 'code' already on path
from scripts.extract_themes import (
@ -166,17 +174,29 @@ def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:
def regenerate_analytics(verbose: bool):
"""
Regenerate theme analytics from parquet data, constants, and tagger source.
Now reads from parquet files instead of CSV. Applies THEME_MIN_CARDS filtering
to exclude themes with too few cards.
Args:
verbose: Whether to print detailed progress
Returns:
Tuple of (theme_tags, selected_synergies, taxonomy)
"""
theme_tags: Set[str] = set()
theme_tags |= collect_theme_tags_from_constants()
theme_tags |= collect_theme_tags_from_tagger_source()
try:
csv_rows = gather_theme_tag_rows()
for row_tags in csv_rows:
for t in row_tags:
if isinstance(t, str) and t:
theme_tags.add(t)
except Exception:
csv_rows = []
# M3: Read from parquet (no longer silent fail)
# Fail loudly if parquet read fails - this is a critical error
parquet_rows = gather_theme_tag_rows()
for row_tags in parquet_rows:
for t in row_tags:
if isinstance(t, str) and t:
theme_tags.add(t)
whitelist = load_whitelist_config()
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
@ -190,10 +210,8 @@ def regenerate_analytics(verbose: bool):
blacklist = {"Draw Triggers"}
theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}
try:
frequencies = tally_tag_frequencies_by_base_color()
except Exception:
frequencies = {}
# M3: Read frequencies from parquet (fail loudly)
frequencies = tally_tag_frequencies_by_base_color()
if frequencies:
def total_count(t: str) -> int:
@ -204,19 +222,40 @@ def regenerate_analytics(verbose: bool):
except Exception:
pass
return s
kept: Set[str] = set()
# M3: Apply THEME_MIN_CARDS filtering
min_cards = getattr(code_settings, 'THEME_MIN_CARDS', 5)
if verbose:
print(f"Applying THEME_MIN_CARDS filter (threshold: {min_cards} cards)")
themes_before_filter = len(theme_tags)
for t in list(theme_tags):
if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
kept.add(t)
count = total_count(t)
# Check both should_keep_theme (whitelist logic) AND THEME_MIN_CARDS threshold
if should_keep_theme(t, count, whitelist, protected_prefixes, protected_suffixes, min_overrides):
# Additional check: must meet minimum card threshold
if count >= min_cards:
kept.add(t)
elif verbose:
print(f" Filtered out '{t}' ({count} cards < {min_cards} threshold)")
# Always include whitelist themes (override threshold)
for extra in whitelist.get('always_include', []) or []:
kept.add(str(extra))
theme_tags = kept
if verbose:
themes_after_filter = len(theme_tags)
filtered_count = themes_before_filter - themes_after_filter
print(f"Filtered {filtered_count} themes below threshold ({themes_after_filter} remain)")
try:
rows = csv_rows if csv_rows else gather_theme_tag_rows()
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
except Exception:
co_map, tag_counts, total_rows = {}, Counter(), 0
# M3: Compute co-occurrence from parquet data (fail loudly)
rows = parquet_rows if parquet_rows else gather_theme_tag_rows()
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)

View file

@ -6,6 +6,7 @@ from collections import Counter
from typing import Dict, List, Set, Any
import pandas as pd
import numpy as np
import itertools
import math
try:
@ -20,6 +21,7 @@ if ROOT not in sys.path:
from code.settings import CSV_DIRECTORY
from code.tagging import tag_constants
from code.path_util import get_processed_cards_path
BASE_COLORS = {
'white': 'W',
@ -88,83 +90,113 @@ def collect_theme_tags_from_tagger_source() -> Set[str]:
def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]:
"""
Tally theme tag frequencies by base color from parquet files.
Note: This function now reads from card_files/processed/all_cards.parquet
instead of per-color CSV files. The CSV files no longer exist after the
parquet migration.
Returns:
Dictionary mapping color names to Counter of tag frequencies
"""
result: Dict[str, Dict[str, int]] = {c: Counter() for c in BASE_COLORS.keys()}
# Iterate over per-color CSVs; if not present, skip
for color in BASE_COLORS.keys():
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
if not os.path.exists(path):
# Load from all_cards.parquet
parquet_path = get_processed_cards_path()
if not os.path.exists(parquet_path):
print(f"Warning: Parquet file not found: {parquet_path}")
return {k: dict(v) for k, v in result.items()}
try:
df = pd.read_parquet(parquet_path, columns=['themeTags', 'colorIdentity'], engine='pyarrow')
except Exception as e:
print(f"Error reading parquet file: {e}")
return {k: dict(v) for k, v in result.items()}
if 'themeTags' not in df.columns:
print("Warning: themeTags column not found in parquet file")
return {k: dict(v) for k, v in result.items()}
# Iterate rows and tally tags by base color
for _, row in df.iterrows():
# Parquet stores themeTags as numpy array
tags = row.get('themeTags')
if not isinstance(tags, (list, np.ndarray)):
continue
try:
df = pd.read_csv(path, converters={'themeTags': pd.eval, 'colorIdentity': pd.eval})
except Exception:
df = pd.read_csv(path)
if 'themeTags' in df.columns:
try:
df['themeTags'] = df['themeTags'].apply(pd.eval)
except Exception:
df['themeTags'] = df['themeTags'].apply(lambda x: [])
if 'colorIdentity' in df.columns:
try:
df['colorIdentity'] = df['colorIdentity'].apply(pd.eval)
except Exception:
pass
if 'themeTags' not in df.columns:
if isinstance(tags, np.ndarray):
tags = tags.tolist()
# Get color identity (stored as string like "W", "UB", "WUG", etc.)
ci = row.get('colorIdentity')
if isinstance(ci, np.ndarray):
ci = ci.tolist()
# Convert colorIdentity to set of letters
if isinstance(ci, str):
letters = set(ci) # "WUG" -> {'W', 'U', 'G'}
elif isinstance(ci, list):
letters = set(ci) # ['W', 'U', 'G'] -> {'W', 'U', 'G'}
else:
letters = set()
# Determine base colors from color identity
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
if not bases:
# Colorless cards don't contribute to any specific color
continue
# Derive base colors from colorIdentity if available, else assume single color file
def rows_base_colors(row):
ids = row.get('colorIdentity') if isinstance(row, dict) else row
if isinstance(ids, list):
letters = set(ids)
else:
letters = set()
derived = set()
for name, letter in BASE_COLORS.items():
if letter in letters:
derived.add(name)
if not derived:
derived.add(color)
return derived
# Iterate rows
for _, row in df.iterrows():
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
# Compute base colors contribution
ci = row['colorIdentity'] if 'colorIdentity' in row else None
letters = set(ci) if isinstance(ci, list) else set()
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
if not bases:
bases = {color}
for bc in bases:
for t in tags:
result[bc][t] += 1
# Tally tags for each base color this card belongs to
for base_color in bases:
for tag in tags:
if isinstance(tag, str) and tag:
result[base_color][tag] += 1
# Convert Counters to plain dicts
return {k: dict(v) for k, v in result.items()}
def gather_theme_tag_rows() -> List[List[str]]:
"""Collect per-card themeTags lists across all base color CSVs.
"""
Collect per-card themeTags lists from parquet file.
Note: This function now reads from card_files/processed/all_cards.parquet
instead of per-color CSV files. The CSV files no longer exist after the
parquet migration.
Returns a list of themeTags arrays, one per card row where themeTags is present.
Returns:
List of themeTags arrays, one per card row where themeTags is present.
"""
rows: List[List[str]] = []
for color in BASE_COLORS.keys():
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
if not os.path.exists(path):
continue
try:
df = pd.read_csv(path, converters={'themeTags': pd.eval})
except Exception:
df = pd.read_csv(path)
if 'themeTags' in df.columns:
try:
df['themeTags'] = df['themeTags'].apply(pd.eval)
except Exception:
df['themeTags'] = df['themeTags'].apply(lambda x: [])
if 'themeTags' not in df.columns:
continue
for _, row in df.iterrows():
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
if tags:
rows.append(tags)
# Load from all_cards.parquet
parquet_path = get_processed_cards_path()
if not os.path.exists(parquet_path):
print(f"Warning: Parquet file not found: {parquet_path}")
return rows
try:
df = pd.read_parquet(parquet_path, columns=['themeTags'], engine='pyarrow')
except Exception as e:
print(f"Error reading parquet file: {e}")
return rows
if 'themeTags' not in df.columns:
print("Warning: themeTags column not found in parquet file")
return rows
# Collect theme tags from each card
for _, row in df.iterrows():
# Parquet stores themeTags as numpy array
tags = row.get('themeTags')
if isinstance(tags, np.ndarray):
tags = tags.tolist()
if isinstance(tags, list) and tags:
# Convert to list of strings (filter out non-strings)
tag_list = [str(t) for t in tags if isinstance(t, str) and t]
if tag_list:
rows.append(tag_list)
return rows

View file

@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
Strip Theme Catalog Script
Removes themes with insufficient card counts from the theme catalog YAML files.
Creates backups and logs all stripped themes for reference.
Usage:
python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
Options:
--min-cards N Override THEME_MIN_CARDS setting (default: from environment/settings)
--no-backup Skip creating backup files
--dry-run Show what would be stripped without making changes
Example:
python -m code.scripts.strip_catalog_themes
python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Add project root to path for imports
PROJECT_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(PROJECT_ROOT))
from code import settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_catalog_themes,
create_stripped_themes_log,
get_theme_distribution
)
def main():
parser = argparse.ArgumentParser(
description="Strip themes with insufficient card counts from catalog YAML files"
)
parser.add_argument(
"--min-cards",
type=int,
default=settings.THEME_MIN_CARDS,
help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
)
parser.add_argument(
"--no-backup",
action="store_true",
help="Skip creating backup files before modification"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be stripped without making changes"
)
args = parser.parse_args()
# Paths
processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
log_dir = PROJECT_ROOT / 'logs'
stripped_log_path = log_dir / 'stripped_themes.yml'
print(f"Stripping themes from catalog (min_cards={args.min_cards})")
print(f"Catalog directory: {catalog_dir}")
print(f"Dry run: {args.dry_run}")
print()
# Step 1: Get theme card counts from parquet files
print("Step 1: Analyzing theme card counts from parquet files...")
parquet_files = sorted(processed_dir.glob("*.parquet"))
if not parquet_files:
print(f"Error: No parquet files found in {processed_dir}")
return 1
print(f"Found {len(parquet_files)} parquet files")
theme_counts = get_theme_card_counts(parquet_files)
print(f"Found {len(theme_counts)} unique themes")
print()
# Step 2: Get distribution
distribution = get_theme_distribution(theme_counts)
print("Theme distribution:")
print(f" 1 card: {distribution['1_card']:4d} themes")
print(f" 2 cards: {distribution['2_cards']:4d} themes")
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
print(f" Total: {distribution['total']:4d} themes")
print()
# Step 3: Identify themes to strip
themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
themes_to_keep = set(theme_counts.keys()) - themes_to_strip
print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
print(f"Themes to keep: {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
print()
# Show sample of themes to strip
if themes_to_strip:
print("Sample themes to strip (first 10):")
sample = sorted(themes_to_strip)[:10]
for theme_id in sample:
count = len(theme_counts[theme_id])
cards_sample = sorted(theme_counts[theme_id])[:3]
cards_str = ", ".join(cards_sample)
if count > 3:
cards_str += f", ... ({count} total)"
print(f" - {theme_id} ({count} cards): {cards_str}")
print()
if args.dry_run:
print("DRY RUN: No changes made")
return 0
# Step 4: Strip themes from catalog
print("Step 4: Stripping themes from catalog YAML files...")
results = strip_catalog_themes(
catalog_dir=catalog_dir,
themes_to_strip=themes_to_strip,
backup=not args.no_backup
)
print(f" Stripped: {results['stripped_count']} themes")
print(f" Files deleted: {len(results['files_deleted'])}")
print(f" Backups created: {len(results['backups_created'])}")
if results['errors']:
print(f" Errors: {len(results['errors'])}")
for error in results['errors'][:5]: # Show first 5 errors
print(f" - {error}")
print()
# Step 5: Create stripped themes log
print("Step 5: Creating stripped themes log...")
create_stripped_themes_log(
output_path=stripped_log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=args.min_cards,
sources=["catalog YAML"]
)
print(f" Log written to {stripped_log_path}")
print()
print("✅ Catalog stripping complete!")
print()
print(f"Summary:")
print(f" Total themes analyzed: {len(theme_counts)}")
print(f" Themes stripped: {len(themes_to_strip)}")
print(f" Themes remaining: {len(themes_to_keep)}")
print(f" Catalog files deleted: {len(results['files_deleted'])}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""
Strip low-card themes from parquet file themeTags columns.
This script identifies and removes themes below the THEME_MIN_CARDS threshold
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
the Theme Stripping roadmap (R21).
Usage:
# Dry run to see what would be stripped
python code/scripts/strip_parquet_themes.py --dry-run
# Strip from single parquet file
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
# Strip from all parquet files in directory
python code/scripts/strip_parquet_themes.py --all
# Specify custom threshold
python code/scripts/strip_parquet_themes.py --threshold 10 --all
Environment Variables:
THEME_MIN_CARDS: Minimum card threshold (default: 5)
Outputs:
- Modified parquet file(s) with stripped themeTags
- Timestamped backup (.parquet.bak) if --backup enabled
- Updated logs/stripped_themes.yml log
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
from code import settings as code_settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_parquet_themes,
create_stripped_themes_log
)
def find_parquet_files(directory: Path) -> list[Path]:
"""Find all parquet files in processed directory."""
return sorted(directory.glob("*.parquet"))
def update_stripped_themes_log(
theme_counts: dict,
themes_to_strip: set[str],
min_cards: int
) -> None:
"""Update the stripped_themes.yml log with parquet stripping results."""
log_path = ROOT / "logs" / "stripped_themes.yml"
# Create log with parquet source indicator
create_stripped_themes_log(
output_path=log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=min_cards,
sources=["parquet files"]
)
print(f"\nUpdated stripped themes log: {log_path}")
def main():
parser = argparse.ArgumentParser(
description="Strip low-card themes from parquet themeTags columns",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'--file',
type=Path,
help='Specific parquet file to process'
)
parser.add_argument(
'--all',
action='store_true',
help='Process all parquet files in card_files/processed/'
)
parser.add_argument(
'--threshold',
type=int,
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be stripped without making changes'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Skip creating backup files before modification'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed stripping information'
)
args = parser.parse_args()
# Determine threshold
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
# Determine which files to process
if args.file:
if not args.file.exists():
print(f"Error: File not found: {args.file}")
return 1
parquet_files = [args.file]
elif args.all:
processed_dir = ROOT / "card_files" / "processed"
parquet_files = find_parquet_files(processed_dir)
if not parquet_files:
print(f"No parquet files found in {processed_dir}")
return 1
else:
# Default: process all_cards.parquet
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
if not default_file.exists():
print(f"Error: Default file not found: {default_file}")
print("Use --file or --all to specify files to process")
return 1
parquet_files = [default_file]
print(f"Theme Stripping Configuration:")
print(f" Minimum cards: {min_cards}")
print(f" Files to process: {len(parquet_files)}")
print(f" Backup enabled: {not args.no_backup}")
print(f" Dry run: {args.dry_run}")
print()
# Get theme card counts from parquet files
print("Analyzing theme card counts...")
try:
theme_counts = get_theme_card_counts(parquet_files)
print(f"Found {len(theme_counts)} unique themes across files")
except Exception as e:
print(f"Error analyzing theme counts: {e}")
return 1
# Identify themes to strip
print("Identifying themes to strip...")
try:
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
except Exception as e:
print(f"Error identifying themes to strip: {e}")
return 1
if not themes_to_strip:
print("No themes found below threshold. Nothing to strip.")
return 0
print(f"Found {len(themes_to_strip)} themes to strip")
if args.verbose:
sample = sorted(list(themes_to_strip))[:10]
print(f"Sample themes: {', '.join(sample)}")
if len(themes_to_strip) > 10:
print(f" ... and {len(themes_to_strip) - 10} more")
print()
# Dry run mode
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
print()
for parquet_file in parquet_files:
print(f"Would process: {parquet_file}")
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
return 0
# Process each parquet file
total_results = {
"files_processed": 0,
"cards_processed": 0,
"tags_removed": 0,
"errors": []
}
for parquet_file in parquet_files:
print(f"Processing: {parquet_file.name}")
try:
results = strip_parquet_themes(
parquet_path=parquet_file,
themes_to_strip=themes_to_strip,
backup=not args.no_backup
)
total_results["files_processed"] += 1
total_results["cards_processed"] += results["cards_processed"]
total_results["tags_removed"] += results["tags_removed"]
total_results["errors"].extend(results["errors"])
if args.verbose:
print(f" Cards: {results['cards_processed']}")
print(f" Tags removed: {results['tags_removed']}")
if results["backup_created"]:
print(f" Backup: {results['backup_created']}")
except Exception as e:
error_msg = f"Error processing {parquet_file}: {e}"
print(f" {error_msg}")
total_results["errors"].append(error_msg)
continue
print()
# Update stripped themes log
try:
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
except Exception as e:
print(f"Warning: Failed to update stripped themes log: {e}")
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Files processed: {total_results['files_processed']}")
print(f"Cards processed: {total_results['cards_processed']}")
print(f"Tags removed: {total_results['tags_removed']}")
print(f"Themes stripped: {len(themes_to_strip)}")
if total_results["errors"]:
print(f"\nErrors encountered: {len(total_results['errors'])}")
for error in total_results["errors"]:
print(f" - {error}")
else:
print("\nStripping completed successfully!")
return 0 if not total_results["errors"] else 1
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,380 @@
#!/usr/bin/env python3
"""
Standalone theme stripping orchestration script.
This script coordinates the complete theme stripping pipeline:
1. Analyze parquet files to identify low-card themes
2. Strip from catalog YAML files (optional)
3. Strip from parquet themeTags columns (optional)
4. Rebuild theme_list.json from stripped parquet data
5. Generate stripped_themes.yml log
Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
Usage:
# Dry run to preview changes
python code/scripts/strip_themes.py --dry-run
# Strip everything with default threshold (5 cards)
python code/scripts/strip_themes.py
# Strip only catalog YAML files
python code/scripts/strip_themes.py --sources catalog
# Strip only parquet files
python code/scripts/strip_themes.py --sources parquet
# Custom threshold
python code/scripts/strip_themes.py --min-cards 10
# Skip backups (not recommended)
python code/scripts/strip_themes.py --no-backup
Environment Variables:
THEME_MIN_CARDS: Minimum card threshold (default: 5)
Outputs:
- Modified catalog/*.yml files (if --sources includes catalog)
- Modified parquet files (if --sources includes parquet)
- Regenerated config/themes/theme_list.json
- Updated logs/stripped_themes.yml log
- Timestamped backups (if --backup enabled)
"""
import argparse
import sys
import time
from pathlib import Path
from datetime import datetime
from typing import Set, Dict
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
from code import settings as code_settings
from code.tagging.theme_stripper import (
get_theme_card_counts,
identify_themes_to_strip,
strip_catalog_themes,
strip_parquet_themes,
create_stripped_themes_log
)
def strip_all_sources(
min_cards: int,
sources: Set[str],
backup: bool,
dry_run: bool,
verbose: bool
) -> Dict:
"""
Execute complete theme stripping pipeline.
Args:
min_cards: Minimum card count threshold
sources: Set of sources to strip ('catalog', 'parquet', or both)
backup: Whether to create backups before modification
dry_run: Preview changes without modifying files
verbose: Show detailed output
Returns:
Dictionary with stripping results and statistics
"""
start_time = time.time()
results = {
"themes_analyzed": 0,
"themes_to_strip": 0,
"catalog_stripped": 0,
"parquet_tags_removed": 0,
"json_regenerated": False,
"errors": []
}
print("="*70)
print("THEME STRIPPING PIPELINE")
print("="*70)
print(f"Configuration:")
print(f" Minimum cards: {min_cards}")
print(f" Sources: {', '.join(sorted(sources))}")
print(f" Backup enabled: {backup}")
print(f" Dry run: {dry_run}")
print()
# Step 1: Analyze parquet files
print("Step 1: Analyzing theme card counts...")
try:
parquet_dir = ROOT / "card_files" / "processed"
parquet_files = sorted(parquet_dir.glob("*.parquet"))
if not parquet_files:
results["errors"].append("No parquet files found in card_files/processed/")
return results
theme_counts = get_theme_card_counts(parquet_files)
results["themes_analyzed"] = len(theme_counts)
print(f" Found {len(theme_counts)} unique themes")
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
results["themes_to_strip"] = len(themes_to_strip)
print(f" Identified {len(themes_to_strip)} themes below threshold")
if verbose and themes_to_strip:
sample = sorted(list(themes_to_strip))[:5]
print(f" Sample themes: {', '.join(sample)}")
if len(themes_to_strip) > 5:
print(f" ... and {len(themes_to_strip) - 5} more")
if not themes_to_strip:
print("\n✅ No themes below threshold. Nothing to strip.")
return results
except Exception as e:
error_msg = f"Analysis failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
return results
print()
# Dry run mode
if dry_run:
print("DRY RUN MODE - No files will be modified")
print()
if 'catalog' in sources:
print("Would strip from catalog YAML files:")
catalog_dir = ROOT / "config" / "themes" / "catalog"
yaml_files = sorted(catalog_dir.glob("*.yml"))
for yaml_file in yaml_files[:5]:
print(f" - {yaml_file.name}")
if len(yaml_files) > 5:
print(f" ... and {len(yaml_files) - 5} more")
if 'parquet' in sources:
print("\nWould strip from parquet files:")
for pf in parquet_files[:3]:
print(f" - {pf.name}")
if len(parquet_files) > 3:
print(f" ... and {len(parquet_files) - 3} more")
print(f"\nWould strip {len(themes_to_strip)} themes total")
print("Would regenerate theme_list.json")
print("Would update stripped_themes.yml log")
return results
# Step 2: Strip from catalog (if requested)
# NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
# otherwise build_theme_catalog.py will read un-stripped themes from YAML
if 'catalog' in sources:
print("Step 2: Stripping from catalog YAML files...")
try:
catalog_dir = ROOT / "config" / "themes" / "catalog"
catalog_results = strip_catalog_themes(
catalog_dir=catalog_dir,
themes_to_strip=themes_to_strip,
backup=backup
)
results["catalog_stripped"] = catalog_results["files_modified"]
if verbose:
print(f" Files modified: {catalog_results['files_modified']}")
print(f" Themes removed: {catalog_results['themes_removed']}")
if catalog_results["backups_created"]:
print(f" Backups created: {len(catalog_results['backups_created'])}")
else:
print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
results["errors"].extend(catalog_results["errors"])
except Exception as e:
error_msg = f"Catalog stripping failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
print()
# Step 3: Strip from parquet (if requested)
if 'parquet' in sources:
step_num = 3 if 'catalog' in sources else 2
print(f"Step {step_num}: Stripping from parquet files...")
try:
for parquet_file in parquet_files:
if verbose:
print(f" Processing: {parquet_file.name}")
parquet_results = strip_parquet_themes(
parquet_path=parquet_file,
themes_to_strip=themes_to_strip,
backup=backup
)
results["parquet_tags_removed"] += parquet_results["tags_removed"]
results["errors"].extend(parquet_results["errors"])
if verbose and parquet_results["tags_removed"] > 0:
print(f" Removed {parquet_results['tags_removed']} tag occurrences")
if not verbose:
print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
except Exception as e:
error_msg = f"Parquet stripping failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
print()
# Step 4: Rebuild theme_list.json (if parquet was stripped)
# NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
if 'parquet' in sources:
step_num = 4 if 'catalog' in sources else 3
print(f"Step {step_num}: Rebuilding theme_list.json...")
try:
# Import build script
from code.scripts.build_theme_catalog import main as build_main
# Suppress verbose build output unless --verbose flag
import io
import contextlib
if not verbose:
with contextlib.redirect_stdout(io.StringIO()):
build_main()
else:
build_main()
results["json_regenerated"] = True
print(" ✓ theme_list.json regenerated")
except Exception as e:
error_msg = f"JSON regeneration failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
print()
# Step 5: Update stripped themes log
final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
print(f"Step {final_step}: Updating stripped_themes.yml log...")
try:
log_path = ROOT / "logs" / "stripped_themes.yml"
source_labels = []
if 'catalog' in sources:
source_labels.append("catalog YAML")
if 'parquet' in sources:
source_labels.append("parquet files")
create_stripped_themes_log(
output_path=log_path,
theme_counts=theme_counts,
themes_stripped=themes_to_strip,
min_threshold=min_cards,
sources=source_labels if source_labels else None
)
print(f" ✓ Log updated: {log_path}")
except Exception as e:
error_msg = f"Log update failed: {e}"
print(f"{error_msg}")
results["errors"].append(error_msg)
# Final summary
elapsed = time.time() - start_time
print()
print("="*70)
print("SUMMARY")
print("="*70)
print(f"Themes analyzed: {results['themes_analyzed']}")
print(f"Themes stripped: {results['themes_to_strip']}")
if 'catalog' in sources:
print(f"Catalog files modified: {results['catalog_stripped']}")
if 'parquet' in sources:
print(f"Parquet tags removed: {results['parquet_tags_removed']}")
print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
print(f"Time elapsed: {elapsed:.2f}s")
if results["errors"]:
print(f"\n⚠️ Errors encountered: {len(results['errors'])}")
for error in results["errors"]:
print(f" - {error}")
else:
print("\n✅ Theme stripping completed successfully!")
return results
def main():
parser = argparse.ArgumentParser(
description="Orchestrate complete theme stripping pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'--min-cards',
type=int,
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
)
parser.add_argument(
'--sources',
type=str,
help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be stripped without making changes'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Skip creating backup files before modification'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed stripping information'
)
args = parser.parse_args()
# Determine threshold
min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
# Determine sources
if args.sources:
source_input = args.sources.lower()
if source_input == 'all':
sources = {'catalog', 'parquet'}
else:
sources = set(s.strip() for s in source_input.split(','))
valid_sources = {'catalog', 'parquet'}
invalid = sources - valid_sources
if invalid:
print(f"Error: Invalid sources: {', '.join(invalid)}")
print(f"Valid sources: {', '.join(valid_sources)}, all")
return 1
else:
sources = {'catalog', 'parquet'} # Default: all sources
# Execute pipeline
results = strip_all_sources(
min_cards=min_cards,
sources=sources,
backup=not args.no_backup,
dry_run=args.dry_run,
verbose=args.verbose
)
# Return exit code
return 0 if not results["errors"] else 1
if __name__ == "__main__":
sys.exit(main())