feat: implement theme stripping system with THEME_MIN_CARDS config (#55)

* feat: implement theme stripping system with THEME_MIN_CARDS config * fix: call build_catalog directly to avoid argparse conflicts in CI
2026-03-24 22:16:31 +01:00 · 2026-03-19 15:27:17 -07:00 · 2026-03-19 15:27:17 -07:00 · 03e2846882
commit 03e2846882
parent 1ebc2fcb3c
20 changed files with 6613 additions and 1364 deletions
--- a/code/scripts/analyze_theme_distribution.py
+++ b/code/scripts/analyze_theme_distribution.py
@ -0,0 +1,207 @@
+"""
+Theme Distribution Analysis Script
+
+Analyzes theme distribution across the card catalog and generates reports
+showing which themes would be stripped based on minimum card thresholds.
+
+Usage:
+    python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
+
+Arguments:
+    --min-cards N    Minimum card threshold (default: from THEME_MIN_CARDS setting)
+    --output FILE    Output file path (default: logs/theme_stripping_analysis.txt)
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Set
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    get_theme_distribution,
+    get_themes_by_count
+)
+
+
+def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
+    """
+    Analyze theme distribution and generate report.
+    
+    Args:
+        min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
+        output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
+    """
+    if min_cards is None:
+        min_cards = THEME_MIN_CARDS
+    
+    if output_path is None:
+        output_path = "logs/theme_stripping_analysis.txt"
+    
+    print(f"Analyzing theme distribution (min_cards={min_cards})...")
+    
+    # Find all parquet files
+    processed_dir = Path(CARD_FILES_PROCESSED_DIR)
+    if not processed_dir.exists():
+        print(f"Error: Processed cards directory not found: {processed_dir}")
+        print("Please run initial setup first to generate parquet files.")
+        sys.exit(1)
+    
+    parquet_files = list(processed_dir.glob("*.parquet"))
+    if not parquet_files:
+        print(f"Error: No parquet files found in {processed_dir}")
+        print("Please run initial setup first to generate parquet files.")
+        sys.exit(1)
+    
+    print(f"Found {len(parquet_files)} parquet files to analyze")
+    
+    # Build theme counts
+    print("Building theme -> card count mapping...")
+    theme_counts = get_theme_card_counts(parquet_files)
+    
+    if not theme_counts:
+        print("Error: No themes found in parquet files")
+        sys.exit(1)
+    
+    print(f"Found {len(theme_counts)} unique themes")
+    
+    # Identify themes to strip
+    themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
+    
+    # Get distribution
+    distribution = get_theme_distribution(theme_counts)
+    
+    # Get themes below threshold
+    below_threshold = get_themes_by_count(theme_counts, min_cards)
+    
+    # Generate report
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        # Header
+        f.write("=" * 80 + "\n")
+        f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
+        f.write("=" * 80 + "\n")
+        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Minimum Card Threshold: {min_cards}\n")
+        f.write(f"Source: {processed_dir}\n")
+        f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
+        f.write("=" * 80 + "\n\n")
+        
+        # Summary statistics
+        f.write("SUMMARY STATISTICS\n")
+        f.write("-" * 80 + "\n")
+        f.write(f"Total Themes: {distribution['total']}\n")
+        f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
+        f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
+        f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
+        f.write("\n")
+        
+        # Distribution by card count
+        f.write("DISTRIBUTION BY CARD COUNT\n")
+        f.write("-" * 80 + "\n")
+        f.write(f"  1 card:  {distribution['1_card']:4d} themes\n")
+        f.write(f"  2 cards: {distribution['2_cards']:4d} themes\n")
+        f.write(f"  3-4 cards: {distribution['3_4_cards']:4d} themes\n")
+        f.write(f"  5-9 cards: {distribution['5_9_cards']:4d} themes\n")
+        f.write(f"  10+ cards: {distribution['10_plus']:4d} themes\n")
+        f.write(f"  Total:   {distribution['total']:4d} themes\n")
+        f.write("\n")
+        
+        # Themes below threshold
+        if below_threshold:
+            f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
+            f.write("=" * 80 + "\n")
+            f.write(f"Total: {len(below_threshold)} themes\n\n")
+            
+            for theme_id, count, card_list in below_threshold:
+                f.write(f"Theme: {theme_id}\n")
+                f.write(f"Card Count: {count}\n")
+                f.write(f"Cards:\n")
+                for card in card_list:
+                    f.write(f"  - {card}\n")
+                f.write("\n")
+        else:
+            f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
+            f.write("=" * 80 + "\n")
+            f.write("All themes meet the minimum card requirement.\n\n")
+        
+        # Recommendations
+        f.write("RECOMMENDATIONS\n")
+        f.write("=" * 80 + "\n")
+        if len(themes_to_strip) > 0:
+            f.write(f"• {len(themes_to_strip)} themes should be stripped\n")
+            f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
+            f.write(f"• Run theme stripping to remove these low-viability themes\n")
+            f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
+        else:
+            f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
+            f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
+        f.write("\n")
+        
+        # Footer
+        f.write("=" * 80 + "\n")
+        f.write("END OF REPORT\n")
+        f.write("=" * 80 + "\n")
+    
+    print(f"\nReport generated: {output_file}")
+    print(f"\nSummary:")
+    print(f"  Total themes: {distribution['total']}")
+    print(f"  Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
+    print(f"  Themes to keep: {distribution['total'] - len(themes_to_strip)}")
+    
+    # Print distribution
+    print(f"\nDistribution:")
+    print(f"  1 card:    {distribution['1_card']:4d} themes")
+    print(f"  2 cards:   {distribution['2_cards']:4d} themes")
+    print(f"  3-4 cards: {distribution['3_4_cards']:4d} themes")
+    print(f"  5-9 cards: {distribution['5_9_cards']:4d} themes")
+    print(f"  10+ cards: {distribution['10_plus']:4d} themes")
+
+
+def main():
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Analyze theme distribution and identify themes below minimum card threshold"
+    )
+    parser.add_argument(
+        '--min-cards',
+        type=int,
+        default=None,
+        help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default=None,
+        help='Output file path (default: logs/theme_stripping_analysis.txt)'
+    )
+    
+    args = parser.parse_args()
+    
+    try:
+        analyze_theme_distribution(
+            min_cards=args.min_cards,
+            output_path=args.output
+        )
+    except KeyboardInterrupt:
+        print("\nAnalysis cancelled by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\nError during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/code/scripts/build_theme_catalog.py
+++ b/code/scripts/build_theme_catalog.py
@ -34,6 +34,14 @@ try:  # Optional
 except Exception:  # pragma: no cover
    yaml = None

+# Import settings for THEME_MIN_CARDS threshold
+# Import at module level to avoid stdlib 'code' conflict when running as script
+ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+if ROOT not in sys.path:
+    sys.path.insert(0, ROOT)
+
+from code import settings as code_settings
+
 try:
    # Support running as `python code/scripts/build_theme_catalog.py` when 'code' already on path
    from scripts.extract_themes import (
@ -166,17 +174,29 @@ def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:


 def regenerate_analytics(verbose: bool):
+    """
+    Regenerate theme analytics from parquet data, constants, and tagger source.
+    
+    Now reads from parquet files instead of CSV. Applies THEME_MIN_CARDS filtering
+    to exclude themes with too few cards.
+    
+    Args:
+        verbose: Whether to print detailed progress
+        
+    Returns:
+        Tuple of (theme_tags, selected_synergies, taxonomy)
+    """
    theme_tags: Set[str] = set()
    theme_tags |= collect_theme_tags_from_constants()
    theme_tags |= collect_theme_tags_from_tagger_source()
-    try:
-        csv_rows = gather_theme_tag_rows()
-        for row_tags in csv_rows:
-            for t in row_tags:
-                if isinstance(t, str) and t:
-                    theme_tags.add(t)
-    except Exception:
-        csv_rows = []
+    
+    # M3: Read from parquet (no longer silent fail)
+    # Fail loudly if parquet read fails - this is a critical error
+    parquet_rows = gather_theme_tag_rows()
+    for row_tags in parquet_rows:
+        for t in row_tags:
+            if isinstance(t, str) and t:
+                theme_tags.add(t)

    whitelist = load_whitelist_config()
    normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
@ -190,10 +210,8 @@ def regenerate_analytics(verbose: bool):
    blacklist = {"Draw Triggers"}
    theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}

-    try:
-        frequencies = tally_tag_frequencies_by_base_color()
-    except Exception:
-        frequencies = {}
+    # M3: Read frequencies from parquet (fail loudly)
+    frequencies = tally_tag_frequencies_by_base_color()

    if frequencies:
        def total_count(t: str) -> int:
@ -204,19 +222,40 @@ def regenerate_analytics(verbose: bool):
                except Exception:
                    pass
            return s
+        
        kept: Set[str] = set()
+        
+        # M3: Apply THEME_MIN_CARDS filtering
+        min_cards = getattr(code_settings, 'THEME_MIN_CARDS', 5)
+        if verbose:
+            print(f"Applying THEME_MIN_CARDS filter (threshold: {min_cards} cards)")
+        
+        themes_before_filter = len(theme_tags)
+        
        for t in list(theme_tags):
-            if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
-                kept.add(t)
+            count = total_count(t)
+            # Check both should_keep_theme (whitelist logic) AND THEME_MIN_CARDS threshold
+            if should_keep_theme(t, count, whitelist, protected_prefixes, protected_suffixes, min_overrides):
+                # Additional check: must meet minimum card threshold
+                if count >= min_cards:
+                    kept.add(t)
+                elif verbose:
+                    print(f"  Filtered out '{t}' ({count} cards < {min_cards} threshold)")
+        
+        # Always include whitelist themes (override threshold)
        for extra in whitelist.get('always_include', []) or []:
            kept.add(str(extra))
+        
        theme_tags = kept
+        
+        if verbose:
+            themes_after_filter = len(theme_tags)
+            filtered_count = themes_before_filter - themes_after_filter
+            print(f"Filtered {filtered_count} themes below threshold ({themes_after_filter} remain)")

-    try:
-        rows = csv_rows if csv_rows else gather_theme_tag_rows()
-        co_map, tag_counts, total_rows = compute_cooccurrence(rows)
-    except Exception:
-        co_map, tag_counts, total_rows = {}, Counter(), 0
+    # M3: Compute co-occurrence from parquet data (fail loudly)
+    rows = parquet_rows if parquet_rows else gather_theme_tag_rows()
+    co_map, tag_counts, total_rows = compute_cooccurrence(rows)

    return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)

--- a/code/scripts/extract_themes.py
+++ b/code/scripts/extract_themes.py
@ -6,6 +6,7 @@ from collections import Counter
 from typing import Dict, List, Set, Any

 import pandas as pd
+import numpy as np
 import itertools
 import math
 try:
@ -20,6 +21,7 @@ if ROOT not in sys.path:

 from code.settings import CSV_DIRECTORY
 from code.tagging import tag_constants
+from code.path_util import get_processed_cards_path

 BASE_COLORS = {
    'white': 'W',
@ -88,83 +90,113 @@ def collect_theme_tags_from_tagger_source() -> Set[str]:


 def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]:
+    """
+    Tally theme tag frequencies by base color from parquet files.
+    
+    Note: This function now reads from card_files/processed/all_cards.parquet
+    instead of per-color CSV files. The CSV files no longer exist after the
+    parquet migration.
+    
+    Returns:
+        Dictionary mapping color names to Counter of tag frequencies
+    """
    result: Dict[str, Dict[str, int]] = {c: Counter() for c in BASE_COLORS.keys()}
-    # Iterate over per-color CSVs; if not present, skip
-    for color in BASE_COLORS.keys():
-        path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
-        if not os.path.exists(path):
+    
+    # Load from all_cards.parquet
+    parquet_path = get_processed_cards_path()
+    if not os.path.exists(parquet_path):
+        print(f"Warning: Parquet file not found: {parquet_path}")
+        return {k: dict(v) for k, v in result.items()}
+    
+    try:
+        df = pd.read_parquet(parquet_path, columns=['themeTags', 'colorIdentity'], engine='pyarrow')
+    except Exception as e:
+        print(f"Error reading parquet file: {e}")
+        return {k: dict(v) for k, v in result.items()}
+    
+    if 'themeTags' not in df.columns:
+        print("Warning: themeTags column not found in parquet file")
+        return {k: dict(v) for k, v in result.items()}
+    
+    # Iterate rows and tally tags by base color
+    for _, row in df.iterrows():
+        # Parquet stores themeTags as numpy array
+        tags = row.get('themeTags')
+        if not isinstance(tags, (list, np.ndarray)):
            continue
-        try:
-            df = pd.read_csv(path, converters={'themeTags': pd.eval, 'colorIdentity': pd.eval})
-        except Exception:
-            df = pd.read_csv(path)
-            if 'themeTags' in df.columns:
-                try:
-                    df['themeTags'] = df['themeTags'].apply(pd.eval)
-                except Exception:
-                    df['themeTags'] = df['themeTags'].apply(lambda x: [])
-            if 'colorIdentity' in df.columns:
-                try:
-                    df['colorIdentity'] = df['colorIdentity'].apply(pd.eval)
-                except Exception:
-                    pass
-        if 'themeTags' not in df.columns:
+        if isinstance(tags, np.ndarray):
+            tags = tags.tolist()
+        
+        # Get color identity (stored as string like "W", "UB", "WUG", etc.)
+        ci = row.get('colorIdentity')
+        if isinstance(ci, np.ndarray):
+            ci = ci.tolist()
+        
+        # Convert colorIdentity to set of letters
+        if isinstance(ci, str):
+            letters = set(ci)  # "WUG" -> {'W', 'U', 'G'}
+        elif isinstance(ci, list):
+            letters = set(ci)  # ['W', 'U', 'G'] -> {'W', 'U', 'G'}
+        else:
+            letters = set()
+        
+        # Determine base colors from color identity
+        bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
+        if not bases:
+            # Colorless cards don't contribute to any specific color
            continue
-        # Derive base colors from colorIdentity if available, else assume single color file
-        def rows_base_colors(row):
-            ids = row.get('colorIdentity') if isinstance(row, dict) else row
-            if isinstance(ids, list):
-                letters = set(ids)
-            else:
-                letters = set()
-            derived = set()
-            for name, letter in BASE_COLORS.items():
-                if letter in letters:
-                    derived.add(name)
-            if not derived:
-                derived.add(color)
-            return derived
-        # Iterate rows
-        for _, row in df.iterrows():
-            tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
-            # Compute base colors contribution
-            ci = row['colorIdentity'] if 'colorIdentity' in row else None
-            letters = set(ci) if isinstance(ci, list) else set()
-            bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
-            if not bases:
-                bases = {color}
-            for bc in bases:
-                for t in tags:
-                    result[bc][t] += 1
+        
+        # Tally tags for each base color this card belongs to
+        for base_color in bases:
+            for tag in tags:
+                if isinstance(tag, str) and tag:
+                    result[base_color][tag] += 1
+    
    # Convert Counters to plain dicts
    return {k: dict(v) for k, v in result.items()}


 def gather_theme_tag_rows() -> List[List[str]]:
-    """Collect per-card themeTags lists across all base color CSVs.
+    """
+    Collect per-card themeTags lists from parquet file.
+    
+    Note: This function now reads from card_files/processed/all_cards.parquet
+    instead of per-color CSV files. The CSV files no longer exist after the
+    parquet migration.

-    Returns a list of themeTags arrays, one per card row where themeTags is present.
+    Returns:
+        List of themeTags arrays, one per card row where themeTags is present.
    """
    rows: List[List[str]] = []
-    for color in BASE_COLORS.keys():
-        path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
-        if not os.path.exists(path):
-            continue
-        try:
-            df = pd.read_csv(path, converters={'themeTags': pd.eval})
-        except Exception:
-            df = pd.read_csv(path)
-            if 'themeTags' in df.columns:
-                try:
-                    df['themeTags'] = df['themeTags'].apply(pd.eval)
-                except Exception:
-                    df['themeTags'] = df['themeTags'].apply(lambda x: [])
-        if 'themeTags' not in df.columns:
-            continue
-        for _, row in df.iterrows():
-            tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
-            if tags:
-                rows.append(tags)
+    
+    # Load from all_cards.parquet
+    parquet_path = get_processed_cards_path()
+    if not os.path.exists(parquet_path):
+        print(f"Warning: Parquet file not found: {parquet_path}")
+        return rows
+    
+    try:
+        df = pd.read_parquet(parquet_path, columns=['themeTags'], engine='pyarrow')
+    except Exception as e:
+        print(f"Error reading parquet file: {e}")
+        return rows
+    
+    if 'themeTags' not in df.columns:
+        print("Warning: themeTags column not found in parquet file")
+        return rows
+    
+    # Collect theme tags from each card
+    for _, row in df.iterrows():
+        # Parquet stores themeTags as numpy array
+        tags = row.get('themeTags')
+        if isinstance(tags, np.ndarray):
+            tags = tags.tolist()
+        if isinstance(tags, list) and tags:
+            # Convert to list of strings (filter out non-strings)
+            tag_list = [str(t) for t in tags if isinstance(t, str) and t]
+            if tag_list:
+                rows.append(tag_list)
+    
    return rows


--- a/code/scripts/strip_catalog_themes.py
+++ b/code/scripts/strip_catalog_themes.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+Strip Theme Catalog Script
+
+Removes themes with insufficient card counts from the theme catalog YAML files.
+Creates backups and logs all stripped themes for reference.
+
+Usage:
+    python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
+
+Options:
+    --min-cards N       Override THEME_MIN_CARDS setting (default: from environment/settings)
+    --no-backup         Skip creating backup files
+    --dry-run           Show what would be stripped without making changes
+
+Example:
+    python -m code.scripts.strip_catalog_themes
+    python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add project root to path for imports
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(PROJECT_ROOT))
+
+from code import settings
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    strip_catalog_themes,
+    create_stripped_themes_log,
+    get_theme_distribution
+)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Strip themes with insufficient card counts from catalog YAML files"
+    )
+    parser.add_argument(
+        "--min-cards",
+        type=int,
+        default=settings.THEME_MIN_CARDS,
+        help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
+    )
+    parser.add_argument(
+        "--no-backup",
+        action="store_true",
+        help="Skip creating backup files before modification"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be stripped without making changes"
+    )
+    
+    args = parser.parse_args()
+    
+    # Paths
+    processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
+    catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
+    log_dir = PROJECT_ROOT / 'logs'
+    stripped_log_path = log_dir / 'stripped_themes.yml'
+    
+    print(f"Stripping themes from catalog (min_cards={args.min_cards})")
+    print(f"Catalog directory: {catalog_dir}")
+    print(f"Dry run: {args.dry_run}")
+    print()
+    
+    # Step 1: Get theme card counts from parquet files
+    print("Step 1: Analyzing theme card counts from parquet files...")
+    parquet_files = sorted(processed_dir.glob("*.parquet"))
+    if not parquet_files:
+        print(f"Error: No parquet files found in {processed_dir}")
+        return 1
+    
+    print(f"Found {len(parquet_files)} parquet files")
+    theme_counts = get_theme_card_counts(parquet_files)
+    print(f"Found {len(theme_counts)} unique themes")
+    print()
+    
+    # Step 2: Get distribution
+    distribution = get_theme_distribution(theme_counts)
+    print("Theme distribution:")
+    print(f"  1 card:     {distribution['1_card']:4d} themes")
+    print(f"  2 cards:    {distribution['2_cards']:4d} themes")
+    print(f"  3-4 cards:  {distribution['3_4_cards']:4d} themes")
+    print(f"  5-9 cards:  {distribution['5_9_cards']:4d} themes")
+    print(f"  10+ cards:  {distribution['10_plus']:4d} themes")
+    print(f"  Total:      {distribution['total']:4d} themes")
+    print()
+    
+    # Step 3: Identify themes to strip
+    themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
+    themes_to_keep = set(theme_counts.keys()) - themes_to_strip
+    
+    print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
+    print(f"Themes to keep:  {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
+    print()
+    
+    # Show sample of themes to strip
+    if themes_to_strip:
+        print("Sample themes to strip (first 10):")
+        sample = sorted(themes_to_strip)[:10]
+        for theme_id in sample:
+            count = len(theme_counts[theme_id])
+            cards_sample = sorted(theme_counts[theme_id])[:3]
+            cards_str = ", ".join(cards_sample)
+            if count > 3:
+                cards_str += f", ... ({count} total)"
+            print(f"  - {theme_id} ({count} cards): {cards_str}")
+        print()
+    
+    if args.dry_run:
+        print("DRY RUN: No changes made")
+        return 0
+    
+    # Step 4: Strip themes from catalog
+    print("Step 4: Stripping themes from catalog YAML files...")
+    results = strip_catalog_themes(
+        catalog_dir=catalog_dir,
+        themes_to_strip=themes_to_strip,
+        backup=not args.no_backup
+    )
+    
+    print(f"  Stripped: {results['stripped_count']} themes")
+    print(f"  Files deleted: {len(results['files_deleted'])}")
+    print(f"  Backups created: {len(results['backups_created'])}")
+    
+    if results['errors']:
+        print(f"  Errors: {len(results['errors'])}")
+        for error in results['errors'][:5]:  # Show first 5 errors
+            print(f"    - {error}")
+    print()
+    
+    # Step 5: Create stripped themes log
+    print("Step 5: Creating stripped themes log...")
+    create_stripped_themes_log(
+        output_path=stripped_log_path,
+        theme_counts=theme_counts,
+        themes_stripped=themes_to_strip,
+        min_threshold=args.min_cards,
+        sources=["catalog YAML"]
+    )
+    print(f"  Log written to {stripped_log_path}")
+    print()
+    
+    print("✅ Catalog stripping complete!")
+    print()
+    print(f"Summary:")
+    print(f"  Total themes analyzed: {len(theme_counts)}")
+    print(f"  Themes stripped: {len(themes_to_strip)}")
+    print(f"  Themes remaining: {len(themes_to_keep)}")
+    print(f"  Catalog files deleted: {len(results['files_deleted'])}")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/code/scripts/strip_parquet_themes.py
+++ b/code/scripts/strip_parquet_themes.py
@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Strip low-card themes from parquet file themeTags columns.
+
+This script identifies and removes themes below the THEME_MIN_CARDS threshold
+from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
+the Theme Stripping roadmap (R21).
+
+Usage:
+    # Dry run to see what would be stripped
+    python code/scripts/strip_parquet_themes.py --dry-run
+    
+    # Strip from single parquet file
+    python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
+    
+    # Strip from all parquet files in directory
+    python code/scripts/strip_parquet_themes.py --all
+    
+    # Specify custom threshold
+    python code/scripts/strip_parquet_themes.py --threshold 10 --all
+
+Environment Variables:
+    THEME_MIN_CARDS: Minimum card threshold (default: 5)
+
+Outputs:
+    - Modified parquet file(s) with stripped themeTags
+    - Timestamped backup (.parquet.bak) if --backup enabled
+    - Updated logs/stripped_themes.yml log
+"""
+
+import argparse
+import sys
+from pathlib import Path
+from datetime import datetime
+
+# Add project root to path
+ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(ROOT))
+
+from code import settings as code_settings
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    strip_parquet_themes,
+    create_stripped_themes_log
+)
+
+
+def find_parquet_files(directory: Path) -> list[Path]:
+    """Find all parquet files in processed directory."""
+    return sorted(directory.glob("*.parquet"))
+
+
+def update_stripped_themes_log(
+    theme_counts: dict,
+    themes_to_strip: set[str],
+    min_cards: int
+) -> None:
+    """Update the stripped_themes.yml log with parquet stripping results."""
+    log_path = ROOT / "logs" / "stripped_themes.yml"
+    
+    # Create log with parquet source indicator
+    create_stripped_themes_log(
+        output_path=log_path,
+        theme_counts=theme_counts,
+        themes_stripped=themes_to_strip,
+        min_threshold=min_cards,
+        sources=["parquet files"]
+    )
+    
+    print(f"\nUpdated stripped themes log: {log_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Strip low-card themes from parquet themeTags columns",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    
+    parser.add_argument(
+        '--file',
+        type=Path,
+        help='Specific parquet file to process'
+    )
+    
+    parser.add_argument(
+        '--all',
+        action='store_true',
+        help='Process all parquet files in card_files/processed/'
+    )
+    
+    parser.add_argument(
+        '--threshold',
+        type=int,
+        help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be stripped without making changes'
+    )
+    
+    parser.add_argument(
+        '--no-backup',
+        action='store_true',
+        help='Skip creating backup files before modification'
+    )
+    
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed stripping information'
+    )
+    
+    args = parser.parse_args()
+    
+    # Determine threshold
+    min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
+    
+    # Determine which files to process
+    if args.file:
+        if not args.file.exists():
+            print(f"Error: File not found: {args.file}")
+            return 1
+        parquet_files = [args.file]
+    elif args.all:
+        processed_dir = ROOT / "card_files" / "processed"
+        parquet_files = find_parquet_files(processed_dir)
+        if not parquet_files:
+            print(f"No parquet files found in {processed_dir}")
+            return 1
+    else:
+        # Default: process all_cards.parquet
+        default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
+        if not default_file.exists():
+            print(f"Error: Default file not found: {default_file}")
+            print("Use --file or --all to specify files to process")
+            return 1
+        parquet_files = [default_file]
+    
+    print(f"Theme Stripping Configuration:")
+    print(f"  Minimum cards: {min_cards}")
+    print(f"  Files to process: {len(parquet_files)}")
+    print(f"  Backup enabled: {not args.no_backup}")
+    print(f"  Dry run: {args.dry_run}")
+    print()
+    
+    # Get theme card counts from parquet files
+    print("Analyzing theme card counts...")
+    try:
+        theme_counts = get_theme_card_counts(parquet_files)
+        print(f"Found {len(theme_counts)} unique themes across files")
+    except Exception as e:
+        print(f"Error analyzing theme counts: {e}")
+        return 1
+    
+    # Identify themes to strip
+    print("Identifying themes to strip...")
+    try:
+        themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
+    except Exception as e:
+        print(f"Error identifying themes to strip: {e}")
+        return 1
+    
+    if not themes_to_strip:
+        print("No themes found below threshold. Nothing to strip.")
+        return 0
+    
+    print(f"Found {len(themes_to_strip)} themes to strip")
+    
+    if args.verbose:
+        sample = sorted(list(themes_to_strip))[:10]
+        print(f"Sample themes: {', '.join(sample)}")
+        if len(themes_to_strip) > 10:
+            print(f"  ... and {len(themes_to_strip) - 10} more")
+    
+    print()
+    
+    # Dry run mode
+    if args.dry_run:
+        print("DRY RUN MODE - No files will be modified")
+        print()
+        for parquet_file in parquet_files:
+            print(f"Would process: {parquet_file}")
+        print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
+        return 0
+    
+    # Process each parquet file
+    total_results = {
+        "files_processed": 0,
+        "cards_processed": 0,
+        "tags_removed": 0,
+        "errors": []
+    }
+    
+    for parquet_file in parquet_files:
+        print(f"Processing: {parquet_file.name}")
+        
+        try:
+            results = strip_parquet_themes(
+                parquet_path=parquet_file,
+                themes_to_strip=themes_to_strip,
+                backup=not args.no_backup
+            )
+            
+            total_results["files_processed"] += 1
+            total_results["cards_processed"] += results["cards_processed"]
+            total_results["tags_removed"] += results["tags_removed"]
+            total_results["errors"].extend(results["errors"])
+            
+            if args.verbose:
+                print(f"  Cards: {results['cards_processed']}")
+                print(f"  Tags removed: {results['tags_removed']}")
+                if results["backup_created"]:
+                    print(f"  Backup: {results['backup_created']}")
+            
+        except Exception as e:
+            error_msg = f"Error processing {parquet_file}: {e}"
+            print(f"  {error_msg}")
+            total_results["errors"].append(error_msg)
+            continue
+        
+        print()
+    
+    # Update stripped themes log
+    try:
+        update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
+    except Exception as e:
+        print(f"Warning: Failed to update stripped themes log: {e}")
+    
+    # Summary
+    print("\n" + "="*60)
+    print("SUMMARY")
+    print("="*60)
+    print(f"Files processed: {total_results['files_processed']}")
+    print(f"Cards processed: {total_results['cards_processed']}")
+    print(f"Tags removed: {total_results['tags_removed']}")
+    print(f"Themes stripped: {len(themes_to_strip)}")
+    
+    if total_results["errors"]:
+        print(f"\nErrors encountered: {len(total_results['errors'])}")
+        for error in total_results["errors"]:
+            print(f"  - {error}")
+    else:
+        print("\nStripping completed successfully!")
+    
+    return 0 if not total_results["errors"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/code/scripts/strip_themes.py
+++ b/code/scripts/strip_themes.py
@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+"""
+Standalone theme stripping orchestration script.
+
+This script coordinates the complete theme stripping pipeline:
+1. Analyze parquet files to identify low-card themes
+2. Strip from catalog YAML files (optional)
+3. Strip from parquet themeTags columns (optional)
+4. Rebuild theme_list.json from stripped parquet data
+5. Generate stripped_themes.yml log
+
+Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
+
+Usage:
+    # Dry run to preview changes
+    python code/scripts/strip_themes.py --dry-run
+    
+    # Strip everything with default threshold (5 cards)
+    python code/scripts/strip_themes.py
+    
+    # Strip only catalog YAML files
+    python code/scripts/strip_themes.py --sources catalog
+    
+    # Strip only parquet files
+    python code/scripts/strip_themes.py --sources parquet
+    
+    # Custom threshold
+    python code/scripts/strip_themes.py --min-cards 10
+    
+    # Skip backups (not recommended)
+    python code/scripts/strip_themes.py --no-backup
+
+Environment Variables:
+    THEME_MIN_CARDS: Minimum card threshold (default: 5)
+
+Outputs:
+    - Modified catalog/*.yml files (if --sources includes catalog)
+    - Modified parquet files (if --sources includes parquet)
+    - Regenerated config/themes/theme_list.json
+    - Updated logs/stripped_themes.yml log
+    - Timestamped backups (if --backup enabled)
+"""
+
+import argparse
+import sys
+import time
+from pathlib import Path
+from datetime import datetime
+from typing import Set, Dict
+
+# Add project root to path
+ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(ROOT))
+
+from code import settings as code_settings
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    strip_catalog_themes,
+    strip_parquet_themes,
+    create_stripped_themes_log
+)
+
+
+def strip_all_sources(
+    min_cards: int,
+    sources: Set[str],
+    backup: bool,
+    dry_run: bool,
+    verbose: bool
+) -> Dict:
+    """
+    Execute complete theme stripping pipeline.
+    
+    Args:
+        min_cards: Minimum card count threshold
+        sources: Set of sources to strip ('catalog', 'parquet', or both)
+        backup: Whether to create backups before modification
+        dry_run: Preview changes without modifying files
+        verbose: Show detailed output
+        
+    Returns:
+        Dictionary with stripping results and statistics
+    """
+    start_time = time.time()
+    results = {
+        "themes_analyzed": 0,
+        "themes_to_strip": 0,
+        "catalog_stripped": 0,
+        "parquet_tags_removed": 0,
+        "json_regenerated": False,
+        "errors": []
+    }
+    
+    print("="*70)
+    print("THEME STRIPPING PIPELINE")
+    print("="*70)
+    print(f"Configuration:")
+    print(f"  Minimum cards: {min_cards}")
+    print(f"  Sources: {', '.join(sorted(sources))}")
+    print(f"  Backup enabled: {backup}")
+    print(f"  Dry run: {dry_run}")
+    print()
+    
+    # Step 1: Analyze parquet files
+    print("Step 1: Analyzing theme card counts...")
+    try:
+        parquet_dir = ROOT / "card_files" / "processed"
+        parquet_files = sorted(parquet_dir.glob("*.parquet"))
+        
+        if not parquet_files:
+            results["errors"].append("No parquet files found in card_files/processed/")
+            return results
+        
+        theme_counts = get_theme_card_counts(parquet_files)
+        results["themes_analyzed"] = len(theme_counts)
+        print(f"  Found {len(theme_counts)} unique themes")
+        
+        themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
+        results["themes_to_strip"] = len(themes_to_strip)
+        print(f"  Identified {len(themes_to_strip)} themes below threshold")
+        
+        if verbose and themes_to_strip:
+            sample = sorted(list(themes_to_strip))[:5]
+            print(f"  Sample themes: {', '.join(sample)}")
+            if len(themes_to_strip) > 5:
+                print(f"    ... and {len(themes_to_strip) - 5} more")
+        
+        if not themes_to_strip:
+            print("\n✅ No themes below threshold. Nothing to strip.")
+            return results
+            
+    except Exception as e:
+        error_msg = f"Analysis failed: {e}"
+        print(f"  ❌ {error_msg}")
+        results["errors"].append(error_msg)
+        return results
+    
+    print()
+    
+    # Dry run mode
+    if dry_run:
+        print("DRY RUN MODE - No files will be modified")
+        print()
+        if 'catalog' in sources:
+            print("Would strip from catalog YAML files:")
+            catalog_dir = ROOT / "config" / "themes" / "catalog"
+            yaml_files = sorted(catalog_dir.glob("*.yml"))
+            for yaml_file in yaml_files[:5]:
+                print(f"  - {yaml_file.name}")
+            if len(yaml_files) > 5:
+                print(f"  ... and {len(yaml_files) - 5} more")
+        
+        if 'parquet' in sources:
+            print("\nWould strip from parquet files:")
+            for pf in parquet_files[:3]:
+                print(f"  - {pf.name}")
+            if len(parquet_files) > 3:
+                print(f"  ... and {len(parquet_files) - 3} more")
+        
+        print(f"\nWould strip {len(themes_to_strip)} themes total")
+        print("Would regenerate theme_list.json")
+        print("Would update stripped_themes.yml log")
+        return results
+    
+    # Step 2: Strip from catalog (if requested)
+    # NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
+    # otherwise build_theme_catalog.py will read un-stripped themes from YAML
+    if 'catalog' in sources:
+        print("Step 2: Stripping from catalog YAML files...")
+        try:
+            catalog_dir = ROOT / "config" / "themes" / "catalog"
+            catalog_results = strip_catalog_themes(
+                catalog_dir=catalog_dir,
+                themes_to_strip=themes_to_strip,
+                backup=backup
+            )
+            
+            results["catalog_stripped"] = catalog_results["files_modified"]
+            
+            if verbose:
+                print(f"  Files modified: {catalog_results['files_modified']}")
+                print(f"  Themes removed: {catalog_results['themes_removed']}")
+                if catalog_results["backups_created"]:
+                    print(f"  Backups created: {len(catalog_results['backups_created'])}")
+            else:
+                print(f"  ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
+            
+            results["errors"].extend(catalog_results["errors"])
+            
+        except Exception as e:
+            error_msg = f"Catalog stripping failed: {e}"
+            print(f"  ❌ {error_msg}")
+            results["errors"].append(error_msg)
+        
+        print()
+    
+    # Step 3: Strip from parquet (if requested)
+    if 'parquet' in sources:
+        step_num = 3 if 'catalog' in sources else 2
+        print(f"Step {step_num}: Stripping from parquet files...")
+        try:
+            for parquet_file in parquet_files:
+                if verbose:
+                    print(f"  Processing: {parquet_file.name}")
+                
+                parquet_results = strip_parquet_themes(
+                    parquet_path=parquet_file,
+                    themes_to_strip=themes_to_strip,
+                    backup=backup
+                )
+                
+                results["parquet_tags_removed"] += parquet_results["tags_removed"]
+                results["errors"].extend(parquet_results["errors"])
+                
+                if verbose and parquet_results["tags_removed"] > 0:
+                    print(f"    Removed {parquet_results['tags_removed']} tag occurrences")
+            
+            if not verbose:
+                print(f"  ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
+            
+        except Exception as e:
+            error_msg = f"Parquet stripping failed: {e}"
+            print(f"  ❌ {error_msg}")
+            results["errors"].append(error_msg)
+        
+        print()
+    
+    # Step 4: Rebuild theme_list.json (if parquet was stripped)
+    # NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
+    if 'parquet' in sources:
+        step_num = 4 if 'catalog' in sources else 3
+        print(f"Step {step_num}: Rebuilding theme_list.json...")
+        try:
+            # Import build script
+            from code.scripts.build_theme_catalog import main as build_main
+            
+            # Suppress verbose build output unless --verbose flag
+            import io
+            import contextlib
+            
+            if not verbose:
+                with contextlib.redirect_stdout(io.StringIO()):
+                    build_main()
+            else:
+                build_main()
+            
+            results["json_regenerated"] = True
+            print("  ✓ theme_list.json regenerated")
+            
+        except Exception as e:
+            error_msg = f"JSON regeneration failed: {e}"
+            print(f"  ❌ {error_msg}")
+            results["errors"].append(error_msg)
+        
+        print()
+    
+    # Step 5: Update stripped themes log
+    final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
+    print(f"Step {final_step}: Updating stripped_themes.yml log...")
+    try:
+        log_path = ROOT / "logs" / "stripped_themes.yml"
+        source_labels = []
+        if 'catalog' in sources:
+            source_labels.append("catalog YAML")
+        if 'parquet' in sources:
+            source_labels.append("parquet files")
+        
+        create_stripped_themes_log(
+            output_path=log_path,
+            theme_counts=theme_counts,
+            themes_stripped=themes_to_strip,
+            min_threshold=min_cards,
+            sources=source_labels if source_labels else None
+        )
+        print(f"  ✓ Log updated: {log_path}")
+        
+    except Exception as e:
+        error_msg = f"Log update failed: {e}"
+        print(f"  ❌ {error_msg}")
+        results["errors"].append(error_msg)
+    
+    # Final summary
+    elapsed = time.time() - start_time
+    print()
+    print("="*70)
+    print("SUMMARY")
+    print("="*70)
+    print(f"Themes analyzed: {results['themes_analyzed']}")
+    print(f"Themes stripped: {results['themes_to_strip']}")
+    if 'catalog' in sources:
+        print(f"Catalog files modified: {results['catalog_stripped']}")
+    if 'parquet' in sources:
+        print(f"Parquet tags removed: {results['parquet_tags_removed']}")
+        print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
+    print(f"Time elapsed: {elapsed:.2f}s")
+    
+    if results["errors"]:
+        print(f"\n⚠️  Errors encountered: {len(results['errors'])}")
+        for error in results["errors"]:
+            print(f"  - {error}")
+    else:
+        print("\n✅ Theme stripping completed successfully!")
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Orchestrate complete theme stripping pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    
+    parser.add_argument(
+        '--min-cards',
+        type=int,
+        help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
+    )
+    
+    parser.add_argument(
+        '--sources',
+        type=str,
+        help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be stripped without making changes'
+    )
+    
+    parser.add_argument(
+        '--no-backup',
+        action='store_true',
+        help='Skip creating backup files before modification'
+    )
+    
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed stripping information'
+    )
+    
+    args = parser.parse_args()
+    
+    # Determine threshold
+    min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
+    
+    # Determine sources
+    if args.sources:
+        source_input = args.sources.lower()
+        if source_input == 'all':
+            sources = {'catalog', 'parquet'}
+        else:
+            sources = set(s.strip() for s in source_input.split(','))
+            valid_sources = {'catalog', 'parquet'}
+            invalid = sources - valid_sources
+            if invalid:
+                print(f"Error: Invalid sources: {', '.join(invalid)}")
+                print(f"Valid sources: {', '.join(valid_sources)}, all")
+                return 1
+    else:
+        sources = {'catalog', 'parquet'}  # Default: all sources
+    
+    # Execute pipeline
+    results = strip_all_sources(
+        min_cards=min_cards,
+        sources=sources,
+        backup=not args.no_backup,
+        dry_run=args.dry_run,
+        verbose=args.verbose
+    )
+    
+    # Return exit code
+    return 0 if not results["errors"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())