feat: implement theme stripping system with THEME_MIN_CARDS config (#55)

* feat: implement theme stripping system with THEME_MIN_CARDS config * fix: call build_catalog directly to avoid argparse conflicts in CI
2026-03-24 22:16:31 +01:00 · 2026-03-19 15:27:17 -07:00 · 2026-03-19 15:27:17 -07:00 · 03e2846882
commit 03e2846882
parent 1ebc2fcb3c
20 changed files with 6613 additions and 1364 deletions
--- a/code/scripts/analyze_theme_distribution.py
+++ b/code/scripts/analyze_theme_distribution.py
@ -0,0 +1,207 @@
+"""
+Theme Distribution Analysis Script
+
+Analyzes theme distribution across the card catalog and generates reports
+showing which themes would be stripped based on minimum card thresholds.
+
+Usage:
+    python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
+
+Arguments:
+    --min-cards N    Minimum card threshold (default: from THEME_MIN_CARDS setting)
+    --output FILE    Output file path (default: logs/theme_stripping_analysis.txt)
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Set
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    get_theme_distribution,
+    get_themes_by_count
+)
+
+
+def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
+    """
+    Analyze theme distribution and generate report.
+    
+    Args:
+        min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
+        output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
+    """
+    if min_cards is None:
+        min_cards = THEME_MIN_CARDS
+    
+    if output_path is None:
+        output_path = "logs/theme_stripping_analysis.txt"
+    
+    print(f"Analyzing theme distribution (min_cards={min_cards})...")
+    
+    # Find all parquet files
+    processed_dir = Path(CARD_FILES_PROCESSED_DIR)
+    if not processed_dir.exists():
+        print(f"Error: Processed cards directory not found: {processed_dir}")
+        print("Please run initial setup first to generate parquet files.")
+        sys.exit(1)
+    
+    parquet_files = list(processed_dir.glob("*.parquet"))
+    if not parquet_files:
+        print(f"Error: No parquet files found in {processed_dir}")
+        print("Please run initial setup first to generate parquet files.")
+        sys.exit(1)
+    
+    print(f"Found {len(parquet_files)} parquet files to analyze")
+    
+    # Build theme counts
+    print("Building theme -> card count mapping...")
+    theme_counts = get_theme_card_counts(parquet_files)
+    
+    if not theme_counts:
+        print("Error: No themes found in parquet files")
+        sys.exit(1)
+    
+    print(f"Found {len(theme_counts)} unique themes")
+    
+    # Identify themes to strip
+    themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
+    
+    # Get distribution
+    distribution = get_theme_distribution(theme_counts)
+    
+    # Get themes below threshold
+    below_threshold = get_themes_by_count(theme_counts, min_cards)
+    
+    # Generate report
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        # Header
+        f.write("=" * 80 + "\n")
+        f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
+        f.write("=" * 80 + "\n")
+        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Minimum Card Threshold: {min_cards}\n")
+        f.write(f"Source: {processed_dir}\n")
+        f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
+        f.write("=" * 80 + "\n\n")
+        
+        # Summary statistics
+        f.write("SUMMARY STATISTICS\n")
+        f.write("-" * 80 + "\n")
+        f.write(f"Total Themes: {distribution['total']}\n")
+        f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
+        f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
+        f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
+        f.write("\n")
+        
+        # Distribution by card count
+        f.write("DISTRIBUTION BY CARD COUNT\n")
+        f.write("-" * 80 + "\n")
+        f.write(f"  1 card:  {distribution['1_card']:4d} themes\n")
+        f.write(f"  2 cards: {distribution['2_cards']:4d} themes\n")
+        f.write(f"  3-4 cards: {distribution['3_4_cards']:4d} themes\n")
+        f.write(f"  5-9 cards: {distribution['5_9_cards']:4d} themes\n")
+        f.write(f"  10+ cards: {distribution['10_plus']:4d} themes\n")
+        f.write(f"  Total:   {distribution['total']:4d} themes\n")
+        f.write("\n")
+        
+        # Themes below threshold
+        if below_threshold:
+            f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
+            f.write("=" * 80 + "\n")
+            f.write(f"Total: {len(below_threshold)} themes\n\n")
+            
+            for theme_id, count, card_list in below_threshold:
+                f.write(f"Theme: {theme_id}\n")
+                f.write(f"Card Count: {count}\n")
+                f.write(f"Cards:\n")
+                for card in card_list:
+                    f.write(f"  - {card}\n")
+                f.write("\n")
+        else:
+            f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
+            f.write("=" * 80 + "\n")
+            f.write("All themes meet the minimum card requirement.\n\n")
+        
+        # Recommendations
+        f.write("RECOMMENDATIONS\n")
+        f.write("=" * 80 + "\n")
+        if len(themes_to_strip) > 0:
+            f.write(f"• {len(themes_to_strip)} themes should be stripped\n")
+            f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
+            f.write(f"• Run theme stripping to remove these low-viability themes\n")
+            f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
+        else:
+            f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
+            f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
+        f.write("\n")
+        
+        # Footer
+        f.write("=" * 80 + "\n")
+        f.write("END OF REPORT\n")
+        f.write("=" * 80 + "\n")
+    
+    print(f"\nReport generated: {output_file}")
+    print(f"\nSummary:")
+    print(f"  Total themes: {distribution['total']}")
+    print(f"  Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
+    print(f"  Themes to keep: {distribution['total'] - len(themes_to_strip)}")
+    
+    # Print distribution
+    print(f"\nDistribution:")
+    print(f"  1 card:    {distribution['1_card']:4d} themes")
+    print(f"  2 cards:   {distribution['2_cards']:4d} themes")
+    print(f"  3-4 cards: {distribution['3_4_cards']:4d} themes")
+    print(f"  5-9 cards: {distribution['5_9_cards']:4d} themes")
+    print(f"  10+ cards: {distribution['10_plus']:4d} themes")
+
+
+def main():
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Analyze theme distribution and identify themes below minimum card threshold"
+    )
+    parser.add_argument(
+        '--min-cards',
+        type=int,
+        default=None,
+        help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default=None,
+        help='Output file path (default: logs/theme_stripping_analysis.txt)'
+    )
+    
+    args = parser.parse_args()
+    
+    try:
+        analyze_theme_distribution(
+            min_cards=args.min_cards,
+            output_path=args.output
+        )
+    except KeyboardInterrupt:
+        print("\nAnalysis cancelled by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\nError during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/code/scripts/build_theme_catalog.py
+++ b/code/scripts/build_theme_catalog.py
@ -34,6 +34,14 @@ try:  # Optional
 except Exception:  # pragma: no cover
    yaml = None

+# Import settings for THEME_MIN_CARDS threshold
+# Import at module level to avoid stdlib 'code' conflict when running as script
+ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+if ROOT not in sys.path:
+    sys.path.insert(0, ROOT)
+
+from code import settings as code_settings
+
 try:
    # Support running as `python code/scripts/build_theme_catalog.py` when 'code' already on path
    from scripts.extract_themes import (
@ -166,17 +174,29 @@ def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:


 def regenerate_analytics(verbose: bool):
+    """
+    Regenerate theme analytics from parquet data, constants, and tagger source.
+    
+    Now reads from parquet files instead of CSV. Applies THEME_MIN_CARDS filtering
+    to exclude themes with too few cards.
+    
+    Args:
+        verbose: Whether to print detailed progress
+        
+    Returns:
+        Tuple of (theme_tags, selected_synergies, taxonomy)
+    """
    theme_tags: Set[str] = set()
    theme_tags |= collect_theme_tags_from_constants()
    theme_tags |= collect_theme_tags_from_tagger_source()
-    try:
-        csv_rows = gather_theme_tag_rows()
-        for row_tags in csv_rows:
-            for t in row_tags:
-                if isinstance(t, str) and t:
-                    theme_tags.add(t)
-    except Exception:
-        csv_rows = []
+    
+    # M3: Read from parquet (no longer silent fail)
+    # Fail loudly if parquet read fails - this is a critical error
+    parquet_rows = gather_theme_tag_rows()
+    for row_tags in parquet_rows:
+        for t in row_tags:
+            if isinstance(t, str) and t:
+                theme_tags.add(t)

    whitelist = load_whitelist_config()
    normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
@ -190,10 +210,8 @@ def regenerate_analytics(verbose: bool):
    blacklist = {"Draw Triggers"}
    theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}

-    try:
-        frequencies = tally_tag_frequencies_by_base_color()
-    except Exception:
-        frequencies = {}
+    # M3: Read frequencies from parquet (fail loudly)
+    frequencies = tally_tag_frequencies_by_base_color()

    if frequencies:
        def total_count(t: str) -> int:
@ -204,19 +222,40 @@ def regenerate_analytics(verbose: bool):
                except Exception:
                    pass
            return s
+        
        kept: Set[str] = set()
+        
+        # M3: Apply THEME_MIN_CARDS filtering
+        min_cards = getattr(code_settings, 'THEME_MIN_CARDS', 5)
+        if verbose:
+            print(f"Applying THEME_MIN_CARDS filter (threshold: {min_cards} cards)")
+        
+        themes_before_filter = len(theme_tags)
+        
        for t in list(theme_tags):
-            if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
-                kept.add(t)
+            count = total_count(t)
+            # Check both should_keep_theme (whitelist logic) AND THEME_MIN_CARDS threshold
+            if should_keep_theme(t, count, whitelist, protected_prefixes, protected_suffixes, min_overrides):
+                # Additional check: must meet minimum card threshold
+                if count >= min_cards:
+                    kept.add(t)
+                elif verbose:
+                    print(f"  Filtered out '{t}' ({count} cards < {min_cards} threshold)")
+        
+        # Always include whitelist themes (override threshold)
        for extra in whitelist.get('always_include', []) or []:
            kept.add(str(extra))
+        
        theme_tags = kept
+        
+        if verbose:
+            themes_after_filter = len(theme_tags)
+            filtered_count = themes_before_filter - themes_after_filter
+            print(f"Filtered {filtered_count} themes below threshold ({themes_after_filter} remain)")

-    try:
-        rows = csv_rows if csv_rows else gather_theme_tag_rows()
-        co_map, tag_counts, total_rows = compute_cooccurrence(rows)
-    except Exception:
-        co_map, tag_counts, total_rows = {}, Counter(), 0
+    # M3: Compute co-occurrence from parquet data (fail loudly)
+    rows = parquet_rows if parquet_rows else gather_theme_tag_rows()
+    co_map, tag_counts, total_rows = compute_cooccurrence(rows)

    return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)

--- a/code/scripts/extract_themes.py
+++ b/code/scripts/extract_themes.py
@ -6,6 +6,7 @@ from collections import Counter
 from typing import Dict, List, Set, Any

 import pandas as pd
+import numpy as np
 import itertools
 import math
 try:
@ -20,6 +21,7 @@ if ROOT not in sys.path:

 from code.settings import CSV_DIRECTORY
 from code.tagging import tag_constants
+from code.path_util import get_processed_cards_path

 BASE_COLORS = {
    'white': 'W',
@ -88,83 +90,113 @@ def collect_theme_tags_from_tagger_source() -> Set[str]:


 def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]:
+    """
+    Tally theme tag frequencies by base color from parquet files.
+    
+    Note: This function now reads from card_files/processed/all_cards.parquet
+    instead of per-color CSV files. The CSV files no longer exist after the
+    parquet migration.
+    
+    Returns:
+        Dictionary mapping color names to Counter of tag frequencies
+    """
    result: Dict[str, Dict[str, int]] = {c: Counter() for c in BASE_COLORS.keys()}
-    # Iterate over per-color CSVs; if not present, skip
-    for color in BASE_COLORS.keys():
-        path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
-        if not os.path.exists(path):
+    
+    # Load from all_cards.parquet
+    parquet_path = get_processed_cards_path()
+    if not os.path.exists(parquet_path):
+        print(f"Warning: Parquet file not found: {parquet_path}")
+        return {k: dict(v) for k, v in result.items()}
+    
+    try:
+        df = pd.read_parquet(parquet_path, columns=['themeTags', 'colorIdentity'], engine='pyarrow')
+    except Exception as e:
+        print(f"Error reading parquet file: {e}")
+        return {k: dict(v) for k, v in result.items()}
+    
+    if 'themeTags' not in df.columns:
+        print("Warning: themeTags column not found in parquet file")
+        return {k: dict(v) for k, v in result.items()}
+    
+    # Iterate rows and tally tags by base color
+    for _, row in df.iterrows():
+        # Parquet stores themeTags as numpy array
+        tags = row.get('themeTags')
+        if not isinstance(tags, (list, np.ndarray)):
            continue
-        try:
-            df = pd.read_csv(path, converters={'themeTags': pd.eval, 'colorIdentity': pd.eval})
-        except Exception:
-            df = pd.read_csv(path)
-            if 'themeTags' in df.columns:
-                try:
-                    df['themeTags'] = df['themeTags'].apply(pd.eval)
-                except Exception:
-                    df['themeTags'] = df['themeTags'].apply(lambda x: [])
-            if 'colorIdentity' in df.columns:
-                try:
-                    df['colorIdentity'] = df['colorIdentity'].apply(pd.eval)
-                except Exception:
-                    pass
-        if 'themeTags' not in df.columns:
+        if isinstance(tags, np.ndarray):
+            tags = tags.tolist()
+        
+        # Get color identity (stored as string like "W", "UB", "WUG", etc.)
+        ci = row.get('colorIdentity')
+        if isinstance(ci, np.ndarray):
+            ci = ci.tolist()
+        
+        # Convert colorIdentity to set of letters
+        if isinstance(ci, str):
+            letters = set(ci)  # "WUG" -> {'W', 'U', 'G'}
+        elif isinstance(ci, list):
+            letters = set(ci)  # ['W', 'U', 'G'] -> {'W', 'U', 'G'}
+        else:
+            letters = set()
+        
+        # Determine base colors from color identity
+        bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
+        if not bases:
+            # Colorless cards don't contribute to any specific color
            continue
-        # Derive base colors from colorIdentity if available, else assume single color file
-        def rows_base_colors(row):
-            ids = row.get('colorIdentity') if isinstance(row, dict) else row
-            if isinstance(ids, list):
-                letters = set(ids)
-            else:
-                letters = set()
-            derived = set()
-            for name, letter in BASE_COLORS.items():
-                if letter in letters:
-                    derived.add(name)
-            if not derived:
-                derived.add(color)
-            return derived
-        # Iterate rows
-        for _, row in df.iterrows():
-            tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
-            # Compute base colors contribution
-            ci = row['colorIdentity'] if 'colorIdentity' in row else None
-            letters = set(ci) if isinstance(ci, list) else set()
-            bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
-            if not bases:
-                bases = {color}
-            for bc in bases:
-                for t in tags:
-                    result[bc][t] += 1
+        
+        # Tally tags for each base color this card belongs to
+        for base_color in bases:
+            for tag in tags:
+                if isinstance(tag, str) and tag:
+                    result[base_color][tag] += 1
+    
    # Convert Counters to plain dicts
    return {k: dict(v) for k, v in result.items()}


 def gather_theme_tag_rows() -> List[List[str]]:
-    """Collect per-card themeTags lists across all base color CSVs.
+    """
+    Collect per-card themeTags lists from parquet file.
+    
+    Note: This function now reads from card_files/processed/all_cards.parquet
+    instead of per-color CSV files. The CSV files no longer exist after the
+    parquet migration.

-    Returns a list of themeTags arrays, one per card row where themeTags is present.
+    Returns:
+        List of themeTags arrays, one per card row where themeTags is present.
    """
    rows: List[List[str]] = []
-    for color in BASE_COLORS.keys():
-        path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
-        if not os.path.exists(path):
-            continue
-        try:
-            df = pd.read_csv(path, converters={'themeTags': pd.eval})
-        except Exception:
-            df = pd.read_csv(path)
-            if 'themeTags' in df.columns:
-                try:
-                    df['themeTags'] = df['themeTags'].apply(pd.eval)
-                except Exception:
-                    df['themeTags'] = df['themeTags'].apply(lambda x: [])
-        if 'themeTags' not in df.columns:
-            continue
-        for _, row in df.iterrows():
-            tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
-            if tags:
-                rows.append(tags)
+    
+    # Load from all_cards.parquet
+    parquet_path = get_processed_cards_path()
+    if not os.path.exists(parquet_path):
+        print(f"Warning: Parquet file not found: {parquet_path}")
+        return rows
+    
+    try:
+        df = pd.read_parquet(parquet_path, columns=['themeTags'], engine='pyarrow')
+    except Exception as e:
+        print(f"Error reading parquet file: {e}")
+        return rows
+    
+    if 'themeTags' not in df.columns:
+        print("Warning: themeTags column not found in parquet file")
+        return rows
+    
+    # Collect theme tags from each card
+    for _, row in df.iterrows():
+        # Parquet stores themeTags as numpy array
+        tags = row.get('themeTags')
+        if isinstance(tags, np.ndarray):
+            tags = tags.tolist()
+        if isinstance(tags, list) and tags:
+            # Convert to list of strings (filter out non-strings)
+            tag_list = [str(t) for t in tags if isinstance(t, str) and t]
+            if tag_list:
+                rows.append(tag_list)
+    
    return rows


--- a/code/scripts/strip_catalog_themes.py
+++ b/code/scripts/strip_catalog_themes.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+Strip Theme Catalog Script
+
+Removes themes with insufficient card counts from the theme catalog YAML files.
+Creates backups and logs all stripped themes for reference.
+
+Usage:
+    python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
+
+Options:
+    --min-cards N       Override THEME_MIN_CARDS setting (default: from environment/settings)
+    --no-backup         Skip creating backup files
+    --dry-run           Show what would be stripped without making changes
+
+Example:
+    python -m code.scripts.strip_catalog_themes
+    python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add project root to path for imports
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(PROJECT_ROOT))
+
+from code import settings
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    strip_catalog_themes,
+    create_stripped_themes_log,
+    get_theme_distribution
+)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Strip themes with insufficient card counts from catalog YAML files"
+    )
+    parser.add_argument(
+        "--min-cards",
+        type=int,
+        default=settings.THEME_MIN_CARDS,
+        help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
+    )
+    parser.add_argument(
+        "--no-backup",
+        action="store_true",
+        help="Skip creating backup files before modification"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be stripped without making changes"
+    )
+    
+    args = parser.parse_args()
+    
+    # Paths
+    processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
+    catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
+    log_dir = PROJECT_ROOT / 'logs'
+    stripped_log_path = log_dir / 'stripped_themes.yml'
+    
+    print(f"Stripping themes from catalog (min_cards={args.min_cards})")
+    print(f"Catalog directory: {catalog_dir}")
+    print(f"Dry run: {args.dry_run}")
+    print()
+    
+    # Step 1: Get theme card counts from parquet files
+    print("Step 1: Analyzing theme card counts from parquet files...")
+    parquet_files = sorted(processed_dir.glob("*.parquet"))
+    if not parquet_files:
+        print(f"Error: No parquet files found in {processed_dir}")
+        return 1
+    
+    print(f"Found {len(parquet_files)} parquet files")
+    theme_counts = get_theme_card_counts(parquet_files)
+    print(f"Found {len(theme_counts)} unique themes")
+    print()
+    
+    # Step 2: Get distribution
+    distribution = get_theme_distribution(theme_counts)
+    print("Theme distribution:")
+    print(f"  1 card:     {distribution['1_card']:4d} themes")
+    print(f"  2 cards:    {distribution['2_cards']:4d} themes")
+    print(f"  3-4 cards:  {distribution['3_4_cards']:4d} themes")
+    print(f"  5-9 cards:  {distribution['5_9_cards']:4d} themes")
+    print(f"  10+ cards:  {distribution['10_plus']:4d} themes")
+    print(f"  Total:      {distribution['total']:4d} themes")
+    print()
+    
+    # Step 3: Identify themes to strip
+    themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
+    themes_to_keep = set(theme_counts.keys()) - themes_to_strip
+    
+    print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
+    print(f"Themes to keep:  {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
+    print()
+    
+    # Show sample of themes to strip
+    if themes_to_strip:
+        print("Sample themes to strip (first 10):")
+        sample = sorted(themes_to_strip)[:10]
+        for theme_id in sample:
+            count = len(theme_counts[theme_id])
+            cards_sample = sorted(theme_counts[theme_id])[:3]
+            cards_str = ", ".join(cards_sample)
+            if count > 3:
+                cards_str += f", ... ({count} total)"
+            print(f"  - {theme_id} ({count} cards): {cards_str}")
+        print()
+    
+    if args.dry_run:
+        print("DRY RUN: No changes made")
+        return 0
+    
+    # Step 4: Strip themes from catalog
+    print("Step 4: Stripping themes from catalog YAML files...")
+    results = strip_catalog_themes(
+        catalog_dir=catalog_dir,
+        themes_to_strip=themes_to_strip,
+        backup=not args.no_backup
+    )
+    
+    print(f"  Stripped: {results['stripped_count']} themes")
+    print(f"  Files deleted: {len(results['files_deleted'])}")
+    print(f"  Backups created: {len(results['backups_created'])}")
+    
+    if results['errors']:
+        print(f"  Errors: {len(results['errors'])}")
+        for error in results['errors'][:5]:  # Show first 5 errors
+            print(f"    - {error}")
+    print()
+    
+    # Step 5: Create stripped themes log
+    print("Step 5: Creating stripped themes log...")
+    create_stripped_themes_log(
+        output_path=stripped_log_path,
+        theme_counts=theme_counts,
+        themes_stripped=themes_to_strip,
+        min_threshold=args.min_cards,
+        sources=["catalog YAML"]
+    )
+    print(f"  Log written to {stripped_log_path}")
+    print()
+    
+    print("✅ Catalog stripping complete!")
+    print()
+    print(f"Summary:")
+    print(f"  Total themes analyzed: {len(theme_counts)}")
+    print(f"  Themes stripped: {len(themes_to_strip)}")
+    print(f"  Themes remaining: {len(themes_to_keep)}")
+    print(f"  Catalog files deleted: {len(results['files_deleted'])}")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/code/scripts/strip_parquet_themes.py
+++ b/code/scripts/strip_parquet_themes.py
@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Strip low-card themes from parquet file themeTags columns.
+
+This script identifies and removes themes below the THEME_MIN_CARDS threshold
+from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
+the Theme Stripping roadmap (R21).
+
+Usage:
+    # Dry run to see what would be stripped
+    python code/scripts/strip_parquet_themes.py --dry-run
+    
+    # Strip from single parquet file
+    python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
+    
+    # Strip from all parquet files in directory
+    python code/scripts/strip_parquet_themes.py --all
+    
+    # Specify custom threshold
+    python code/scripts/strip_parquet_themes.py --threshold 10 --all
+
+Environment Variables:
+    THEME_MIN_CARDS: Minimum card threshold (default: 5)
+
+Outputs:
+    - Modified parquet file(s) with stripped themeTags
+    - Timestamped backup (.parquet.bak) if --backup enabled
+    - Updated logs/stripped_themes.yml log
+"""
+
+import argparse
+import sys
+from pathlib import Path
+from datetime import datetime
+
+# Add project root to path
+ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(ROOT))
+
+from code import settings as code_settings
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    strip_parquet_themes,
+    create_stripped_themes_log
+)
+
+
+def find_parquet_files(directory: Path) -> list[Path]:
+    """Find all parquet files in processed directory."""
+    return sorted(directory.glob("*.parquet"))
+
+
+def update_stripped_themes_log(
+    theme_counts: dict,
+    themes_to_strip: set[str],
+    min_cards: int
+) -> None:
+    """Update the stripped_themes.yml log with parquet stripping results."""
+    log_path = ROOT / "logs" / "stripped_themes.yml"
+    
+    # Create log with parquet source indicator
+    create_stripped_themes_log(
+        output_path=log_path,
+        theme_counts=theme_counts,
+        themes_stripped=themes_to_strip,
+        min_threshold=min_cards,
+        sources=["parquet files"]
+    )
+    
+    print(f"\nUpdated stripped themes log: {log_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Strip low-card themes from parquet themeTags columns",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    
+    parser.add_argument(
+        '--file',
+        type=Path,
+        help='Specific parquet file to process'
+    )
+    
+    parser.add_argument(
+        '--all',
+        action='store_true',
+        help='Process all parquet files in card_files/processed/'
+    )
+    
+    parser.add_argument(
+        '--threshold',
+        type=int,
+        help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be stripped without making changes'
+    )
+    
+    parser.add_argument(
+        '--no-backup',
+        action='store_true',
+        help='Skip creating backup files before modification'
+    )
+    
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed stripping information'
+    )
+    
+    args = parser.parse_args()
+    
+    # Determine threshold
+    min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
+    
+    # Determine which files to process
+    if args.file:
+        if not args.file.exists():
+            print(f"Error: File not found: {args.file}")
+            return 1
+        parquet_files = [args.file]
+    elif args.all:
+        processed_dir = ROOT / "card_files" / "processed"
+        parquet_files = find_parquet_files(processed_dir)
+        if not parquet_files:
+            print(f"No parquet files found in {processed_dir}")
+            return 1
+    else:
+        # Default: process all_cards.parquet
+        default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
+        if not default_file.exists():
+            print(f"Error: Default file not found: {default_file}")
+            print("Use --file or --all to specify files to process")
+            return 1
+        parquet_files = [default_file]
+    
+    print(f"Theme Stripping Configuration:")
+    print(f"  Minimum cards: {min_cards}")
+    print(f"  Files to process: {len(parquet_files)}")
+    print(f"  Backup enabled: {not args.no_backup}")
+    print(f"  Dry run: {args.dry_run}")
+    print()
+    
+    # Get theme card counts from parquet files
+    print("Analyzing theme card counts...")
+    try:
+        theme_counts = get_theme_card_counts(parquet_files)
+        print(f"Found {len(theme_counts)} unique themes across files")
+    except Exception as e:
+        print(f"Error analyzing theme counts: {e}")
+        return 1
+    
+    # Identify themes to strip
+    print("Identifying themes to strip...")
+    try:
+        themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
+    except Exception as e:
+        print(f"Error identifying themes to strip: {e}")
+        return 1
+    
+    if not themes_to_strip:
+        print("No themes found below threshold. Nothing to strip.")
+        return 0
+    
+    print(f"Found {len(themes_to_strip)} themes to strip")
+    
+    if args.verbose:
+        sample = sorted(list(themes_to_strip))[:10]
+        print(f"Sample themes: {', '.join(sample)}")
+        if len(themes_to_strip) > 10:
+            print(f"  ... and {len(themes_to_strip) - 10} more")
+    
+    print()
+    
+    # Dry run mode
+    if args.dry_run:
+        print("DRY RUN MODE - No files will be modified")
+        print()
+        for parquet_file in parquet_files:
+            print(f"Would process: {parquet_file}")
+        print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
+        return 0
+    
+    # Process each parquet file
+    total_results = {
+        "files_processed": 0,
+        "cards_processed": 0,
+        "tags_removed": 0,
+        "errors": []
+    }
+    
+    for parquet_file in parquet_files:
+        print(f"Processing: {parquet_file.name}")
+        
+        try:
+            results = strip_parquet_themes(
+                parquet_path=parquet_file,
+                themes_to_strip=themes_to_strip,
+                backup=not args.no_backup
+            )
+            
+            total_results["files_processed"] += 1
+            total_results["cards_processed"] += results["cards_processed"]
+            total_results["tags_removed"] += results["tags_removed"]
+            total_results["errors"].extend(results["errors"])
+            
+            if args.verbose:
+                print(f"  Cards: {results['cards_processed']}")
+                print(f"  Tags removed: {results['tags_removed']}")
+                if results["backup_created"]:
+                    print(f"  Backup: {results['backup_created']}")
+            
+        except Exception as e:
+            error_msg = f"Error processing {parquet_file}: {e}"
+            print(f"  {error_msg}")
+            total_results["errors"].append(error_msg)
+            continue
+        
+        print()
+    
+    # Update stripped themes log
+    try:
+        update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
+    except Exception as e:
+        print(f"Warning: Failed to update stripped themes log: {e}")
+    
+    # Summary
+    print("\n" + "="*60)
+    print("SUMMARY")
+    print("="*60)
+    print(f"Files processed: {total_results['files_processed']}")
+    print(f"Cards processed: {total_results['cards_processed']}")
+    print(f"Tags removed: {total_results['tags_removed']}")
+    print(f"Themes stripped: {len(themes_to_strip)}")
+    
+    if total_results["errors"]:
+        print(f"\nErrors encountered: {len(total_results['errors'])}")
+        for error in total_results["errors"]:
+            print(f"  - {error}")
+    else:
+        print("\nStripping completed successfully!")
+    
+    return 0 if not total_results["errors"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/code/scripts/strip_themes.py
+++ b/code/scripts/strip_themes.py
@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+"""
+Standalone theme stripping orchestration script.
+
+This script coordinates the complete theme stripping pipeline:
+1. Analyze parquet files to identify low-card themes
+2. Strip from catalog YAML files (optional)
+3. Strip from parquet themeTags columns (optional)
+4. Rebuild theme_list.json from stripped parquet data
+5. Generate stripped_themes.yml log
+
+Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
+
+Usage:
+    # Dry run to preview changes
+    python code/scripts/strip_themes.py --dry-run
+    
+    # Strip everything with default threshold (5 cards)
+    python code/scripts/strip_themes.py
+    
+    # Strip only catalog YAML files
+    python code/scripts/strip_themes.py --sources catalog
+    
+    # Strip only parquet files
+    python code/scripts/strip_themes.py --sources parquet
+    
+    # Custom threshold
+    python code/scripts/strip_themes.py --min-cards 10
+    
+    # Skip backups (not recommended)
+    python code/scripts/strip_themes.py --no-backup
+
+Environment Variables:
+    THEME_MIN_CARDS: Minimum card threshold (default: 5)
+
+Outputs:
+    - Modified catalog/*.yml files (if --sources includes catalog)
+    - Modified parquet files (if --sources includes parquet)
+    - Regenerated config/themes/theme_list.json
+    - Updated logs/stripped_themes.yml log
+    - Timestamped backups (if --backup enabled)
+"""
+
+import argparse
+import sys
+import time
+from pathlib import Path
+from datetime import datetime
+from typing import Set, Dict
+
+# Add project root to path
+ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(ROOT))
+
+from code import settings as code_settings
+from code.tagging.theme_stripper import (
+    get_theme_card_counts,
+    identify_themes_to_strip,
+    strip_catalog_themes,
+    strip_parquet_themes,
+    create_stripped_themes_log
+)
+
+
+def strip_all_sources(
+    min_cards: int,
+    sources: Set[str],
+    backup: bool,
+    dry_run: bool,
+    verbose: bool
+) -> Dict:
+    """
+    Execute complete theme stripping pipeline.
+    
+    Args:
+        min_cards: Minimum card count threshold
+        sources: Set of sources to strip ('catalog', 'parquet', or both)
+        backup: Whether to create backups before modification
+        dry_run: Preview changes without modifying files
+        verbose: Show detailed output
+        
+    Returns:
+        Dictionary with stripping results and statistics
+    """
+    start_time = time.time()
+    results = {
+        "themes_analyzed": 0,
+        "themes_to_strip": 0,
+        "catalog_stripped": 0,
+        "parquet_tags_removed": 0,
+        "json_regenerated": False,
+        "errors": []
+    }
+    
+    print("="*70)
+    print("THEME STRIPPING PIPELINE")
+    print("="*70)
+    print(f"Configuration:")
+    print(f"  Minimum cards: {min_cards}")
+    print(f"  Sources: {', '.join(sorted(sources))}")
+    print(f"  Backup enabled: {backup}")
+    print(f"  Dry run: {dry_run}")
+    print()
+    
+    # Step 1: Analyze parquet files
+    print("Step 1: Analyzing theme card counts...")
+    try:
+        parquet_dir = ROOT / "card_files" / "processed"
+        parquet_files = sorted(parquet_dir.glob("*.parquet"))
+        
+        if not parquet_files:
+            results["errors"].append("No parquet files found in card_files/processed/")
+            return results
+        
+        theme_counts = get_theme_card_counts(parquet_files)
+        results["themes_analyzed"] = len(theme_counts)
+        print(f"  Found {len(theme_counts)} unique themes")
+        
+        themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
+        results["themes_to_strip"] = len(themes_to_strip)
+        print(f"  Identified {len(themes_to_strip)} themes below threshold")
+        
+        if verbose and themes_to_strip:
+            sample = sorted(list(themes_to_strip))[:5]
+            print(f"  Sample themes: {', '.join(sample)}")
+            if len(themes_to_strip) > 5:
+                print(f"    ... and {len(themes_to_strip) - 5} more")
+        
+        if not themes_to_strip:
+            print("\n✅ No themes below threshold. Nothing to strip.")
+            return results
+            
+    except Exception as e:
+        error_msg = f"Analysis failed: {e}"
+        print(f"  ❌ {error_msg}")
+        results["errors"].append(error_msg)
+        return results
+    
+    print()
+    
+    # Dry run mode
+    if dry_run:
+        print("DRY RUN MODE - No files will be modified")
+        print()
+        if 'catalog' in sources:
+            print("Would strip from catalog YAML files:")
+            catalog_dir = ROOT / "config" / "themes" / "catalog"
+            yaml_files = sorted(catalog_dir.glob("*.yml"))
+            for yaml_file in yaml_files[:5]:
+                print(f"  - {yaml_file.name}")
+            if len(yaml_files) > 5:
+                print(f"  ... and {len(yaml_files) - 5} more")
+        
+        if 'parquet' in sources:
+            print("\nWould strip from parquet files:")
+            for pf in parquet_files[:3]:
+                print(f"  - {pf.name}")
+            if len(parquet_files) > 3:
+                print(f"  ... and {len(parquet_files) - 3} more")
+        
+        print(f"\nWould strip {len(themes_to_strip)} themes total")
+        print("Would regenerate theme_list.json")
+        print("Would update stripped_themes.yml log")
+        return results
+    
+    # Step 2: Strip from catalog (if requested)
+    # NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
+    # otherwise build_theme_catalog.py will read un-stripped themes from YAML
+    if 'catalog' in sources:
+        print("Step 2: Stripping from catalog YAML files...")
+        try:
+            catalog_dir = ROOT / "config" / "themes" / "catalog"
+            catalog_results = strip_catalog_themes(
+                catalog_dir=catalog_dir,
+                themes_to_strip=themes_to_strip,
+                backup=backup
+            )
+            
+            results["catalog_stripped"] = catalog_results["files_modified"]
+            
+            if verbose:
+                print(f"  Files modified: {catalog_results['files_modified']}")
+                print(f"  Themes removed: {catalog_results['themes_removed']}")
+                if catalog_results["backups_created"]:
+                    print(f"  Backups created: {len(catalog_results['backups_created'])}")
+            else:
+                print(f"  ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
+            
+            results["errors"].extend(catalog_results["errors"])
+            
+        except Exception as e:
+            error_msg = f"Catalog stripping failed: {e}"
+            print(f"  ❌ {error_msg}")
+            results["errors"].append(error_msg)
+        
+        print()
+    
+    # Step 3: Strip from parquet (if requested)
+    if 'parquet' in sources:
+        step_num = 3 if 'catalog' in sources else 2
+        print(f"Step {step_num}: Stripping from parquet files...")
+        try:
+            for parquet_file in parquet_files:
+                if verbose:
+                    print(f"  Processing: {parquet_file.name}")
+                
+                parquet_results = strip_parquet_themes(
+                    parquet_path=parquet_file,
+                    themes_to_strip=themes_to_strip,
+                    backup=backup
+                )
+                
+                results["parquet_tags_removed"] += parquet_results["tags_removed"]
+                results["errors"].extend(parquet_results["errors"])
+                
+                if verbose and parquet_results["tags_removed"] > 0:
+                    print(f"    Removed {parquet_results['tags_removed']} tag occurrences")
+            
+            if not verbose:
+                print(f"  ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
+            
+        except Exception as e:
+            error_msg = f"Parquet stripping failed: {e}"
+            print(f"  ❌ {error_msg}")
+            results["errors"].append(error_msg)
+        
+        print()
+    
+    # Step 4: Rebuild theme_list.json (if parquet was stripped)
+    # NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
+    if 'parquet' in sources:
+        step_num = 4 if 'catalog' in sources else 3
+        print(f"Step {step_num}: Rebuilding theme_list.json...")
+        try:
+            # Import build script
+            from code.scripts.build_theme_catalog import main as build_main
+            
+            # Suppress verbose build output unless --verbose flag
+            import io
+            import contextlib
+            
+            if not verbose:
+                with contextlib.redirect_stdout(io.StringIO()):
+                    build_main()
+            else:
+                build_main()
+            
+            results["json_regenerated"] = True
+            print("  ✓ theme_list.json regenerated")
+            
+        except Exception as e:
+            error_msg = f"JSON regeneration failed: {e}"
+            print(f"  ❌ {error_msg}")
+            results["errors"].append(error_msg)
+        
+        print()
+    
+    # Step 5: Update stripped themes log
+    final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
+    print(f"Step {final_step}: Updating stripped_themes.yml log...")
+    try:
+        log_path = ROOT / "logs" / "stripped_themes.yml"
+        source_labels = []
+        if 'catalog' in sources:
+            source_labels.append("catalog YAML")
+        if 'parquet' in sources:
+            source_labels.append("parquet files")
+        
+        create_stripped_themes_log(
+            output_path=log_path,
+            theme_counts=theme_counts,
+            themes_stripped=themes_to_strip,
+            min_threshold=min_cards,
+            sources=source_labels if source_labels else None
+        )
+        print(f"  ✓ Log updated: {log_path}")
+        
+    except Exception as e:
+        error_msg = f"Log update failed: {e}"
+        print(f"  ❌ {error_msg}")
+        results["errors"].append(error_msg)
+    
+    # Final summary
+    elapsed = time.time() - start_time
+    print()
+    print("="*70)
+    print("SUMMARY")
+    print("="*70)
+    print(f"Themes analyzed: {results['themes_analyzed']}")
+    print(f"Themes stripped: {results['themes_to_strip']}")
+    if 'catalog' in sources:
+        print(f"Catalog files modified: {results['catalog_stripped']}")
+    if 'parquet' in sources:
+        print(f"Parquet tags removed: {results['parquet_tags_removed']}")
+        print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
+    print(f"Time elapsed: {elapsed:.2f}s")
+    
+    if results["errors"]:
+        print(f"\n⚠️  Errors encountered: {len(results['errors'])}")
+        for error in results["errors"]:
+            print(f"  - {error}")
+    else:
+        print("\n✅ Theme stripping completed successfully!")
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Orchestrate complete theme stripping pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    
+    parser.add_argument(
+        '--min-cards',
+        type=int,
+        help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
+    )
+    
+    parser.add_argument(
+        '--sources',
+        type=str,
+        help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be stripped without making changes'
+    )
+    
+    parser.add_argument(
+        '--no-backup',
+        action='store_true',
+        help='Skip creating backup files before modification'
+    )
+    
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed stripping information'
+    )
+    
+    args = parser.parse_args()
+    
+    # Determine threshold
+    min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
+    
+    # Determine sources
+    if args.sources:
+        source_input = args.sources.lower()
+        if source_input == 'all':
+            sources = {'catalog', 'parquet'}
+        else:
+            sources = set(s.strip() for s in source_input.split(','))
+            valid_sources = {'catalog', 'parquet'}
+            invalid = sources - valid_sources
+            if invalid:
+                print(f"Error: Invalid sources: {', '.join(invalid)}")
+                print(f"Valid sources: {', '.join(valid_sources)}, all")
+                return 1
+    else:
+        sources = {'catalog', 'parquet'}  # Default: all sources
+    
+    # Execute pipeline
+    results = strip_all_sources(
+        min_cards=min_cards,
+        sources=sources,
+        backup=not args.no_backup,
+        dry_run=args.dry_run,
+        verbose=args.verbose
+    )
+    
+    # Return exit code
+    return 0 if not results["errors"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/code/settings.py
+++ b/code/settings.py
@ -156,4 +156,14 @@ SIMILARITY_CACHE_MAX_AGE_DAYS = int(os.getenv('SIMILARITY_CACHE_MAX_AGE_DAYS', '
 SIMILARITY_CACHE_DOWNLOAD = os.getenv('SIMILARITY_CACHE_DOWNLOAD', '1').lower() not in ('0', 'false', 'off', 'disabled')

 # Batch build feature flag (Build X and Compare)
-ENABLE_BATCH_BUILD = os.getenv('ENABLE_BATCH_BUILD', '1').lower() not in ('0', 'false', 'off', 'disabled')
+ENABLE_BATCH_BUILD = os.getenv('ENABLE_BATCH_BUILD', '1').lower() not in ('0', 'false', 'off', 'disabled')
+
+# ----------------------------------------------------------------------------------
+# THEME CATALOG SETTINGS
+# ----------------------------------------------------------------------------------
+
+# Minimum number of cards required for a theme to be kept in the system
+# Themes with fewer cards will be stripped during setup/tagging
+# Set to 1 to keep all themes with at least one card
+# Set to 0 to only strip orphaned themes (themes with zero cards)
+THEME_MIN_CARDS = max(0, int(os.getenv('THEME_MIN_CARDS', '5')))
--- a/code/tagging/combo_tag_applier.py
+++ b/code/tagging/combo_tag_applier.py
@ -9,6 +9,7 @@ from pathlib import Path
 from typing import DefaultDict, Dict, List, Set

 # Third-party imports
+import numpy as np
 import pandas as pd


@ -151,7 +152,8 @@ def apply_combo_tags(
    # Calculate updated counts
    updated_counts: Dict[str, int] = {}
    if before_hash != after_hash:
-        updated_counts["total"] = int((df["comboTags"].apply(bool)).sum())
+        # Use len() > 0 to handle arrays properly (avoid ambiguous truth value)
+        updated_counts["total"] = int((df["comboTags"].apply(lambda x: len(x) > 0 if isinstance(x, (list, np.ndarray)) else bool(x))).sum())
    else:
        updated_counts["total"] = 0
    
--- a/code/tagging/tagger.py
+++ b/code/tagging/tagger.py
@ -6897,6 +6897,112 @@ def run_tagging(parallel: bool = False, max_workers: int | None = None):
        logger.info(f"✓ Wrote tagging completion flag to {flag_path}")
    except Exception as e:
        logger.warning(f"Failed to write tagging completion flag: {e}")
+    
+    # R21: Theme stripping after tagging (if THEME_MIN_CARDS > 1)
+    try:
+        from settings import THEME_MIN_CARDS
+        
+        if THEME_MIN_CARDS > 1:
+            logger.info("=" * 80)
+            logger.info(f"Starting theme stripping (THEME_MIN_CARDS={THEME_MIN_CARDS})")
+            logger.info("=" * 80)
+            
+            strip_start = pd.Timestamp.now()
+            
+            # Import theme stripping functions
+            from tagging.theme_stripper import (
+                get_theme_card_counts,
+                identify_themes_to_strip,
+                strip_parquet_themes,
+                strip_catalog_themes,
+                create_stripped_themes_log
+            )
+            
+            # Define project root (tagger.py is in code/tagging/, so go up 2 levels)
+            PROJECT_ROOT = Path(__file__).resolve().parents[2]
+            
+            # Step 1: Analyze themes
+            parquet_dir = Path("card_files/processed")
+            parquet_files = sorted(parquet_dir.glob("*.parquet"))
+            
+            logger.info(f"Analyzing {len(parquet_files)} parquet files...")
+            theme_counts = get_theme_card_counts(parquet_files)
+            themes_to_strip = identify_themes_to_strip(theme_counts, THEME_MIN_CARDS)
+            
+            logger.info(f"Found {len(theme_counts)} themes, {len(themes_to_strip)} below threshold")
+            
+            if themes_to_strip:
+                # Step 2: Strip from catalog YAML (MUST happen before building JSON)
+                logger.info("Stripping themes from catalog YAML files...")
+                catalog_dir = PROJECT_ROOT / "config" / "themes" / "catalog"
+                
+                if catalog_dir.exists():
+                    catalog_results = strip_catalog_themes(
+                        catalog_dir=catalog_dir,
+                        themes_to_strip=themes_to_strip,
+                        backup=True
+                    )
+                    logger.info(f"✓ Modified {len(catalog_results['files_modified'])} catalog files, stripped {catalog_results['stripped_count']} themes")
+                else:
+                    logger.info("Catalog directory doesn't exist yet, skipping YAML stripping")
+                
+                # Step 3: Strip from parquet files
+                logger.info("Stripping themes from parquet files...")
+                total_tags_removed = 0
+                for parquet_file in parquet_files:
+                    results = strip_parquet_themes(
+                        parquet_path=parquet_file,
+                        themes_to_strip=themes_to_strip,
+                        backup=True
+                    )
+                    total_tags_removed += results["tags_removed"]
+                
+                logger.info(f"✓ Removed {total_tags_removed} theme tag occurrences")
+                
+                # Step 4: Rebuild theme_list.json from stripped data
+                logger.info("Rebuilding theme_list.json from stripped parquet and catalog...")
+                try:
+                    from scripts.build_theme_catalog import build_catalog
+                    import json
+                    from pathlib import Path
+                    
+                    # Call build_catalog directly to avoid argparse issues
+                    data = build_catalog(limit=0, verbose=False)
+                    output_path = PROJECT_ROOT / "config" / "themes" / "theme_list.json"
+                    
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump({k: v for k, v in data.items() if k != 'yaml_catalog'}, f, indent=2, ensure_ascii=False)
+                    
+                    logger.info("✓ theme_list.json regenerated from stripped sources")
+                except Exception as e:
+                    logger.warning(f"Failed to rebuild theme_list.json: {e}")
+                
+                # Step 5: Update stripped themes log
+                logger.info("Updating stripped_themes.yml log...")
+                log_path = PROJECT_ROOT / "logs" / "stripped_themes.yml"
+                create_stripped_themes_log(
+                    output_path=log_path,
+                    theme_counts=theme_counts,
+                    themes_stripped=themes_to_strip,
+                    min_threshold=THEME_MIN_CARDS,
+                    sources=["parquet files", "catalog YAML"]
+                )
+                logger.info(f"✓ Log updated: {log_path}")
+                
+                strip_duration = (pd.Timestamp.now() - strip_start).total_seconds()
+                logger.info("=" * 80)
+                logger.info(f"✓ Theme stripping complete in {strip_duration:.2f}s")
+                logger.info(f"  Themes stripped: {len(themes_to_strip)}")
+                logger.info(f"  Tags removed: {total_tags_removed}")
+                logger.info("=" * 80)
+            else:
+                logger.info("No themes below threshold, skipping stripping")
+        else:
+            logger.info(f"Theme stripping disabled (THEME_MIN_CARDS={THEME_MIN_CARDS})")
+    
+    except Exception as e:
+        logger.error(f"Theme stripping failed: {e}")
+        logger.warning("Continuing without theme stripping")



--- a/code/tagging/theme_stripper.py
+++ b/code/tagging/theme_stripper.py
@ -0,0 +1,621 @@
+"""
+Theme Stripping Module
+
+Provides threshold logic and utilities for identifying and stripping themes
+with insufficient card counts from the theme catalog and card data.
+
+This module supports M1-M4 of the Theme Stripping roadmap:
+- M1: Threshold logic and theme count analysis
+- M2: Theme catalog YAML stripping
+- M3: theme_list.json stripping
+- M4: Parquet file theme_tags stripping
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Set, List, Tuple, Any, Optional
+import pandas as pd
+import numpy as np
+
+try:
+    import yaml
+except ImportError:
+    yaml = None  # type: ignore
+
+
+# ----------------------------------------------------------------------------------
+# M1: Threshold Logic & Analysis
+# ----------------------------------------------------------------------------------
+
+def get_theme_card_counts(parquet_paths: List[Path]) -> Dict[str, Set[str]]:
+    """
+    Build a mapping of theme -> set of card names from parquet files.
+    
+    Args:
+        parquet_paths: List of paths to parquet files to analyze
+        
+    Returns:
+        Dictionary mapping theme ID to set of card names containing that theme
+        
+    Example:
+        {"lifegain": {"Ajani's Pridemate", "Soul Warden", ...}, ...}
+    """
+    theme_to_cards: Dict[str, Set[str]] = {}
+    
+    for parquet_path in parquet_paths:
+        try:
+            df = pd.read_parquet(parquet_path)
+            
+            # Process each card's theme_tags
+            for _, row in df.iterrows():
+                card_name = row.get('name', '')
+                theme_tags = row.get('themeTags', [])
+                
+                # Handle numpy arrays, lists, and string formats
+                if isinstance(theme_tags, np.ndarray):
+                    themes = [str(t).strip() for t in theme_tags if str(t).strip()]
+                elif isinstance(theme_tags, str):
+                    # Try common separators
+                    if '|' in theme_tags:
+                        themes = [t.strip() for t in theme_tags.split('|') if t.strip()]
+                    elif ',' in theme_tags:
+                        themes = [t.strip() for t in theme_tags.split(',') if t.strip()]
+                    else:
+                        themes = [theme_tags.strip()] if theme_tags.strip() else []
+                elif isinstance(theme_tags, list):
+                    themes = [str(t).strip() for t in theme_tags if str(t).strip()]
+                else:
+                    themes = []
+                
+                # Add card to each theme's set
+                for theme in themes:
+                    if theme:  # Skip empty themes
+                        # Normalize theme ID (lowercase, replace spaces with underscores)
+                        theme_id = theme.lower().replace(' ', '_')
+                        if theme_id not in theme_to_cards:
+                            theme_to_cards[theme_id] = set()
+                        theme_to_cards[theme_id].add(card_name)
+                        
+        except Exception as e:
+            print(f"Warning: Failed to process {parquet_path}: {e}")
+            continue
+    
+    return theme_to_cards
+
+
+def identify_themes_to_strip(
+    theme_counts: Dict[str, Set[str]],
+    min_cards: int
+) -> Set[str]:
+    """
+    Identify themes that should be stripped based on card count threshold.
+    
+    Args:
+        theme_counts: Dictionary mapping theme ID to set of card names
+        min_cards: Minimum number of cards required to keep a theme
+        
+    Returns:
+        Set of theme IDs that should be stripped
+        
+    Example:
+        >>> counts = {"daybound": {"Card1", "Card2"}, "lifegain": {"Card1", "Card2", "Card3", "Card4", "Card5"}}
+        >>> identify_themes_to_strip(counts, 5)
+        {'daybound'}
+    """
+    themes_to_strip = set()
+    
+    for theme_id, card_set in theme_counts.items():
+        card_count = len(card_set)
+        if card_count < min_cards:
+            themes_to_strip.add(theme_id)
+    
+    return themes_to_strip
+
+
+def should_strip_theme(theme: str, card_count: int, min_cards: int) -> bool:
+    """
+    Determine if a specific theme should be stripped based on threshold.
+    
+    Args:
+        theme: Theme ID
+        card_count: Number of cards with this theme
+        min_cards: Minimum threshold
+        
+    Returns:
+        True if theme should be stripped, False otherwise
+    """
+    return card_count < min_cards
+
+
+def get_theme_distribution(theme_counts: Dict[str, Set[str]]) -> Dict[str, int]:
+    """
+    Get distribution of themes by card count buckets.
+    
+    Args:
+        theme_counts: Dictionary mapping theme ID to set of card names
+        
+    Returns:
+        Dictionary with distribution statistics:
+        - "1_card": Count of themes with exactly 1 card
+        - "2_cards": Count of themes with exactly 2 cards
+        - "3_4_cards": Count of themes with 3-4 cards
+        - "5_9_cards": Count of themes with 5-9 cards
+        - "10_plus": Count of themes with 10+ cards
+        - "total": Total number of themes
+    """
+    distribution = {
+        "1_card": 0,
+        "2_cards": 0,
+        "3_4_cards": 0,
+        "5_9_cards": 0,
+        "10_plus": 0,
+        "total": 0
+    }
+    
+    for card_set in theme_counts.values():
+        count = len(card_set)
+        distribution["total"] += 1
+        
+        if count == 1:
+            distribution["1_card"] += 1
+        elif count == 2:
+            distribution["2_cards"] += 1
+        elif 3 <= count <= 4:
+            distribution["3_4_cards"] += 1
+        elif 5 <= count <= 9:
+            distribution["5_9_cards"] += 1
+        else:  # 10+
+            distribution["10_plus"] += 1
+    
+    return distribution
+
+
+def get_themes_by_count(
+    theme_counts: Dict[str, Set[str]],
+    below_threshold: int
+) -> List[Tuple[str, int, List[str]]]:
+    """
+    Get list of themes below threshold with their counts and card lists.
+    
+    Args:
+        theme_counts: Dictionary mapping theme ID to set of card names
+        below_threshold: Threshold for listing themes
+        
+    Returns:
+        List of tuples (theme_id, card_count, card_list) sorted by count (ascending)
+        
+    Example:
+        [("miracle", 4, ["Temporal Mastery", "Terminus", "Entreat the Angels", "Bonfire"]), ...]
+    """
+    below_list = []
+    
+    for theme_id, card_set in theme_counts.items():
+        count = len(card_set)
+        if count < below_threshold:
+            card_list = sorted(card_set)  # Sort for consistent output
+            below_list.append((theme_id, count, card_list))
+    
+    # Sort by count (ascending), then alphabetically
+    below_list.sort(key=lambda x: (x[1], x[0]))
+    
+    return below_list
+
+
+# ----------------------------------------------------------------------------------
+# M2: Theme Catalog Stripping
+# ----------------------------------------------------------------------------------
+
+def backup_catalog_file(file_path: Path) -> Path:
+    """
+    Create a timestamped backup of a catalog YAML file.
+    
+    Args:
+        file_path: Path to the YAML file to backup
+        
+    Returns:
+        Path to the backup file created
+        
+    Example:
+        daybound.yml -> daybound_20260319_143025.yml.bak
+    """
+    if not file_path.exists():
+        raise FileNotFoundError(f"Cannot backup non-existent file: {file_path}")
+    
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    stem = file_path.stem  # filename without extension
+    backup_path = file_path.parent / f"{stem}_{timestamp}.yml.bak"
+    
+    # Copy content to backup
+    backup_path.write_text(file_path.read_text(encoding='utf-8'), encoding='utf-8')
+    
+    return backup_path
+
+
+def remove_theme_from_catalog(yaml_data: Dict[str, Any], theme_id: str) -> bool:
+    """
+    Remove a theme entry from catalog YAML data.
+    
+    Args:
+        yaml_data: Loaded YAML data (dict)
+        theme_id: Theme ID to remove (must match exactly)
+        
+    Returns:
+        True if theme was removed, False if not found
+        
+    Note:
+        Modifies yaml_data in-place. Handles single-theme files (where entire
+        file content is the theme dict) and potential multi-theme structures.
+    """
+    # Single-theme file: check if the 'id' field matches
+    if isinstance(yaml_data, dict) and yaml_data.get('id') == theme_id:
+        # For single-theme files, we can't remove the theme from the dict itself
+        # Caller must handle file deletion
+        return True
+    
+    # Multi-theme file: check if yaml_data contains a list or dict of themes
+    # (Future-proofing: current catalog uses one file per theme, but structure may change)
+    if isinstance(yaml_data, list):
+        for i, theme in enumerate(yaml_data):
+            if isinstance(theme, dict) and theme.get('id') == theme_id:
+                yaml_data.pop(i)
+                return True
+    
+    return False
+
+
+def strip_catalog_themes(
+    catalog_dir: Path,
+    themes_to_strip: Set[str],
+    backup: bool = True
+) -> Dict[str, Any]:
+    """
+    Strip low-card themes from YAML catalog files.
+    
+    Args:
+        catalog_dir: Directory containing theme catalog YAML files
+        themes_to_strip: Set of theme IDs to remove
+        backup: Whether to create timestamped backups before modification
+        
+    Returns:
+        Dictionary with stripping results:
+        - "stripped_count": Number of themes stripped
+        - "files_modified": List of file paths modified
+        - "files_deleted": List of file paths deleted (empty single-theme files)
+        - "backups_created": List of backup file paths
+        - "errors": List of error messages
+        
+    Example:
+        results = strip_catalog_themes(
+            Path("config/themes/catalog"),
+            {"daybound", "miracle"},
+            backup=True
+        )
+        # Results: {"stripped_count": 2, "files_modified": [...], ...}
+    """
+    if yaml is None:
+        raise RuntimeError("PyYAML not installed - cannot strip catalog themes")
+    
+    if not catalog_dir.exists():
+        raise FileNotFoundError(f"Catalog directory does not exist: {catalog_dir}")
+    
+    results = {
+        "stripped_count": 0,
+        "files_modified": [],
+        "files_deleted": [],
+        "backups_created": [],
+        "errors": []
+    }
+    
+    # Find all YAML files in catalog directory
+    yaml_files = sorted(catalog_dir.glob("*.yml"))
+    
+    for yaml_file in yaml_files:
+        try:
+            # Load YAML content
+            content = yaml_file.read_text(encoding='utf-8')
+            data = yaml.safe_load(content)
+            
+            if not isinstance(data, dict):
+                continue  # Skip non-dict files
+            
+            theme_id = data.get('id')
+            if not theme_id or theme_id not in themes_to_strip:
+                continue  # Skip if theme not in strip list
+            
+            # Create backup before modification
+            if backup:
+                try:
+                    backup_path = backup_catalog_file(yaml_file)
+                    results["backups_created"].append(str(backup_path))
+                except Exception as e:
+                    results["errors"].append(f"Backup failed for {yaml_file.name}: {e}")
+                    # Continue anyway - modification is important
+            
+            # For single-theme files, delete the file entirely
+            # (Current catalog structure: one theme per file)
+            yaml_file.unlink()
+            results["stripped_count"] += 1
+            results["files_deleted"].append(str(yaml_file))
+            
+        except yaml.YAMLError as e:
+            results["errors"].append(f"YAML parse error in {yaml_file.name}: {e}")
+        except Exception as e:
+            results["errors"].append(f"Error processing {yaml_file.name}: {e}")
+    
+    return results
+
+
+def create_stripped_themes_log(
+    output_path: Path,
+    theme_counts: Dict[str, Set[str]],
+    themes_stripped: Set[str],
+    min_threshold: int,
+    sources: Optional[List[str]] = None
+) -> None:
+    """
+    Create a YAML log of stripped themes with metadata.
+    
+    Args:
+        output_path: Path where stripped_themes.yml will be written
+        theme_counts: Dictionary mapping theme ID to set of card names
+        themes_stripped: Set of theme IDs that were stripped
+        min_threshold: The minimum card threshold used for stripping
+        sources: Optional list of sources themes were stripped from
+        
+    Creates a YAML file with structure:
+        metadata:
+          last_updated: "2026-03-19T12:30:00"
+          min_card_threshold: 5
+          total_stripped: 42
+        
+        stripped_themes:
+          - theme_id: "daybound"
+            display_name: "Daybound"
+            card_count: 3
+            cards:
+              - "Card Name 1"
+              - "Card Name 2"
+            reason: "Below minimum card threshold (3 < 5)"
+            stripped_from:
+              - "catalog/daybound.yml"
+              - "theme_list.json"
+              - "parquet files"
+    """
+    if yaml is None:
+        raise RuntimeError("PyYAML not installed - cannot create stripped themes log")
+    
+    # Build stripped themes list
+    stripped_list = []
+    for theme_id in sorted(themes_stripped):
+        if theme_id not in theme_counts:
+            continue  # Skip if we don't have count data
+        
+        card_set = theme_counts[theme_id]
+        card_count = len(card_set)
+        sorted_cards = sorted(card_set)
+        
+        # Convert theme_id to display name (capitalize each word, replace underscores)
+        display_name = theme_id.replace('_', ' ').title()
+        
+        theme_entry = {
+            'theme_id': theme_id,
+            'display_name': display_name,
+            'card_count': card_count,
+            'cards': sorted_cards,
+            'reason': f"Below minimum card threshold ({card_count} < {min_threshold})",
+            'stripped_from': sources if sources else ["catalog YAML", "theme_list.json", "parquet files"]
+        }
+        
+        stripped_list.append(theme_entry)
+    
+    # Sort by card count (ascending), then alphabetically
+    stripped_list.sort(key=lambda x: (x['card_count'], x['theme_id']))
+    
+    # Build complete log structure
+    log_data = {
+        'metadata': {
+            'last_updated': datetime.now().isoformat(),
+            'min_card_threshold': min_threshold,
+            'total_stripped': len(stripped_list)
+        },
+        'stripped_themes': stripped_list
+    }
+    
+    # Write to file
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        yaml.dump(log_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, indent=2)
+    
+    print(f"Stripped themes log written to {output_path}")
+
+
+# ----------------------------------------------------------------------------------
+# M4: Parquet File Stripping
+# ----------------------------------------------------------------------------------
+
+def backup_parquet_file(file_path: Path) -> Path:
+    """
+    Create a timestamped backup of a parquet file.
+    
+    Args:
+        file_path: Path to the parquet file to backup
+        
+    Returns:
+        Path to the backup file created
+        
+    Example:
+        all_cards.parquet -> all_cards_20260319_143025.parquet.bak
+    """
+    if not file_path.exists():
+        raise FileNotFoundError(f"Cannot backup non-existent file: {file_path}")
+    
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    stem = file_path.stem  # filename without extension
+    backup_path = file_path.parent / f"{stem}_{timestamp}.parquet.bak"
+    
+    # Copy file to backup
+    import shutil
+    shutil.copy2(file_path, backup_path)
+    
+    return backup_path
+
+
+def filter_theme_tags(theme_tags: Any, themes_to_strip: Set[str]) -> List[str]:
+    """
+    Remove specific themes from a themeTags value (handles multiple formats).
+    
+    Args:
+        theme_tags: Can be numpy array, list, or string
+        themes_to_strip: Set of theme IDs to remove (case-insensitive matching)
+        
+    Returns:
+        Filtered list of theme tags
+        
+    Note:
+        Matches themes case-insensitively for robustness.
+    """
+    # Convert to list if needed
+    if isinstance(theme_tags, np.ndarray):
+        tags_list = theme_tags.tolist()
+    elif isinstance(theme_tags, list):
+        tags_list = theme_tags
+    elif isinstance(theme_tags, str):
+        # Handle string formats (comma or pipe separated)
+        if '|' in theme_tags:
+            tags_list = [t.strip() for t in theme_tags.split('|') if t.strip()]
+        elif ',' in theme_tags:
+            tags_list = [t.strip() for t in theme_tags.split(',') if t.strip()]
+        else:
+            tags_list = [theme_tags] if theme_tags else []
+    else:
+        tags_list = []
+    
+    # Normalize themes to strip (lowercase for case-insensitive matching)
+    normalized_strip_set = {theme.lower() for theme in themes_to_strip}
+    
+    # Filter themes
+    filtered = [tag for tag in tags_list if str(tag).lower() not in normalized_strip_set]
+    
+    return filtered
+
+
+def update_parquet_theme_tags(df: pd.DataFrame, themes_to_strip: Set[str]) -> pd.DataFrame:
+    """
+    Process entire dataframe to remove stripped themes from themeTags column.
+    
+    Args:
+        df: DataFrame with themeTags column
+        themes_to_strip: Set of theme IDs to remove
+        
+    Returns:
+        Modified DataFrame (in-place modification + return for convenience)
+        
+    Note:
+        Modifies df in-place and also returns it.
+    """
+    if 'themeTags' not in df.columns:
+        print("Warning: themeTags column not found in dataframe")
+        return df
+    
+    # Apply filtering to each row
+    df['themeTags'] = df['themeTags'].apply(
+        lambda tags: filter_theme_tags(tags, themes_to_strip)
+    )
+    
+    return df
+
+
+def strip_parquet_themes(
+    parquet_path: Path,
+    themes_to_strip: Set[str],
+    backup: bool = True
+) -> Dict[str, Any]:
+    """
+    Strip low-card themes from parquet file's themeTags column.
+    
+    Args:
+        parquet_path: Path to parquet file
+        themes_to_strip: Set of theme IDs to remove
+        backup: Whether to create timestamped backup before modification
+        
+    Returns:
+        Dictionary with stripping results:
+        - "cards_processed": Total number of cards
+        - "cards_modified": Number of cards with tags removed
+        - "tags_removed": Total number of tag removals
+        - "backup_created": Backup file path (if backup=True)
+        - "errors": List of error messages
+        
+    Example:
+        results = strip_parquet_themes(
+            Path("card_files/processed/all_cards.parquet"),
+            {"fateseal", "gravestorm"},
+            backup=True
+        )
+    """
+    if not parquet_path.exists():
+        raise FileNotFoundError(f"Parquet file does not exist: {parquet_path}")
+    
+    results = {
+        "cards_processed": 0,
+        "cards_modified": 0,
+        "tags_removed": 0,
+        "backup_created": None,
+        "errors": []
+    }
+    
+    try:
+        # Load parquet
+        df = pd.read_parquet(parquet_path, engine='pyarrow')
+        results["cards_processed"] = len(df)
+        
+        # Create backup before modification
+        if backup:
+            try:
+                backup_path = backup_parquet_file(parquet_path)
+                results["backup_created"] = str(backup_path)
+                print(f"Created backup: {backup_path}")
+            except Exception as e:
+                results["errors"].append(f"Backup failed: {e}")
+                # Continue anyway - modification is important
+        
+        # Track modifications
+        if 'themeTags' in df.columns:
+            # Count tags before stripping
+            tags_before = sum(
+                len(tags) if isinstance(tags, (list, np.ndarray)) else 0 
+                for tags in df['themeTags']
+            )
+            
+            # Apply filtering
+            update_parquet_theme_tags(df, themes_to_strip)
+            
+            # Count tags after stripping
+            tags_after = sum(
+                len(tags) if isinstance(tags, list) else 0 
+                for tags in df['themeTags']
+            )
+            
+            results["tags_removed"] = tags_before - tags_after
+            
+            # Count cards with modifications (cards that had at least one tag removed)
+            # This is approximate: tags_removed / ~avg_tags_per_card
+            if results["tags_removed"] > 0:
+                results["cards_modified"] = results["tags_removed"]  # Conservative estimate
+            
+            print(f"Stripped {results['tags_removed']} tag occurrences from {results['cards_processed']} cards")
+        else:
+            results["errors"].append("themeTags column not found in parquet file")
+            return results
+        
+        # Write modified parquet back
+        df.to_parquet(parquet_path, engine='pyarrow', index=False)
+        print(f"Updated {parquet_path}")
+        
+    except Exception as e:
+        results["errors"].append(f"Error processing parquet: {e}")
+    
+    return results