#!/usr/bin/env python3 """ Standalone theme stripping orchestration script. This script coordinates the complete theme stripping pipeline: 1. Analyze parquet files to identify low-card themes 2. Strip from catalog YAML files (optional) 3. Strip from parquet themeTags columns (optional) 4. Rebuild theme_list.json from stripped parquet data 5. Generate stripped_themes.yml log Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21). Usage: # Dry run to preview changes python code/scripts/strip_themes.py --dry-run # Strip everything with default threshold (5 cards) python code/scripts/strip_themes.py # Strip only catalog YAML files python code/scripts/strip_themes.py --sources catalog # Strip only parquet files python code/scripts/strip_themes.py --sources parquet # Custom threshold python code/scripts/strip_themes.py --min-cards 10 # Skip backups (not recommended) python code/scripts/strip_themes.py --no-backup Environment Variables: THEME_MIN_CARDS: Minimum card threshold (default: 5) Outputs: - Modified catalog/*.yml files (if --sources includes catalog) - Modified parquet files (if --sources includes parquet) - Regenerated config/themes/theme_list.json - Updated logs/stripped_themes.yml log - Timestamped backups (if --backup enabled) """ import argparse import sys import time from pathlib import Path from datetime import datetime from typing import Set, Dict # Add project root to path ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(ROOT)) from code import settings as code_settings from code.tagging.theme_stripper import ( get_theme_card_counts, identify_themes_to_strip, strip_catalog_themes, strip_parquet_themes, create_stripped_themes_log ) def strip_all_sources( min_cards: int, sources: Set[str], backup: bool, dry_run: bool, verbose: bool ) -> Dict: """ Execute complete theme stripping pipeline. Args: min_cards: Minimum card count threshold sources: Set of sources to strip ('catalog', 'parquet', or both) backup: Whether to create backups before modification dry_run: Preview changes without modifying files verbose: Show detailed output Returns: Dictionary with stripping results and statistics """ start_time = time.time() results = { "themes_analyzed": 0, "themes_to_strip": 0, "catalog_stripped": 0, "parquet_tags_removed": 0, "json_regenerated": False, "errors": [] } print("="*70) print("THEME STRIPPING PIPELINE") print("="*70) print(f"Configuration:") print(f" Minimum cards: {min_cards}") print(f" Sources: {', '.join(sorted(sources))}") print(f" Backup enabled: {backup}") print(f" Dry run: {dry_run}") print() # Step 1: Analyze parquet files print("Step 1: Analyzing theme card counts...") try: parquet_dir = ROOT / "card_files" / "processed" parquet_files = sorted(parquet_dir.glob("*.parquet")) if not parquet_files: results["errors"].append("No parquet files found in card_files/processed/") return results theme_counts = get_theme_card_counts(parquet_files) results["themes_analyzed"] = len(theme_counts) print(f" Found {len(theme_counts)} unique themes") themes_to_strip = identify_themes_to_strip(theme_counts, min_cards) results["themes_to_strip"] = len(themes_to_strip) print(f" Identified {len(themes_to_strip)} themes below threshold") if verbose and themes_to_strip: sample = sorted(list(themes_to_strip))[:5] print(f" Sample themes: {', '.join(sample)}") if len(themes_to_strip) > 5: print(f" ... and {len(themes_to_strip) - 5} more") if not themes_to_strip: print("\n✅ No themes below threshold. Nothing to strip.") return results except Exception as e: error_msg = f"Analysis failed: {e}" print(f" ❌ {error_msg}") results["errors"].append(error_msg) return results print() # Dry run mode if dry_run: print("DRY RUN MODE - No files will be modified") print() if 'catalog' in sources: print("Would strip from catalog YAML files:") catalog_dir = ROOT / "config" / "themes" / "catalog" yaml_files = sorted(catalog_dir.glob("*.yml")) for yaml_file in yaml_files[:5]: print(f" - {yaml_file.name}") if len(yaml_files) > 5: print(f" ... and {len(yaml_files) - 5} more") if 'parquet' in sources: print("\nWould strip from parquet files:") for pf in parquet_files[:3]: print(f" - {pf.name}") if len(parquet_files) > 3: print(f" ... and {len(parquet_files) - 3} more") print(f"\nWould strip {len(themes_to_strip)} themes total") print("Would regenerate theme_list.json") print("Would update stripped_themes.yml log") return results # Step 2: Strip from catalog (if requested) # NOTE: Catalog YAML must be stripped BEFORE building theme_list.json, # otherwise build_theme_catalog.py will read un-stripped themes from YAML if 'catalog' in sources: print("Step 2: Stripping from catalog YAML files...") try: catalog_dir = ROOT / "config" / "themes" / "catalog" catalog_results = strip_catalog_themes( catalog_dir=catalog_dir, themes_to_strip=themes_to_strip, backup=backup ) results["catalog_stripped"] = catalog_results["files_modified"] if verbose: print(f" Files modified: {catalog_results['files_modified']}") print(f" Themes removed: {catalog_results['themes_removed']}") if catalog_results["backups_created"]: print(f" Backups created: {len(catalog_results['backups_created'])}") else: print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files") results["errors"].extend(catalog_results["errors"]) except Exception as e: error_msg = f"Catalog stripping failed: {e}" print(f" ❌ {error_msg}") results["errors"].append(error_msg) print() # Step 3: Strip from parquet (if requested) if 'parquet' in sources: step_num = 3 if 'catalog' in sources else 2 print(f"Step {step_num}: Stripping from parquet files...") try: for parquet_file in parquet_files: if verbose: print(f" Processing: {parquet_file.name}") parquet_results = strip_parquet_themes( parquet_path=parquet_file, themes_to_strip=themes_to_strip, backup=backup ) results["parquet_tags_removed"] += parquet_results["tags_removed"] results["errors"].extend(parquet_results["errors"]) if verbose and parquet_results["tags_removed"] > 0: print(f" Removed {parquet_results['tags_removed']} tag occurrences") if not verbose: print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)") except Exception as e: error_msg = f"Parquet stripping failed: {e}" print(f" ❌ {error_msg}") results["errors"].append(error_msg) print() # Step 4: Rebuild theme_list.json (if parquet was stripped) # NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first if 'parquet' in sources: step_num = 4 if 'catalog' in sources else 3 print(f"Step {step_num}: Rebuilding theme_list.json...") try: # Import build script from code.scripts.build_theme_catalog import main as build_main # Suppress verbose build output unless --verbose flag import io import contextlib if not verbose: with contextlib.redirect_stdout(io.StringIO()): build_main() else: build_main() results["json_regenerated"] = True print(" ✓ theme_list.json regenerated") except Exception as e: error_msg = f"JSON regeneration failed: {e}" print(f" ❌ {error_msg}") results["errors"].append(error_msg) print() # Step 5: Update stripped themes log final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4) print(f"Step {final_step}: Updating stripped_themes.yml log...") try: log_path = ROOT / "logs" / "stripped_themes.yml" source_labels = [] if 'catalog' in sources: source_labels.append("catalog YAML") if 'parquet' in sources: source_labels.append("parquet files") create_stripped_themes_log( output_path=log_path, theme_counts=theme_counts, themes_stripped=themes_to_strip, min_threshold=min_cards, sources=source_labels if source_labels else None ) print(f" ✓ Log updated: {log_path}") except Exception as e: error_msg = f"Log update failed: {e}" print(f" ❌ {error_msg}") results["errors"].append(error_msg) # Final summary elapsed = time.time() - start_time print() print("="*70) print("SUMMARY") print("="*70) print(f"Themes analyzed: {results['themes_analyzed']}") print(f"Themes stripped: {results['themes_to_strip']}") if 'catalog' in sources: print(f"Catalog files modified: {results['catalog_stripped']}") if 'parquet' in sources: print(f"Parquet tags removed: {results['parquet_tags_removed']}") print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}") print(f"Time elapsed: {elapsed:.2f}s") if results["errors"]: print(f"\n⚠️ Errors encountered: {len(results['errors'])}") for error in results["errors"]: print(f" - {error}") else: print("\n✅ Theme stripping completed successfully!") return results def main(): parser = argparse.ArgumentParser( description="Orchestrate complete theme stripping pipeline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) parser.add_argument( '--min-cards', type=int, help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})' ) parser.add_argument( '--sources', type=str, help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be stripped without making changes' ) parser.add_argument( '--no-backup', action='store_true', help='Skip creating backup files before modification' ) parser.add_argument( '--verbose', action='store_true', help='Show detailed stripping information' ) args = parser.parse_args() # Determine threshold min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS # Determine sources if args.sources: source_input = args.sources.lower() if source_input == 'all': sources = {'catalog', 'parquet'} else: sources = set(s.strip() for s in source_input.split(',')) valid_sources = {'catalog', 'parquet'} invalid = sources - valid_sources if invalid: print(f"Error: Invalid sources: {', '.join(invalid)}") print(f"Valid sources: {', '.join(valid_sources)}, all") return 1 else: sources = {'catalog', 'parquet'} # Default: all sources # Execute pipeline results = strip_all_sources( min_cards=min_cards, sources=sources, backup=not args.no_backup, dry_run=args.dry_run, verbose=args.verbose ) # Return exit code return 0 if not results["errors"] else 1 if __name__ == "__main__": sys.exit(main())