mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2026-03-24 22:16:31 +01:00
feat: implement theme stripping system with THEME_MIN_CARDS config (#55)
Some checks are pending
CI / build (push) Waiting to run
Some checks are pending
CI / build (push) Waiting to run
* feat: implement theme stripping system with THEME_MIN_CARDS config * fix: call build_catalog directly to avoid argparse conflicts in CI
This commit is contained in:
parent
1ebc2fcb3c
commit
03e2846882
20 changed files with 6613 additions and 1364 deletions
207
code/scripts/analyze_theme_distribution.py
Normal file
207
code/scripts/analyze_theme_distribution.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""
|
||||
Theme Distribution Analysis Script
|
||||
|
||||
Analyzes theme distribution across the card catalog and generates reports
|
||||
showing which themes would be stripped based on minimum card thresholds.
|
||||
|
||||
Usage:
|
||||
python -m code.scripts.analyze_theme_distribution [--min-cards N] [--output FILE]
|
||||
|
||||
Arguments:
|
||||
--min-cards N Minimum card threshold (default: from THEME_MIN_CARDS setting)
|
||||
--output FILE Output file path (default: logs/theme_stripping_analysis.txt)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, Set
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from code.settings import THEME_MIN_CARDS, CARD_FILES_PROCESSED_DIR
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
get_theme_distribution,
|
||||
get_themes_by_count
|
||||
)
|
||||
|
||||
|
||||
def analyze_theme_distribution(min_cards: int = None, output_path: str = None) -> None:
|
||||
"""
|
||||
Analyze theme distribution and generate report.
|
||||
|
||||
Args:
|
||||
min_cards: Minimum card threshold (defaults to THEME_MIN_CARDS setting)
|
||||
output_path: Path to output file (defaults to logs/theme_stripping_analysis.txt)
|
||||
"""
|
||||
if min_cards is None:
|
||||
min_cards = THEME_MIN_CARDS
|
||||
|
||||
if output_path is None:
|
||||
output_path = "logs/theme_stripping_analysis.txt"
|
||||
|
||||
print(f"Analyzing theme distribution (min_cards={min_cards})...")
|
||||
|
||||
# Find all parquet files
|
||||
processed_dir = Path(CARD_FILES_PROCESSED_DIR)
|
||||
if not processed_dir.exists():
|
||||
print(f"Error: Processed cards directory not found: {processed_dir}")
|
||||
print("Please run initial setup first to generate parquet files.")
|
||||
sys.exit(1)
|
||||
|
||||
parquet_files = list(processed_dir.glob("*.parquet"))
|
||||
if not parquet_files:
|
||||
print(f"Error: No parquet files found in {processed_dir}")
|
||||
print("Please run initial setup first to generate parquet files.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(parquet_files)} parquet files to analyze")
|
||||
|
||||
# Build theme counts
|
||||
print("Building theme -> card count mapping...")
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
|
||||
if not theme_counts:
|
||||
print("Error: No themes found in parquet files")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(theme_counts)} unique themes")
|
||||
|
||||
# Identify themes to strip
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
|
||||
# Get distribution
|
||||
distribution = get_theme_distribution(theme_counts)
|
||||
|
||||
# Get themes below threshold
|
||||
below_threshold = get_themes_by_count(theme_counts, min_cards)
|
||||
|
||||
# Generate report
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
# Header
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("THEME DISTRIBUTION ANALYSIS REPORT\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||
f.write(f"Minimum Card Threshold: {min_cards}\n")
|
||||
f.write(f"Source: {processed_dir}\n")
|
||||
f.write(f"Parquet Files Analyzed: {len(parquet_files)}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
# Summary statistics
|
||||
f.write("SUMMARY STATISTICS\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
f.write(f"Total Themes: {distribution['total']}\n")
|
||||
f.write(f"Themes to Strip (< {min_cards} cards): {len(themes_to_strip)}\n")
|
||||
f.write(f"Themes to Keep (>= {min_cards} cards): {distribution['total'] - len(themes_to_strip)}\n")
|
||||
f.write(f"Percentage to Strip: {len(themes_to_strip) / distribution['total'] * 100:.1f}%\n")
|
||||
f.write("\n")
|
||||
|
||||
# Distribution by card count
|
||||
f.write("DISTRIBUTION BY CARD COUNT\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
f.write(f" 1 card: {distribution['1_card']:4d} themes\n")
|
||||
f.write(f" 2 cards: {distribution['2_cards']:4d} themes\n")
|
||||
f.write(f" 3-4 cards: {distribution['3_4_cards']:4d} themes\n")
|
||||
f.write(f" 5-9 cards: {distribution['5_9_cards']:4d} themes\n")
|
||||
f.write(f" 10+ cards: {distribution['10_plus']:4d} themes\n")
|
||||
f.write(f" Total: {distribution['total']:4d} themes\n")
|
||||
f.write("\n")
|
||||
|
||||
# Themes below threshold
|
||||
if below_threshold:
|
||||
f.write(f"THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write(f"Total: {len(below_threshold)} themes\n\n")
|
||||
|
||||
for theme_id, count, card_list in below_threshold:
|
||||
f.write(f"Theme: {theme_id}\n")
|
||||
f.write(f"Card Count: {count}\n")
|
||||
f.write(f"Cards:\n")
|
||||
for card in card_list:
|
||||
f.write(f" - {card}\n")
|
||||
f.write("\n")
|
||||
else:
|
||||
f.write(f"NO THEMES BELOW THRESHOLD (< {min_cards} cards)\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("All themes meet the minimum card requirement.\n\n")
|
||||
|
||||
# Recommendations
|
||||
f.write("RECOMMENDATIONS\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
if len(themes_to_strip) > 0:
|
||||
f.write(f"• {len(themes_to_strip)} themes should be stripped\n")
|
||||
f.write(f"• This represents {len(themes_to_strip) / distribution['total'] * 100:.1f}% of the catalog\n")
|
||||
f.write(f"• Run theme stripping to remove these low-viability themes\n")
|
||||
f.write(f"• Consider adjusting THEME_MIN_CARDS if this seems too aggressive\n")
|
||||
else:
|
||||
f.write(f"• No themes below threshold (all themes have >= {min_cards} cards)\n")
|
||||
f.write(f"• Consider lowering THEME_MIN_CARDS if you want to strip more themes\n")
|
||||
f.write("\n")
|
||||
|
||||
# Footer
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("END OF REPORT\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
|
||||
print(f"\nReport generated: {output_file}")
|
||||
print(f"\nSummary:")
|
||||
print(f" Total themes: {distribution['total']}")
|
||||
print(f" Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip) / distribution['total'] * 100:.1f}%)")
|
||||
print(f" Themes to keep: {distribution['total'] - len(themes_to_strip)}")
|
||||
|
||||
# Print distribution
|
||||
print(f"\nDistribution:")
|
||||
print(f" 1 card: {distribution['1_card']:4d} themes")
|
||||
print(f" 2 cards: {distribution['2_cards']:4d} themes")
|
||||
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
|
||||
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
|
||||
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze theme distribution and identify themes below minimum card threshold"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--min-cards',
|
||||
type=int,
|
||||
default=None,
|
||||
help=f'Minimum card threshold (default: {THEME_MIN_CARDS} from THEME_MIN_CARDS setting)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Output file path (default: logs/theme_stripping_analysis.txt)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
analyze_theme_distribution(
|
||||
min_cards=args.min_cards,
|
||||
output_path=args.output
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\nAnalysis cancelled by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nError during analysis: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -34,6 +34,14 @@ try: # Optional
|
|||
except Exception: # pragma: no cover
|
||||
yaml = None
|
||||
|
||||
# Import settings for THEME_MIN_CARDS threshold
|
||||
# Import at module level to avoid stdlib 'code' conflict when running as script
|
||||
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
if ROOT not in sys.path:
|
||||
sys.path.insert(0, ROOT)
|
||||
|
||||
from code import settings as code_settings
|
||||
|
||||
try:
|
||||
# Support running as `python code/scripts/build_theme_catalog.py` when 'code' already on path
|
||||
from scripts.extract_themes import (
|
||||
|
|
@ -166,17 +174,29 @@ def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:
|
|||
|
||||
|
||||
def regenerate_analytics(verbose: bool):
|
||||
"""
|
||||
Regenerate theme analytics from parquet data, constants, and tagger source.
|
||||
|
||||
Now reads from parquet files instead of CSV. Applies THEME_MIN_CARDS filtering
|
||||
to exclude themes with too few cards.
|
||||
|
||||
Args:
|
||||
verbose: Whether to print detailed progress
|
||||
|
||||
Returns:
|
||||
Tuple of (theme_tags, selected_synergies, taxonomy)
|
||||
"""
|
||||
theme_tags: Set[str] = set()
|
||||
theme_tags |= collect_theme_tags_from_constants()
|
||||
theme_tags |= collect_theme_tags_from_tagger_source()
|
||||
try:
|
||||
csv_rows = gather_theme_tag_rows()
|
||||
for row_tags in csv_rows:
|
||||
for t in row_tags:
|
||||
if isinstance(t, str) and t:
|
||||
theme_tags.add(t)
|
||||
except Exception:
|
||||
csv_rows = []
|
||||
|
||||
# M3: Read from parquet (no longer silent fail)
|
||||
# Fail loudly if parquet read fails - this is a critical error
|
||||
parquet_rows = gather_theme_tag_rows()
|
||||
for row_tags in parquet_rows:
|
||||
for t in row_tags:
|
||||
if isinstance(t, str) and t:
|
||||
theme_tags.add(t)
|
||||
|
||||
whitelist = load_whitelist_config()
|
||||
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
|
||||
|
|
@ -190,10 +210,8 @@ def regenerate_analytics(verbose: bool):
|
|||
blacklist = {"Draw Triggers"}
|
||||
theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}
|
||||
|
||||
try:
|
||||
frequencies = tally_tag_frequencies_by_base_color()
|
||||
except Exception:
|
||||
frequencies = {}
|
||||
# M3: Read frequencies from parquet (fail loudly)
|
||||
frequencies = tally_tag_frequencies_by_base_color()
|
||||
|
||||
if frequencies:
|
||||
def total_count(t: str) -> int:
|
||||
|
|
@ -204,19 +222,40 @@ def regenerate_analytics(verbose: bool):
|
|||
except Exception:
|
||||
pass
|
||||
return s
|
||||
|
||||
kept: Set[str] = set()
|
||||
|
||||
# M3: Apply THEME_MIN_CARDS filtering
|
||||
min_cards = getattr(code_settings, 'THEME_MIN_CARDS', 5)
|
||||
if verbose:
|
||||
print(f"Applying THEME_MIN_CARDS filter (threshold: {min_cards} cards)")
|
||||
|
||||
themes_before_filter = len(theme_tags)
|
||||
|
||||
for t in list(theme_tags):
|
||||
if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
|
||||
kept.add(t)
|
||||
count = total_count(t)
|
||||
# Check both should_keep_theme (whitelist logic) AND THEME_MIN_CARDS threshold
|
||||
if should_keep_theme(t, count, whitelist, protected_prefixes, protected_suffixes, min_overrides):
|
||||
# Additional check: must meet minimum card threshold
|
||||
if count >= min_cards:
|
||||
kept.add(t)
|
||||
elif verbose:
|
||||
print(f" Filtered out '{t}' ({count} cards < {min_cards} threshold)")
|
||||
|
||||
# Always include whitelist themes (override threshold)
|
||||
for extra in whitelist.get('always_include', []) or []:
|
||||
kept.add(str(extra))
|
||||
|
||||
theme_tags = kept
|
||||
|
||||
if verbose:
|
||||
themes_after_filter = len(theme_tags)
|
||||
filtered_count = themes_before_filter - themes_after_filter
|
||||
print(f"Filtered {filtered_count} themes below threshold ({themes_after_filter} remain)")
|
||||
|
||||
try:
|
||||
rows = csv_rows if csv_rows else gather_theme_tag_rows()
|
||||
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
|
||||
except Exception:
|
||||
co_map, tag_counts, total_rows = {}, Counter(), 0
|
||||
# M3: Compute co-occurrence from parquet data (fail loudly)
|
||||
rows = parquet_rows if parquet_rows else gather_theme_tag_rows()
|
||||
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
|
||||
|
||||
return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from collections import Counter
|
|||
from typing import Dict, List, Set, Any
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import itertools
|
||||
import math
|
||||
try:
|
||||
|
|
@ -20,6 +21,7 @@ if ROOT not in sys.path:
|
|||
|
||||
from code.settings import CSV_DIRECTORY
|
||||
from code.tagging import tag_constants
|
||||
from code.path_util import get_processed_cards_path
|
||||
|
||||
BASE_COLORS = {
|
||||
'white': 'W',
|
||||
|
|
@ -88,83 +90,113 @@ def collect_theme_tags_from_tagger_source() -> Set[str]:
|
|||
|
||||
|
||||
def tally_tag_frequencies_by_base_color() -> Dict[str, Dict[str, int]]:
|
||||
"""
|
||||
Tally theme tag frequencies by base color from parquet files.
|
||||
|
||||
Note: This function now reads from card_files/processed/all_cards.parquet
|
||||
instead of per-color CSV files. The CSV files no longer exist after the
|
||||
parquet migration.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping color names to Counter of tag frequencies
|
||||
"""
|
||||
result: Dict[str, Dict[str, int]] = {c: Counter() for c in BASE_COLORS.keys()}
|
||||
# Iterate over per-color CSVs; if not present, skip
|
||||
for color in BASE_COLORS.keys():
|
||||
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
|
||||
if not os.path.exists(path):
|
||||
|
||||
# Load from all_cards.parquet
|
||||
parquet_path = get_processed_cards_path()
|
||||
if not os.path.exists(parquet_path):
|
||||
print(f"Warning: Parquet file not found: {parquet_path}")
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
try:
|
||||
df = pd.read_parquet(parquet_path, columns=['themeTags', 'colorIdentity'], engine='pyarrow')
|
||||
except Exception as e:
|
||||
print(f"Error reading parquet file: {e}")
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
if 'themeTags' not in df.columns:
|
||||
print("Warning: themeTags column not found in parquet file")
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
# Iterate rows and tally tags by base color
|
||||
for _, row in df.iterrows():
|
||||
# Parquet stores themeTags as numpy array
|
||||
tags = row.get('themeTags')
|
||||
if not isinstance(tags, (list, np.ndarray)):
|
||||
continue
|
||||
try:
|
||||
df = pd.read_csv(path, converters={'themeTags': pd.eval, 'colorIdentity': pd.eval})
|
||||
except Exception:
|
||||
df = pd.read_csv(path)
|
||||
if 'themeTags' in df.columns:
|
||||
try:
|
||||
df['themeTags'] = df['themeTags'].apply(pd.eval)
|
||||
except Exception:
|
||||
df['themeTags'] = df['themeTags'].apply(lambda x: [])
|
||||
if 'colorIdentity' in df.columns:
|
||||
try:
|
||||
df['colorIdentity'] = df['colorIdentity'].apply(pd.eval)
|
||||
except Exception:
|
||||
pass
|
||||
if 'themeTags' not in df.columns:
|
||||
if isinstance(tags, np.ndarray):
|
||||
tags = tags.tolist()
|
||||
|
||||
# Get color identity (stored as string like "W", "UB", "WUG", etc.)
|
||||
ci = row.get('colorIdentity')
|
||||
if isinstance(ci, np.ndarray):
|
||||
ci = ci.tolist()
|
||||
|
||||
# Convert colorIdentity to set of letters
|
||||
if isinstance(ci, str):
|
||||
letters = set(ci) # "WUG" -> {'W', 'U', 'G'}
|
||||
elif isinstance(ci, list):
|
||||
letters = set(ci) # ['W', 'U', 'G'] -> {'W', 'U', 'G'}
|
||||
else:
|
||||
letters = set()
|
||||
|
||||
# Determine base colors from color identity
|
||||
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
|
||||
if not bases:
|
||||
# Colorless cards don't contribute to any specific color
|
||||
continue
|
||||
# Derive base colors from colorIdentity if available, else assume single color file
|
||||
def rows_base_colors(row):
|
||||
ids = row.get('colorIdentity') if isinstance(row, dict) else row
|
||||
if isinstance(ids, list):
|
||||
letters = set(ids)
|
||||
else:
|
||||
letters = set()
|
||||
derived = set()
|
||||
for name, letter in BASE_COLORS.items():
|
||||
if letter in letters:
|
||||
derived.add(name)
|
||||
if not derived:
|
||||
derived.add(color)
|
||||
return derived
|
||||
# Iterate rows
|
||||
for _, row in df.iterrows():
|
||||
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
|
||||
# Compute base colors contribution
|
||||
ci = row['colorIdentity'] if 'colorIdentity' in row else None
|
||||
letters = set(ci) if isinstance(ci, list) else set()
|
||||
bases = {name for name, letter in BASE_COLORS.items() if letter in letters}
|
||||
if not bases:
|
||||
bases = {color}
|
||||
for bc in bases:
|
||||
for t in tags:
|
||||
result[bc][t] += 1
|
||||
|
||||
# Tally tags for each base color this card belongs to
|
||||
for base_color in bases:
|
||||
for tag in tags:
|
||||
if isinstance(tag, str) and tag:
|
||||
result[base_color][tag] += 1
|
||||
|
||||
# Convert Counters to plain dicts
|
||||
return {k: dict(v) for k, v in result.items()}
|
||||
|
||||
|
||||
def gather_theme_tag_rows() -> List[List[str]]:
|
||||
"""Collect per-card themeTags lists across all base color CSVs.
|
||||
"""
|
||||
Collect per-card themeTags lists from parquet file.
|
||||
|
||||
Note: This function now reads from card_files/processed/all_cards.parquet
|
||||
instead of per-color CSV files. The CSV files no longer exist after the
|
||||
parquet migration.
|
||||
|
||||
Returns a list of themeTags arrays, one per card row where themeTags is present.
|
||||
Returns:
|
||||
List of themeTags arrays, one per card row where themeTags is present.
|
||||
"""
|
||||
rows: List[List[str]] = []
|
||||
for color in BASE_COLORS.keys():
|
||||
path = os.path.join(CSV_DIRECTORY, f"{color}_cards.csv")
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
try:
|
||||
df = pd.read_csv(path, converters={'themeTags': pd.eval})
|
||||
except Exception:
|
||||
df = pd.read_csv(path)
|
||||
if 'themeTags' in df.columns:
|
||||
try:
|
||||
df['themeTags'] = df['themeTags'].apply(pd.eval)
|
||||
except Exception:
|
||||
df['themeTags'] = df['themeTags'].apply(lambda x: [])
|
||||
if 'themeTags' not in df.columns:
|
||||
continue
|
||||
for _, row in df.iterrows():
|
||||
tags = list(row['themeTags']) if hasattr(row.get('themeTags'), '__len__') and not isinstance(row.get('themeTags'), str) else []
|
||||
if tags:
|
||||
rows.append(tags)
|
||||
|
||||
# Load from all_cards.parquet
|
||||
parquet_path = get_processed_cards_path()
|
||||
if not os.path.exists(parquet_path):
|
||||
print(f"Warning: Parquet file not found: {parquet_path}")
|
||||
return rows
|
||||
|
||||
try:
|
||||
df = pd.read_parquet(parquet_path, columns=['themeTags'], engine='pyarrow')
|
||||
except Exception as e:
|
||||
print(f"Error reading parquet file: {e}")
|
||||
return rows
|
||||
|
||||
if 'themeTags' not in df.columns:
|
||||
print("Warning: themeTags column not found in parquet file")
|
||||
return rows
|
||||
|
||||
# Collect theme tags from each card
|
||||
for _, row in df.iterrows():
|
||||
# Parquet stores themeTags as numpy array
|
||||
tags = row.get('themeTags')
|
||||
if isinstance(tags, np.ndarray):
|
||||
tags = tags.tolist()
|
||||
if isinstance(tags, list) and tags:
|
||||
# Convert to list of strings (filter out non-strings)
|
||||
tag_list = [str(t) for t in tags if isinstance(t, str) and t]
|
||||
if tag_list:
|
||||
rows.append(tag_list)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
|
|
|
|||
165
code/scripts/strip_catalog_themes.py
Normal file
165
code/scripts/strip_catalog_themes.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Strip Theme Catalog Script
|
||||
|
||||
Removes themes with insufficient card counts from the theme catalog YAML files.
|
||||
Creates backups and logs all stripped themes for reference.
|
||||
|
||||
Usage:
|
||||
python -m code.scripts.strip_catalog_themes [--min-cards N] [--no-backup] [--dry-run]
|
||||
|
||||
Options:
|
||||
--min-cards N Override THEME_MIN_CARDS setting (default: from environment/settings)
|
||||
--no-backup Skip creating backup files
|
||||
--dry-run Show what would be stripped without making changes
|
||||
|
||||
Example:
|
||||
python -m code.scripts.strip_catalog_themes
|
||||
python -m code.scripts.strip_catalog_themes --min-cards 3 --dry-run
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path for imports
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from code import settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_catalog_themes,
|
||||
create_stripped_themes_log,
|
||||
get_theme_distribution
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Strip themes with insufficient card counts from catalog YAML files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-cards",
|
||||
type=int,
|
||||
default=settings.THEME_MIN_CARDS,
|
||||
help=f"Minimum cards required to keep a theme (default: {settings.THEME_MIN_CARDS})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-backup",
|
||||
action="store_true",
|
||||
help="Skip creating backup files before modification"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be stripped without making changes"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Paths
|
||||
processed_dir = Path(settings.CARD_FILES_PROCESSED_DIR)
|
||||
catalog_dir = PROJECT_ROOT / 'config' / 'themes' / 'catalog'
|
||||
log_dir = PROJECT_ROOT / 'logs'
|
||||
stripped_log_path = log_dir / 'stripped_themes.yml'
|
||||
|
||||
print(f"Stripping themes from catalog (min_cards={args.min_cards})")
|
||||
print(f"Catalog directory: {catalog_dir}")
|
||||
print(f"Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
# Step 1: Get theme card counts from parquet files
|
||||
print("Step 1: Analyzing theme card counts from parquet files...")
|
||||
parquet_files = sorted(processed_dir.glob("*.parquet"))
|
||||
if not parquet_files:
|
||||
print(f"Error: No parquet files found in {processed_dir}")
|
||||
return 1
|
||||
|
||||
print(f"Found {len(parquet_files)} parquet files")
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
print(f"Found {len(theme_counts)} unique themes")
|
||||
print()
|
||||
|
||||
# Step 2: Get distribution
|
||||
distribution = get_theme_distribution(theme_counts)
|
||||
print("Theme distribution:")
|
||||
print(f" 1 card: {distribution['1_card']:4d} themes")
|
||||
print(f" 2 cards: {distribution['2_cards']:4d} themes")
|
||||
print(f" 3-4 cards: {distribution['3_4_cards']:4d} themes")
|
||||
print(f" 5-9 cards: {distribution['5_9_cards']:4d} themes")
|
||||
print(f" 10+ cards: {distribution['10_plus']:4d} themes")
|
||||
print(f" Total: {distribution['total']:4d} themes")
|
||||
print()
|
||||
|
||||
# Step 3: Identify themes to strip
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, args.min_cards)
|
||||
themes_to_keep = set(theme_counts.keys()) - themes_to_strip
|
||||
|
||||
print(f"Themes to strip: {len(themes_to_strip)} ({len(themes_to_strip)/len(theme_counts)*100:.1f}%)")
|
||||
print(f"Themes to keep: {len(themes_to_keep)} ({len(themes_to_keep)/len(theme_counts)*100:.1f}%)")
|
||||
print()
|
||||
|
||||
# Show sample of themes to strip
|
||||
if themes_to_strip:
|
||||
print("Sample themes to strip (first 10):")
|
||||
sample = sorted(themes_to_strip)[:10]
|
||||
for theme_id in sample:
|
||||
count = len(theme_counts[theme_id])
|
||||
cards_sample = sorted(theme_counts[theme_id])[:3]
|
||||
cards_str = ", ".join(cards_sample)
|
||||
if count > 3:
|
||||
cards_str += f", ... ({count} total)"
|
||||
print(f" - {theme_id} ({count} cards): {cards_str}")
|
||||
print()
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN: No changes made")
|
||||
return 0
|
||||
|
||||
# Step 4: Strip themes from catalog
|
||||
print("Step 4: Stripping themes from catalog YAML files...")
|
||||
results = strip_catalog_themes(
|
||||
catalog_dir=catalog_dir,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=not args.no_backup
|
||||
)
|
||||
|
||||
print(f" Stripped: {results['stripped_count']} themes")
|
||||
print(f" Files deleted: {len(results['files_deleted'])}")
|
||||
print(f" Backups created: {len(results['backups_created'])}")
|
||||
|
||||
if results['errors']:
|
||||
print(f" Errors: {len(results['errors'])}")
|
||||
for error in results['errors'][:5]: # Show first 5 errors
|
||||
print(f" - {error}")
|
||||
print()
|
||||
|
||||
# Step 5: Create stripped themes log
|
||||
print("Step 5: Creating stripped themes log...")
|
||||
create_stripped_themes_log(
|
||||
output_path=stripped_log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=args.min_cards,
|
||||
sources=["catalog YAML"]
|
||||
)
|
||||
print(f" Log written to {stripped_log_path}")
|
||||
print()
|
||||
|
||||
print("✅ Catalog stripping complete!")
|
||||
print()
|
||||
print(f"Summary:")
|
||||
print(f" Total themes analyzed: {len(theme_counts)}")
|
||||
print(f" Themes stripped: {len(themes_to_strip)}")
|
||||
print(f" Themes remaining: {len(themes_to_keep)}")
|
||||
print(f" Catalog files deleted: {len(results['files_deleted'])}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
253
code/scripts/strip_parquet_themes.py
Normal file
253
code/scripts/strip_parquet_themes.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Strip low-card themes from parquet file themeTags columns.
|
||||
|
||||
This script identifies and removes themes below the THEME_MIN_CARDS threshold
|
||||
from the themeTags column in parquet files. It's part of Milestone 4 (M4) of
|
||||
the Theme Stripping roadmap (R21).
|
||||
|
||||
Usage:
|
||||
# Dry run to see what would be stripped
|
||||
python code/scripts/strip_parquet_themes.py --dry-run
|
||||
|
||||
# Strip from single parquet file
|
||||
python code/scripts/strip_parquet_themes.py --file card_files/processed/all_cards.parquet
|
||||
|
||||
# Strip from all parquet files in directory
|
||||
python code/scripts/strip_parquet_themes.py --all
|
||||
|
||||
# Specify custom threshold
|
||||
python code/scripts/strip_parquet_themes.py --threshold 10 --all
|
||||
|
||||
Environment Variables:
|
||||
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||||
|
||||
Outputs:
|
||||
- Modified parquet file(s) with stripped themeTags
|
||||
- Timestamped backup (.parquet.bak) if --backup enabled
|
||||
- Updated logs/stripped_themes.yml log
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from code import settings as code_settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_parquet_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
|
||||
def find_parquet_files(directory: Path) -> list[Path]:
|
||||
"""Find all parquet files in processed directory."""
|
||||
return sorted(directory.glob("*.parquet"))
|
||||
|
||||
|
||||
def update_stripped_themes_log(
|
||||
theme_counts: dict,
|
||||
themes_to_strip: set[str],
|
||||
min_cards: int
|
||||
) -> None:
|
||||
"""Update the stripped_themes.yml log with parquet stripping results."""
|
||||
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||||
|
||||
# Create log with parquet source indicator
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=min_cards,
|
||||
sources=["parquet files"]
|
||||
)
|
||||
|
||||
print(f"\nUpdated stripped themes log: {log_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Strip low-card themes from parquet themeTags columns",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--file',
|
||||
type=Path,
|
||||
help='Specific parquet file to process'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
help='Process all parquet files in card_files/processed/'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--threshold',
|
||||
type=int,
|
||||
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be stripped without making changes'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Skip creating backup files before modification'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed stripping information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine threshold
|
||||
min_cards = args.threshold if args.threshold else code_settings.THEME_MIN_CARDS
|
||||
|
||||
# Determine which files to process
|
||||
if args.file:
|
||||
if not args.file.exists():
|
||||
print(f"Error: File not found: {args.file}")
|
||||
return 1
|
||||
parquet_files = [args.file]
|
||||
elif args.all:
|
||||
processed_dir = ROOT / "card_files" / "processed"
|
||||
parquet_files = find_parquet_files(processed_dir)
|
||||
if not parquet_files:
|
||||
print(f"No parquet files found in {processed_dir}")
|
||||
return 1
|
||||
else:
|
||||
# Default: process all_cards.parquet
|
||||
default_file = ROOT / "card_files" / "processed" / "all_cards.parquet"
|
||||
if not default_file.exists():
|
||||
print(f"Error: Default file not found: {default_file}")
|
||||
print("Use --file or --all to specify files to process")
|
||||
return 1
|
||||
parquet_files = [default_file]
|
||||
|
||||
print(f"Theme Stripping Configuration:")
|
||||
print(f" Minimum cards: {min_cards}")
|
||||
print(f" Files to process: {len(parquet_files)}")
|
||||
print(f" Backup enabled: {not args.no_backup}")
|
||||
print(f" Dry run: {args.dry_run}")
|
||||
print()
|
||||
|
||||
# Get theme card counts from parquet files
|
||||
print("Analyzing theme card counts...")
|
||||
try:
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
print(f"Found {len(theme_counts)} unique themes across files")
|
||||
except Exception as e:
|
||||
print(f"Error analyzing theme counts: {e}")
|
||||
return 1
|
||||
|
||||
# Identify themes to strip
|
||||
print("Identifying themes to strip...")
|
||||
try:
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
except Exception as e:
|
||||
print(f"Error identifying themes to strip: {e}")
|
||||
return 1
|
||||
|
||||
if not themes_to_strip:
|
||||
print("No themes found below threshold. Nothing to strip.")
|
||||
return 0
|
||||
|
||||
print(f"Found {len(themes_to_strip)} themes to strip")
|
||||
|
||||
if args.verbose:
|
||||
sample = sorted(list(themes_to_strip))[:10]
|
||||
print(f"Sample themes: {', '.join(sample)}")
|
||||
if len(themes_to_strip) > 10:
|
||||
print(f" ... and {len(themes_to_strip) - 10} more")
|
||||
|
||||
print()
|
||||
|
||||
# Dry run mode
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Would process: {parquet_file}")
|
||||
print(f"\nWould strip {len(themes_to_strip)} themes from themeTags column")
|
||||
return 0
|
||||
|
||||
# Process each parquet file
|
||||
total_results = {
|
||||
"files_processed": 0,
|
||||
"cards_processed": 0,
|
||||
"tags_removed": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Processing: {parquet_file.name}")
|
||||
|
||||
try:
|
||||
results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=not args.no_backup
|
||||
)
|
||||
|
||||
total_results["files_processed"] += 1
|
||||
total_results["cards_processed"] += results["cards_processed"]
|
||||
total_results["tags_removed"] += results["tags_removed"]
|
||||
total_results["errors"].extend(results["errors"])
|
||||
|
||||
if args.verbose:
|
||||
print(f" Cards: {results['cards_processed']}")
|
||||
print(f" Tags removed: {results['tags_removed']}")
|
||||
if results["backup_created"]:
|
||||
print(f" Backup: {results['backup_created']}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing {parquet_file}: {e}"
|
||||
print(f" {error_msg}")
|
||||
total_results["errors"].append(error_msg)
|
||||
continue
|
||||
|
||||
print()
|
||||
|
||||
# Update stripped themes log
|
||||
try:
|
||||
update_stripped_themes_log(theme_counts, themes_to_strip, min_cards)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to update stripped themes log: {e}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*60)
|
||||
print("SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Files processed: {total_results['files_processed']}")
|
||||
print(f"Cards processed: {total_results['cards_processed']}")
|
||||
print(f"Tags removed: {total_results['tags_removed']}")
|
||||
print(f"Themes stripped: {len(themes_to_strip)}")
|
||||
|
||||
if total_results["errors"]:
|
||||
print(f"\nErrors encountered: {len(total_results['errors'])}")
|
||||
for error in total_results["errors"]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print("\nStripping completed successfully!")
|
||||
|
||||
return 0 if not total_results["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
380
code/scripts/strip_themes.py
Normal file
380
code/scripts/strip_themes.py
Normal file
|
|
@ -0,0 +1,380 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone theme stripping orchestration script.
|
||||
|
||||
This script coordinates the complete theme stripping pipeline:
|
||||
1. Analyze parquet files to identify low-card themes
|
||||
2. Strip from catalog YAML files (optional)
|
||||
3. Strip from parquet themeTags columns (optional)
|
||||
4. Rebuild theme_list.json from stripped parquet data
|
||||
5. Generate stripped_themes.yml log
|
||||
|
||||
Part of Milestone 5 (M5) - Integration & Testing for Theme Stripping (R21).
|
||||
|
||||
Usage:
|
||||
# Dry run to preview changes
|
||||
python code/scripts/strip_themes.py --dry-run
|
||||
|
||||
# Strip everything with default threshold (5 cards)
|
||||
python code/scripts/strip_themes.py
|
||||
|
||||
# Strip only catalog YAML files
|
||||
python code/scripts/strip_themes.py --sources catalog
|
||||
|
||||
# Strip only parquet files
|
||||
python code/scripts/strip_themes.py --sources parquet
|
||||
|
||||
# Custom threshold
|
||||
python code/scripts/strip_themes.py --min-cards 10
|
||||
|
||||
# Skip backups (not recommended)
|
||||
python code/scripts/strip_themes.py --no-backup
|
||||
|
||||
Environment Variables:
|
||||
THEME_MIN_CARDS: Minimum card threshold (default: 5)
|
||||
|
||||
Outputs:
|
||||
- Modified catalog/*.yml files (if --sources includes catalog)
|
||||
- Modified parquet files (if --sources includes parquet)
|
||||
- Regenerated config/themes/theme_list.json
|
||||
- Updated logs/stripped_themes.yml log
|
||||
- Timestamped backups (if --backup enabled)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Set, Dict
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from code import settings as code_settings
|
||||
from code.tagging.theme_stripper import (
|
||||
get_theme_card_counts,
|
||||
identify_themes_to_strip,
|
||||
strip_catalog_themes,
|
||||
strip_parquet_themes,
|
||||
create_stripped_themes_log
|
||||
)
|
||||
|
||||
|
||||
def strip_all_sources(
|
||||
min_cards: int,
|
||||
sources: Set[str],
|
||||
backup: bool,
|
||||
dry_run: bool,
|
||||
verbose: bool
|
||||
) -> Dict:
|
||||
"""
|
||||
Execute complete theme stripping pipeline.
|
||||
|
||||
Args:
|
||||
min_cards: Minimum card count threshold
|
||||
sources: Set of sources to strip ('catalog', 'parquet', or both)
|
||||
backup: Whether to create backups before modification
|
||||
dry_run: Preview changes without modifying files
|
||||
verbose: Show detailed output
|
||||
|
||||
Returns:
|
||||
Dictionary with stripping results and statistics
|
||||
"""
|
||||
start_time = time.time()
|
||||
results = {
|
||||
"themes_analyzed": 0,
|
||||
"themes_to_strip": 0,
|
||||
"catalog_stripped": 0,
|
||||
"parquet_tags_removed": 0,
|
||||
"json_regenerated": False,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
print("="*70)
|
||||
print("THEME STRIPPING PIPELINE")
|
||||
print("="*70)
|
||||
print(f"Configuration:")
|
||||
print(f" Minimum cards: {min_cards}")
|
||||
print(f" Sources: {', '.join(sorted(sources))}")
|
||||
print(f" Backup enabled: {backup}")
|
||||
print(f" Dry run: {dry_run}")
|
||||
print()
|
||||
|
||||
# Step 1: Analyze parquet files
|
||||
print("Step 1: Analyzing theme card counts...")
|
||||
try:
|
||||
parquet_dir = ROOT / "card_files" / "processed"
|
||||
parquet_files = sorted(parquet_dir.glob("*.parquet"))
|
||||
|
||||
if not parquet_files:
|
||||
results["errors"].append("No parquet files found in card_files/processed/")
|
||||
return results
|
||||
|
||||
theme_counts = get_theme_card_counts(parquet_files)
|
||||
results["themes_analyzed"] = len(theme_counts)
|
||||
print(f" Found {len(theme_counts)} unique themes")
|
||||
|
||||
themes_to_strip = identify_themes_to_strip(theme_counts, min_cards)
|
||||
results["themes_to_strip"] = len(themes_to_strip)
|
||||
print(f" Identified {len(themes_to_strip)} themes below threshold")
|
||||
|
||||
if verbose and themes_to_strip:
|
||||
sample = sorted(list(themes_to_strip))[:5]
|
||||
print(f" Sample themes: {', '.join(sample)}")
|
||||
if len(themes_to_strip) > 5:
|
||||
print(f" ... and {len(themes_to_strip) - 5} more")
|
||||
|
||||
if not themes_to_strip:
|
||||
print("\n✅ No themes below threshold. Nothing to strip.")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Analysis failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
return results
|
||||
|
||||
print()
|
||||
|
||||
# Dry run mode
|
||||
if dry_run:
|
||||
print("DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
if 'catalog' in sources:
|
||||
print("Would strip from catalog YAML files:")
|
||||
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
||||
yaml_files = sorted(catalog_dir.glob("*.yml"))
|
||||
for yaml_file in yaml_files[:5]:
|
||||
print(f" - {yaml_file.name}")
|
||||
if len(yaml_files) > 5:
|
||||
print(f" ... and {len(yaml_files) - 5} more")
|
||||
|
||||
if 'parquet' in sources:
|
||||
print("\nWould strip from parquet files:")
|
||||
for pf in parquet_files[:3]:
|
||||
print(f" - {pf.name}")
|
||||
if len(parquet_files) > 3:
|
||||
print(f" ... and {len(parquet_files) - 3} more")
|
||||
|
||||
print(f"\nWould strip {len(themes_to_strip)} themes total")
|
||||
print("Would regenerate theme_list.json")
|
||||
print("Would update stripped_themes.yml log")
|
||||
return results
|
||||
|
||||
# Step 2: Strip from catalog (if requested)
|
||||
# NOTE: Catalog YAML must be stripped BEFORE building theme_list.json,
|
||||
# otherwise build_theme_catalog.py will read un-stripped themes from YAML
|
||||
if 'catalog' in sources:
|
||||
print("Step 2: Stripping from catalog YAML files...")
|
||||
try:
|
||||
catalog_dir = ROOT / "config" / "themes" / "catalog"
|
||||
catalog_results = strip_catalog_themes(
|
||||
catalog_dir=catalog_dir,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=backup
|
||||
)
|
||||
|
||||
results["catalog_stripped"] = catalog_results["files_modified"]
|
||||
|
||||
if verbose:
|
||||
print(f" Files modified: {catalog_results['files_modified']}")
|
||||
print(f" Themes removed: {catalog_results['themes_removed']}")
|
||||
if catalog_results["backups_created"]:
|
||||
print(f" Backups created: {len(catalog_results['backups_created'])}")
|
||||
else:
|
||||
print(f" ✓ Stripped {catalog_results['themes_removed']} themes from {catalog_results['files_modified']} files")
|
||||
|
||||
results["errors"].extend(catalog_results["errors"])
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Catalog stripping failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 3: Strip from parquet (if requested)
|
||||
if 'parquet' in sources:
|
||||
step_num = 3 if 'catalog' in sources else 2
|
||||
print(f"Step {step_num}: Stripping from parquet files...")
|
||||
try:
|
||||
for parquet_file in parquet_files:
|
||||
if verbose:
|
||||
print(f" Processing: {parquet_file.name}")
|
||||
|
||||
parquet_results = strip_parquet_themes(
|
||||
parquet_path=parquet_file,
|
||||
themes_to_strip=themes_to_strip,
|
||||
backup=backup
|
||||
)
|
||||
|
||||
results["parquet_tags_removed"] += parquet_results["tags_removed"]
|
||||
results["errors"].extend(parquet_results["errors"])
|
||||
|
||||
if verbose and parquet_results["tags_removed"] > 0:
|
||||
print(f" Removed {parquet_results['tags_removed']} tag occurrences")
|
||||
|
||||
if not verbose:
|
||||
print(f" ✓ Removed {results['parquet_tags_removed']} tag occurrences from {len(parquet_files)} file(s)")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Parquet stripping failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 4: Rebuild theme_list.json (if parquet was stripped)
|
||||
# NOTE: This reads from both parquet AND catalog YAML, so both must be stripped first
|
||||
if 'parquet' in sources:
|
||||
step_num = 4 if 'catalog' in sources else 3
|
||||
print(f"Step {step_num}: Rebuilding theme_list.json...")
|
||||
try:
|
||||
# Import build script
|
||||
from code.scripts.build_theme_catalog import main as build_main
|
||||
|
||||
# Suppress verbose build output unless --verbose flag
|
||||
import io
|
||||
import contextlib
|
||||
|
||||
if not verbose:
|
||||
with contextlib.redirect_stdout(io.StringIO()):
|
||||
build_main()
|
||||
else:
|
||||
build_main()
|
||||
|
||||
results["json_regenerated"] = True
|
||||
print(" ✓ theme_list.json regenerated")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"JSON regeneration failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
print()
|
||||
|
||||
# Step 5: Update stripped themes log
|
||||
final_step = 5 if ('catalog' in sources and 'parquet' in sources) else (3 if 'catalog' in sources else 4)
|
||||
print(f"Step {final_step}: Updating stripped_themes.yml log...")
|
||||
try:
|
||||
log_path = ROOT / "logs" / "stripped_themes.yml"
|
||||
source_labels = []
|
||||
if 'catalog' in sources:
|
||||
source_labels.append("catalog YAML")
|
||||
if 'parquet' in sources:
|
||||
source_labels.append("parquet files")
|
||||
|
||||
create_stripped_themes_log(
|
||||
output_path=log_path,
|
||||
theme_counts=theme_counts,
|
||||
themes_stripped=themes_to_strip,
|
||||
min_threshold=min_cards,
|
||||
sources=source_labels if source_labels else None
|
||||
)
|
||||
print(f" ✓ Log updated: {log_path}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Log update failed: {e}"
|
||||
print(f" ❌ {error_msg}")
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
# Final summary
|
||||
elapsed = time.time() - start_time
|
||||
print()
|
||||
print("="*70)
|
||||
print("SUMMARY")
|
||||
print("="*70)
|
||||
print(f"Themes analyzed: {results['themes_analyzed']}")
|
||||
print(f"Themes stripped: {results['themes_to_strip']}")
|
||||
if 'catalog' in sources:
|
||||
print(f"Catalog files modified: {results['catalog_stripped']}")
|
||||
if 'parquet' in sources:
|
||||
print(f"Parquet tags removed: {results['parquet_tags_removed']}")
|
||||
print(f"JSON regenerated: {'Yes' if results['json_regenerated'] else 'No'}")
|
||||
print(f"Time elapsed: {elapsed:.2f}s")
|
||||
|
||||
if results["errors"]:
|
||||
print(f"\n⚠️ Errors encountered: {len(results['errors'])}")
|
||||
for error in results["errors"]:
|
||||
print(f" - {error}")
|
||||
else:
|
||||
print("\n✅ Theme stripping completed successfully!")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Orchestrate complete theme stripping pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--min-cards',
|
||||
type=int,
|
||||
help=f'Minimum card count threshold (default: {code_settings.THEME_MIN_CARDS})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sources',
|
||||
type=str,
|
||||
help='Comma-separated list of sources to strip: catalog, parquet, all (default: all)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be stripped without making changes'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Skip creating backup files before modification'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed stripping information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine threshold
|
||||
min_cards = args.min_cards if args.min_cards else code_settings.THEME_MIN_CARDS
|
||||
|
||||
# Determine sources
|
||||
if args.sources:
|
||||
source_input = args.sources.lower()
|
||||
if source_input == 'all':
|
||||
sources = {'catalog', 'parquet'}
|
||||
else:
|
||||
sources = set(s.strip() for s in source_input.split(','))
|
||||
valid_sources = {'catalog', 'parquet'}
|
||||
invalid = sources - valid_sources
|
||||
if invalid:
|
||||
print(f"Error: Invalid sources: {', '.join(invalid)}")
|
||||
print(f"Valid sources: {', '.join(valid_sources)}, all")
|
||||
return 1
|
||||
else:
|
||||
sources = {'catalog', 'parquet'} # Default: all sources
|
||||
|
||||
# Execute pipeline
|
||||
results = strip_all_sources(
|
||||
min_cards=min_cards,
|
||||
sources=sources,
|
||||
backup=not args.no_backup,
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
# Return exit code
|
||||
return 0 if not results["errors"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue