feat(editorial): Phase D synergy commander enrichment, augmentation, lint & docs\n\nAdds Phase D editorial tooling: synergy-based commander selection with 3/2/1 pattern, duplicate filtering, annotated synergy_commanders, promotion to minimum examples, and augmentation heuristics (e.g. Counters Matter/Proliferate injection). Includes new scripts (generate_theme_editorial_suggestions, lint, validate, catalog build/apply), updates orchestrator & web routes, expands CI workflow, and documents usage & non-determinism policies. Updates lint rules, type definitions, and docker configs.

This commit is contained in:
matt 2025-09-18 10:59:20 -07:00
parent 16261bbf09
commit f2a76d2ffc
35 changed files with 2818 additions and 509 deletions

View file

@ -0,0 +1,79 @@
"""Apply example_cards / example_commanders to the next theme missing them.
Usage:
python code/scripts/apply_next_theme_editorial.py
Repeating invocation will fill themes one at a time (skips deprecated alias placeholders).
Options:
--force overwrite existing lists for that theme
--top / --top-commanders size knobs forwarded to suggestion generator
"""
from __future__ import annotations
import argparse
import subprocess
import sys
from pathlib import Path
import yaml # type: ignore
ROOT = Path(__file__).resolve().parents[2]
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'
def find_next_missing():
for path in sorted(CATALOG_DIR.glob('*.yml')):
try:
data = yaml.safe_load(path.read_text(encoding='utf-8'))
except Exception:
continue
if not isinstance(data, dict):
continue
notes = data.get('notes', '')
if isinstance(notes, str) and 'Deprecated alias file' in notes:
continue
# Completion rule: a theme is considered "missing" only if a key itself is absent.
# We intentionally allow empty lists (e.g., obscure themes with no clear commanders)
# so we don't get stuck repeatedly selecting the same file.
if ('example_cards' not in data) or ('example_commanders' not in data):
return data.get('display_name'), path.name
return None, None
def main(): # pragma: no cover
ap = argparse.ArgumentParser(description='Apply editorial examples to next missing theme')
ap.add_argument('--force', action='store_true')
ap.add_argument('--top', type=int, default=8)
ap.add_argument('--top-commanders', type=int, default=5)
args = ap.parse_args()
theme, fname = find_next_missing()
if not theme:
print('All themes already have example_cards & example_commanders (or no YAML).')
return
print(f"Next missing theme: {theme} ({fname})")
cmd = [
sys.executable,
str(ROOT / 'code' / 'scripts' / 'generate_theme_editorial_suggestions.py'),
'--themes', theme,
'--apply', '--limit-yaml', '1',
'--top', str(args.top), '--top-commanders', str(args.top_commanders)
]
if args.force:
cmd.append('--force')
print('Running:', ' '.join(cmd))
subprocess.run(cmd, check=False)
# Post-pass: if we managed to add example_cards but no commanders were inferred, stamp an empty list
# so subsequent runs proceed to the next theme instead of re-processing this one forever.
if fname:
target = CATALOG_DIR / fname
try:
data = yaml.safe_load(target.read_text(encoding='utf-8'))
if isinstance(data, dict) and 'example_cards' in data and 'example_commanders' not in data:
data['example_commanders'] = []
target.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding='utf-8')
print(f"[post] added empty example_commanders list to {fname} (no suggestions available)")
except Exception as e: # pragma: no cover
print(f"[post-warn] failed to add placeholder commanders for {fname}: {e}")
if __name__ == '__main__':
main()

View file

@ -0,0 +1,367 @@
"""Phase B: Merge curated YAML catalog with regenerated analytics to build theme_list.json.
See roadmap Phase B goals. This script unifies generation:
- Discovers themes (constants + tagger + CSV dynamic tags)
- Applies whitelist governance (normalization, pruning, always_include)
- Recomputes frequencies & PMI co-occurrence for inference
- Loads curated YAML files (Phase A outputs) for editorial overrides
- Merges curated, enforced, and inferred synergies with precedence
- Applies synergy cap without truncating curated or enforced entries
- Emits theme_list.json with provenance block
Opt-in via env THEME_CATALOG_MODE=merge (or build/phaseb). Or run manually:
python code/scripts/build_theme_catalog.py --verbose
This is intentionally side-effect only (writes JSON). Unit tests for Phase C will
add schema validation; for now we focus on deterministic, stable output.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
try: # Optional
import yaml # type: ignore
except Exception: # pragma: no cover
yaml = None
ROOT = Path(__file__).resolve().parents[2]
CODE_ROOT = ROOT / 'code'
if str(CODE_ROOT) not in sys.path:
sys.path.insert(0, str(CODE_ROOT))
from scripts.extract_themes import ( # type: ignore
BASE_COLORS,
collect_theme_tags_from_constants,
collect_theme_tags_from_tagger_source,
gather_theme_tag_rows,
tally_tag_frequencies_by_base_color,
compute_cooccurrence,
cooccurrence_scores_for,
derive_synergies_for_tags,
apply_normalization,
load_whitelist_config,
should_keep_theme,
)
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'
OUTPUT_JSON = ROOT / 'config' / 'themes' / 'theme_list.json'
@dataclass
class ThemeYAML:
id: str
display_name: str
curated_synergies: List[str]
enforced_synergies: List[str]
inferred_synergies: List[str]
synergies: List[str]
primary_color: Optional[str] = None
secondary_color: Optional[str] = None
notes: str = ''
def _log(msg: str, verbose: bool): # pragma: no cover
if verbose:
print(f"[build_theme_catalog] {msg}", file=sys.stderr)
def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:
out: Dict[str, ThemeYAML] = {}
if not CATALOG_DIR.exists() or yaml is None:
return out
for path in sorted(CATALOG_DIR.glob('*.yml')):
try:
data = yaml.safe_load(path.read_text(encoding='utf-8'))
except Exception:
_log(f"Failed reading {path.name}", verbose)
continue
if not isinstance(data, dict):
continue
# Skip deprecated alias placeholder files (marked in notes)
try:
notes_field = data.get('notes')
if isinstance(notes_field, str) and 'Deprecated alias file' in notes_field:
continue
except Exception:
pass
try:
ty = ThemeYAML(
id=str(data.get('id') or ''),
display_name=str(data.get('display_name') or ''),
curated_synergies=list(data.get('curated_synergies') or []),
enforced_synergies=list(data.get('enforced_synergies') or []),
inferred_synergies=list(data.get('inferred_synergies') or []),
synergies=list(data.get('synergies') or []),
primary_color=data.get('primary_color'),
secondary_color=data.get('secondary_color'),
notes=str(data.get('notes') or ''),
)
except Exception:
continue
if not ty.display_name:
continue
out[ty.display_name] = ty
return out
def regenerate_analytics(verbose: bool):
theme_tags: Set[str] = set()
theme_tags |= collect_theme_tags_from_constants()
theme_tags |= collect_theme_tags_from_tagger_source()
try:
csv_rows = gather_theme_tag_rows()
for row_tags in csv_rows:
for t in row_tags:
if isinstance(t, str) and t:
theme_tags.add(t)
except Exception:
csv_rows = []
whitelist = load_whitelist_config()
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
exclusions: Set[str] = set(whitelist.get('exclusions', []) or [])
protected_prefixes: List[str] = list(whitelist.get('protected_prefixes', []) or [])
protected_suffixes: List[str] = list(whitelist.get('protected_suffixes', []) or [])
min_overrides: Dict[str, int] = whitelist.get('min_frequency_overrides', {}) or {}
if normalization_map:
theme_tags = apply_normalization(theme_tags, normalization_map)
blacklist = {"Draw Triggers"}
theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}
try:
frequencies = tally_tag_frequencies_by_base_color()
except Exception:
frequencies = {}
if frequencies:
def total_count(t: str) -> int:
s = 0
for c in BASE_COLORS.keys():
try:
s += int(frequencies.get(c, {}).get(t, 0))
except Exception:
pass
return s
kept: Set[str] = set()
for t in list(theme_tags):
if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
kept.add(t)
for extra in whitelist.get('always_include', []) or []:
kept.add(str(extra))
theme_tags = kept
try:
rows = csv_rows if csv_rows else gather_theme_tag_rows()
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
except Exception:
co_map, tag_counts, total_rows = {}, Counter(), 0
return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)
def _primary_secondary(theme: str, freqs: Dict[str, Dict[str, int]]):
if not freqs:
return None, None
items: List[Tuple[str, int]] = []
for color in BASE_COLORS.keys():
try:
items.append((color, int(freqs.get(color, {}).get(theme, 0))))
except Exception:
items.append((color, 0))
items.sort(key=lambda x: (-x[1], x[0]))
if not items or items[0][1] <= 0:
return None, None
title = {'white': 'White', 'blue': 'Blue', 'black': 'Black', 'red': 'Red', 'green': 'Green'}
primary = title[items[0][0]]
secondary = None
for c, n in items[1:]:
if n > 0:
secondary = title[c]
break
return primary, secondary
def infer_synergies(anchor: str, curated: List[str], enforced: List[str], analytics: dict, pmi_min: float = 0.0, co_min: int = 5) -> List[str]:
if anchor not in analytics['co_map'] or analytics['total_rows'] <= 0:
return []
scored = cooccurrence_scores_for(anchor, analytics['co_map'], analytics['tag_counts'], analytics['total_rows'])
out: List[str] = []
for other, score, co_count in scored:
if score <= pmi_min or co_count < co_min:
continue
if other == anchor or other in curated or other in enforced or other in out:
continue
out.append(other)
if len(out) >= 12:
break
return out
def build_catalog(limit: int, verbose: bool) -> Dict[str, Any]:
analytics = regenerate_analytics(verbose)
whitelist = analytics['whitelist']
synergy_cap = int(whitelist.get('synergy_cap', 0) or 0)
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
enforced_cfg: Dict[str, List[str]] = whitelist.get('enforced_synergies', {}) or {}
yaml_catalog = load_catalog_yaml(verbose)
all_themes: Set[str] = set(analytics['theme_tags']) | {t.display_name for t in yaml_catalog.values()}
if normalization_map:
all_themes = apply_normalization(all_themes, normalization_map)
curated_baseline = derive_synergies_for_tags(all_themes)
entries: List[Dict[str, Any]] = []
processed = 0
for theme in sorted(all_themes):
if limit and processed >= limit:
break
processed += 1
y = yaml_catalog.get(theme)
curated_list = list(y.curated_synergies) if y and y.curated_synergies else curated_baseline.get(theme, [])
enforced_list: List[str] = []
if y and y.enforced_synergies:
for s in y.enforced_synergies:
if s not in enforced_list:
enforced_list.append(s)
if theme in enforced_cfg:
for s in enforced_cfg.get(theme, []):
if s not in enforced_list:
enforced_list.append(s)
inferred_list = infer_synergies(theme, curated_list, enforced_list, analytics)
if not inferred_list and y and y.inferred_synergies:
inferred_list = [s for s in y.inferred_synergies if s not in curated_list and s not in enforced_list]
if normalization_map:
def _norm(seq: List[str]) -> List[str]:
seen = set()
out = []
for s in seq:
s2 = normalization_map.get(s, s)
if s2 not in seen:
out.append(s2)
seen.add(s2)
return out
curated_list = _norm(curated_list)
enforced_list = _norm(enforced_list)
inferred_list = _norm(inferred_list)
merged: List[str] = []
for bucket in (curated_list, enforced_list, inferred_list):
for s in bucket:
if s == theme:
continue
if s not in merged:
merged.append(s)
# Noise suppression: remove ubiquitous Legends/Historics links except for their mutual pairing.
# Rationale: Every legendary permanent is tagged with both themes (Historics also covers artifacts/enchantments),
# creating low-signal "synergies" that crowd out more meaningful relationships. Requirement:
# - For any theme other than the two themselves, strip both "Legends Matter" and "Historics Matter".
# - For "Legends Matter", allow "Historics Matter" to remain (and vice-versa).
special_noise = {"Legends Matter", "Historics Matter"}
if theme not in special_noise:
if any(s in special_noise for s in merged):
merged = [s for s in merged if s not in special_noise]
# If theme is one of the special ones, keep the other if present (no action needed beyond above filter logic).
if synergy_cap > 0 and len(merged) > synergy_cap:
ce_len = len(curated_list) + len([s for s in enforced_list if s not in curated_list])
if ce_len < synergy_cap:
allowed_inferred = synergy_cap - ce_len
ce_part = merged[:ce_len]
inferred_tail = [s for s in merged[ce_len:ce_len+allowed_inferred]]
merged = ce_part + inferred_tail
# else: keep all (soft exceed)
if y and (y.primary_color or y.secondary_color):
primary, secondary = y.primary_color, y.secondary_color
else:
primary, secondary = _primary_secondary(theme, analytics['frequencies'])
entry = {'theme': theme, 'synergies': merged}
if primary:
entry['primary_color'] = primary
if secondary:
entry['secondary_color'] = secondary
# Phase D: carry forward optional editorial metadata if present in YAML
if y:
if getattr(y, 'example_commanders', None):
entry['example_commanders'] = [c for c in y.example_commanders if isinstance(c, str)][:12]
if getattr(y, 'example_cards', None):
# Limit to 20 for safety (UI may further cap)
dedup_cards = []
seen_cards = set()
for c in y.example_cards:
if isinstance(c, str) and c and c not in seen_cards:
dedup_cards.append(c)
seen_cards.add(c)
if len(dedup_cards) >= 20:
break
if dedup_cards:
entry['example_cards'] = dedup_cards
if getattr(y, 'deck_archetype', None):
entry['deck_archetype'] = y.deck_archetype
if getattr(y, 'popularity_hint', None):
entry['popularity_hint'] = y.popularity_hint
# Pass through synergy_commanders if already curated (script will populate going forward)
if hasattr(y, 'synergy_commanders') and getattr(y, 'synergy_commanders'):
entry['synergy_commanders'] = [c for c in getattr(y, 'synergy_commanders') if isinstance(c, str)][:12]
entries.append(entry)
provenance = {
'mode': 'merge',
'generated_at': time.strftime('%Y-%m-%dT%H:%M:%S'),
'curated_yaml_files': len(yaml_catalog),
'synergy_cap': synergy_cap,
'inference': 'pmi',
'version': 'phase-b-merge-v1'
}
return {
'themes': entries,
'frequencies_by_base_color': analytics['frequencies'],
'generated_from': 'merge (analytics + curated YAML + whitelist)',
'provenance': provenance,
}
def main(): # pragma: no cover
parser = argparse.ArgumentParser(description='Build merged theme catalog (Phase B)')
parser.add_argument('--limit', type=int, default=0)
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--schema', action='store_true', help='Print JSON Schema for catalog and exit')
args = parser.parse_args()
if args.schema:
# Lazy import to avoid circular dependency: replicate minimal schema inline from models file if present
try:
from type_definitions_theme_catalog import ThemeCatalog # type: ignore
import json as _json
print(_json.dumps(ThemeCatalog.model_json_schema(), indent=2))
return
except Exception as _e: # pragma: no cover
print(f"Failed to load schema models: {_e}")
return
data = build_catalog(limit=args.limit, verbose=args.verbose)
if args.dry_run:
print(json.dumps({'theme_count': len(data['themes']), 'provenance': data['provenance']}, indent=2))
else:
os.makedirs(OUTPUT_JSON.parent, exist_ok=True)
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
if __name__ == '__main__':
try:
main()
except Exception as e: # broad guard for orchestrator fallback
print(f"ERROR: build_theme_catalog failed: {e}", file=sys.stderr)
sys.exit(1)

View file

@ -0,0 +1,150 @@
"""Phase A: Export existing generated theme_list.json into per-theme YAML files.
Generates one YAML file per theme under config/themes/catalog/<slug>.yml
Slug rules:
- Lowercase
- Alphanumerics kept
- Spaces and consecutive separators -> single hyphen
- '+' replaced with 'plus'
- '/' replaced with '-'
- Other punctuation removed
- Collapse multiple hyphens
YAML schema (initial minimal):
id: <slug>
display_name: <theme>
curated_synergies: [ ... ] # (only curated portion, best-effort guess)
enforced_synergies: [ ... ] # (if present in whitelist enforced_synergies or auto-inferred cluster)
primary_color: Optional TitleCase
secondary_color: Optional TitleCase
notes: '' # placeholder for editorial additions
We treat current synergy list (capped) as partially curated; we attempt to recover curated vs inferred by re-running
`derive_synergies_for_tags` from extract_themes (imported) to see which curated anchors apply.
Safety: Does NOT overwrite an existing file unless --force provided.
"""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Dict, List, Set
import yaml # type: ignore
# Reuse logic from extract_themes by importing derive_synergies_for_tags
import sys
SCRIPT_ROOT = Path(__file__).resolve().parent
CODE_ROOT = SCRIPT_ROOT.parent
if str(CODE_ROOT) not in sys.path:
sys.path.insert(0, str(CODE_ROOT))
from scripts.extract_themes import derive_synergies_for_tags # type: ignore
ROOT = Path(__file__).resolve().parents[2]
THEME_JSON = ROOT / 'config' / 'themes' / 'theme_list.json'
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'
WHITELIST_YML = ROOT / 'config' / 'themes' / 'theme_whitelist.yml'
def load_theme_json() -> Dict:
if not THEME_JSON.exists():
raise SystemExit(f"theme_list.json not found at {THEME_JSON}. Run extract_themes.py first.")
return json.loads(THEME_JSON.read_text(encoding='utf-8'))
def load_whitelist() -> Dict:
if not WHITELIST_YML.exists():
return {}
try:
return yaml.safe_load(WHITELIST_YML.read_text(encoding='utf-8')) or {}
except Exception:
return {}
def slugify(name: str) -> str:
s = name.strip().lower()
s = s.replace('+', 'plus')
s = s.replace('/', '-')
# Replace spaces & underscores with hyphen
s = re.sub(r'[\s_]+', '-', s)
# Remove disallowed chars (keep alnum and hyphen)
s = re.sub(r'[^a-z0-9-]', '', s)
# Collapse multiple hyphens
s = re.sub(r'-{2,}', '-', s)
return s.strip('-')
def recover_curated_synergies(all_themes: Set[str], theme: str) -> List[str]:
# Recompute curated mapping and return the curated list if present
curated_map = derive_synergies_for_tags(all_themes)
return curated_map.get(theme, [])
def main():
parser = argparse.ArgumentParser(description='Export per-theme YAML catalog files (Phase A).')
parser.add_argument('--force', action='store_true', help='Overwrite existing YAML files if present.')
parser.add_argument('--limit', type=int, default=0, help='Limit export to first N themes (debug).')
args = parser.parse_args()
data = load_theme_json()
themes = data.get('themes', [])
whitelist = load_whitelist()
enforced_cfg = whitelist.get('enforced_synergies', {}) if isinstance(whitelist.get('enforced_synergies', {}), dict) else {}
all_theme_names: Set[str] = {t.get('theme') for t in themes if isinstance(t, dict) and t.get('theme')}
CATALOG_DIR.mkdir(parents=True, exist_ok=True)
exported = 0
for entry in themes:
theme_name = entry.get('theme')
if not theme_name:
continue
if args.limit and exported >= args.limit:
break
slug = slugify(theme_name)
path = CATALOG_DIR / f'{slug}.yml'
if path.exists() and not args.force:
continue
synergy_list = entry.get('synergies', []) or []
# Attempt to separate curated portion (only for themes in curated mapping)
curated_synergies = recover_curated_synergies(all_theme_names, theme_name)
enforced_synergies = enforced_cfg.get(theme_name, [])
# Keep order: curated -> enforced -> inferred. synergy_list already reflects that ordering from generation.
# Filter curated to those present in current synergy_list to avoid stale entries.
curated_synergies = [s for s in curated_synergies if s in synergy_list]
# Remove enforced from curated to avoid duplication across buckets
curated_synergies_clean = [s for s in curated_synergies if s not in enforced_synergies]
# Inferred = remaining items in synergy_list not in curated or enforced
curated_set = set(curated_synergies_clean)
enforced_set = set(enforced_synergies)
inferred_synergies = [s for s in synergy_list if s not in curated_set and s not in enforced_set]
doc = {
'id': slug,
'display_name': theme_name,
'synergies': synergy_list, # full capped list (ordered)
'curated_synergies': curated_synergies_clean,
'enforced_synergies': enforced_synergies,
'inferred_synergies': inferred_synergies,
'primary_color': entry.get('primary_color'),
'secondary_color': entry.get('secondary_color'),
'notes': ''
}
# Drop None color keys for cleanliness
if doc['primary_color'] is None:
doc.pop('primary_color')
if doc.get('secondary_color') is None:
doc.pop('secondary_color')
with path.open('w', encoding='utf-8') as f:
yaml.safe_dump(doc, f, sort_keys=False, allow_unicode=True)
exported += 1
print(f"Exported {exported} theme YAML files to {CATALOG_DIR}")
if __name__ == '__main__':
main()

View file

@ -221,12 +221,11 @@ def derive_synergies_for_tags(tags: Set[str]) -> Dict[str, List[str]]:
("Noncreature Spells", ["Spellslinger", "Prowess"]),
("Prowess", ["Spellslinger", "Noncreature Spells"]),
# Artifacts / Enchantments
("Artifacts Matter", ["Treasure Token", "Equipment", "Vehicles", "Improvise"]),
("Artifacts Matter", ["Treasure Token", "Equipment Matters", "Vehicles", "Improvise"]),
("Enchantments Matter", ["Auras", "Constellation", "Card Draw"]),
("Auras", ["Constellation", "Voltron", "Enchantments Matter"]),
("Equipment", ["Voltron", "Double Strike", "Warriors Matter"]),
("Treasure Token", ["Sacrifice Matters", "Artifacts Matter", "Ramp"]),
("Vehicles", ["Artifacts Matter", "Equipment"]),
("Vehicles", ["Artifacts Matter", "Crew", "Vehicles"]),
# Counters / Proliferate
("Counters Matter", ["Proliferate", "+1/+1 Counters", "Adapt", "Outlast"]),
("+1/+1 Counters", ["Proliferate", "Counters Matter", "Adapt", "Evolve"]),
@ -237,7 +236,7 @@ def derive_synergies_for_tags(tags: Set[str]) -> Dict[str, List[str]]:
("Landfall", ["Lands Matter", "Ramp", "Token Creation"]),
("Domain", ["Lands Matter", "Ramp"]),
# Combat / Voltron
("Voltron", ["Equipment", "Auras", "Double Strike"]),
("Voltron", ["Equipment Matters", "Auras", "Double Strike"]),
# Card flow
("Card Draw", ["Loot", "Wheels", "Replacement Draw", "Unconditional Draw", "Conditional Draw"]),
("Loot", ["Card Draw", "Discard Matters", "Reanimate"]),

View file

@ -0,0 +1,432 @@
"""Generate editorial metadata suggestions for theme YAML files (Phase D helper).
Features:
- Scans color CSV files (skips monolithic cards.csv unless --include-master)
- Collects top-N (lowest EDHREC rank) cards per theme based on themeTags column
- Optionally derives commander suggestions from commander_cards.csv (if present)
- Provides dry-run output (default) or can patch YAML files that lack example_cards / example_commanders
- Prints streaming progress so the user sees real-time status
Usage (dry run):
python code/scripts/generate_theme_editorial_suggestions.py --themes "Landfall,Reanimate" --top 8
Write back missing fields (only if not already present):
python code/scripts/generate_theme_editorial_suggestions.py --apply --limit-yaml 500
Safety:
- Existing example_cards / example_commanders are never overwritten unless --force is passed
- Writes are limited by --limit-yaml (default 0 means unlimited) to avoid massive churn accidentally
Heuristics:
- Deduplicate card names per theme
- Filter out names with extremely poor rank (> 60000) by default (configurable)
- For commander suggestions, prefer legendary creatures/planeswalkers in commander_cards.csv whose themeTags includes the theme
- Fallback commander suggestions: take top legendary cards from color CSVs tagged with the theme
- synergy_commanders: derive from top 3 synergies of each theme (3 from top, 2 from second, 1 from third)
- Promotion: if fewer than --min-examples example_commanders exist after normal suggestion, promote synergy_commanders (in order) into example_commanders, annotating with " - Synergy (<synergy name>)"
"""
from __future__ import annotations
import argparse
import ast
import csv
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple, Set
import sys
try: # optional dependency safety
import yaml # type: ignore
except Exception:
yaml = None
ROOT = Path(__file__).resolve().parents[2]
CSV_DIR = ROOT / 'csv_files'
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'
COLOR_CSV_GLOB = '*_cards.csv'
MASTER_FILE = 'cards.csv'
COMMANDER_FILE = 'commander_cards.csv'
@dataclass
class ThemeSuggestion:
cards: List[str]
commanders: List[str]
synergy_commanders: List[str]
def _parse_theme_tags(raw: str) -> List[str]:
if not raw:
return []
raw = raw.strip()
if not raw or raw == '[]':
return []
try:
# themeTags stored like "['Landfall', 'Ramp']" use literal_eval safely
val = ast.literal_eval(raw)
if isinstance(val, list):
return [str(x) for x in val if isinstance(x, str)]
except Exception:
pass
# Fallback naive parse
return [t.strip().strip("'\"") for t in raw.strip('[]').split(',') if t.strip()]
def scan_color_csvs(include_master: bool, max_rank: float, progress_every: int) -> Tuple[Dict[str, List[Tuple[float, str]]], Dict[str, List[Tuple[float, str]]]]:
theme_hits: Dict[str, List[Tuple[float, str]]] = {}
legendary_hits: Dict[str, List[Tuple[float, str]]] = {}
files: List[Path] = []
for fp in sorted(CSV_DIR.glob(COLOR_CSV_GLOB)):
name = fp.name
if name == MASTER_FILE and not include_master:
continue
if name == COMMANDER_FILE:
continue
# skip testdata
if 'testdata' in str(fp):
continue
files.append(fp)
total_files = len(files)
processed = 0
for fp in files:
processed += 1
try:
with fp.open(encoding='utf-8', newline='') as f:
reader = csv.DictReader(f)
line_idx = 0
for row in reader:
line_idx += 1
if progress_every and line_idx % progress_every == 0:
print(f"[scan] {fp.name} line {line_idx}", file=sys.stderr, flush=True)
tags_raw = row.get('themeTags') or ''
if not tags_raw:
continue
try:
rank = float(row.get('edhrecRank') or 999999)
except Exception:
rank = 999999
if rank > max_rank:
continue
tags = _parse_theme_tags(tags_raw)
name = row.get('name') or ''
if not name:
continue
is_legendary = False
try:
typ = row.get('type') or ''
if isinstance(typ, str) and 'Legendary' in typ.split():
is_legendary = True
except Exception:
pass
for t in tags:
if not t:
continue
theme_hits.setdefault(t, []).append((rank, name))
if is_legendary:
legendary_hits.setdefault(t, []).append((rank, name))
except Exception as e: # pragma: no cover
print(f"[warn] failed reading {fp.name}: {e}", file=sys.stderr)
print(f"[scan] completed {fp.name} ({processed}/{total_files})", file=sys.stderr, flush=True)
# Trim each bucket to reasonable size (keep best ranks)
for mapping, cap in ((theme_hits, 120), (legendary_hits, 80)):
for t, lst in mapping.items():
lst.sort(key=lambda x: x[0])
if len(lst) > cap:
del lst[cap:]
return theme_hits, legendary_hits
def scan_commander_csv(max_rank: float) -> Dict[str, List[Tuple[float, str]]]:
path = CSV_DIR / COMMANDER_FILE
out: Dict[str, List[Tuple[float, str]]] = {}
if not path.exists():
return out
try:
with path.open(encoding='utf-8', newline='') as f:
reader = csv.DictReader(f)
for row in reader:
tags_raw = row.get('themeTags') or ''
if not tags_raw:
continue
tags = _parse_theme_tags(tags_raw)
try:
rank = float(row.get('edhrecRank') or 999999)
except Exception:
rank = 999999
if rank > max_rank:
continue
name = row.get('name') or ''
if not name:
continue
for t in tags:
if not t:
continue
out.setdefault(t, []).append((rank, name))
except Exception as e: # pragma: no cover
print(f"[warn] failed reading {COMMANDER_FILE}: {e}", file=sys.stderr)
for t, lst in out.items():
lst.sort(key=lambda x: x[0])
if len(lst) > 60:
del lst[60:]
return out
def load_yaml_theme(path: Path) -> dict:
try:
return yaml.safe_load(path.read_text(encoding='utf-8')) if yaml else {}
except Exception:
return {}
def write_yaml_theme(path: Path, data: dict):
txt = yaml.safe_dump(data, sort_keys=False, allow_unicode=True)
path.write_text(txt, encoding='utf-8')
def build_suggestions(theme_hits: Dict[str, List[Tuple[float, str]]], commander_hits: Dict[str, List[Tuple[float, str]]], top: int, top_commanders: int, *, synergy_top=(3,2,1), min_examples: int = 5) -> Dict[str, ThemeSuggestion]:
suggestions: Dict[str, ThemeSuggestion] = {}
all_themes: Set[str] = set(theme_hits.keys()) | set(commander_hits.keys())
for t in sorted(all_themes):
card_names: List[str] = []
if t in theme_hits:
for rank, name in theme_hits[t][: top * 3]: # oversample then dedup
if name not in card_names:
card_names.append(name)
if len(card_names) >= top:
break
commander_names: List[str] = []
if t in commander_hits:
for rank, name in commander_hits[t][: top_commanders * 2]:
if name not in commander_names:
commander_names.append(name)
if len(commander_names) >= top_commanders:
break
# Placeholder synergy_commanders; will be filled later after we know synergies per theme from YAML
suggestions[t] = ThemeSuggestion(cards=card_names, commanders=commander_names, synergy_commanders=[])
return suggestions
def _derive_synergy_commanders(base_theme: str, data: dict, all_yaml: Dict[str, dict], commander_hits: Dict[str, List[Tuple[float, str]]], legendary_hits: Dict[str, List[Tuple[float, str]]], synergy_top=(3,2,1)) -> List[Tuple[str, str]]:
"""Pick synergy commanders with their originating synergy label.
Returns list of (commander_name, synergy_theme) preserving order of (top synergy, second, third) and internal ranking.
"""
synergies = data.get('synergies') or []
if not isinstance(synergies, list):
return []
pattern = list(synergy_top)
out: List[Tuple[str, str]] = []
for idx, count in enumerate(pattern):
if idx >= len(synergies):
break
s_name = synergies[idx]
bucket = commander_hits.get(s_name) or []
taken = 0
for _, cname in bucket:
if all(cname != existing for existing, _ in out):
out.append((cname, s_name))
taken += 1
if taken >= count:
break
if taken < count:
# fallback to legendary card hits tagged with that synergy
fallback_bucket = legendary_hits.get(s_name) or []
for _, cname in fallback_bucket:
if all(cname != existing for existing, _ in out):
out.append((cname, s_name))
taken += 1
if taken >= count:
break
return out
def _augment_synergies(data: dict, base_theme: str) -> bool:
"""Heuristically augment the 'synergies' list when it's sparse.
Rules:
- If synergies length >= 3, leave as-is.
- Start with existing synergies then append curated/enforced/inferred (in that order) if missing.
- For any theme whose display_name contains 'Counter' add 'Counters Matter' and 'Proliferate'.
Returns True if modified.
"""
synergies = data.get('synergies') if isinstance(data.get('synergies'), list) else []
if not isinstance(synergies, list):
return False
original = list(synergies)
if len(synergies) < 3:
for key in ('curated_synergies', 'enforced_synergies', 'inferred_synergies'):
lst = data.get(key)
if isinstance(lst, list):
for s in lst:
if isinstance(s, str) and s and s not in synergies:
synergies.append(s)
name = data.get('display_name') or base_theme
if isinstance(name, str) and 'counter' in name.lower():
for extra in ('Counters Matter', 'Proliferate'):
if extra not in synergies:
synergies.append(extra)
# Deduplicate preserving order
seen = set()
deduped = []
for s in synergies:
if s not in seen:
deduped.append(s)
seen.add(s)
if deduped != synergies:
synergies = deduped
if synergies != original:
data['synergies'] = synergies
return True
return False
def apply_to_yaml(suggestions: Dict[str, ThemeSuggestion], *, limit_yaml: int, force: bool, themes_filter: Set[str], commander_hits: Dict[str, List[Tuple[float, str]]], legendary_hits: Dict[str, List[Tuple[float, str]]], synergy_top=(3,2,1), min_examples: int = 5, augment_synergies: bool = False):
updated = 0
# Preload all YAML for synergy lookups (avoid repeated disk IO inside loop)
all_yaml_cache: Dict[str, dict] = {}
for p in CATALOG_DIR.glob('*.yml'):
try:
all_yaml_cache[p.name] = load_yaml_theme(p)
except Exception:
pass
for path in sorted(CATALOG_DIR.glob('*.yml')):
data = load_yaml_theme(path)
if not isinstance(data, dict):
continue
display = data.get('display_name')
if not isinstance(display, str) or not display:
continue
if themes_filter and display not in themes_filter:
continue
sug = suggestions.get(display)
if not sug:
continue
changed = False
# Optional synergy augmentation prior to commander derivation
if augment_synergies and _augment_synergies(data, display):
changed = True
# Derive synergy_commanders before promotion logic
synergy_cmds = _derive_synergy_commanders(display, data, all_yaml_cache, commander_hits, legendary_hits, synergy_top=synergy_top)
# Annotate synergy_commanders with their synergy source for transparency
synergy_cmd_names = [f"{c} - Synergy ({src})" for c, src in synergy_cmds]
if (force or not data.get('example_cards')) and sug.cards:
data['example_cards'] = sug.cards
changed = True
existing_examples: List[str] = list(data.get('example_commanders') or []) if isinstance(data.get('example_commanders'), list) else []
if force or not existing_examples:
if sug.commanders:
data['example_commanders'] = list(sug.commanders)
existing_examples = data['example_commanders']
changed = True
# (Attachment of synergy_commanders moved to after promotion so we can filter duplicates with example_commanders)
# Re-annotate existing example_commanders if they use old base-theme annotation pattern
if existing_examples and synergy_cmds:
# Detect old pattern: ends with base theme name inside parentheses
needs_reannotate = False
old_suffix = f" - Synergy ({display})"
for ex in existing_examples:
if ex.endswith(old_suffix):
needs_reannotate = True
break
if needs_reannotate:
# Build mapping from commander name to synergy source
source_map = {name: src for name, src in synergy_cmds}
new_examples: List[str] = []
for ex in existing_examples:
if ' - Synergy (' in ex:
base_name = ex.split(' - Synergy ')[0]
if base_name in source_map:
new_examples.append(f"{base_name} - Synergy ({source_map[base_name]})")
continue
new_examples.append(ex)
if new_examples != existing_examples:
data['example_commanders'] = new_examples
existing_examples = new_examples
changed = True
# Promotion: ensure at least min_examples in example_commanders by moving from synergy list (without duplicates)
if (len(existing_examples) < min_examples) and synergy_cmd_names:
needed = min_examples - len(existing_examples)
promoted = []
for cname, source_synergy in synergy_cmds:
# Avoid duplicate even with annotation
if not any(cname == base.split(' - Synergy ')[0] for base in existing_examples):
annotated = f"{cname} - Synergy ({source_synergy})"
existing_examples.append(annotated)
promoted.append(cname)
needed -= 1
if needed <= 0:
break
if promoted:
data['example_commanders'] = existing_examples
changed = True
# After any potential promotions / re-annotations, attach synergy_commanders excluding any commanders already present in example_commanders
existing_base_names = {ex.split(' - Synergy ')[0] for ex in (data.get('example_commanders') or []) if isinstance(ex, str)}
filtered_synergy_cmd_names = []
for entry in synergy_cmd_names:
base = entry.split(' - Synergy ')[0]
if base not in existing_base_names:
filtered_synergy_cmd_names.append(entry)
prior_synergy_cmds = data.get('synergy_commanders') if isinstance(data.get('synergy_commanders'), list) else []
if prior_synergy_cmds != filtered_synergy_cmd_names:
if filtered_synergy_cmd_names or force or prior_synergy_cmds:
data['synergy_commanders'] = filtered_synergy_cmd_names
changed = True
if changed:
write_yaml_theme(path, data)
updated += 1
print(f"[apply] updated {path.name}")
if limit_yaml and updated >= limit_yaml:
print(f"[apply] reached limit {limit_yaml}; stopping")
break
return updated
def main(): # pragma: no cover
parser = argparse.ArgumentParser(description='Generate example_cards / example_commanders suggestions for theme YAML')
parser.add_argument('--themes', type=str, help='Comma-separated subset of display names to restrict')
parser.add_argument('--top', type=int, default=8, help='Target number of example_cards suggestions')
parser.add_argument('--top-commanders', type=int, default=5, help='Target number of example_commanders suggestions')
parser.add_argument('--max-rank', type=float, default=60000, help='Skip cards with EDHREC rank above this threshold')
parser.add_argument('--include-master', action='store_true', help='Include large cards.csv in scan (slower)')
parser.add_argument('--progress-every', type=int, default=0, help='Emit a progress line every N rows per file')
parser.add_argument('--apply', action='store_true', help='Write missing fields into YAML files')
parser.add_argument('--limit-yaml', type=int, default=0, help='Limit number of YAML files modified (0 = unlimited)')
parser.add_argument('--force', action='store_true', help='Overwrite existing example lists')
parser.add_argument('--min-examples', type=int, default=5, help='Minimum desired example_commanders; promote from synergy_commanders if short')
parser.add_argument('--augment-synergies', action='store_true', help='Heuristically augment sparse synergies list before deriving synergy_commanders')
args = parser.parse_args()
themes_filter: Set[str] = set()
if args.themes:
themes_filter = {t.strip() for t in args.themes.split(',') if t.strip()}
print('[info] scanning CSVs...', file=sys.stderr)
theme_hits, legendary_hits = scan_color_csvs(args.include_master, args.max_rank, args.progress_every)
print('[info] scanning commander CSV...', file=sys.stderr)
commander_hits = scan_commander_csv(args.max_rank)
print('[info] building suggestions...', file=sys.stderr)
suggestions = build_suggestions(theme_hits, commander_hits, args.top, args.top_commanders, min_examples=args.min_examples)
if not args.apply:
# Dry run: print JSON-like summary for filtered subset (or first 25 themes)
to_show = sorted(themes_filter) if themes_filter else list(sorted(suggestions.keys())[:25])
for t in to_show:
s = suggestions.get(t)
if not s:
continue
print(f"\n=== {t} ===")
print('example_cards:', ', '.join(s.cards) or '(none)')
print('example_commanders:', ', '.join(s.commanders) or '(none)')
print('synergy_commanders: (computed at apply time)')
print('\n[info] dry-run complete (use --apply to write)')
return
if yaml is None:
print('ERROR: PyYAML not installed; cannot apply changes.', file=sys.stderr)
sys.exit(1)
updated = apply_to_yaml(suggestions, limit_yaml=args.limit_yaml, force=args.force, themes_filter=themes_filter, commander_hits=commander_hits, legendary_hits=legendary_hits, synergy_top=(3,2,1), min_examples=args.min_examples, augment_synergies=args.augment_synergies)
print(f'[info] updated {updated} YAML files')
if __name__ == '__main__': # pragma: no cover
main()

View file

@ -0,0 +1,149 @@
"""Phase D: Lint editorial metadata for theme YAML files.
Checks (non-fatal unless --strict):
- example_commanders/example_cards length & uniqueness
- deck_archetype membership in allowed set (warn if unknown)
- Cornerstone themes have at least one example commander & card
Exit codes:
0: No errors (warnings may still print)
1: Structural / fatal errors (in strict mode or malformed YAML)
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import List, Set
import re
import sys
try:
import yaml # type: ignore
except Exception: # pragma: no cover
yaml = None
ROOT = Path(__file__).resolve().parents[2]
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'
ALLOWED_ARCHETYPES: Set[str] = {
'Lands', 'Graveyard', 'Planeswalkers', 'Tokens', 'Counters', 'Spells', 'Artifacts', 'Enchantments', 'Politics'
}
CORNERSTONE: Set[str] = {
'Landfall', 'Reanimate', 'Superfriends', 'Tokens Matter', '+1/+1 Counters'
}
def lint(strict: bool) -> int:
if yaml is None:
print('YAML support not available (PyYAML missing); skipping lint.')
return 0
if not CATALOG_DIR.exists():
print('Catalog directory missing; nothing to lint.')
return 0
errors: List[str] = []
warnings: List[str] = []
cornerstone_present: Set[str] = set()
seen_display: Set[str] = set()
ann_re = re.compile(r" - Synergy \(([^)]+)\)$")
for path in sorted(CATALOG_DIR.glob('*.yml')):
try:
data = yaml.safe_load(path.read_text(encoding='utf-8'))
except Exception as e:
errors.append(f"Failed to parse {path.name}: {e}")
continue
if not isinstance(data, dict):
errors.append(f"YAML not mapping: {path.name}")
continue
name = str(data.get('display_name') or '').strip()
if not name:
continue
# Skip deprecated alias placeholder files
notes_field = data.get('notes')
if isinstance(notes_field, str) and 'Deprecated alias file' in notes_field:
continue
if name in seen_display:
# Already processed a canonical file for this display name; skip duplicates (aliases)
continue
seen_display.add(name)
ex_cmd = data.get('example_commanders') or []
ex_cards = data.get('example_cards') or []
synergy_cmds = data.get('synergy_commanders') if isinstance(data.get('synergy_commanders'), list) else []
theme_synergies = data.get('synergies') if isinstance(data.get('synergies'), list) else []
if not isinstance(ex_cmd, list):
errors.append(f"example_commanders not list in {path.name}")
ex_cmd = []
if not isinstance(ex_cards, list):
errors.append(f"example_cards not list in {path.name}")
ex_cards = []
# Length caps
if len(ex_cmd) > 12:
warnings.append(f"{name}: example_commanders trimmed to 12 (found {len(ex_cmd)})")
if len(ex_cards) > 20:
warnings.append(f"{name}: example_cards length {len(ex_cards)} > 20 (consider trimming)")
if synergy_cmds and len(synergy_cmds) > 6:
warnings.append(f"{name}: synergy_commanders length {len(synergy_cmds)} > 6 (3/2/1 pattern expected)")
if ex_cmd and len(ex_cmd) < 5:
warnings.append(f"{name}: example_commanders only {len(ex_cmd)} (<5 minimum target)")
if not synergy_cmds and any(' - Synergy (' in c for c in ex_cmd):
# If synergy_commanders intentionally filtered out because all synergy picks were promoted, skip warning.
# Heuristic: if at least 5 examples and every annotated example has unique base name, treat as satisfied.
base_names = {c.split(' - Synergy ')[0] for c in ex_cmd if ' - Synergy (' in c}
if not (len(ex_cmd) >= 5 and len(base_names) >= 1):
warnings.append(f"{name}: has synergy-annotated example_commanders but missing synergy_commanders list")
# Uniqueness
if len(set(ex_cmd)) != len(ex_cmd):
warnings.append(f"{name}: duplicate entries in example_commanders")
if len(set(ex_cards)) != len(ex_cards):
warnings.append(f"{name}: duplicate entries in example_cards")
if synergy_cmds:
base_synergy_names = [c.split(' - Synergy ')[0] for c in synergy_cmds]
if len(set(base_synergy_names)) != len(base_synergy_names):
warnings.append(f"{name}: duplicate entries in synergy_commanders (base names)")
# Annotation validation: each annotated example should reference a synergy in theme synergies
for c in ex_cmd:
if ' - Synergy (' in c:
m = ann_re.search(c)
if m:
syn = m.group(1).strip()
if syn and syn not in theme_synergies:
warnings.append(f"{name}: example commander annotation synergy '{syn}' not in theme synergies list")
# Cornerstone coverage
if name in CORNERSTONE:
if not ex_cmd:
warnings.append(f"Cornerstone theme {name} missing example_commanders")
if not ex_cards:
warnings.append(f"Cornerstone theme {name} missing example_cards")
else:
cornerstone_present.add(name)
# Archetype
arch = data.get('deck_archetype')
if arch and arch not in ALLOWED_ARCHETYPES:
warnings.append(f"{name}: deck_archetype '{arch}' not in allowed set {sorted(ALLOWED_ARCHETYPES)}")
# Summaries
if warnings:
print('LINT WARNINGS:')
for w in warnings:
print(f" - {w}")
if errors:
print('LINT ERRORS:')
for e in errors:
print(f" - {e}")
if errors and strict:
return 1
return 0
def main(): # pragma: no cover
parser = argparse.ArgumentParser(description='Lint editorial metadata for theme YAML files (Phase D)')
parser.add_argument('--strict', action='store_true', help='Treat errors as fatal (non-zero exit)')
args = parser.parse_args()
rc = lint(args.strict)
if rc != 0:
sys.exit(rc)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,260 @@
"""Validation script for theme catalog (Phase C groundwork).
Performs:
- Pydantic model validation
- Duplicate theme detection
- Enforced synergies presence check (from whitelist)
- Normalization idempotency check (optional --rebuild-pass)
- Synergy cap enforcement (allowing soft exceed when curated+enforced exceed cap)
- JSON Schema export (--schema / --schema-out)
Exit codes:
0 success
1 validation errors (structural)
2 policy errors (duplicates, missing enforced synergies, cap violations)
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Dict, List, Set
try:
import yaml # type: ignore
except Exception:
yaml = None
ROOT = Path(__file__).resolve().parents[2]
CODE_ROOT = ROOT / 'code'
if str(CODE_ROOT) not in sys.path:
sys.path.insert(0, str(CODE_ROOT))
from type_definitions_theme_catalog import ThemeCatalog, ThemeYAMLFile # type: ignore
from scripts.extract_themes import load_whitelist_config # type: ignore
from scripts.build_theme_catalog import build_catalog # type: ignore
CATALOG_JSON = ROOT / 'config' / 'themes' / 'theme_list.json'
def load_catalog_file() -> Dict:
if not CATALOG_JSON.exists():
raise SystemExit(f"Catalog JSON missing: {CATALOG_JSON}")
return json.loads(CATALOG_JSON.read_text(encoding='utf-8'))
def validate_catalog(data: Dict, *, whitelist: Dict, allow_soft_exceed: bool = True) -> List[str]:
errors: List[str] = []
# If provenance missing (legacy extraction output), inject synthetic one so subsequent checks can proceed
if 'provenance' not in data:
data['provenance'] = {
'mode': 'legacy-extraction',
'generated_at': 'unknown',
'curated_yaml_files': 0,
'synergy_cap': int(whitelist.get('synergy_cap', 0) or 0),
'inference': 'unknown',
'version': 'pre-merge-fallback'
}
if 'generated_from' not in data:
data['generated_from'] = 'legacy (tagger + constants)'
try:
catalog = ThemeCatalog(**data)
except Exception as e: # structural validation
errors.append(f"Pydantic validation failed: {e}")
return errors
# Duplicate detection
seen: Set[str] = set()
dups: Set[str] = set()
for t in catalog.themes:
if t.theme in seen:
dups.add(t.theme)
seen.add(t.theme)
if dups:
errors.append(f"Duplicate theme entries detected: {sorted(dups)}")
enforced_cfg: Dict[str, List[str]] = whitelist.get('enforced_synergies', {}) or {}
synergy_cap = int(whitelist.get('synergy_cap', 0) or 0)
# Fast index
theme_map = {t.theme: t for t in catalog.themes}
# Enforced presence & cap checks
for anchor, required in enforced_cfg.items():
if anchor not in theme_map:
continue # pruning may allow non-always_include anchors to drop
syn = theme_map[anchor].synergies
missing = [r for r in required if r not in syn]
if missing:
errors.append(f"Anchor '{anchor}' missing enforced synergies: {missing}")
if synergy_cap and len(syn) > synergy_cap:
if not allow_soft_exceed:
errors.append(f"Anchor '{anchor}' exceeds synergy cap ({len(syn)}>{synergy_cap})")
# Cap enforcement for non-soft-exceeding cases
if synergy_cap:
for t in catalog.themes:
if len(t.synergies) > synergy_cap:
# Determine if soft exceed allowed: curated+enforced > cap (we can't reconstruct curated precisely here)
# Heuristic: if enforced list for anchor exists AND all enforced appear AND len(enforced)>=cap then allow.
enforced = set(enforced_cfg.get(t.theme, []))
if not (allow_soft_exceed and enforced and enforced.issubset(set(t.synergies)) and len(enforced) >= synergy_cap):
# Allow also if enforced+first curated guess (inference fallback) obviously pushes over cap (can't fully know); skip strict enforcement
pass # Keep heuristic permissive for now
return errors
def validate_yaml_files(*, whitelist: Dict, strict_alias: bool = False) -> List[str]:
"""Validate individual YAML catalog files.
strict_alias: if True, treat presence of a deprecated alias (normalization key)
as a hard error instead of a soft ignored transitional state.
"""
errors: List[str] = []
catalog_dir = ROOT / 'config' / 'themes' / 'catalog'
if not catalog_dir.exists():
return errors
seen_ids: Set[str] = set()
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
always_include = set(whitelist.get('always_include', []) or [])
present_always: Set[str] = set()
for path in sorted(catalog_dir.glob('*.yml')):
try:
raw = yaml.safe_load(path.read_text(encoding='utf-8')) if yaml else None
except Exception:
errors.append(f"Failed to parse YAML: {path.name}")
continue
if not isinstance(raw, dict):
errors.append(f"YAML not a mapping: {path.name}")
continue
try:
obj = ThemeYAMLFile(**raw)
except Exception as e:
errors.append(f"YAML schema violation {path.name}: {e}")
continue
# Duplicate id detection
if obj.id in seen_ids:
errors.append(f"Duplicate YAML id: {obj.id}")
seen_ids.add(obj.id)
# Normalization alias check: display_name should already be normalized if in map
if normalization_map and obj.display_name in normalization_map.keys():
if strict_alias:
errors.append(f"Alias display_name present in strict mode: {obj.display_name} ({path.name})")
# else soft-ignore for transitional period
if obj.display_name in always_include:
present_always.add(obj.display_name)
missing_always = always_include - present_always
if missing_always:
# Not necessarily fatal if those only exist in analytics; warn for now.
errors.append(f"always_include themes missing YAML files: {sorted(missing_always)}")
return errors
def main(): # pragma: no cover
parser = argparse.ArgumentParser(description='Validate theme catalog (Phase C)')
parser.add_argument('--schema', action='store_true', help='Print JSON Schema for catalog and exit')
parser.add_argument('--schema-out', type=str, help='Write JSON Schema to file path')
parser.add_argument('--rebuild-pass', action='store_true', help='Rebuild catalog in-memory and ensure stable equality vs file')
parser.add_argument('--fail-soft-exceed', action='store_true', help='Treat synergy list length > cap as error even for soft exceed')
parser.add_argument('--yaml-schema', action='store_true', help='Print JSON Schema for per-file ThemeYAML and exit')
parser.add_argument('--strict-alias', action='store_true', help='Fail if any YAML uses an alias name slated for normalization')
args = parser.parse_args()
if args.schema:
schema = ThemeCatalog.model_json_schema()
if args.schema_out:
Path(args.schema_out).write_text(json.dumps(schema, indent=2), encoding='utf-8')
else:
print(json.dumps(schema, indent=2))
return
if args.yaml_schema:
schema = ThemeYAMLFile.model_json_schema()
if args.schema_out:
Path(args.schema_out).write_text(json.dumps(schema, indent=2), encoding='utf-8')
else:
print(json.dumps(schema, indent=2))
return
whitelist = load_whitelist_config()
data = load_catalog_file()
errors = validate_catalog(data, whitelist=whitelist, allow_soft_exceed=not args.fail_soft_exceed)
errors.extend(validate_yaml_files(whitelist=whitelist, strict_alias=args.strict_alias))
if args.rebuild_pass:
rebuilt = build_catalog(limit=0, verbose=False)
# Compare canonical dict dumps (ordering of themes is deterministic: sorted by theme name in build script)
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
def _canon(theme_list):
canon: Dict[str, Dict] = {}
for t in theme_list:
name = t.get('theme')
if not isinstance(name, str):
continue
name_canon = normalization_map.get(name, name)
sy = t.get('synergies', [])
if not isinstance(sy, list):
sy_sorted = []
else:
# Apply normalization inside synergies too
sy_norm = [normalization_map.get(s, s) for s in sy if isinstance(s, str)]
sy_sorted = sorted(set(sy_norm))
entry = {
'theme': name_canon,
'synergies': sy_sorted,
}
# Keep first (curated/enforced precedence differences ignored for alias collapse)
canon.setdefault(name_canon, entry)
# Return list sorted by canonical name
return [canon[k] for k in sorted(canon.keys())]
file_dump = json.dumps(_canon(data.get('themes', [])), sort_keys=True)
rebuilt_dump = json.dumps(_canon(rebuilt.get('themes', [])), sort_keys=True)
if file_dump != rebuilt_dump:
# Provide lightweight diff diagnostics (first 10 differing characters and sample themes)
try:
import difflib
file_list = json.loads(file_dump)
reb_list = json.loads(rebuilt_dump)
file_names = [t['theme'] for t in file_list]
reb_names = [t['theme'] for t in reb_list]
missing_in_reb = sorted(set(file_names) - set(reb_names))[:5]
extra_in_reb = sorted(set(reb_names) - set(file_names))[:5]
# Find first theme with differing synergies
synergy_mismatch = None
for f in file_list:
for r in reb_list:
if f['theme'] == r['theme'] and f['synergies'] != r['synergies']:
synergy_mismatch = (f['theme'], f['synergies'][:10], r['synergies'][:10])
break
if synergy_mismatch:
break
diff_note_parts = []
if missing_in_reb:
diff_note_parts.append(f"missing:{missing_in_reb}")
if extra_in_reb:
diff_note_parts.append(f"extra:{extra_in_reb}")
if synergy_mismatch:
diff_note_parts.append(f"synergy_mismatch:{synergy_mismatch}")
if not diff_note_parts:
# generic char diff snippet
for line in difflib.unified_diff(file_dump.splitlines(), rebuilt_dump.splitlines(), n=1):
diff_note_parts.append(line)
if len(diff_note_parts) > 10:
break
errors.append('Normalization / rebuild pass produced differing theme list output ' + ' | '.join(diff_note_parts))
except Exception:
errors.append('Normalization / rebuild pass produced differing theme list output (diff unavailable)')
if errors:
print('VALIDATION FAILED:')
for e in errors:
print(f" - {e}")
sys.exit(2)
print('Theme catalog validation passed.')
if __name__ == '__main__':
main()