mirror of
https://github.com/mwisnowski/mtg_python_deckbuilder.git
synced 2025-09-22 04:50:46 +02:00
367 lines
15 KiB
Python
367 lines
15 KiB
Python
"""Phase B: Merge curated YAML catalog with regenerated analytics to build theme_list.json.
|
|
|
|
See roadmap Phase B goals. This script unifies generation:
|
|
- Discovers themes (constants + tagger + CSV dynamic tags)
|
|
- Applies whitelist governance (normalization, pruning, always_include)
|
|
- Recomputes frequencies & PMI co-occurrence for inference
|
|
- Loads curated YAML files (Phase A outputs) for editorial overrides
|
|
- Merges curated, enforced, and inferred synergies with precedence
|
|
- Applies synergy cap without truncating curated or enforced entries
|
|
- Emits theme_list.json with provenance block
|
|
|
|
Opt-in via env THEME_CATALOG_MODE=merge (or build/phaseb). Or run manually:
|
|
python code/scripts/build_theme_catalog.py --verbose
|
|
|
|
This is intentionally side-effect only (writes JSON). Unit tests for Phase C will
|
|
add schema validation; for now we focus on deterministic, stable output.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from collections import Counter
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
try: # Optional
|
|
import yaml # type: ignore
|
|
except Exception: # pragma: no cover
|
|
yaml = None
|
|
|
|
ROOT = Path(__file__).resolve().parents[2]
|
|
CODE_ROOT = ROOT / 'code'
|
|
if str(CODE_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(CODE_ROOT))
|
|
|
|
from scripts.extract_themes import ( # type: ignore
|
|
BASE_COLORS,
|
|
collect_theme_tags_from_constants,
|
|
collect_theme_tags_from_tagger_source,
|
|
gather_theme_tag_rows,
|
|
tally_tag_frequencies_by_base_color,
|
|
compute_cooccurrence,
|
|
cooccurrence_scores_for,
|
|
derive_synergies_for_tags,
|
|
apply_normalization,
|
|
load_whitelist_config,
|
|
should_keep_theme,
|
|
)
|
|
|
|
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'
|
|
OUTPUT_JSON = ROOT / 'config' / 'themes' / 'theme_list.json'
|
|
|
|
|
|
@dataclass
|
|
class ThemeYAML:
|
|
id: str
|
|
display_name: str
|
|
curated_synergies: List[str]
|
|
enforced_synergies: List[str]
|
|
inferred_synergies: List[str]
|
|
synergies: List[str]
|
|
primary_color: Optional[str] = None
|
|
secondary_color: Optional[str] = None
|
|
notes: str = ''
|
|
|
|
|
|
def _log(msg: str, verbose: bool): # pragma: no cover
|
|
if verbose:
|
|
print(f"[build_theme_catalog] {msg}", file=sys.stderr)
|
|
|
|
|
|
def load_catalog_yaml(verbose: bool) -> Dict[str, ThemeYAML]:
|
|
out: Dict[str, ThemeYAML] = {}
|
|
if not CATALOG_DIR.exists() or yaml is None:
|
|
return out
|
|
for path in sorted(CATALOG_DIR.glob('*.yml')):
|
|
try:
|
|
data = yaml.safe_load(path.read_text(encoding='utf-8'))
|
|
except Exception:
|
|
_log(f"Failed reading {path.name}", verbose)
|
|
continue
|
|
if not isinstance(data, dict):
|
|
continue
|
|
# Skip deprecated alias placeholder files (marked in notes)
|
|
try:
|
|
notes_field = data.get('notes')
|
|
if isinstance(notes_field, str) and 'Deprecated alias file' in notes_field:
|
|
continue
|
|
except Exception:
|
|
pass
|
|
try:
|
|
ty = ThemeYAML(
|
|
id=str(data.get('id') or ''),
|
|
display_name=str(data.get('display_name') or ''),
|
|
curated_synergies=list(data.get('curated_synergies') or []),
|
|
enforced_synergies=list(data.get('enforced_synergies') or []),
|
|
inferred_synergies=list(data.get('inferred_synergies') or []),
|
|
synergies=list(data.get('synergies') or []),
|
|
primary_color=data.get('primary_color'),
|
|
secondary_color=data.get('secondary_color'),
|
|
notes=str(data.get('notes') or ''),
|
|
)
|
|
except Exception:
|
|
continue
|
|
if not ty.display_name:
|
|
continue
|
|
out[ty.display_name] = ty
|
|
return out
|
|
|
|
|
|
def regenerate_analytics(verbose: bool):
|
|
theme_tags: Set[str] = set()
|
|
theme_tags |= collect_theme_tags_from_constants()
|
|
theme_tags |= collect_theme_tags_from_tagger_source()
|
|
try:
|
|
csv_rows = gather_theme_tag_rows()
|
|
for row_tags in csv_rows:
|
|
for t in row_tags:
|
|
if isinstance(t, str) and t:
|
|
theme_tags.add(t)
|
|
except Exception:
|
|
csv_rows = []
|
|
|
|
whitelist = load_whitelist_config()
|
|
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
|
|
exclusions: Set[str] = set(whitelist.get('exclusions', []) or [])
|
|
protected_prefixes: List[str] = list(whitelist.get('protected_prefixes', []) or [])
|
|
protected_suffixes: List[str] = list(whitelist.get('protected_suffixes', []) or [])
|
|
min_overrides: Dict[str, int] = whitelist.get('min_frequency_overrides', {}) or {}
|
|
|
|
if normalization_map:
|
|
theme_tags = apply_normalization(theme_tags, normalization_map)
|
|
blacklist = {"Draw Triggers"}
|
|
theme_tags = {t for t in theme_tags if t and t not in blacklist and t not in exclusions}
|
|
|
|
try:
|
|
frequencies = tally_tag_frequencies_by_base_color()
|
|
except Exception:
|
|
frequencies = {}
|
|
|
|
if frequencies:
|
|
def total_count(t: str) -> int:
|
|
s = 0
|
|
for c in BASE_COLORS.keys():
|
|
try:
|
|
s += int(frequencies.get(c, {}).get(t, 0))
|
|
except Exception:
|
|
pass
|
|
return s
|
|
kept: Set[str] = set()
|
|
for t in list(theme_tags):
|
|
if should_keep_theme(t, total_count(t), whitelist, protected_prefixes, protected_suffixes, min_overrides):
|
|
kept.add(t)
|
|
for extra in whitelist.get('always_include', []) or []:
|
|
kept.add(str(extra))
|
|
theme_tags = kept
|
|
|
|
try:
|
|
rows = csv_rows if csv_rows else gather_theme_tag_rows()
|
|
co_map, tag_counts, total_rows = compute_cooccurrence(rows)
|
|
except Exception:
|
|
co_map, tag_counts, total_rows = {}, Counter(), 0
|
|
|
|
return dict(theme_tags=theme_tags, frequencies=frequencies, co_map=co_map, tag_counts=tag_counts, total_rows=total_rows, whitelist=whitelist)
|
|
|
|
|
|
def _primary_secondary(theme: str, freqs: Dict[str, Dict[str, int]]):
|
|
if not freqs:
|
|
return None, None
|
|
items: List[Tuple[str, int]] = []
|
|
for color in BASE_COLORS.keys():
|
|
try:
|
|
items.append((color, int(freqs.get(color, {}).get(theme, 0))))
|
|
except Exception:
|
|
items.append((color, 0))
|
|
items.sort(key=lambda x: (-x[1], x[0]))
|
|
if not items or items[0][1] <= 0:
|
|
return None, None
|
|
title = {'white': 'White', 'blue': 'Blue', 'black': 'Black', 'red': 'Red', 'green': 'Green'}
|
|
primary = title[items[0][0]]
|
|
secondary = None
|
|
for c, n in items[1:]:
|
|
if n > 0:
|
|
secondary = title[c]
|
|
break
|
|
return primary, secondary
|
|
|
|
|
|
def infer_synergies(anchor: str, curated: List[str], enforced: List[str], analytics: dict, pmi_min: float = 0.0, co_min: int = 5) -> List[str]:
|
|
if anchor not in analytics['co_map'] or analytics['total_rows'] <= 0:
|
|
return []
|
|
scored = cooccurrence_scores_for(anchor, analytics['co_map'], analytics['tag_counts'], analytics['total_rows'])
|
|
out: List[str] = []
|
|
for other, score, co_count in scored:
|
|
if score <= pmi_min or co_count < co_min:
|
|
continue
|
|
if other == anchor or other in curated or other in enforced or other in out:
|
|
continue
|
|
out.append(other)
|
|
if len(out) >= 12:
|
|
break
|
|
return out
|
|
|
|
|
|
def build_catalog(limit: int, verbose: bool) -> Dict[str, Any]:
|
|
analytics = regenerate_analytics(verbose)
|
|
whitelist = analytics['whitelist']
|
|
synergy_cap = int(whitelist.get('synergy_cap', 0) or 0)
|
|
normalization_map: Dict[str, str] = whitelist.get('normalization', {}) if isinstance(whitelist.get('normalization'), dict) else {}
|
|
enforced_cfg: Dict[str, List[str]] = whitelist.get('enforced_synergies', {}) or {}
|
|
|
|
yaml_catalog = load_catalog_yaml(verbose)
|
|
all_themes: Set[str] = set(analytics['theme_tags']) | {t.display_name for t in yaml_catalog.values()}
|
|
if normalization_map:
|
|
all_themes = apply_normalization(all_themes, normalization_map)
|
|
curated_baseline = derive_synergies_for_tags(all_themes)
|
|
|
|
entries: List[Dict[str, Any]] = []
|
|
processed = 0
|
|
for theme in sorted(all_themes):
|
|
if limit and processed >= limit:
|
|
break
|
|
processed += 1
|
|
y = yaml_catalog.get(theme)
|
|
curated_list = list(y.curated_synergies) if y and y.curated_synergies else curated_baseline.get(theme, [])
|
|
enforced_list: List[str] = []
|
|
if y and y.enforced_synergies:
|
|
for s in y.enforced_synergies:
|
|
if s not in enforced_list:
|
|
enforced_list.append(s)
|
|
if theme in enforced_cfg:
|
|
for s in enforced_cfg.get(theme, []):
|
|
if s not in enforced_list:
|
|
enforced_list.append(s)
|
|
inferred_list = infer_synergies(theme, curated_list, enforced_list, analytics)
|
|
if not inferred_list and y and y.inferred_synergies:
|
|
inferred_list = [s for s in y.inferred_synergies if s not in curated_list and s not in enforced_list]
|
|
|
|
if normalization_map:
|
|
def _norm(seq: List[str]) -> List[str]:
|
|
seen = set()
|
|
out = []
|
|
for s in seq:
|
|
s2 = normalization_map.get(s, s)
|
|
if s2 not in seen:
|
|
out.append(s2)
|
|
seen.add(s2)
|
|
return out
|
|
curated_list = _norm(curated_list)
|
|
enforced_list = _norm(enforced_list)
|
|
inferred_list = _norm(inferred_list)
|
|
|
|
merged: List[str] = []
|
|
for bucket in (curated_list, enforced_list, inferred_list):
|
|
for s in bucket:
|
|
if s == theme:
|
|
continue
|
|
if s not in merged:
|
|
merged.append(s)
|
|
|
|
# Noise suppression: remove ubiquitous Legends/Historics links except for their mutual pairing.
|
|
# Rationale: Every legendary permanent is tagged with both themes (Historics also covers artifacts/enchantments),
|
|
# creating low-signal "synergies" that crowd out more meaningful relationships. Requirement:
|
|
# - For any theme other than the two themselves, strip both "Legends Matter" and "Historics Matter".
|
|
# - For "Legends Matter", allow "Historics Matter" to remain (and vice-versa).
|
|
special_noise = {"Legends Matter", "Historics Matter"}
|
|
if theme not in special_noise:
|
|
if any(s in special_noise for s in merged):
|
|
merged = [s for s in merged if s not in special_noise]
|
|
# If theme is one of the special ones, keep the other if present (no action needed beyond above filter logic).
|
|
|
|
if synergy_cap > 0 and len(merged) > synergy_cap:
|
|
ce_len = len(curated_list) + len([s for s in enforced_list if s not in curated_list])
|
|
if ce_len < synergy_cap:
|
|
allowed_inferred = synergy_cap - ce_len
|
|
ce_part = merged[:ce_len]
|
|
inferred_tail = [s for s in merged[ce_len:ce_len+allowed_inferred]]
|
|
merged = ce_part + inferred_tail
|
|
# else: keep all (soft exceed)
|
|
|
|
if y and (y.primary_color or y.secondary_color):
|
|
primary, secondary = y.primary_color, y.secondary_color
|
|
else:
|
|
primary, secondary = _primary_secondary(theme, analytics['frequencies'])
|
|
|
|
entry = {'theme': theme, 'synergies': merged}
|
|
if primary:
|
|
entry['primary_color'] = primary
|
|
if secondary:
|
|
entry['secondary_color'] = secondary
|
|
# Phase D: carry forward optional editorial metadata if present in YAML
|
|
if y:
|
|
if getattr(y, 'example_commanders', None):
|
|
entry['example_commanders'] = [c for c in y.example_commanders if isinstance(c, str)][:12]
|
|
if getattr(y, 'example_cards', None):
|
|
# Limit to 20 for safety (UI may further cap)
|
|
dedup_cards = []
|
|
seen_cards = set()
|
|
for c in y.example_cards:
|
|
if isinstance(c, str) and c and c not in seen_cards:
|
|
dedup_cards.append(c)
|
|
seen_cards.add(c)
|
|
if len(dedup_cards) >= 20:
|
|
break
|
|
if dedup_cards:
|
|
entry['example_cards'] = dedup_cards
|
|
if getattr(y, 'deck_archetype', None):
|
|
entry['deck_archetype'] = y.deck_archetype
|
|
if getattr(y, 'popularity_hint', None):
|
|
entry['popularity_hint'] = y.popularity_hint
|
|
# Pass through synergy_commanders if already curated (script will populate going forward)
|
|
if hasattr(y, 'synergy_commanders') and getattr(y, 'synergy_commanders'):
|
|
entry['synergy_commanders'] = [c for c in getattr(y, 'synergy_commanders') if isinstance(c, str)][:12]
|
|
entries.append(entry)
|
|
|
|
provenance = {
|
|
'mode': 'merge',
|
|
'generated_at': time.strftime('%Y-%m-%dT%H:%M:%S'),
|
|
'curated_yaml_files': len(yaml_catalog),
|
|
'synergy_cap': synergy_cap,
|
|
'inference': 'pmi',
|
|
'version': 'phase-b-merge-v1'
|
|
}
|
|
return {
|
|
'themes': entries,
|
|
'frequencies_by_base_color': analytics['frequencies'],
|
|
'generated_from': 'merge (analytics + curated YAML + whitelist)',
|
|
'provenance': provenance,
|
|
}
|
|
|
|
|
|
def main(): # pragma: no cover
|
|
parser = argparse.ArgumentParser(description='Build merged theme catalog (Phase B)')
|
|
parser.add_argument('--limit', type=int, default=0)
|
|
parser.add_argument('--verbose', action='store_true')
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
parser.add_argument('--schema', action='store_true', help='Print JSON Schema for catalog and exit')
|
|
args = parser.parse_args()
|
|
if args.schema:
|
|
# Lazy import to avoid circular dependency: replicate minimal schema inline from models file if present
|
|
try:
|
|
from type_definitions_theme_catalog import ThemeCatalog # type: ignore
|
|
import json as _json
|
|
print(_json.dumps(ThemeCatalog.model_json_schema(), indent=2))
|
|
return
|
|
except Exception as _e: # pragma: no cover
|
|
print(f"Failed to load schema models: {_e}")
|
|
return
|
|
data = build_catalog(limit=args.limit, verbose=args.verbose)
|
|
if args.dry_run:
|
|
print(json.dumps({'theme_count': len(data['themes']), 'provenance': data['provenance']}, indent=2))
|
|
else:
|
|
os.makedirs(OUTPUT_JSON.parent, exist_ok=True)
|
|
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
main()
|
|
except Exception as e: # broad guard for orchestrator fallback
|
|
print(f"ERROR: build_theme_catalog failed: {e}", file=sys.stderr)
|
|
sys.exit(1)
|