mtg_python_deckbuilder/code/scripts/generate_theme_editorial_suggestions.py

"""Generate editorial metadata suggestions for theme YAML files (Phase D helper).

Features:
 - Scans color CSV files (skips monolithic cards.csv unless --include-master)
 - Collects top-N (lowest EDHREC rank) cards per theme based on themeTags column
 - Optionally derives commander suggestions from commander_cards.csv (if present)
 - Provides dry-run output (default) or can patch YAML files that lack example_cards / example_commanders
 - Prints streaming progress so the user sees real-time status

Usage (dry run):
  python code/scripts/generate_theme_editorial_suggestions.py --themes "Landfall,Reanimate" --top 8

Write back missing fields (only if not already present):
  python code/scripts/generate_theme_editorial_suggestions.py --apply --limit-yaml 500

Safety:
 - Existing example_cards / example_commanders are never overwritten unless --force is passed
 - Writes are limited by --limit-yaml (default 0 means unlimited) to avoid massive churn accidentally

Heuristics:
 - Deduplicate card names per theme
 - Filter out names with extremely poor rank (> 60000) by default (configurable)
 - For commander suggestions, prefer legendary creatures/planeswalkers in commander_cards.csv whose themeTags includes the theme
 - Fallback commander suggestions: take top legendary cards from color CSVs tagged with the theme
 - synergy_commanders: derive from top 3 synergies of each theme (3 from top, 2 from second, 1 from third)
 - Promotion: if fewer than --min-examples example_commanders exist after normal suggestion, promote synergy_commanders (in order) into example_commanders, annotating with " - Synergy (<synergy name>)"
"""
from __future__ import annotations

import argparse
import ast
import csv
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple, Set
import sys

try:  # optional dependency safety
    import yaml  # type: ignore
except Exception:
    yaml = None

ROOT = Path(__file__).resolve().parents[2]
CSV_DIR = ROOT / 'csv_files'
CATALOG_DIR = ROOT / 'config' / 'themes' / 'catalog'

COLOR_CSV_GLOB = '*_cards.csv'
MASTER_FILE = 'cards.csv'
COMMANDER_FILE = 'commander_cards.csv'


@dataclass
class ThemeSuggestion:
    cards: List[str]
    commanders: List[str]
    synergy_commanders: List[str]


def _parse_theme_tags(raw: str) -> List[str]:
    if not raw:
        return []
    raw = raw.strip()
    if not raw or raw == '[]':
        return []
    try:
        # themeTags stored like "['Landfall', 'Ramp']" – use literal_eval safely
        val = ast.literal_eval(raw)
        if isinstance(val, list):
            return [str(x) for x in val if isinstance(x, str)]
    except Exception:
        pass
    # Fallback naive parse
    return [t.strip().strip("'\"") for t in raw.strip('[]').split(',') if t.strip()]


def scan_color_csvs(include_master: bool, max_rank: float, progress_every: int) -> Tuple[Dict[str, List[Tuple[float, str]]], Dict[str, List[Tuple[float, str]]]]:
    theme_hits: Dict[str, List[Tuple[float, str]]] = {}
    legendary_hits: Dict[str, List[Tuple[float, str]]] = {}
    files: List[Path] = []
    for fp in sorted(CSV_DIR.glob(COLOR_CSV_GLOB)):
        name = fp.name
        if name == MASTER_FILE and not include_master:
            continue
        if name == COMMANDER_FILE:
            continue
        # skip testdata
        if 'testdata' in str(fp):
            continue
        files.append(fp)
    total_files = len(files)
    processed = 0
    for fp in files:
        processed += 1
        try:
            with fp.open(encoding='utf-8', newline='') as f:
                reader = csv.DictReader(f)
                line_idx = 0
                for row in reader:
                    line_idx += 1
                    if progress_every and line_idx % progress_every == 0:
                        print(f"[scan] {fp.name} line {line_idx}", file=sys.stderr, flush=True)
                    tags_raw = row.get('themeTags') or ''
                    if not tags_raw:
                        continue
                    try:
                        rank = float(row.get('edhrecRank') or 999999)
                    except Exception:
                        rank = 999999
                    if rank > max_rank:
                        continue
                    tags = _parse_theme_tags(tags_raw)
                    name = row.get('name') or ''
                    if not name:
                        continue
                    is_legendary = False
                    try:
                        typ = row.get('type') or ''
                        if isinstance(typ, str) and 'Legendary' in typ.split():
                            is_legendary = True
                    except Exception:
                        pass
                    for t in tags:
                        if not t:
                            continue
                        theme_hits.setdefault(t, []).append((rank, name))
                        if is_legendary:
                            legendary_hits.setdefault(t, []).append((rank, name))
        except Exception as e:  # pragma: no cover
            print(f"[warn] failed reading {fp.name}: {e}", file=sys.stderr)
        print(f"[scan] completed {fp.name} ({processed}/{total_files})", file=sys.stderr, flush=True)
    # Trim each bucket to reasonable size (keep best ranks)
    for mapping, cap in ((theme_hits, 120), (legendary_hits, 80)):
        for t, lst in mapping.items():
            lst.sort(key=lambda x: x[0])
            if len(lst) > cap:
                del lst[cap:]
    return theme_hits, legendary_hits


def scan_commander_csv(max_rank: float) -> Dict[str, List[Tuple[float, str]]]:
    path = CSV_DIR / COMMANDER_FILE
    out: Dict[str, List[Tuple[float, str]]] = {}
    if not path.exists():
        return out
    try:
        with path.open(encoding='utf-8', newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                tags_raw = row.get('themeTags') or ''
                if not tags_raw:
                    continue
                tags = _parse_theme_tags(tags_raw)
                try:
                    rank = float(row.get('edhrecRank') or 999999)
                except Exception:
                    rank = 999999
                if rank > max_rank:
                    continue
                name = row.get('name') or ''
                if not name:
                    continue
                for t in tags:
                    if not t:
                        continue
                    out.setdefault(t, []).append((rank, name))
    except Exception as e:  # pragma: no cover
        print(f"[warn] failed reading {COMMANDER_FILE}: {e}", file=sys.stderr)
    for t, lst in out.items():
        lst.sort(key=lambda x: x[0])
        if len(lst) > 60:
            del lst[60:]
    return out


def load_yaml_theme(path: Path) -> dict:
    try:
        return yaml.safe_load(path.read_text(encoding='utf-8')) if yaml else {}
    except Exception:
        return {}


def write_yaml_theme(path: Path, data: dict):
    txt = yaml.safe_dump(data, sort_keys=False, allow_unicode=True)
    path.write_text(txt, encoding='utf-8')


def build_suggestions(theme_hits: Dict[str, List[Tuple[float, str]]], commander_hits: Dict[str, List[Tuple[float, str]]], top: int, top_commanders: int, *, synergy_top=(3,2,1), min_examples: int = 5) -> Dict[str, ThemeSuggestion]:
    suggestions: Dict[str, ThemeSuggestion] = {}
    all_themes: Set[str] = set(theme_hits.keys()) | set(commander_hits.keys())
    for t in sorted(all_themes):
        card_names: List[str] = []
        if t in theme_hits:
            for rank, name in theme_hits[t][: top * 3]:  # oversample then dedup
                if name not in card_names:
                    card_names.append(name)
                if len(card_names) >= top:
                    break
        commander_names: List[str] = []
        if t in commander_hits:
            for rank, name in commander_hits[t][: top_commanders * 2]:
                if name not in commander_names:
                    commander_names.append(name)
                if len(commander_names) >= top_commanders:
                    break
        # Placeholder synergy_commanders; will be filled later after we know synergies per theme from YAML
        suggestions[t] = ThemeSuggestion(cards=card_names, commanders=commander_names, synergy_commanders=[])
    return suggestions


def _derive_synergy_commanders(base_theme: str, data: dict, all_yaml: Dict[str, dict], commander_hits: Dict[str, List[Tuple[float, str]]], legendary_hits: Dict[str, List[Tuple[float, str]]], synergy_top=(3,2,1)) -> List[Tuple[str, str]]:
    """Pick synergy commanders with their originating synergy label.
    Returns list of (commander_name, synergy_theme) preserving order of (top synergy, second, third) and internal ranking.
    """
    synergies = data.get('synergies') or []
    if not isinstance(synergies, list):
        return []
    pattern = list(synergy_top)
    out: List[Tuple[str, str]] = []
    for idx, count in enumerate(pattern):
        if idx >= len(synergies):
            break
        s_name = synergies[idx]
        bucket = commander_hits.get(s_name) or []
        taken = 0
        for _, cname in bucket:
            if all(cname != existing for existing, _ in out):
                out.append((cname, s_name))
                taken += 1
                if taken >= count:
                    break
        if taken < count:
            # fallback to legendary card hits tagged with that synergy
            fallback_bucket = legendary_hits.get(s_name) or []
            for _, cname in fallback_bucket:
                if all(cname != existing for existing, _ in out):
                    out.append((cname, s_name))
                    taken += 1
                    if taken >= count:
                        break
    return out


def _augment_synergies(data: dict, base_theme: str) -> bool:
    """Heuristically augment the 'synergies' list when it's sparse.
    Rules:
      - If synergies length >= 3, leave as-is.
      - Start with existing synergies then append curated/enforced/inferred (in that order) if missing.
      - For any theme whose display_name contains 'Counter' add 'Counters Matter' and 'Proliferate'.
    Returns True if modified.
    """
    synergies = data.get('synergies') if isinstance(data.get('synergies'), list) else []
    if not isinstance(synergies, list):
        return False
    original = list(synergies)
    if len(synergies) < 3:
        for key in ('curated_synergies', 'enforced_synergies', 'inferred_synergies'):
            lst = data.get(key)
            if isinstance(lst, list):
                for s in lst:
                    if isinstance(s, str) and s and s not in synergies:
                        synergies.append(s)
    name = data.get('display_name') or base_theme
    if isinstance(name, str) and 'counter' in name.lower():
        for extra in ('Counters Matter', 'Proliferate'):
            if extra not in synergies:
                synergies.append(extra)
    # Deduplicate preserving order
    seen = set()
    deduped = []
    for s in synergies:
        if s not in seen:
            deduped.append(s)
            seen.add(s)
    if deduped != synergies:
        synergies = deduped
    if synergies != original:
        data['synergies'] = synergies
        return True
    return False


def apply_to_yaml(suggestions: Dict[str, ThemeSuggestion], *, limit_yaml: int, force: bool, themes_filter: Set[str], commander_hits: Dict[str, List[Tuple[float, str]]], legendary_hits: Dict[str, List[Tuple[float, str]]], synergy_top=(3,2,1), min_examples: int = 5, augment_synergies: bool = False, treat_placeholders_missing: bool = False):
    updated = 0
    # Preload all YAML for synergy lookups (avoid repeated disk IO inside loop)
    all_yaml_cache: Dict[str, dict] = {}
    for p in CATALOG_DIR.glob('*.yml'):
        try:
            all_yaml_cache[p.name] = load_yaml_theme(p)
        except Exception:
            pass
    for path in sorted(CATALOG_DIR.glob('*.yml')):
        data = load_yaml_theme(path)
        if not isinstance(data, dict):
            continue
        display = data.get('display_name')
        if not isinstance(display, str) or not display:
            continue
        if themes_filter and display not in themes_filter:
            continue
        sug = suggestions.get(display)
        if not sug:
            continue
        changed = False
        # Optional synergy augmentation prior to commander derivation
        if augment_synergies and _augment_synergies(data, display):
            changed = True
        # Derive synergy_commanders before promotion logic
        synergy_cmds = _derive_synergy_commanders(display, data, all_yaml_cache, commander_hits, legendary_hits, synergy_top=synergy_top)
        # Annotate synergy_commanders with their synergy source for transparency
        synergy_cmd_names = [f"{c} - Synergy ({src})" for c, src in synergy_cmds]
        if (force or not data.get('example_cards')) and sug.cards:
            data['example_cards'] = sug.cards
            changed = True
        existing_examples: List[str] = list(data.get('example_commanders') or []) if isinstance(data.get('example_commanders'), list) else []
        # Treat an all-placeholder (" Anchor" suffix) list as effectively empty when flag enabled
        if treat_placeholders_missing and existing_examples and all(isinstance(e, str) and e.endswith(' Anchor') for e in existing_examples):
            existing_examples = []
        if force or not existing_examples:
            if sug.commanders:
                data['example_commanders'] = list(sug.commanders)
                existing_examples = data['example_commanders']
                changed = True
        # (Attachment of synergy_commanders moved to after promotion so we can filter duplicates with example_commanders)
        # Re-annotate existing example_commanders if they use old base-theme annotation pattern
        if existing_examples and synergy_cmds:
            # Detect old pattern: ends with base theme name inside parentheses
            needs_reannotate = False
            old_suffix = f" - Synergy ({display})"
            for ex in existing_examples:
                if ex.endswith(old_suffix):
                    needs_reannotate = True
                    break
            if needs_reannotate:
                # Build mapping from commander name to synergy source
                source_map = {name: src for name, src in synergy_cmds}
                new_examples: List[str] = []
                for ex in existing_examples:
                    if ' - Synergy (' in ex:
                        base_name = ex.split(' - Synergy ')[0]
                        if base_name in source_map:
                            new_examples.append(f"{base_name} - Synergy ({source_map[base_name]})")
                            continue
                    new_examples.append(ex)
                if new_examples != existing_examples:
                    data['example_commanders'] = new_examples
                    existing_examples = new_examples
                    changed = True
        # Promotion: ensure at least min_examples in example_commanders by moving from synergy list (without duplicates)
        if (len(existing_examples) < min_examples) and synergy_cmd_names:
            needed = min_examples - len(existing_examples)
            promoted = []
            for cname, source_synergy in synergy_cmds:
                # Avoid duplicate even with annotation
                if not any(cname == base.split(' - Synergy ')[0] for base in existing_examples):
                    annotated = f"{cname} - Synergy ({source_synergy})"
                    existing_examples.append(annotated)
                    promoted.append(cname)
                    needed -= 1
                    if needed <= 0:
                        break
            if promoted:
                data['example_commanders'] = existing_examples
                changed = True
        # After any potential promotions / re-annotations, attach synergy_commanders excluding any commanders already present in example_commanders
        existing_base_names = {ex.split(' - Synergy ')[0] for ex in (data.get('example_commanders') or []) if isinstance(ex, str)}
        filtered_synergy_cmd_names = []
        for entry in synergy_cmd_names:
            base = entry.split(' - Synergy ')[0]
            if base not in existing_base_names:
                filtered_synergy_cmd_names.append(entry)
        prior_synergy_cmds = data.get('synergy_commanders') if isinstance(data.get('synergy_commanders'), list) else []
        if prior_synergy_cmds != filtered_synergy_cmd_names:
            if filtered_synergy_cmd_names or force or prior_synergy_cmds:
                data['synergy_commanders'] = filtered_synergy_cmd_names
                changed = True

        if changed:
            write_yaml_theme(path, data)
            updated += 1
            print(f"[apply] updated {path.name}")
            if limit_yaml and updated >= limit_yaml:
                print(f"[apply] reached limit {limit_yaml}; stopping")
                break
    return updated


def main():  # pragma: no cover
    parser = argparse.ArgumentParser(description='Generate example_cards / example_commanders suggestions for theme YAML')
    parser.add_argument('--themes', type=str, help='Comma-separated subset of display names to restrict')
    parser.add_argument('--top', type=int, default=8, help='Target number of example_cards suggestions')
    parser.add_argument('--top-commanders', type=int, default=5, help='Target number of example_commanders suggestions')
    parser.add_argument('--max-rank', type=float, default=60000, help='Skip cards with EDHREC rank above this threshold')
    parser.add_argument('--include-master', action='store_true', help='Include large cards.csv in scan (slower)')
    parser.add_argument('--progress-every', type=int, default=0, help='Emit a progress line every N rows per file')
    parser.add_argument('--apply', action='store_true', help='Write missing fields into YAML files')
    parser.add_argument('--limit-yaml', type=int, default=0, help='Limit number of YAML files modified (0 = unlimited)')
    parser.add_argument('--force', action='store_true', help='Overwrite existing example lists')
    parser.add_argument('--min-examples', type=int, default=5, help='Minimum desired example_commanders; promote from synergy_commanders if short')
    parser.add_argument('--augment-synergies', action='store_true', help='Heuristically augment sparse synergies list before deriving synergy_commanders')
    parser.add_argument('--treat-placeholders', action='store_true', help='Consider Anchor-only example_commanders lists as missing so they can be replaced')
    args = parser.parse_args()

    themes_filter: Set[str] = set()
    if args.themes:
        themes_filter = {t.strip() for t in args.themes.split(',') if t.strip()}

    print('[info] scanning CSVs...', file=sys.stderr)
    theme_hits, legendary_hits = scan_color_csvs(args.include_master, args.max_rank, args.progress_every)
    print('[info] scanning commander CSV...', file=sys.stderr)
    commander_hits = scan_commander_csv(args.max_rank)
    print('[info] building suggestions...', file=sys.stderr)
    suggestions = build_suggestions(theme_hits, commander_hits, args.top, args.top_commanders, min_examples=args.min_examples)

    if not args.apply:
        # Dry run: print JSON-like summary for filtered subset (or first 25 themes)
        to_show = sorted(themes_filter) if themes_filter else list(sorted(suggestions.keys())[:25])
        for t in to_show:
            s = suggestions.get(t)
            if not s:
                continue
            print(f"\n=== {t} ===")
            print('example_cards:', ', '.join(s.cards) or '(none)')
            print('example_commanders:', ', '.join(s.commanders) or '(none)')
            print('synergy_commanders: (computed at apply time)')
        print('\n[info] dry-run complete (use --apply to write)')
        return

    if yaml is None:
        print('ERROR: PyYAML not installed; cannot apply changes.', file=sys.stderr)
        sys.exit(1)
    updated = apply_to_yaml(
        suggestions,
        limit_yaml=args.limit_yaml,
        force=args.force,
        themes_filter=themes_filter,
        commander_hits=commander_hits,
        legendary_hits=legendary_hits,
        synergy_top=(3,2,1),
        min_examples=args.min_examples,
        augment_synergies=args.augment_synergies,
        treat_placeholders_missing=args.treat_placeholders,
    )
    print(f'[info] updated {updated} YAML files')


if __name__ == '__main__':  # pragma: no cover
    main()