mtg_python_deckbuilder/code/scripts/generate_background_cards.py

"""Generate `background_cards.csv` from the master card dataset.

This script filters the full `cards.csv` export for cards whose type line contains
"Background" and writes the filtered rows to `background_cards.csv`. The output
maintains the same columns as the source data, ensures deterministic ordering,
and prepends a metadata comment with version and row count.

Usage (default paths derived from CSV_FILES_DIR environment variable)::

    python -m code.scripts.generate_background_cards
    python -m code.scripts.generate_background_cards --source other/cards.csv --output some/backgrounds.csv
"""
from __future__ import annotations

import argparse
import csv
import datetime as _dt
from pathlib import Path
from typing import Dict, Iterable, List, Sequence

from path_util import csv_dir

BACKGROUND_KEYWORD = "background"
DEFAULT_SOURCE_NAME = "cards.csv"
DEFAULT_OUTPUT_NAME = "background_cards.csv"


def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate background cards CSV")
    parser.add_argument(
        "--source",
        type=Path,
        help="Optional override for the source cards.csv file",
    )
    parser.add_argument(
        "--output",
        type=Path,
        help="Optional override for the generated background_cards.csv file",
    )
    parser.add_argument(
        "--version",
        type=str,
        help="Optional version string to embed in the output metadata comment",
    )
    return parser.parse_args(argv)


def _resolve_paths(args: argparse.Namespace) -> tuple[Path, Path]:
    base = Path(csv_dir()).resolve()
    source = (args.source or (base / DEFAULT_SOURCE_NAME)).resolve()
    output = (args.output or (base / DEFAULT_OUTPUT_NAME)).resolve()
    return source, output


def _is_background_type(type_line: str | None) -> bool:
    if not type_line:
        return False
    return BACKGROUND_KEYWORD in type_line.lower()


def _parse_theme_tags(raw: str | None) -> list[str]:
    if not raw:
        return []
    text = raw.strip()
    if not text:
        return []
    if text.startswith("[") and text.endswith("]"):
        body = text[1:-1].strip()
        if not body:
            return []
        tokens = [token.strip(" '\"") for token in body.split(",")]
        return [token for token in tokens if token]
    return [part.strip() for part in text.split(";") if part.strip()]


def _is_background_row(row: Dict[str, str]) -> bool:
    if _is_background_type(row.get("type")):
        return True
    theme_tags = _parse_theme_tags(row.get("themeTags"))
    return any(BACKGROUND_KEYWORD in tag.lower() for tag in theme_tags)


def _row_priority(row: Dict[str, str]) -> tuple[int, int]:
    """Return priority tuple for duplicate selection.

    Prefer rows that explicitly declare a background type line, then those with
    longer oracle text. Higher tuple values take precedence when comparing
    candidates.
    """

    type_line = row.get("type", "") or ""
    has_type = BACKGROUND_KEYWORD in type_line.lower()
    text_length = len((row.get("text") or "").strip())
    return (1 if has_type else 0, text_length)


def _gather_background_rows(reader: csv.DictReader) -> list[Dict[str, str]]:
    selected: Dict[str, Dict[str, str]] = {}
    for row in reader:
        if not row:
            continue
        name = (row.get("name") or "").strip()
        if not name:
            continue
        if not _is_background_row(row):
            continue
        current = selected.get(name.lower())
        if current is None:
            selected[name.lower()] = row
            continue
        if _row_priority(row) > _row_priority(current):
            selected[name.lower()] = row
    ordered_names = sorted(selected.keys())
    return [selected[key] for key in ordered_names]


def _ensure_all_columns(rows: Iterable[Dict[str, str]], headers: List[str]) -> None:
    for row in rows:
        for header in headers:
            row.setdefault(header, "")


def _write_background_csv(output: Path, headers: List[str], rows: List[Dict[str, str]], version: str, source: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)
    now_utc = _dt.datetime.now(_dt.UTC).replace(microsecond=0)
    metadata = {
        "version": version,
        "count": str(len(rows)),
        "source": source.name,
        "generated": now_utc.isoformat().replace("+00:00", "Z"),
    }
    meta_line = "# " + " ".join(f"{key}={value}" for key, value in metadata.items())
    with output.open("w", encoding="utf-8", newline="") as handle:
        handle.write(meta_line + "\n")
        writer = csv.DictWriter(handle, fieldnames=headers)
        writer.writeheader()
        for row in rows:
            writer.writerow({key: row.get(key, "") for key in headers})


def main(argv: Sequence[str] | None = None) -> None:
    args = _parse_args(argv)
    source, output = _resolve_paths(args)
    if not source.exists():
        raise FileNotFoundError(f"Source cards CSV not found: {source}")

    with source.open("r", encoding="utf-8", newline="") as handle:
        reader = csv.DictReader(handle)
        if reader.fieldnames is None:
            raise ValueError("cards.csv is missing header row")
        rows = _gather_background_rows(reader)
        _ensure_all_columns(rows, list(reader.fieldnames))

    version = args.version or _dt.datetime.now(_dt.UTC).strftime("%Y%m%d")
    _write_background_csv(output, list(reader.fieldnames), rows, version, source)


if __name__ == "__main__":
    main()