mtg_python_deckbuilder/code/scripts/preview_perf_compare.py

"""Compare two preview benchmark JSON result files and emit delta stats.

Usage:
  python -m code.scripts.preview_perf_compare --baseline logs/perf/theme_preview_baseline_all_pass1_20250923.json --candidate logs/perf/new_run.json

Outputs JSON with percentage deltas for p50/p90/p95/avg (positive = regression/slower).
If multi-pass structures are present (combined & passes_results) those are included.
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Any, Dict


def load(path: Path) -> Dict[str, Any]:
    data = json.loads(path.read_text(encoding="utf-8"))
    # Multi-pass result may store stats under combined
    if "combined" in data:
        core = data["combined"].copy()
        # Inject representative fields for uniform comparison
        core["p50_ms"] = core.get("p50_ms") or data.get("p50_ms")
        core["p90_ms"] = core.get("p90_ms") or data.get("p90_ms")
        core["p95_ms"] = core.get("p95_ms") or data.get("p95_ms")
        core["avg_ms"] = core.get("avg_ms") or data.get("avg_ms")
        data["_core_stats"] = core
    else:
        data["_core_stats"] = {
            k: data.get(k) for k in ("p50_ms", "p90_ms", "p95_ms", "avg_ms", "count")
        }
    return data


def pct_delta(new: float, old: float) -> float:
    if old == 0:
        return 0.0
    return round(((new - old) / old) * 100.0, 2)


def compare(baseline: Dict[str, Any], candidate: Dict[str, Any]) -> Dict[str, Any]:
    b = baseline["_core_stats"]
    c = candidate["_core_stats"]
    result = {"baseline_count": b.get("count"), "candidate_count": c.get("count")}
    for k in ("p50_ms", "p90_ms", "p95_ms", "avg_ms"):
        if b.get(k) is not None and c.get(k) is not None:
            result[k] = {
                "baseline": b[k],
                "candidate": c[k],
                "delta_pct": pct_delta(c[k], b[k]),
            }
    # If both have per-pass details include first and last pass p95/p50
    if "passes_results" in baseline and "passes_results" in candidate:
        result["passes"] = {
            "baseline": {
                "cold_p95": baseline.get("cold_pass_p95_ms"),
                "warm_p95": baseline.get("warm_pass_p95_ms"),
                "cold_p50": baseline.get("cold_pass_p50_ms"),
                "warm_p50": baseline.get("warm_pass_p50_ms"),
            },
            "candidate": {
                "cold_p95": candidate.get("cold_pass_p95_ms"),
                "warm_p95": candidate.get("warm_pass_p95_ms"),
                "cold_p50": candidate.get("cold_pass_p50_ms"),
                "warm_p50": candidate.get("warm_pass_p50_ms"),
            },
        }
    return result


def main(argv: list[str]) -> int:
    ap = argparse.ArgumentParser(description="Compare two preview benchmark JSON result files")
    ap.add_argument("--baseline", required=True, type=Path, help="Baseline JSON path")
    ap.add_argument("--candidate", required=True, type=Path, help="Candidate JSON path")
    ap.add_argument("--p95-threshold", type=float, default=None, help="Fail (exit 2) if p95 regression exceeds this percent (positive delta)")
    ap.add_argument("--warm-only", action="store_true", help="When both results have passes, compare warm pass p95/p50 instead of combined/core")
    args = ap.parse_args(argv)
    if not args.baseline.exists():
        raise SystemExit(f"Baseline not found: {args.baseline}")
    if not args.candidate.exists():
        raise SystemExit(f"Candidate not found: {args.candidate}")
    baseline = load(args.baseline)
    candidate = load(args.candidate)
    # If warm-only requested and both have warm pass stats, override _core_stats before compare
    if args.warm_only and "warm_pass_p95_ms" in baseline and "warm_pass_p95_ms" in candidate:
        baseline["_core_stats"] = {
            "p50_ms": baseline.get("warm_pass_p50_ms"),
            "p90_ms": baseline.get("_core_stats", {}).get("p90_ms"),  # p90 not tracked per-pass; retain combined
            "p95_ms": baseline.get("warm_pass_p95_ms"),
            "avg_ms": baseline.get("_core_stats", {}).get("avg_ms"),
            "count": baseline.get("_core_stats", {}).get("count"),
        }
        candidate["_core_stats"] = {
            "p50_ms": candidate.get("warm_pass_p50_ms"),
            "p90_ms": candidate.get("_core_stats", {}).get("p90_ms"),
            "p95_ms": candidate.get("warm_pass_p95_ms"),
            "avg_ms": candidate.get("_core_stats", {}).get("avg_ms"),
            "count": candidate.get("_core_stats", {}).get("count"),
        }
    cmp = compare(baseline, candidate)
    payload = {"event": "preview_perf_compare", **cmp}
    if args.p95_threshold is not None and "p95_ms" in cmp:
        delta = cmp["p95_ms"]["delta_pct"]
        payload["threshold"] = {"p95_threshold": args.p95_threshold, "p95_delta_pct": delta}
        if delta is not None and delta > args.p95_threshold:
            payload["result"] = "fail"
            print(json.dumps(payload, indent=2))  # noqa: T201
            return 2
        payload["result"] = "pass"
    print(json.dumps(payload, indent=2))  # noqa: T201
    return 0


if __name__ == "__main__":  # pragma: no cover
    raise SystemExit(main(__import__('sys').argv[1:]))