feat(web): Core Refactor Phase A — extract sampling and cache modules; add adaptive TTL + eviction heuristics, Redis PoC, and metrics wiring. Tests added for TTL, eviction, exports, splash-adaptive, card index, and service worker. Docs+roadmap updated.

This commit is contained in:
matt 2025-09-24 13:57:23 -07:00
parent c4a7fc48ea
commit a029d430c5
49 changed files with 3889 additions and 701 deletions

View file

@ -0,0 +1,309 @@
"""Ad-hoc performance benchmark for theme preview build latency (Phase A validation).
Runs warm-up plus measured request loops against several theme slugs and prints
aggregate latency stats (p50/p90/p95, cache hit ratio evolution). Intended to
establish or validate that refactor did not introduce >5% p95 regression.
Usage (ensure server running locally commonly :8080 in docker compose):
python -m code.scripts.preview_perf_benchmark --themes 8 --loops 40 \
--url http://localhost:8080 --warm 1 --limit 12
Theme slug discovery hierarchy (when --theme not provided):
1. Try /themes/index.json (legacy / planned static index)
2. Fallback to /themes/api/themes (current API) and take the first N ids
The discovered slugs are sorted deterministically then truncated to N.
NOTE: This is intentionally minimal (no external deps). For stable comparisons
run with identical parameters pre/post-change and commit the JSON output under
logs/perf/.
"""
from __future__ import annotations
import argparse
import json
import statistics
import time
from typing import Any, Dict, List
import urllib.request
import urllib.error
import sys
from pathlib import Path
def _fetch_json(url: str) -> Dict[str, Any]:
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req, timeout=15) as resp: # nosec B310 local dev
data = resp.read().decode("utf-8", "replace")
return json.loads(data) # type: ignore[return-value]
def select_theme_slugs(base_url: str, count: int) -> List[str]:
"""Discover theme slugs for benchmarking.
Attempts legacy static index first, then falls back to live API listing.
"""
errors: List[str] = []
slugs: List[str] = []
# Attempt 1: legacy /themes/index.json
try:
idx = _fetch_json(f"{base_url.rstrip('/')}/themes/index.json")
entries = idx.get("themes") or []
for it in entries:
if not isinstance(it, dict):
continue
slug = it.get("slug") or it.get("id") or it.get("theme_id")
if isinstance(slug, str):
slugs.append(slug)
except Exception as e: # pragma: no cover - network variability
errors.append(f"index.json failed: {e}")
if not slugs:
# Attempt 2: live API listing
try:
listing = _fetch_json(f"{base_url.rstrip('/')}/themes/api/themes")
items = listing.get("items") or []
for it in items:
if not isinstance(it, dict):
continue
tid = it.get("id") or it.get("slug") or it.get("theme_id")
if isinstance(tid, str):
slugs.append(tid)
except Exception as e: # pragma: no cover - network variability
errors.append(f"api/themes failed: {e}")
slugs = sorted(set(slugs))[:count]
if not slugs:
raise SystemExit("No theme slugs discovered; cannot benchmark (" + "; ".join(errors) + ")")
return slugs
def fetch_all_theme_slugs(base_url: str, page_limit: int = 200) -> List[str]:
"""Fetch all theme slugs via paginated /themes/api/themes endpoint.
Uses maximum page size (200) and iterates using offset until no next page.
Returns deterministic sorted unique list of slugs.
"""
slugs: List[str] = []
offset = 0
seen: set[str] = set()
while True:
try:
url = f"{base_url.rstrip('/')}/themes/api/themes?limit={page_limit}&offset={offset}"
data = _fetch_json(url)
except Exception as e: # pragma: no cover - network variability
raise SystemExit(f"Failed fetching themes page offset={offset}: {e}")
items = data.get("items") or []
for it in items:
if not isinstance(it, dict):
continue
tid = it.get("id") or it.get("slug") or it.get("theme_id")
if isinstance(tid, str) and tid not in seen:
seen.add(tid)
slugs.append(tid)
next_offset = data.get("next_offset")
if not next_offset or next_offset == offset:
break
offset = int(next_offset)
return sorted(slugs)
def percentile(values: List[float], pct: float) -> float:
if not values:
return 0.0
sv = sorted(values)
k = (len(sv) - 1) * pct
f = int(k)
c = min(f + 1, len(sv) - 1)
if f == c:
return sv[f]
d0 = sv[f] * (c - k)
d1 = sv[c] * (k - f)
return d0 + d1
def run_loop(base_url: str, slugs: List[str], loops: int, limit: int, warm: bool, path_template: str) -> Dict[str, Any]:
latencies: List[float] = []
per_slug_counts = {s: 0 for s in slugs}
t_start = time.time()
for i in range(loops):
slug = slugs[i % len(slugs)]
# path_template may contain {slug} and {limit}
try:
rel = path_template.format(slug=slug, limit=limit)
except Exception:
rel = f"/themes/api/theme/{slug}/preview?limit={limit}"
if not rel.startswith('/'):
rel = '/' + rel
url = f"{base_url.rstrip('/')}{rel}"
t0 = time.time()
try:
_fetch_json(url)
except Exception as e:
print(json.dumps({"event": "perf_benchmark_error", "slug": slug, "error": str(e)})) # noqa: T201
continue
ms = (time.time() - t0) * 1000.0
latencies.append(ms)
per_slug_counts[slug] += 1
elapsed = time.time() - t_start
return {
"warm": warm,
"loops": loops,
"slugs": slugs,
"per_slug_requests": per_slug_counts,
"elapsed_s": round(elapsed, 3),
"p50_ms": round(percentile(latencies, 0.50), 2),
"p90_ms": round(percentile(latencies, 0.90), 2),
"p95_ms": round(percentile(latencies, 0.95), 2),
"avg_ms": round(statistics.mean(latencies), 2) if latencies else 0.0,
"count": len(latencies),
"_latencies": latencies, # internal (removed in final result unless explicitly retained)
}
def _stats_from_latencies(latencies: List[float]) -> Dict[str, Any]:
if not latencies:
return {"count": 0, "p50_ms": 0.0, "p90_ms": 0.0, "p95_ms": 0.0, "avg_ms": 0.0}
return {
"count": len(latencies),
"p50_ms": round(percentile(latencies, 0.50), 2),
"p90_ms": round(percentile(latencies, 0.90), 2),
"p95_ms": round(percentile(latencies, 0.95), 2),
"avg_ms": round(statistics.mean(latencies), 2),
}
def main(argv: List[str]) -> int:
ap = argparse.ArgumentParser(description="Theme preview performance benchmark")
ap.add_argument("--url", default="http://localhost:8000", help="Base server URL (default: %(default)s)")
ap.add_argument("--themes", type=int, default=6, help="Number of theme slugs to exercise (default: %(default)s)")
ap.add_argument("--loops", type=int, default=60, help="Total request iterations (default: %(default)s)")
ap.add_argument("--limit", type=int, default=12, help="Preview size (default: %(default)s)")
ap.add_argument("--path-template", default="/themes/api/theme/{slug}/preview?limit={limit}", help="Format string for preview request path (default: %(default)s)")
ap.add_argument("--theme", action="append", dest="explicit_theme", help="Explicit theme slug(s); overrides automatic selection")
ap.add_argument("--warm", type=int, default=1, help="Number of warm-up loops (full cycles over selected slugs) (default: %(default)s)")
ap.add_argument("--output", type=Path, help="Optional JSON output path (committed under logs/perf)")
ap.add_argument("--all", action="store_true", help="Exercise ALL themes (ignores --themes; loops auto-set to passes*total_slugs unless --loops-explicit)")
ap.add_argument("--passes", type=int, default=1, help="When using --all, number of passes over the full theme set (default: %(default)s)")
# Hidden flag to detect if user explicitly set --loops (argparse has no direct support, so use sentinel technique)
# We keep original --loops for backwards compatibility; when --all we recompute unless user passed --loops-explicit
ap.add_argument("--loops-explicit", action="store_true", help=argparse.SUPPRESS)
ap.add_argument("--extract-warm-baseline", type=Path, help="If multi-pass (--all --passes >1), write a warm-only baseline JSON (final pass stats) to this path")
args = ap.parse_args(argv)
try:
if args.explicit_theme:
slugs = args.explicit_theme
elif args.all:
slugs = fetch_all_theme_slugs(args.url)
else:
slugs = select_theme_slugs(args.url, args.themes)
except SystemExit as e: # pragma: no cover - dependency on live server
print(str(e), file=sys.stderr)
return 2
mode = "all" if args.all else "subset"
total_slugs = len(slugs)
if args.all and not args.loops_explicit:
# Derive loops = passes * total_slugs
args.loops = max(1, args.passes) * total_slugs
print(json.dumps({ # noqa: T201
"event": "preview_perf_start",
"mode": mode,
"total_slugs": total_slugs,
"planned_loops": args.loops,
"passes": args.passes if args.all else None,
}))
# Execution paths:
# 1. Standard subset or single-pass all: warm cycles -> single measured run
# 2. Multi-pass all mode (--all --passes >1): iterate passes capturing per-pass stats (no separate warm loops)
if args.all and args.passes > 1:
pass_results: List[Dict[str, Any]] = []
combined_latencies: List[float] = []
t0_all = time.time()
for p in range(1, args.passes + 1):
r = run_loop(args.url, slugs, len(slugs), args.limit, warm=(p == 1), path_template=args.path_template)
lat = r.pop("_latencies", [])
combined_latencies.extend(lat)
pass_result = {
"pass": p,
"warm": r["warm"],
"elapsed_s": r["elapsed_s"],
"p50_ms": r["p50_ms"],
"p90_ms": r["p90_ms"],
"p95_ms": r["p95_ms"],
"avg_ms": r["avg_ms"],
"count": r["count"],
}
pass_results.append(pass_result)
total_elapsed = round(time.time() - t0_all, 3)
aggregate = _stats_from_latencies(combined_latencies)
result = {
"mode": mode,
"total_slugs": total_slugs,
"passes": args.passes,
"slugs": slugs,
"combined": {
**aggregate,
"elapsed_s": total_elapsed,
},
"passes_results": pass_results,
"cold_pass_p95_ms": pass_results[0]["p95_ms"],
"warm_pass_p95_ms": pass_results[-1]["p95_ms"],
"cold_pass_p50_ms": pass_results[0]["p50_ms"],
"warm_pass_p50_ms": pass_results[-1]["p50_ms"],
}
print(json.dumps({"event": "preview_perf_result", **result}, indent=2)) # noqa: T201
# Optional warm baseline extraction (final pass only; represents warmed steady-state)
if args.extract_warm_baseline:
try:
wb = pass_results[-1]
warm_obj = {
"event": "preview_perf_warm_baseline",
"mode": mode,
"total_slugs": total_slugs,
"warm_baseline": True,
"source_pass": wb["pass"],
"p50_ms": wb["p50_ms"],
"p90_ms": wb["p90_ms"],
"p95_ms": wb["p95_ms"],
"avg_ms": wb["avg_ms"],
"count": wb["count"],
"slugs": slugs,
}
args.extract_warm_baseline.parent.mkdir(parents=True, exist_ok=True)
args.extract_warm_baseline.write_text(json.dumps(warm_obj, indent=2, sort_keys=True), encoding="utf-8")
print(json.dumps({ # noqa: T201
"event": "preview_perf_warm_baseline_written",
"path": str(args.extract_warm_baseline),
"p95_ms": wb["p95_ms"],
}))
except Exception as e: # pragma: no cover
print(json.dumps({"event": "preview_perf_warm_baseline_error", "error": str(e)})) # noqa: T201
else:
# Warm-up loops first (if requested)
for w in range(args.warm):
run_loop(args.url, slugs, len(slugs), args.limit, warm=True, path_template=args.path_template)
result = run_loop(args.url, slugs, args.loops, args.limit, warm=False, path_template=args.path_template)
result.pop("_latencies", None)
result["slugs"] = slugs
result["mode"] = mode
result["total_slugs"] = total_slugs
if args.all:
result["passes"] = args.passes
print(json.dumps({"event": "preview_perf_result", **result}, indent=2)) # noqa: T201
if args.output:
try:
args.output.parent.mkdir(parents=True, exist_ok=True)
# Ensure we write the final result object (multi-pass already prepared above)
args.output.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-8")
except Exception as e: # pragma: no cover
print(f"ERROR: failed writing output file: {e}", file=sys.stderr)
return 3
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main(sys.argv[1:]))

View file

@ -0,0 +1,75 @@
"""CI helper: run a warm-pass benchmark candidate (single pass over all themes)
then compare against the committed warm baseline with threshold enforcement.
Intended usage (example):
python -m code.scripts.preview_perf_ci_check --url http://localhost:8080 \
--baseline logs/perf/theme_preview_warm_baseline.json --p95-threshold 5
Exit codes:
0 success (within threshold)
2 regression (p95 delta > threshold)
3 setup / usage error
Notes:
- Uses --all --passes 1 to create a fresh candidate snapshot that approximates
a warmed steady-state (server should have background refresh / typical load).
- If you prefer multi-pass then warm-only selection, adjust logic accordingly.
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from pathlib import Path
def run(cmd: list[str]) -> subprocess.CompletedProcess:
return subprocess.run(cmd, capture_output=True, text=True, check=False)
def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description="Preview performance CI regression gate")
ap.add_argument("--url", default="http://localhost:8080", help="Base URL of running web service")
ap.add_argument("--baseline", type=Path, required=True, help="Path to committed warm baseline JSON")
ap.add_argument("--p95-threshold", type=float, default=5.0, help="Max allowed p95 regression percent (default: %(default)s)")
ap.add_argument("--candidate-output", type=Path, default=Path("logs/perf/theme_preview_ci_candidate.json"), help="Where to write candidate benchmark JSON")
ap.add_argument("--multi-pass", action="store_true", help="Run a 2-pass all-themes benchmark and compare warm pass only (optional enhancement)")
args = ap.parse_args(argv)
if not args.baseline.exists():
print(json.dumps({"event":"ci_perf_error","message":"Baseline not found","path":str(args.baseline)}))
return 3
# Run candidate single-pass all-themes benchmark (no extra warm cycles to keep CI fast)
# If multi-pass requested, run two passes over all themes so second pass represents warmed steady-state.
passes = "2" if args.multi_pass else "1"
bench_cmd = [sys.executable, "-m", "code.scripts.preview_perf_benchmark", "--url", args.url, "--all", "--passes", passes, "--output", str(args.candidate_output)]
bench_proc = run(bench_cmd)
if bench_proc.returncode != 0:
print(json.dumps({"event":"ci_perf_error","stage":"benchmark","code":bench_proc.returncode,"stderr":bench_proc.stderr}))
return 3
print(bench_proc.stdout)
if not args.candidate_output.exists():
print(json.dumps({"event":"ci_perf_error","message":"Candidate output missing"}))
return 3
compare_cmd = [
sys.executable,
"-m","code.scripts.preview_perf_compare",
"--baseline", str(args.baseline),
"--candidate", str(args.candidate_output),
"--warm-only",
"--p95-threshold", str(args.p95_threshold),
]
cmp_proc = run(compare_cmd)
print(cmp_proc.stdout)
if cmp_proc.returncode == 2:
# Already printed JSON with failure status
return 2
if cmp_proc.returncode != 0:
print(json.dumps({"event":"ci_perf_error","stage":"compare","code":cmp_proc.returncode,"stderr":cmp_proc.stderr}))
return 3
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main(sys.argv[1:]))

View file

@ -0,0 +1,115 @@
"""Compare two preview benchmark JSON result files and emit delta stats.
Usage:
python -m code.scripts.preview_perf_compare --baseline logs/perf/theme_preview_baseline_all_pass1_20250923.json --candidate logs/perf/new_run.json
Outputs JSON with percentage deltas for p50/p90/p95/avg (positive = regression/slower).
If multi-pass structures are present (combined & passes_results) those are included.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any, Dict
def load(path: Path) -> Dict[str, Any]:
data = json.loads(path.read_text(encoding="utf-8"))
# Multi-pass result may store stats under combined
if "combined" in data:
core = data["combined"].copy()
# Inject representative fields for uniform comparison
core["p50_ms"] = core.get("p50_ms") or data.get("p50_ms")
core["p90_ms"] = core.get("p90_ms") or data.get("p90_ms")
core["p95_ms"] = core.get("p95_ms") or data.get("p95_ms")
core["avg_ms"] = core.get("avg_ms") or data.get("avg_ms")
data["_core_stats"] = core
else:
data["_core_stats"] = {
k: data.get(k) for k in ("p50_ms", "p90_ms", "p95_ms", "avg_ms", "count")
}
return data
def pct_delta(new: float, old: float) -> float:
if old == 0:
return 0.0
return round(((new - old) / old) * 100.0, 2)
def compare(baseline: Dict[str, Any], candidate: Dict[str, Any]) -> Dict[str, Any]:
b = baseline["_core_stats"]
c = candidate["_core_stats"]
result = {"baseline_count": b.get("count"), "candidate_count": c.get("count")}
for k in ("p50_ms", "p90_ms", "p95_ms", "avg_ms"):
if b.get(k) is not None and c.get(k) is not None:
result[k] = {
"baseline": b[k],
"candidate": c[k],
"delta_pct": pct_delta(c[k], b[k]),
}
# If both have per-pass details include first and last pass p95/p50
if "passes_results" in baseline and "passes_results" in candidate:
result["passes"] = {
"baseline": {
"cold_p95": baseline.get("cold_pass_p95_ms"),
"warm_p95": baseline.get("warm_pass_p95_ms"),
"cold_p50": baseline.get("cold_pass_p50_ms"),
"warm_p50": baseline.get("warm_pass_p50_ms"),
},
"candidate": {
"cold_p95": candidate.get("cold_pass_p95_ms"),
"warm_p95": candidate.get("warm_pass_p95_ms"),
"cold_p50": candidate.get("cold_pass_p50_ms"),
"warm_p50": candidate.get("warm_pass_p50_ms"),
},
}
return result
def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description="Compare two preview benchmark JSON result files")
ap.add_argument("--baseline", required=True, type=Path, help="Baseline JSON path")
ap.add_argument("--candidate", required=True, type=Path, help="Candidate JSON path")
ap.add_argument("--p95-threshold", type=float, default=None, help="Fail (exit 2) if p95 regression exceeds this percent (positive delta)")
ap.add_argument("--warm-only", action="store_true", help="When both results have passes, compare warm pass p95/p50 instead of combined/core")
args = ap.parse_args(argv)
if not args.baseline.exists():
raise SystemExit(f"Baseline not found: {args.baseline}")
if not args.candidate.exists():
raise SystemExit(f"Candidate not found: {args.candidate}")
baseline = load(args.baseline)
candidate = load(args.candidate)
# If warm-only requested and both have warm pass stats, override _core_stats before compare
if args.warm_only and "warm_pass_p95_ms" in baseline and "warm_pass_p95_ms" in candidate:
baseline["_core_stats"] = {
"p50_ms": baseline.get("warm_pass_p50_ms"),
"p90_ms": baseline.get("_core_stats", {}).get("p90_ms"), # p90 not tracked per-pass; retain combined
"p95_ms": baseline.get("warm_pass_p95_ms"),
"avg_ms": baseline.get("_core_stats", {}).get("avg_ms"),
"count": baseline.get("_core_stats", {}).get("count"),
}
candidate["_core_stats"] = {
"p50_ms": candidate.get("warm_pass_p50_ms"),
"p90_ms": candidate.get("_core_stats", {}).get("p90_ms"),
"p95_ms": candidate.get("warm_pass_p95_ms"),
"avg_ms": candidate.get("_core_stats", {}).get("avg_ms"),
"count": candidate.get("_core_stats", {}).get("count"),
}
cmp = compare(baseline, candidate)
payload = {"event": "preview_perf_compare", **cmp}
if args.p95_threshold is not None and "p95_ms" in cmp:
delta = cmp["p95_ms"]["delta_pct"]
payload["threshold"] = {"p95_threshold": args.p95_threshold, "p95_delta_pct": delta}
if delta is not None and delta > args.p95_threshold:
payload["result"] = "fail"
print(json.dumps(payload, indent=2)) # noqa: T201
return 2
payload["result"] = "pass"
print(json.dumps(payload, indent=2)) # noqa: T201
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main(__import__('sys').argv[1:]))

View file

@ -0,0 +1,94 @@
"""Snapshot the current power bracket taxonomy to a dated JSON artifact.
Outputs a JSON file under logs/taxonomy_snapshots/ named
taxonomy_<YYYYMMDD>_<HHMMSS>.json
containing:
{
"generated_at": ISO8601,
"hash": sha256 hex of canonical payload (excluding this top-level wrapper),
"brackets": [ {level,name,short_desc,long_desc,limits} ... ]
}
If a snapshot with identical hash already exists today, creation is skipped
unless --force provided.
Usage (from repo root):
python -m code.scripts.snapshot_taxonomy
python -m code.scripts.snapshot_taxonomy --force
Intended to provide an auditable evolution trail for taxonomy adjustments
before we implement taxonomy-aware sampling changes.
"""
from __future__ import annotations
import argparse
import json
import hashlib
from datetime import datetime
from pathlib import Path
from typing import Any, Dict
from code.deck_builder.phases.phase0_core import BRACKET_DEFINITIONS
SNAP_DIR = Path("logs/taxonomy_snapshots")
SNAP_DIR.mkdir(parents=True, exist_ok=True)
def _canonical_brackets():
return [
{
"level": b.level,
"name": b.name,
"short_desc": b.short_desc,
"long_desc": b.long_desc,
"limits": b.limits,
}
for b in sorted(BRACKET_DEFINITIONS, key=lambda x: x.level)
]
def compute_hash(brackets) -> str:
# Canonical JSON with sorted keys for repeatable hash
payload = json.dumps(brackets, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
def find_existing_hashes() -> Dict[str, Path]:
existing = {}
for p in SNAP_DIR.glob("taxonomy_*.json"):
try:
data = json.loads(p.read_text(encoding="utf-8"))
h = data.get("hash")
if h:
existing[h] = p
except Exception:
continue
return existing
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--force", action="store_true", help="Write new snapshot even if identical hash exists today")
args = ap.parse_args()
brackets = _canonical_brackets()
h = compute_hash(brackets)
existing = find_existing_hashes()
if h in existing and not args.force:
print(f"Snapshot identical (hash={h[:12]}...) exists: {existing[h].name}; skipping.")
return 0
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
out = SNAP_DIR / f"taxonomy_{ts}.json"
wrapper: Dict[str, Any] = {
"generated_at": datetime.utcnow().isoformat() + "Z",
"hash": h,
"brackets": brackets,
}
out.write_text(json.dumps(wrapper, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(f"Wrote taxonomy snapshot {out} (hash={h[:12]}...)")
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())