eval: fold GEMM/allreduce harnesses into self-contained milestone benches
Move the GEMM + allreduce sweep/render logic out of scripts/ and tests/
into two self-contained eval benches so a user can regenerate every
result + figure with one command:
kernbench run --bench milestone-1h-gemm (MILESTONE_FAST=1 reuses JSON)
kernbench run --bench milestone-1h-ccl
- benches/milestone_1h_{gemm,ccl}.py: single home for each domain; the
run(torch) entry drives the sweeps and writes figures into
benches/1H_milestone_output/{gemm,ccl}/ (gitignored), then submits a
sentinel tensor to satisfy the run_bench contract.
- tests/gemm + tests/sccl helpers and scripts/gemm_sweep.py become thin
re-export/wrapper shims over the benches (single source preserved); the
pytest-only param builders + _run_distributed wrapper stay in the shim.
- eval-bench pattern: a bench may drive many configs + build its own
per-config engines (extends ADR-0045 D5; reverses ADR-0044 D1/D2).
ADR-0054 (EN+KO) records the design; ADR-0043/0044/0045 + CLAUDE.md CLI
Semantics amended; ADR INDEX regenerated. Verified: milestone benches run
clean (ok=True, all artifacts), full suite 67 passed, lang-pairs OK.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
"""Milestone benches: registration + figure/result generation (ADR-0054).
|
||||
|
||||
``milestone-1h-gemm`` / ``milestone-1h-ccl`` are eval benches: run via the
|
||||
normal ``run_bench`` path, they regenerate every GEMM / allreduce figure +
|
||||
CSV into ``benches/1H_milestone_output/{gemm,ccl}/``. The GEMM bench in
|
||||
``MILESTONE_FAST=1`` mode just re-renders the committed sweep JSON (fast,
|
||||
default-run here); the CCL bench drives both full sweeps (slow, opt-in).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from kernbench.benches.registry import resolve
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
import kernbench.benches.milestone_1h_ccl as ccl_bench
|
||||
import kernbench.benches.milestone_1h_gemm as gemm_bench
|
||||
|
||||
_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$")
|
||||
|
||||
|
||||
def _run(name: str):
|
||||
topo = resolve_topology("topology.yaml")
|
||||
return run_bench(
|
||||
topology=topo, bench_fn=resolve(name).run, device=resolve_device(None),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_milestone_benches_registered():
|
||||
for name in ("milestone-1h-gemm", "milestone-1h-ccl"):
|
||||
spec = resolve(name)
|
||||
assert spec.name == name
|
||||
assert _NAME_RE.match(spec.name)
|
||||
assert spec.description.strip()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not gemm_bench.DEFAULT_SWEEP_JSON.exists(),
|
||||
reason="gemm_sweep.json absent; run scripts/gemm_sweep.py first",
|
||||
)
|
||||
def test_milestone_gemm_fast_generates_figures(monkeypatch):
|
||||
monkeypatch.setenv("MILESTONE_FAST", "1")
|
||||
result = _run("milestone-1h-gemm")
|
||||
assert result.completion.ok, result.completion
|
||||
out = gemm_bench._OUTPUT_DIR
|
||||
for png in (
|
||||
"gemm_stage_breakdown.png",
|
||||
"gemm_mac_utilization_measured.png",
|
||||
"gemm_mac_utilization_theoretical_vs_measured.png",
|
||||
):
|
||||
assert (out / png).exists(), f"missing {png}"
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_milestone_ccl_generates_figures():
|
||||
result = _run("milestone-1h-ccl")
|
||||
assert result.completion.ok, result.completion
|
||||
out = ccl_bench._OUTPUT_DIR
|
||||
for artifact in (
|
||||
"summary.csv",
|
||||
"topology.png",
|
||||
"comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png",
|
||||
"AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png",
|
||||
"AllReduce_LRAB_Ring1D_6SiP_1x6.png",
|
||||
"AllReduce_LRAB_2Dtorus_6SiP_2x3.png",
|
||||
"AllReduce_LRAB_2DMesh_6SiP_2x3.png",
|
||||
):
|
||||
assert (out / artifact).exists(), f"missing {artifact}"
|
||||
Reference in New Issue
Block a user