diff --git a/benches/__init__.py b/benches/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/benches/ipcq_allreduce.py b/benches/ipcq_allreduce.py deleted file mode 100644 index 798173e..0000000 --- a/benches/ipcq_allreduce.py +++ /dev/null @@ -1,2 +0,0 @@ -def run(torch): - print("IPCQ all reduce kernel bench") diff --git a/benches/loader.py b/benches/loader.py deleted file mode 100644 index fd6bb24..0000000 --- a/benches/loader.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import annotations - -import importlib -from collections.abc import Callable -from typing import Any - -from kernbench.runtime_api.context import RuntimeContext - -BenchFn = Callable[[RuntimeContext], Any] - - -def _load_module(bench_id: str): - bench_id = bench_id.strip() - if not bench_id: - raise ValueError("Bench id is empty.") - module_path = f"benches.{bench_id}" - try: - return importlib.import_module(module_path) - except ModuleNotFoundError as e: - raise ValueError( - f"Unknown bench '{bench_id}'. Expected module {module_path}.py" - ) from e - - -def resolve_bench(bench_id: str) -> BenchFn: - """Resolve a bench id into its ``run(torch)`` callable. - - Expected layout (repo root): - benches/.py - def run(torch: RuntimeContext) -> Any - """ - mod = _load_module(bench_id) - run_fn = getattr(mod, "run", None) - if run_fn is None: - raise ValueError( - f"Bench module benches.{bench_id} must define 'run(torch)'." - ) - if not callable(run_fn): - raise ValueError(f"'run' in benches.{bench_id} is not callable.") - return run_fn diff --git a/docs/adr-ko/ADR-0010-api-cli-surface-and-semantics.md b/docs/adr-ko/ADR-0010-api-cli-surface-and-semantics.md index 0e29028..2cb0f8a 100644 --- a/docs/adr-ko/ADR-0010-api-cli-surface-and-semantics.md +++ b/docs/adr-ko/ADR-0010-api-cli-surface-and-semantics.md @@ -6,10 +6,11 @@ Accepted ## Context -`kernbench` CLI는 시뮬레이터의 사용자 대면 진입점이다. 세 개의 서브명령을 +`kernbench` CLI는 시뮬레이터의 사용자 대면 진입점이다. 네 개의 서브명령을 노출한다: - `run` — 토폴로지에 대해 벤치마크를 실행한다. +- `list` — 등록된 벤치마크 목록을 출력한다. - `probe` — 레이턴시 / 대역폭 측정을 위한 진단 유틸리티. - `web` — 인터랙티브 토폴로지 뷰어. @@ -33,8 +34,9 @@ Accepted - `--topology `: 토폴로지 YAML 파일 경로. `resolve_topology()`를 통해 로드된다. -- `--bench `: 벤치마크 이름. `benches.loader.resolve_bench()`를 - 통해 해석된다. +- `--bench `: 벤치마크 식별자. `kernbench.benches.registry.resolve()`를 + 통해 해석되며, 등록된 kebab-case 이름(예: `gemm-single-pe`) 또는 + `kernbench list` 의 숫자 인덱스를 모두 받는다. 선택 인자: @@ -61,7 +63,22 @@ Accepted CLI는 여러 OS 프로세스나 독립된 시뮬레이션 실행을 생성하지 **않는다** — 병렬성은 단일 시뮬레이션 인스턴스 내부에서 일어난다. -### D4. `kernbench probe` — 레이턴시 / 대역폭 진단 유틸리티 +### D4. `kernbench list` — 등록된 벤치마크 목록 출력 + +인자 없음. 각 등록된 벤치의 자동 부여된 인덱스, 등록된 이름, +한 줄 설명을 출력한다. + +벤치는 `@bench(name=..., description=...)` 데코레이터 +(`kernbench.benches.registry`)를 통해 자기 자신을 등록한다. +`kernbench.benches/` 아래의 언더스코어로 시작하지 않는 모든 모듈은 +반드시 최소 하나의 벤치를 등록해야 한다; 데코레이터가 누락되면 +패키지 import 시점에 `RuntimeError`가 발생한다. + +인덱스는 import 시점에 이름의 알파벳 순으로 부여된다. 인덱스는 +`--bench` 의 축약 표기를 위한 CLI 편의 기능이며 안정적인 API가 +아니다 — 알파벳 순으로 새 벤치가 끼면 이후 인덱스가 밀린다. + +### D5. `kernbench probe` — 레이턴시 / 대역폭 진단 유틸리티 필수 인자: @@ -85,7 +102,7 @@ Probe는 추가로 단조성 불변식을 검증한다 — 예를 들어 local-H 레이턴시 / 대역폭 모델을 검증하기 위한 개발자 도구이다; 벤치마크가 아니다. -### D5. `kernbench web` — 토폴로지 뷰어 +### D6. `kernbench web` — 토폴로지 뷰어 선택 인자: @@ -99,7 +116,7 @@ Probe는 추가로 단조성 불변식을 검증한다 — 예를 들어 local-H - `kernbench web`은 인터랙티브이다 — 팬/줌, 컴포넌트 속성 호버, SIP / CUBE / PE 뷰 간 전환. -### D6. runtime API와 시뮬레이션 엔진은 디바이스 스코프를 유지한다 +### D7. runtime API와 시뮬레이션 엔진은 디바이스 스코프를 유지한다 - runtime API 호출은 호출당 하나의 디바이스에서 동작한다. - 시뮬레이션 엔진은 모든 요청을 결정론적으로 스케줄링한다. @@ -108,6 +125,9 @@ Probe는 추가로 단조성 불변식을 검증한다 — 예를 들어 local-H 이 불변식은 각 레이어를 독립적으로 테스트 가능하게 유지한다; 디바이스 열거와 다중 디바이스 팬아웃은 오직 CLI의 `run` 명령에만 존재한다(D3). +`probe` 구현은 `kernbench.probes` 아래에 있다 (`kernbench.benches`와 +분리됨). 이는 probe가 등록된 벤치가 아니라 진단 유틸리티임을 반영한다. + ## Consequences - 벤치마크 작성자는 단일 디바이스 로직을 작성한다; 다중 디바이스 동작은 diff --git a/docs/adr/ADR-0010-api-cli-surface-and-semantics.md b/docs/adr/ADR-0010-api-cli-surface-and-semantics.md index 4925a64..16b2ec6 100644 --- a/docs/adr/ADR-0010-api-cli-surface-and-semantics.md +++ b/docs/adr/ADR-0010-api-cli-surface-and-semantics.md @@ -7,9 +7,10 @@ Accepted ## Context The `kernbench` CLI is the user-facing entry point of the simulator. It -exposes three subcommands: +exposes four subcommands: - `run` — execute a benchmark against a topology. +- `list` — enumerate registered benches. - `probe` — diagnostic utility for latency / BW measurement. - `web` — interactive topology viewer. @@ -33,8 +34,10 @@ Required arguments: - `--topology `: topology YAML file path. Loaded via `resolve_topology()`. -- `--bench `: benchmark name. Resolved via - `benches.loader.resolve_bench()`. +- `--bench `: benchmark identifier. Resolved via + `kernbench.benches.registry.resolve()`, which accepts either the + registered kebab-case name (e.g., `gemm-single-pe`) or a numeric + index from `kernbench list`. Optional arguments: @@ -63,7 +66,21 @@ When `--device all` (or omitted) and the topology has multiple SIPs: The CLI does NOT spawn multiple OS processes or independent simulation runs — parallelism is internal to one simulation instance. -### D4. `kernbench probe` — latency / BW diagnostic utility +### D4. `kernbench list` — enumerate registered benches + +No arguments. Prints each registered bench's auto-assigned index, +registered name, and one-line description. + +Benches register themselves via the `@bench(name=..., description=...)` +decorator (`kernbench.benches.registry`). Every non-underscore module +under `kernbench.benches/` MUST register at least one bench; a missing +decorator raises `RuntimeError` at package import time. + +Indices are assigned alphabetically by name at import time. They are a +CLI convenience (shorthand for `--bench`), not a stable API — a new +bench inserted alphabetically will shift later indices. + +### D5. `kernbench probe` — latency / BW diagnostic utility Required argument: @@ -87,7 +104,7 @@ that local-HBM access ≤ cross-PE-within-cube ≤ cross-cube ≤ cross-SIP — and reports violations. Probe is a developer tool for verifying the latency / BW model; it is not a benchmark. -### D5. `kernbench web` — topology viewer +### D6. `kernbench web` — topology viewer Optional arguments: @@ -102,7 +119,7 @@ the browser. Distinct from the static `docs/diagrams/` artifacts: - `kernbench web` is interactive — pan/zoom, hover for component attributes, switch between SIP / CUBE / PE views. -### D6. Runtime API and simulation engine remain device-scoped +### D7. Runtime API and simulation engine remain device-scoped - Runtime API calls operate on one device per invocation. - The simulation engine schedules all requests deterministically. @@ -112,6 +129,10 @@ This invariant keeps each layer testable in isolation; device enumeration and multi-device fan-out live only in the CLI's `run` command (D3). +The `probe` implementation lives under `kernbench.probes` (separate +from `kernbench.benches`), reflecting that probes are diagnostic +utilities, not registered benches. + ## Consequences - Benchmark authors write single-device logic; multi-device behavior diff --git a/pyproject.toml b/pyproject.toml index 8a5863f..7a86c71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,8 +12,8 @@ dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard kernbench = "kernbench.cli.main:main" [tool.setuptools.packages.find] -where = ["src", "."] -include = ["kernbench*", "benches*"] +where = ["src"] +include = ["kernbench*"] [project.optional-dependencies] dev = [ diff --git a/scripts/gemm_sweep.py b/scripts/gemm_sweep.py index 5400469..c24909e 100644 --- a/scripts/gemm_sweep.py +++ b/scripts/gemm_sweep.py @@ -117,19 +117,19 @@ def _run_one(M: int, K: int, N: int, topology: str, variant: str = "ref_ref") -> os.environ["MATMUL_N"] = str(N) os.environ["MATMUL_VARIANT"] = variant - # Late imports so env vars are read by benches/matmul_composite at module load. + # Late imports so env vars are read by matmul_composite at module load. # Force re-import to pick up new env values. - for mod_name in [m for m in list(sys.modules) if m.startswith("benches.matmul_composite")]: + for mod_name in [m for m in list(sys.modules) if m.startswith("kernbench.benches.matmul_composite")]: del sys.modules[mod_name] - from benches.loader import resolve_bench + from kernbench.benches.registry import resolve as resolve_bench from kernbench.runtime_api.bench_runner import run_bench from kernbench.runtime_api.types import resolve_device from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import resolve_topology topo = resolve_topology(topology) - bench = resolve_bench("matmul_composite") + bench = resolve_bench("matmul-composite").run device = resolve_device(None) t0 = time.time() diff --git a/src/kernbench/benches/__init__.py b/src/kernbench/benches/__init__.py new file mode 100644 index 0000000..bb9b3e8 --- /dev/null +++ b/src/kernbench/benches/__init__.py @@ -0,0 +1,9 @@ +"""kernbench.benches: eager-import sibling modules so @bench fires. + +Underscore-prefixed modules are treated as helpers and skipped. +After import, every imported module must have registered at least one +bench, or a RuntimeError is raised by the audit. +""" +from kernbench.benches.registry import _eager_import_and_audit + +_eager_import_and_audit(__path__, __name__) diff --git a/benches/ccl_allreduce.py b/src/kernbench/benches/ccl_allreduce.py similarity index 95% rename from benches/ccl_allreduce.py rename to src/kernbench/benches/ccl_allreduce.py index e521da2..3f0b228 100644 --- a/benches/ccl_allreduce.py +++ b/src/kernbench/benches/ccl_allreduce.py @@ -14,6 +14,7 @@ from dataclasses import dataclass import numpy as np +from kernbench.benches.registry import bench from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config from kernbench.policy.placement.dp import DPPolicy @@ -95,6 +96,10 @@ def _worker(rank: int, cfg: _BenchCfg, torch) -> None: _report(tensor.numpy(), cfg) +@bench( + name="ccl-allreduce", + description="CCL all-reduce bench (TP launcher; rank = SIP).", +) def run(torch) -> None: torch.distributed.init_process_group(backend="ahbm") cfg = _resolve_cfg(torch) diff --git a/benches/gemm_single_pe.py b/src/kernbench/benches/gemm_single_pe.py similarity index 89% rename from benches/gemm_single_pe.py rename to src/kernbench/benches/gemm_single_pe.py index 6142ddd..cae025b 100644 --- a/benches/gemm_single_pe.py +++ b/src/kernbench/benches/gemm_single_pe.py @@ -10,6 +10,7 @@ per-tile DMA internally. Run: kernbench run gemm_single_pe """ +from kernbench.benches.registry import bench from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) → (M, N) @@ -27,6 +28,10 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"): tl.wait(h) +@bench( + name="gemm-single-pe", + description="Single-PE GEMM via scheduler_v2 (pe_accel).", +) def run(torch): """Run the single-PE GEMM benchmark.""" dp = DPPolicy(cube="replicate", pe="replicate", diff --git a/benches/gpt3_qkv.py b/src/kernbench/benches/gpt3_qkv.py similarity index 96% rename from benches/gpt3_qkv.py rename to src/kernbench/benches/gpt3_qkv.py index 56e80f1..d6df5cc 100644 --- a/benches/gpt3_qkv.py +++ b/src/kernbench/benches/gpt3_qkv.py @@ -20,6 +20,7 @@ topology.yaml is unchanged. Run: kernbench run gpt3_qkv """ +from kernbench.benches.registry import bench from kernbench.policy.placement.dp import DPPolicy # -- PE configuration (DPPolicy overrides — does not change topology.yaml) ----- @@ -66,6 +67,10 @@ def _gpt3_qkv_kernel(x_ptr, wq_ptr, wk_ptr, wv_ptr, tl.wait(hv) +@bench( + name="gpt3-qkv", + description="GPT-3 QKV projection sharded column-wise across all PEs.", +) def run(torch): """Run the GPT-3 QKV benchmark.""" M = SEQ_LEN diff --git a/src/kernbench/benches/ipcq_allreduce.py b/src/kernbench/benches/ipcq_allreduce.py new file mode 100644 index 0000000..f1f5f5c --- /dev/null +++ b/src/kernbench/benches/ipcq_allreduce.py @@ -0,0 +1,9 @@ +from kernbench.benches.registry import bench + + +@bench( + name="ipcq-allreduce", + description="IPCQ all-reduce kernel bench (placeholder).", +) +def run(torch): + print("IPCQ all reduce kernel bench") diff --git a/benches/matmul_composite.py b/src/kernbench/benches/matmul_composite.py similarity index 93% rename from benches/matmul_composite.py rename to src/kernbench/benches/matmul_composite.py index 0365d6e..938e00b 100644 --- a/benches/matmul_composite.py +++ b/src/kernbench/benches/matmul_composite.py @@ -17,6 +17,7 @@ Run: """ import os +from kernbench.benches.registry import bench from kernbench.policy.placement.dp import DPPolicy M = int(os.environ.get("MATMUL_M", "256")) @@ -57,6 +58,10 @@ _KERNELS = { } +@bench( + name="matmul-composite", + description="Single-PE composite GEMM with ref/load variants for perf characterization.", +) def run(torch): if VARIANT not in _KERNELS: raise ValueError(f"unknown MATMUL_VARIANT={VARIANT!r}; " diff --git a/benches/qkv_gemm.py b/src/kernbench/benches/qkv_gemm.py similarity index 91% rename from benches/qkv_gemm.py rename to src/kernbench/benches/qkv_gemm.py index a8ad8f2..f869f38 100644 --- a/benches/qkv_gemm.py +++ b/src/kernbench/benches/qkv_gemm.py @@ -7,6 +7,7 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() - Tensor a is loaded into TCM via DMA - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32) """ +from kernbench.benches.registry import bench from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) → (M, N) @@ -28,6 +29,10 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"): tl.wait(handle) +@bench( + name="qkv-gemm", + description="QKV GEMM (Q*K^T) on a single PE — full host-to-PE pipeline.", +) def run(torch): """Run the QKV GEMM benchmark.""" # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE) diff --git a/benches/qkv_gemm_multi_pe.py b/src/kernbench/benches/qkv_gemm_multi_pe.py similarity index 90% rename from benches/qkv_gemm_multi_pe.py rename to src/kernbench/benches/qkv_gemm_multi_pe.py index 2aced00..62cbee2 100644 --- a/benches/qkv_gemm_multi_pe.py +++ b/src/kernbench/benches/qkv_gemm_multi_pe.py @@ -7,6 +7,7 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() - Tensor a is loaded into TCM via DMA - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32) """ +from kernbench.benches.registry import bench from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) -> (M, N) @@ -28,6 +29,10 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"): tl.wait(handle) +@bench( + name="qkv-gemm-multi-pe", + description="Column-parallel QKV GEMM across all PEs in a cube (multi-PE).", +) def run(torch): """Run the multi-PE QKV GEMM benchmark.""" # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split) diff --git a/src/kernbench/benches/registry.py b/src/kernbench/benches/registry.py new file mode 100644 index 0000000..58d8af8 --- /dev/null +++ b/src/kernbench/benches/registry.py @@ -0,0 +1,106 @@ +"""Bench registry: @bench decorator + name/index resolution. + +Each bench module under ``kernbench.benches`` MUST register its callable +via ``@bench(name=..., description=...)``. Indices are assigned +alphabetically by name after eager import; they are a CLI convenience, +not a stable API. +""" +from __future__ import annotations + +import re +from collections.abc import Callable +from dataclasses import dataclass +from importlib import import_module +from pkgutil import iter_modules + +BenchFn = Callable[..., object] + +_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$") + + +@dataclass(frozen=True) +class BenchSpec: + index: int + name: str + description: str + run: BenchFn + + +_PENDING: list[tuple[str, str, BenchFn]] = [] +_REGISTERED_MODULES: set[str] = set() +_REGISTRY: dict[str, BenchSpec] = {} + + +def bench(*, name: str, description: str) -> Callable[[BenchFn], BenchFn]: + if not isinstance(name, str) or not _NAME_RE.match(name): + raise ValueError( + f"bench name {name!r} must be kebab-case (lowercase, digits, dashes; " + f"starts with a letter)." + ) + if not isinstance(description, str) or not description.strip(): + raise ValueError(f"bench {name!r}: description must be a non-empty string.") + + def deco(fn: BenchFn) -> BenchFn: + _PENDING.append((name, description, fn)) + _REGISTERED_MODULES.add(fn.__module__) + return fn + + return deco + + +def _finalize() -> None: + if _REGISTRY: + return + seen: set[str] = set() + for n, _, _ in _PENDING: + if n in seen: + raise RuntimeError(f"duplicate bench name: {n!r}") + seen.add(n) + for i, (n, d, f) in enumerate(sorted(_PENDING, key=lambda t: t[0]), start=1): + _REGISTRY[n] = BenchSpec(index=i, name=n, description=d, run=f) + + +def list_all() -> list[BenchSpec]: + _finalize() + return sorted(_REGISTRY.values(), key=lambda s: s.index) + + +def resolve(identifier: str) -> BenchSpec: + _finalize() + if not isinstance(identifier, str) or not identifier.strip(): + raise ValueError("bench identifier must be a non-empty string.") + ident = identifier.strip() + if ident.isdigit(): + idx = int(ident) + for s in _REGISTRY.values(): + if s.index == idx: + return s + raise ValueError( + f"No bench with index {idx}. Use 'kernbench list' to see options." + ) + if ident in _REGISTRY: + return _REGISTRY[ident] + raise ValueError( + f"Unknown bench {ident!r}. Use 'kernbench list' to see options." + ) + + +def _audit_modules(imported: list[str], registered: set[str]) -> None: + missing = sorted(m for m in imported if m not in registered) + if missing: + raise RuntimeError( + f"Bench module(s) missing @bench decorator: {missing}. " + f"Each file under kernbench.benches/ must register at least one bench " + f"via @bench(...), or be renamed with a leading underscore if it is a " + f"helper." + ) + + +def _eager_import_and_audit(pkg_path: list[str], pkg_name: str) -> None: + imported: list[str] = [] + for m in iter_modules(pkg_path): + if m.name == "registry" or m.name.startswith("_"): + continue + mod = import_module(f"{pkg_name}.{m.name}") + imported.append(mod.__name__) + _audit_modules(imported, _REGISTERED_MODULES) diff --git a/benches/va_offset_verify.py b/src/kernbench/benches/va_offset_verify.py similarity index 90% rename from benches/va_offset_verify.py rename to src/kernbench/benches/va_offset_verify.py index 7c2bdd1..6721870 100644 --- a/benches/va_offset_verify.py +++ b/src/kernbench/benches/va_offset_verify.py @@ -9,6 +9,7 @@ The kernel uses standard Triton patterns: - tl.num_programs(0) for PE count within cube - Shape args are automatically localized by launch() """ +from kernbench.benches.registry import bench from kernbench.policy.placement.dp import DPPolicy M, K = 128, 256 @@ -26,6 +27,10 @@ def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"): tl.store(dst_ptr + offset, data) +@bench( + name="va-offset-verify", + description="Triton base_ptr + pid * stride VA addressing verification (TP sharded).", +) def run(torch): """Run the VA offset verification benchmark with full TP sharding.""" dp = DPPolicy(cube="column_wise", pe="column_wise") diff --git a/src/kernbench/cli/main.py b/src/kernbench/cli/main.py index a5c2ec2..958a50f 100644 --- a/src/kernbench/cli/main.py +++ b/src/kernbench/cli/main.py @@ -1,10 +1,10 @@ import argparse import sys -from benches.loader import resolve_bench -from kernbench.cli.probe import cmd_probe +from kernbench.benches.registry import list_all, resolve from kernbench.cli.report import format_report from kernbench.common.types import SimEngine +from kernbench.probes.probe import cmd_probe from kernbench.runtime_api.bench_runner import run_bench from kernbench.runtime_api.types import DeviceSelector, resolve_device from kernbench.sim_engine.engine import GraphEngine @@ -17,7 +17,10 @@ def build_parser() -> argparse.ArgumentParser: runp = sub.add_parser("run", help="Run a benchmark") runp.add_argument("--topology", required=True) - runp.add_argument("--bench", required=True) + runp.add_argument( + "--bench", required=True, + help="Bench name (kebab-case) or numeric index from 'kernbench list'", + ) runp.add_argument( "--device", default=None, help="Target device: 'all' or 'sip:' (default: all)" ) @@ -27,6 +30,9 @@ def build_parser() -> argparse.ArgumentParser: ) runp.set_defaults(_handler=cmd_run) + listp = sub.add_parser("list", help="List registered benches") + listp.set_defaults(_handler=cmd_list) + probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns") probep.add_argument("--topology", required=True) probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)") @@ -53,23 +59,34 @@ def cmd_web(args) -> int: return 0 +def cmd_list(args) -> int: + specs = list_all() + print(f"{'#':>3} {'NAME':<22} DESCRIPTION") + print("-" * 80) + for s in specs: + print(f"{s.index:>3} {s.name:<22} {s.description}") + return 0 + + def cmd_run(args) -> int: print("> Running benchmark with:", args) topo = resolve_topology(args.topology) - bench = resolve_bench(args.bench) + spec_entry = resolve(args.bench) device = resolve_device(args.device) verify_data = getattr(args, "verify_data", False) def _factory(topology, device): return engine_factory(topology, device, enable_data=verify_data) - result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory) + result = run_bench( + topology=topo, bench_fn=spec_entry.run, device=device, engine_factory=_factory, + ) topo_obj = getattr(topo, "topology_obj", topo) spec = getattr(topo_obj, "spec", None) if result.traces: - print(format_report(result.traces, title=args.bench, spec=spec)) + print(format_report(result.traces, title=spec_entry.name, spec=spec)) print(result.summary_text()) # Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay diff --git a/src/kernbench/probes/__init__.py b/src/kernbench/probes/__init__.py new file mode 100644 index 0000000..7635aa1 --- /dev/null +++ b/src/kernbench/probes/__init__.py @@ -0,0 +1,5 @@ +"""kernbench.probes: latency/BW diagnostic utilities (not benchmarks). + +See ADR-0010 D4. Probe is a developer tool for verifying the latency/BW +model; it bypasses the bench registry. +""" diff --git a/src/kernbench/cli/probe.py b/src/kernbench/probes/probe.py similarity index 100% rename from src/kernbench/cli/probe.py rename to src/kernbench/probes/probe.py diff --git a/tests/test_bench_registry.py b/tests/test_bench_registry.py new file mode 100644 index 0000000..b6687de --- /dev/null +++ b/tests/test_bench_registry.py @@ -0,0 +1,95 @@ +"""Tests for kernbench.benches.registry — @bench decorator + resolve/list.""" +from __future__ import annotations + +import pytest + +from kernbench.benches import registry + + +EXPECTED_NAMES = [ + "ccl-allreduce", + "gemm-single-pe", + "gpt3-qkv", + "ipcq-allreduce", + "matmul-composite", + "qkv-gemm", + "qkv-gemm-multi-pe", + "va-offset-verify", +] + + +def test_registry_lists_all_benches(): + specs = registry.list_all() + names = [s.name for s in specs] + assert names == EXPECTED_NAMES + + +def test_registry_indices_are_1_based_sorted_by_name(): + specs = registry.list_all() + assert [s.index for s in specs] == list(range(1, len(EXPECTED_NAMES) + 1)) + assert sorted(s.name for s in specs) == [s.name for s in specs] + + +def test_resolve_by_name_returns_spec(): + spec = registry.resolve("gemm-single-pe") + assert spec.name == "gemm-single-pe" + assert callable(spec.run) + assert spec.description.strip() + + +def test_resolve_by_index_string_matches_list_order(): + specs = registry.list_all() + third = specs[2] + resolved = registry.resolve(str(third.index)) + assert resolved is third + + +def test_resolve_unknown_name_raises(): + with pytest.raises(ValueError, match="kernbench list"): + registry.resolve("does-not-exist") + + +def test_resolve_unknown_index_raises(): + with pytest.raises(ValueError, match="kernbench list"): + registry.resolve("99") + + +def test_resolve_empty_identifier_raises(): + with pytest.raises(ValueError): + registry.resolve("") + + +def test_bench_decorator_rejects_invalid_name(): + with pytest.raises(ValueError, match="kebab-case"): + registry.bench(name="Invalid_Name", description="x") + + +def test_bench_decorator_rejects_empty_description(): + with pytest.raises(ValueError, match="non-empty"): + registry.bench(name="ok-name", description=" ") + + +def test_audit_raises_on_missing_decorator(): + with pytest.raises(RuntimeError, match="missing @bench decorator"): + registry._audit_modules( + imported=["kernbench.benches.fake_no_dec", "kernbench.benches.real"], + registered={"kernbench.benches.real"}, + ) + + +def test_audit_passes_when_all_registered(): + registry._audit_modules( + imported=["kernbench.benches.a", "kernbench.benches.b"], + registered={"kernbench.benches.a", "kernbench.benches.b"}, + ) + + +def test_duplicate_name_at_finalize_fails(monkeypatch): + """_finalize() rejects two pending entries with the same name.""" + monkeypatch.setattr(registry, "_PENDING", [ + ("dup", "d1", lambda: None), + ("dup", "d2", lambda: None), + ]) + monkeypatch.setattr(registry, "_REGISTRY", {}) + with pytest.raises(RuntimeError, match="duplicate bench name"): + registry._finalize() diff --git a/tests/test_cli.py b/tests/test_cli.py index 5482f7e..5fd25a8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,17 +6,17 @@ def test_cli_main_arg_parsing(monkeypatch): def fake_cmd_run(args) -> int: assert args.cmd == "run" assert args.topology == "topology.yaml" - assert args.bench == "qkv_gemm" + assert args.bench == "qkv-gemm" assert args.device == None return 0 # monkey patch the handler to test arg parsing without running the actual bench monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run) - rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"]) + rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv-gemm"]) assert rc == 0 def test_cli_main(): """CLI bench run on single SIP device.""" - rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm", "--device", "sip:0"]) + rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv-gemm", "--device", "sip:0"]) assert rc == 0 diff --git a/tests/test_cli_list.py b/tests/test_cli_list.py new file mode 100644 index 0000000..92d2152 --- /dev/null +++ b/tests/test_cli_list.py @@ -0,0 +1,44 @@ +"""Tests for `kernbench list` subcommand and `--bench ` resolution.""" +from __future__ import annotations + +import kernbench.cli.main as cli_main +from kernbench.benches import registry + + +def test_cli_list_outputs_all_benches(capsys): + rc = cli_main.main(["list"]) + assert rc == 0 + out = capsys.readouterr().out + for spec in registry.list_all(): + assert spec.name in out + assert "DESCRIPTION" in out + + +def test_cli_run_by_index(monkeypatch): + """CLI accepts numeric index for --bench; same callable as the name.""" + qkv_spec = registry.resolve("qkv-gemm") + + captured = {} + + def fake_run_bench(*, topology, bench_fn, device, engine_factory): + captured["bench_fn"] = bench_fn + + class _R: + traces = [] + engine = None + + class completion: + ok = True + + def summary_text(self): + return "" + return _R() + + monkeypatch.setattr(cli_main, "run_bench", fake_run_bench) + rc = cli_main.main([ + "run", "--topology", "topology.yaml", + "--bench", str(qkv_spec.index), + "--device", "sip:0", + ]) + assert rc == 0 + assert captured["bench_fn"] is qkv_spec.run diff --git a/tests/test_cli_verify_data.py b/tests/test_cli_verify_data.py index 9c71f79..de8e405 100644 --- a/tests/test_cli_verify_data.py +++ b/tests/test_cli_verify_data.py @@ -11,7 +11,7 @@ def test_cli_verify_data_flag_parsed(monkeypatch): monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run) rc = cli_main.main([ - "run", "--topology", "topology.yaml", "--bench", "qkv_gemm", + "run", "--topology", "topology.yaml", "--bench", "qkv-gemm", "--verify-data", ]) assert rc == 0 @@ -26,7 +26,7 @@ def test_cli_verify_data_flag_default(monkeypatch): monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run) rc = cli_main.main([ - "run", "--topology", "topology.yaml", "--bench", "qkv_gemm", + "run", "--topology", "topology.yaml", "--bench", "qkv-gemm", ]) assert rc == 0 @@ -34,7 +34,7 @@ def test_cli_verify_data_flag_default(monkeypatch): def test_cmd_run_verify_data_enables_engine(): """--verify-data runs full pipeline with enable_data=True and DataExecutor.""" rc = cli_main.main([ - "run", "--topology", "topology.yaml", "--bench", "qkv_gemm", + "run", "--topology", "topology.yaml", "--bench", "qkv-gemm", "--device", "sip:0", "--verify-data", ]) assert rc == 0 @@ -43,7 +43,7 @@ def test_cmd_run_verify_data_enables_engine(): def test_cmd_run_without_verify_data_no_op_log(): """Without --verify-data, engine runs in timing-only mode (no op_log).""" rc = cli_main.main([ - "run", "--topology", "topology.yaml", "--bench", "qkv_gemm", + "run", "--topology", "topology.yaml", "--bench", "qkv-gemm", "--device", "sip:0", ]) assert rc == 0 diff --git a/tests/test_mmu_fabric.py b/tests/test_mmu_fabric.py index 62a2ad3..f59acb0 100644 --- a/tests/test_mmu_fabric.py +++ b/tests/test_mmu_fabric.py @@ -235,7 +235,7 @@ def test_qkv_gemm_still_passes(): correlation_id="test_regression", spec=graph.spec, ) - from benches.qkv_gemm import run as bench_run + from kernbench.benches.qkv_gemm import run as bench_run bench_run(ctx) ctx.wait_all() # If we get here without exception, the benchmark succeeded diff --git a/tests/test_pe_components.py b/tests/test_pe_components.py index 99cb024..b5f53c6 100644 --- a/tests/test_pe_components.py +++ b/tests/test_pe_components.py @@ -864,7 +864,7 @@ def test_mcpu_kernel_launch_composite(): def test_qkv_gemm_bench_completes(): """The qkv_gemm benchmark runs to completion without error.""" clear_registry() - from benches.qkv_gemm import run as bench_run + from kernbench.benches.qkv_gemm import run as bench_run from kernbench.runtime_api.context import RuntimeContext graph = load_topology(TOPOLOGY_PATH) @@ -958,7 +958,7 @@ def test_mcpu_multi_pe_kernel_launch(): def test_qkv_gemm_bench_multi_pe_completes(): """The qkv_gemm_multi_pe benchmark runs to completion without error.""" clear_registry() - from benches.qkv_gemm_multi_pe import run as bench_run + from kernbench.benches.qkv_gemm_multi_pe import run as bench_run from kernbench.runtime_api.context import RuntimeContext graph = load_topology(TOPOLOGY_PATH) diff --git a/tests/test_probe.py b/tests/test_probe.py index f6a4a22..d20a414 100644 --- a/tests/test_probe.py +++ b/tests/test_probe.py @@ -263,7 +263,7 @@ def test_pe_cross_cube_best_worst(): def test_probe_timestamp_trace(): """_hop_timestamps must return monotonically increasing cumulative timestamps.""" - from kernbench.cli.probe import _hop_timestamps, _build_edge_map + from kernbench.probes.probe import _hop_timestamps, _build_edge_map graph = _graph() edge_map = _build_edge_map(graph) resolver = AddressResolver(graph) @@ -341,7 +341,7 @@ def test_hbm_efficiency_applied(): def test_probe_sweep_saturation(): """Utilization at 1MB must exceed utilization at 4KB for pe-local-hbm.""" - from kernbench.cli.probe import _sweep_util + from kernbench.probes.probe import _sweep_util # pe-local-hbm: ovhd=2ns (router), wire~0.03ns, bn from topology bn = _hbm_effective_bw() u = _sweep_util(2.0, 0.03, bn) diff --git a/tests/test_va_offset.py b/tests/test_va_offset.py index 9ea2eb3..3e4360e 100644 --- a/tests/test_va_offset.py +++ b/tests/test_va_offset.py @@ -143,7 +143,7 @@ def test_2d_bench_completes(): engine=engine, target_device=DeviceSelector("sip:0"), correlation_id="vo3", spec=graph.spec, ) - from benches.va_offset_verify import run as bench_run + from kernbench.benches.va_offset_verify import run as bench_run bench_run(ctx) ctx.wait_all()