benches: package as kernbench.benches, add @bench registry + list subcommand

Move benches/ -> src/kernbench/benches/ and src/kernbench/cli/probe.py -> src/kernbench/probes/probe.py. Each bench self-registers via @bench(name=..., description=...); kernbench list enumerates benches with auto-assigned indices, --bench accepts kebab-case name or numeric index. Audit at package-import time fails if any non-underscore module forgets the decorator. ADR-0010 (EN + KO) updated to reflect the new resolver path, list subcommand, and probes package separation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 14:42:10 -07:00
parent 168b0c89f0
commit 049e3d8bb3
28 changed files with 398 additions and 79 deletions
@@ -0,0 +1,9 @@
+"""kernbench.benches: eager-import sibling modules so @bench fires.
+
+Underscore-prefixed modules are treated as helpers and skipped.
+After import, every imported module must have registered at least one
+bench, or a RuntimeError is raised by the audit.
+"""
+from kernbench.benches.registry import _eager_import_and_audit
+
+_eager_import_and_audit(__path__, __name__)
@@ -0,0 +1,108 @@
+"""CCL all-reduce bench (ADR-0024 + ADR-0027).
+
+Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
+tensor sharded row-wise across the cube mesh (pe0 per cube). After
+``dist.all_reduce(op="sum")`` every cube on every rank must hold
+``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
+
+Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
+(SIP count → world_size, cube_mesh → N_CUBES).
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from kernbench.benches.registry import bench
+from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
+from kernbench.policy.placement.dp import DPPolicy
+
+DEFAULT_N_ELEM = 8
+
+
+@dataclass(frozen=True)
+class _BenchCfg:
+    algorithm: str
+    n_elem: int
+    n_cubes: int
+    world_size: int
+
+
+def _resolve_cfg(torch) -> _BenchCfg:
+    """Read ccl.yaml + topology once at host side."""
+    merged = resolve_algorithm_config(load_ccl_config())
+    ws = torch.distributed.get_world_size()
+    spec = torch.spec or {}
+    n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+    if ws != n_sips:
+        raise RuntimeError(
+            f"ccl_allreduce bench requires world_size == topology SIP count "
+            f"(world_size={ws}, n_sips={n_sips})."
+        )
+    cm = spec.get("sip", {}).get("cube_mesh", {})
+    n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
+    return _BenchCfg(
+        algorithm=merged["algorithm"],
+        n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
+        n_cubes=n_cubes,
+        world_size=ws,
+    )
+
+
+def _rank_dp(n_cubes: int) -> DPPolicy:
+    return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
+
+
+def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
+    """Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
+    return torch.zeros(
+        (cfg.n_cubes, cfg.n_elem), dtype="f16",
+        dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
+    )
+
+
+def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
+    """Fill all cubes with the scalar ``rank + 1``."""
+    arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
+    tensor.copy_(torch.from_numpy(arr))
+
+
+def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
+    """Single-line pass/fail printer (rank 0 only)."""
+    expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
+    ok = True
+    for cube_id in range(cfg.n_cubes):
+        if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
+            ok = False
+            break
+    if ok:
+        total = cfg.world_size * cfg.n_cubes
+        print(f"  {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
+        return
+    got = float(result.reshape(-1).mean())
+    print(
+        f"  [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
+        f"got mean={got:.3f}, expected={expected:.3f}"
+    )
+
+
+def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
+    torch.ahbm.set_device(rank)
+    tensor = _allocate_rank_tensor(torch, rank, cfg)
+    _init_with_rank_value(torch, tensor, rank, cfg)
+    torch.distributed.all_reduce(tensor, op="sum")
+    if rank == 0:
+        _report(tensor.numpy(), cfg)
+
+
+@bench(
+    name="ccl-allreduce",
+    description="CCL all-reduce bench (TP launcher; rank = SIP).",
+)
+def run(torch) -> None:
+    torch.distributed.init_process_group(backend="ahbm")
+    cfg = _resolve_cfg(torch)
+    torch.multiprocessing.spawn(
+        _worker, args=(cfg, torch), nprocs=cfg.world_size,
+    )
@@ -0,0 +1,44 @@
+"""Single-PE GEMM benchmark via scheduler_v2 (pe_accel).
+
+Full host-to-PE pipeline:
+  Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
+
+Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
+Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
+per-tile DMA internally.
+
+Run:
+    kernbench run gemm_single_pe
+"""
+from kernbench.benches.registry import bench
+from kernbench.policy.placement.dp import DPPolicy
+
+# GEMM dimensions: (M, K) x (K, N) → (M, N)
+M, K, N = 32, 128, 32
+DTYPE = "f16"
+
+
+def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    """Single-PE GEMM: out = a @ b. Both operands streamed from HBM by scheduler."""
+    M, K, N = int(M), int(K), int(N)
+
+    a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE)
+    b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
+    h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
+    tl.wait(h)
+
+
+@bench(
+    name="gemm-single-pe",
+    description="Single-PE GEMM via scheduler_v2 (pe_accel).",
+)
+def run(torch):
+    """Run the single-PE GEMM benchmark."""
+    dp = DPPolicy(cube="replicate", pe="replicate",
+                  num_cubes=1, num_pes=1)
+
+    a   = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
+    b   = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
+    out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out")
+
+    torch.launch("gemm_single_pe", _gemm_kernel, a, b, out, M, K, N)
@@ -0,0 +1,101 @@
+"""GPT-3 QKV projection benchmark: sharded across PEs via pe_accel_v1.
+
+GPT-3 architecture:
+  d_model  = 12288   (hidden dimension)
+  n_heads  = 96      (attention heads)
+  d_head   = 128     (dimension per head)
+
+Sharding strategy (column-wise across all PEs):
+  X        : (seq_len, d_model)       -- replicated to all PEs
+  W_Q/K/V  : (d_model, d_model)       -- column-wise sharded across cubes × PEs
+  out_Q/K/V: (seq_len, d_model)       -- column-wise sharded across cubes × PEs
+
+Each PE computes:
+  Q_slice = X @ W_Q_slice :  (seq_len, d_model) @ (d_model, cols_per_pe) -> (seq_len, cols_per_pe)
+  K_slice, V_slice: same
+
+PE count is configurable via N_CUBES × N_PE_PER_CUBE (DPPolicy override).
+topology.yaml is unchanged.
+
+Run:
+    kernbench run gpt3_qkv
+"""
+from kernbench.benches.registry import bench
+from kernbench.policy.placement.dp import DPPolicy
+
+# -- PE configuration (DPPolicy overrides — does not change topology.yaml) -----
+N_SIPS        = 1
+N_CUBES       = 16    # cubes per SIP
+N_PE_PER_CUBE = 8     # PEs per cube
+N_PES         = N_CUBES * N_PE_PER_CUBE  # 128 total
+
+# -- GPT-3 architecture -------------------------------------------------------
+GPT3_D_MODEL = 12288
+SEQ_LEN      = 32
+COLS_PER_PE  = GPT3_D_MODEL // N_PES  # 12288 / 128 = 96
+DTYPE         = "f16"
+
+
+def _gpt3_qkv_kernel(x_ptr, wq_ptr, wk_ptr, wv_ptr,
+                      out_q_ptr, out_k_ptr, out_v_ptr,
+                      seq_len, d_model, cols_per_pe, tl, DTYPE="f16"):
+    """GPT-3 QKV sharded: each PE uses program_id to index its VA slice."""
+    pid = tl.program_id(0)
+    bpe = 2  # f16
+
+    M = int(seq_len)
+    K = int(d_model)
+    N = int(cols_per_pe)
+
+    w_slice = K * N * bpe
+    out_slice = M * N * bpe
+
+    x  = tl.load(int(x_ptr), shape=(M, K), dtype=DTYPE)
+    wq = tl.ref(int(wq_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
+    wk = tl.ref(int(wk_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
+    wv = tl.ref(int(wv_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
+
+    hq = tl.composite(op="gemm", a=x, b=wq,
+                       out_ptr=int(out_q_ptr) + pid * out_slice)
+    hk = tl.composite(op="gemm", a=x, b=wk,
+                       out_ptr=int(out_k_ptr) + pid * out_slice)
+    hv = tl.composite(op="gemm", a=x, b=wv,
+                       out_ptr=int(out_v_ptr) + pid * out_slice)
+
+    tl.wait(hq)
+    tl.wait(hk)
+    tl.wait(hv)
+
+
+@bench(
+    name="gpt3-qkv",
+    description="GPT-3 QKV projection sharded column-wise across all PEs.",
+)
+def run(torch):
+    """Run the GPT-3 QKV benchmark."""
+    M = SEQ_LEN
+    K = GPT3_D_MODEL
+    N = COLS_PER_PE
+
+    # ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
+    # ADR-0024 launcher calls this bench once per SIP (each worker via
+    # torch.ahbm.set_device(rank)); here the policy describes only the
+    # cube × PE layout within a single SIP.
+    # X: replicated across all PEs within the SIP
+    dp_replicate = DPPolicy(cube="replicate", pe="replicate",
+                            num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
+    # W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
+    dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
+                          num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
+
+    x     = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
+    wq    = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
+    wk    = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wk")
+    wv    = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wv")
+    out_q = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_q")
+    out_k = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_k")
+    out_v = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_v")
+
+    torch.launch("gpt3_qkv", _gpt3_qkv_kernel,
+                 x, wq, wk, wv, out_q, out_k, out_v,
+                 M, K, N)
@@ -0,0 +1,9 @@
+from kernbench.benches.registry import bench
+
+
+@bench(
+    name="ipcq-allreduce",
+    description="IPCQ all-reduce kernel bench (placeholder).",
+)
+def run(torch):
+    print("IPCQ all reduce kernel bench")
@@ -0,0 +1,74 @@
+"""Single-PE composite GEMM for PE_accelerator perf characterization.
+
+Three operand-staging variants are selectable via MATMUL_VARIANT:
+
+  - "ref_ref"   (default):  a = tl.ref,  b = tl.ref
+        Both operands HBM-resident; scheduler streams per-tile DMA.
+  - "load_ref":             a = tl.load, b = tl.ref
+        A eagerly DMA'd into TCM up-front; B streamed per-tile.
+  - "load_load":            a = tl.load, b = tl.load
+        Both eagerly DMA'd into TCM up-front.
+
+Other env vars: MATMUL_M, MATMUL_K, MATMUL_N, MATMUL_DTYPE.
+
+Run:
+    MATMUL_M=256 MATMUL_K=256 MATMUL_N=256 MATMUL_VARIANT=load_ref \
+        kernbench run --topology topology.yaml --bench matmul_composite
+"""
+import os
+
+from kernbench.benches.registry import bench
+from kernbench.policy.placement.dp import DPPolicy
+
+M = int(os.environ.get("MATMUL_M", "256"))
+K = int(os.environ.get("MATMUL_K", "256"))
+N = int(os.environ.get("MATMUL_N", "256"))
+DTYPE = os.environ.get("MATMUL_DTYPE", "f16")
+VARIANT = os.environ.get("MATMUL_VARIANT", "ref_ref")
+
+
+def _kernel_ref_ref(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    M, K, N = int(M), int(K), int(N)
+    a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE)
+    b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
+    h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
+    tl.wait(h)
+
+
+def _kernel_load_ref(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    M, K, N = int(M), int(K), int(N)
+    a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
+    b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
+    h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
+    tl.wait(h)
+
+
+def _kernel_load_load(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    M, K, N = int(M), int(K), int(N)
+    a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
+    b = tl.load(int(b_ptr), shape=(K, N), dtype=DTYPE)
+    h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
+    tl.wait(h)
+
+
+_KERNELS = {
+    "ref_ref":   _kernel_ref_ref,
+    "load_ref":  _kernel_load_ref,
+    "load_load": _kernel_load_load,
+}
+
+
+@bench(
+    name="matmul-composite",
+    description="Single-PE composite GEMM with ref/load variants for perf characterization.",
+)
+def run(torch):
+    if VARIANT not in _KERNELS:
+        raise ValueError(f"unknown MATMUL_VARIANT={VARIANT!r}; "
+                         f"expected one of {list(_KERNELS)}")
+    kernel_fn = _KERNELS[VARIANT]
+    dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
+    a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
+    b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
+    out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out")
+    torch.launch(f"matmul_composite_{VARIANT}", kernel_fn, a, b, out, M, K, N)
@@ -0,0 +1,46 @@
+"""QKV GEMM benchmark: Q*K^T projection on a single PE.
+
+Demonstrates the full host-to-PE kernel launch pipeline:
+  Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
+
+Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
+  - Tensor a is loaded into TCM via DMA
+  - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
+"""
+from kernbench.benches.registry import bench
+from kernbench.policy.placement.dp import DPPolicy
+
+# GEMM dimensions: (M, K) x (K, N) → (M, N)
+# Small dims (1 tile) for fast regression. The test verifies the full
+# host→PE pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
+DTYPE = "f16"
+
+
+def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    """QKV GEMM kernel: out = a @ b.
+
+    a is loaded into TCM (DMA_READ).
+    b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
+    """
+    a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
+    b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
+    handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
+    tl.wait(handle)
+
+
+@bench(
+    name="qkv-gemm",
+    description="QKV GEMM (Q*K^T) on a single PE — full host-to-PE pipeline.",
+)
+def run(torch):
+    """Run the QKV GEMM benchmark."""
+    # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
+    a = torch.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
+    b = torch.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
+    out = torch.empty(
+        (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
+    )
+
+    # Launch GEMM kernel
+    torch.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
@@ -0,0 +1,46 @@
+"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
+
+Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
+M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
+
+Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
+  - Tensor a is loaded into TCM via DMA
+  - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
+"""
+from kernbench.benches.registry import bench
+from kernbench.policy.placement.dp import DPPolicy
+
+# GEMM dimensions: (M, K) x (K, N) -> (M, N)
+# Small dims (1 tile) for fast regression. The test verifies the multi-PE
+# fan-out pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
+DTYPE = "f16"
+
+
+def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    """QKV GEMM kernel: out = a @ b.
+
+    a is loaded into TCM (DMA_READ).
+    b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
+    """
+    a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
+    b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
+    handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
+    tl.wait(handle)
+
+
+@bench(
+    name="qkv-gemm-multi-pe",
+    description="Column-parallel QKV GEMM across all PEs in a cube (multi-PE).",
+)
+def run(torch):
+    """Run the multi-PE QKV GEMM benchmark."""
+    # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
+    a = torch.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
+    b = torch.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
+    out = torch.empty(
+        (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
+    )
+
+    # Launch GEMM kernel on all PEs
+    torch.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
@@ -0,0 +1,106 @@
+"""Bench registry: @bench decorator + name/index resolution.
+
+Each bench module under ``kernbench.benches`` MUST register its callable
+via ``@bench(name=..., description=...)``. Indices are assigned
+alphabetically by name after eager import; they are a CLI convenience,
+not a stable API.
+"""
+from __future__ import annotations
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+from importlib import import_module
+from pkgutil import iter_modules
+
+BenchFn = Callable[..., object]
+
+_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$")
+
+
+@dataclass(frozen=True)
+class BenchSpec:
+    index: int
+    name: str
+    description: str
+    run: BenchFn
+
+
+_PENDING: list[tuple[str, str, BenchFn]] = []
+_REGISTERED_MODULES: set[str] = set()
+_REGISTRY: dict[str, BenchSpec] = {}
+
+
+def bench(*, name: str, description: str) -> Callable[[BenchFn], BenchFn]:
+    if not isinstance(name, str) or not _NAME_RE.match(name):
+        raise ValueError(
+            f"bench name {name!r} must be kebab-case (lowercase, digits, dashes; "
+            f"starts with a letter)."
+        )
+    if not isinstance(description, str) or not description.strip():
+        raise ValueError(f"bench {name!r}: description must be a non-empty string.")
+
+    def deco(fn: BenchFn) -> BenchFn:
+        _PENDING.append((name, description, fn))
+        _REGISTERED_MODULES.add(fn.__module__)
+        return fn
+
+    return deco
+
+
+def _finalize() -> None:
+    if _REGISTRY:
+        return
+    seen: set[str] = set()
+    for n, _, _ in _PENDING:
+        if n in seen:
+            raise RuntimeError(f"duplicate bench name: {n!r}")
+        seen.add(n)
+    for i, (n, d, f) in enumerate(sorted(_PENDING, key=lambda t: t[0]), start=1):
+        _REGISTRY[n] = BenchSpec(index=i, name=n, description=d, run=f)
+
+
+def list_all() -> list[BenchSpec]:
+    _finalize()
+    return sorted(_REGISTRY.values(), key=lambda s: s.index)
+
+
+def resolve(identifier: str) -> BenchSpec:
+    _finalize()
+    if not isinstance(identifier, str) or not identifier.strip():
+        raise ValueError("bench identifier must be a non-empty string.")
+    ident = identifier.strip()
+    if ident.isdigit():
+        idx = int(ident)
+        for s in _REGISTRY.values():
+            if s.index == idx:
+                return s
+        raise ValueError(
+            f"No bench with index {idx}. Use 'kernbench list' to see options."
+        )
+    if ident in _REGISTRY:
+        return _REGISTRY[ident]
+    raise ValueError(
+        f"Unknown bench {ident!r}. Use 'kernbench list' to see options."
+    )
+
+
+def _audit_modules(imported: list[str], registered: set[str]) -> None:
+    missing = sorted(m for m in imported if m not in registered)
+    if missing:
+        raise RuntimeError(
+            f"Bench module(s) missing @bench decorator: {missing}. "
+            f"Each file under kernbench.benches/ must register at least one bench "
+            f"via @bench(...), or be renamed with a leading underscore if it is a "
+            f"helper."
+        )
+
+
+def _eager_import_and_audit(pkg_path: list[str], pkg_name: str) -> None:
+    imported: list[str] = []
+    for m in iter_modules(pkg_path):
+        if m.name == "registry" or m.name.startswith("_"):
+            continue
+        mod = import_module(f"{pkg_name}.{m.name}")
+        imported.append(mod.__name__)
+    _audit_modules(imported, _REGISTERED_MODULES)
@@ -0,0 +1,47 @@
+"""VA offset verification benchmark.
+
+Verifies that Triton-style base_ptr + pid * stride addressing works correctly
+with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
+block from a sharded tensor and stores it back.
+
+The kernel uses standard Triton patterns:
+  - tl.program_id(0) for PE index within cube
+  - tl.num_programs(0) for PE count within cube
+  - Shape args are automatically localized by launch()
+"""
+from kernbench.benches.registry import bench
+from kernbench.policy.placement.dp import DPPolicy
+
+M, K = 128, 256
+DTYPE = "f16"
+
+
+def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):
+    """Standard Triton copy kernel. M and K are cube-local (set by launch)."""
+    pid = tl.program_id(0)
+    num_pe = tl.num_programs(0)
+    cols_per_pe = K // num_pe
+    elem_bytes = 2  # f16
+    offset = pid * M * cols_per_pe * elem_bytes
+    data = tl.load(src_ptr + offset, shape=(M, cols_per_pe), dtype=DTYPE)
+    tl.store(dst_ptr + offset, data)
+
+
+@bench(
+    name="va-offset-verify",
+    description="Triton base_ptr + pid * stride VA addressing verification (TP sharded).",
+)
+def run(torch):
+    """Run the VA offset verification benchmark with full TP sharding."""
+    dp = DPPolicy(cube="column_wise", pe="column_wise")
+    src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
+    dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
+
+    # launch() automatically converts M, K to cube-local values
+    torch.launch("va_offset_copy", _copy_kernel, src, dst, M, K)
+
+    # Sanity check: kernel completed with non-zero latency
+    kernel_traces = [t for t in torch._traces if t["phase"] == "kernel"]
+    assert len(kernel_traces) > 0, "No kernel traces recorded"
+    for kt in kernel_traces:
+        assert kt["total_ns"] > 0, f"Kernel latency is zero for {kt}"
@@ -1,10 +1,10 @@
 import argparse
 import sys

-from benches.loader import resolve_bench
-from kernbench.cli.probe import cmd_probe
+from kernbench.benches.registry import list_all, resolve
 from kernbench.cli.report import format_report
 from kernbench.common.types import SimEngine
+from kernbench.probes.probe import cmd_probe
 from kernbench.runtime_api.bench_runner import run_bench
 from kernbench.runtime_api.types import DeviceSelector, resolve_device
 from kernbench.sim_engine.engine import GraphEngine
@@ -17,7 +17,10 @@ def build_parser() -> argparse.ArgumentParser:

    runp = sub.add_parser("run", help="Run a benchmark")
    runp.add_argument("--topology", required=True)
-    runp.add_argument("--bench", required=True)
+    runp.add_argument(
+        "--bench", required=True,
+        help="Bench name (kebab-case) or numeric index from 'kernbench list'",
+    )
    runp.add_argument(
        "--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
    )
@@ -27,6 +30,9 @@ def build_parser() -> argparse.ArgumentParser:
    )
    runp.set_defaults(_handler=cmd_run)

+    listp = sub.add_parser("list", help="List registered benches")
+    listp.set_defaults(_handler=cmd_list)
+
    probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
    probep.add_argument("--topology", required=True)
    probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
@@ -53,23 +59,34 @@ def cmd_web(args) -> int:
    return 0


+def cmd_list(args) -> int:
+    specs = list_all()
+    print(f"{'#':>3}  {'NAME':<22}  DESCRIPTION")
+    print("-" * 80)
+    for s in specs:
+        print(f"{s.index:>3}  {s.name:<22}  {s.description}")
+    return 0
+
+
 def cmd_run(args) -> int:
    print("> Running benchmark with:", args)

    topo = resolve_topology(args.topology)
-    bench = resolve_bench(args.bench)
+    spec_entry = resolve(args.bench)
    device = resolve_device(args.device)
    verify_data = getattr(args, "verify_data", False)

    def _factory(topology, device):
        return engine_factory(topology, device, enable_data=verify_data)

-    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory)
+    result = run_bench(
+        topology=topo, bench_fn=spec_entry.run, device=device, engine_factory=_factory,
+    )

    topo_obj = getattr(topo, "topology_obj", topo)
    spec = getattr(topo_obj, "spec", None)
    if result.traces:
-        print(format_report(result.traces, title=args.bench, spec=spec))
+        print(format_report(result.traces, title=spec_entry.name, spec=spec))
    print(result.summary_text())

    # Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
@@ -0,0 +1,5 @@
+"""kernbench.probes: latency/BW diagnostic utilities (not benchmarks).
+
+See ADR-0010 D4. Probe is a developer tool for verifying the latency/BW
+model; it bypasses the bench registry.
+"""