benches: package as kernbench.benches, add @bench registry + list subcommand

Move benches/ -> src/kernbench/benches/ and src/kernbench/cli/probe.py ->
src/kernbench/probes/probe.py. Each bench self-registers via
@bench(name=..., description=...); kernbench list enumerates benches
with auto-assigned indices, --bench accepts kebab-case name or numeric
index. Audit at package-import time fails if any non-underscore module
forgets the decorator. ADR-0010 (EN + KO) updated to reflect the new
resolver path, list subcommand, and probes package separation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 14:42:10 -07:00
parent 168b0c89f0
commit 049e3d8bb3
28 changed files with 398 additions and 79 deletions
+9
View File
@@ -0,0 +1,9 @@
"""kernbench.benches: eager-import sibling modules so @bench fires.
Underscore-prefixed modules are treated as helpers and skipped.
After import, every imported module must have registered at least one
bench, or a RuntimeError is raised by the audit.
"""
from kernbench.benches.registry import _eager_import_and_audit
_eager_import_and_audit(__path__, __name__)
+108
View File
@@ -0,0 +1,108 @@
"""CCL all-reduce bench (ADR-0024 + ADR-0027).
Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
tensor sharded row-wise across the cube mesh (pe0 per cube). After
``dist.all_reduce(op="sum")`` every cube on every rank must hold
``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
(SIP count → world_size, cube_mesh → N_CUBES).
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from kernbench.benches.registry import bench
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
from kernbench.policy.placement.dp import DPPolicy
DEFAULT_N_ELEM = 8
@dataclass(frozen=True)
class _BenchCfg:
algorithm: str
n_elem: int
n_cubes: int
world_size: int
def _resolve_cfg(torch) -> _BenchCfg:
"""Read ccl.yaml + topology once at host side."""
merged = resolve_algorithm_config(load_ccl_config())
ws = torch.distributed.get_world_size()
spec = torch.spec or {}
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
if ws != n_sips:
raise RuntimeError(
f"ccl_allreduce bench requires world_size == topology SIP count "
f"(world_size={ws}, n_sips={n_sips})."
)
cm = spec.get("sip", {}).get("cube_mesh", {})
n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
return _BenchCfg(
algorithm=merged["algorithm"],
n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
n_cubes=n_cubes,
world_size=ws,
)
def _rank_dp(n_cubes: int) -> DPPolicy:
return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
"""Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
return torch.zeros(
(cfg.n_cubes, cfg.n_elem), dtype="f16",
dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
)
def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
"""Fill all cubes with the scalar ``rank + 1``."""
arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
tensor.copy_(torch.from_numpy(arr))
def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
"""Single-line pass/fail printer (rank 0 only)."""
expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
ok = True
for cube_id in range(cfg.n_cubes):
if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
ok = False
break
if ok:
total = cfg.world_size * cfg.n_cubes
print(f" {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
return
got = float(result.reshape(-1).mean())
print(
f" [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
f"got mean={got:.3f}, expected={expected:.3f}"
)
def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
torch.ahbm.set_device(rank)
tensor = _allocate_rank_tensor(torch, rank, cfg)
_init_with_rank_value(torch, tensor, rank, cfg)
torch.distributed.all_reduce(tensor, op="sum")
if rank == 0:
_report(tensor.numpy(), cfg)
@bench(
name="ccl-allreduce",
description="CCL all-reduce bench (TP launcher; rank = SIP).",
)
def run(torch) -> None:
torch.distributed.init_process_group(backend="ahbm")
cfg = _resolve_cfg(torch)
torch.multiprocessing.spawn(
_worker, args=(cfg, torch), nprocs=cfg.world_size,
)
+44
View File
@@ -0,0 +1,44 @@
"""Single-PE GEMM benchmark via scheduler_v2 (pe_accel).
Full host-to-PE pipeline:
Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
per-tile DMA internally.
Run:
kernbench run gemm_single_pe
"""
from kernbench.benches.registry import bench
from kernbench.policy.placement.dp import DPPolicy
# GEMM dimensions: (M, K) x (K, N) → (M, N)
M, K, N = 32, 128, 32
DTYPE = "f16"
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
"""Single-PE GEMM: out = a @ b. Both operands streamed from HBM by scheduler."""
M, K, N = int(M), int(K), int(N)
a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE)
b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
tl.wait(h)
@bench(
name="gemm-single-pe",
description="Single-PE GEMM via scheduler_v2 (pe_accel).",
)
def run(torch):
"""Run the single-PE GEMM benchmark."""
dp = DPPolicy(cube="replicate", pe="replicate",
num_cubes=1, num_pes=1)
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out")
torch.launch("gemm_single_pe", _gemm_kernel, a, b, out, M, K, N)
+101
View File
@@ -0,0 +1,101 @@
"""GPT-3 QKV projection benchmark: sharded across PEs via pe_accel_v1.
GPT-3 architecture:
d_model = 12288 (hidden dimension)
n_heads = 96 (attention heads)
d_head = 128 (dimension per head)
Sharding strategy (column-wise across all PEs):
X : (seq_len, d_model) -- replicated to all PEs
W_Q/K/V : (d_model, d_model) -- column-wise sharded across cubes × PEs
out_Q/K/V: (seq_len, d_model) -- column-wise sharded across cubes × PEs
Each PE computes:
Q_slice = X @ W_Q_slice : (seq_len, d_model) @ (d_model, cols_per_pe) -> (seq_len, cols_per_pe)
K_slice, V_slice: same
PE count is configurable via N_CUBES × N_PE_PER_CUBE (DPPolicy override).
topology.yaml is unchanged.
Run:
kernbench run gpt3_qkv
"""
from kernbench.benches.registry import bench
from kernbench.policy.placement.dp import DPPolicy
# -- PE configuration (DPPolicy overrides — does not change topology.yaml) -----
N_SIPS = 1
N_CUBES = 16 # cubes per SIP
N_PE_PER_CUBE = 8 # PEs per cube
N_PES = N_CUBES * N_PE_PER_CUBE # 128 total
# -- GPT-3 architecture -------------------------------------------------------
GPT3_D_MODEL = 12288
SEQ_LEN = 32
COLS_PER_PE = GPT3_D_MODEL // N_PES # 12288 / 128 = 96
DTYPE = "f16"
def _gpt3_qkv_kernel(x_ptr, wq_ptr, wk_ptr, wv_ptr,
out_q_ptr, out_k_ptr, out_v_ptr,
seq_len, d_model, cols_per_pe, tl, DTYPE="f16"):
"""GPT-3 QKV sharded: each PE uses program_id to index its VA slice."""
pid = tl.program_id(0)
bpe = 2 # f16
M = int(seq_len)
K = int(d_model)
N = int(cols_per_pe)
w_slice = K * N * bpe
out_slice = M * N * bpe
x = tl.load(int(x_ptr), shape=(M, K), dtype=DTYPE)
wq = tl.ref(int(wq_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
wk = tl.ref(int(wk_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
wv = tl.ref(int(wv_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
hq = tl.composite(op="gemm", a=x, b=wq,
out_ptr=int(out_q_ptr) + pid * out_slice)
hk = tl.composite(op="gemm", a=x, b=wk,
out_ptr=int(out_k_ptr) + pid * out_slice)
hv = tl.composite(op="gemm", a=x, b=wv,
out_ptr=int(out_v_ptr) + pid * out_slice)
tl.wait(hq)
tl.wait(hk)
tl.wait(hv)
@bench(
name="gpt3-qkv",
description="GPT-3 QKV projection sharded column-wise across all PEs.",
)
def run(torch):
"""Run the GPT-3 QKV benchmark."""
M = SEQ_LEN
K = GPT3_D_MODEL
N = COLS_PER_PE
# ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
# ADR-0024 launcher calls this bench once per SIP (each worker via
# torch.ahbm.set_device(rank)); here the policy describes only the
# cube × PE layout within a single SIP.
# X: replicated across all PEs within the SIP
dp_replicate = DPPolicy(cube="replicate", pe="replicate",
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
# W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
x = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
wq = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
wk = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wk")
wv = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wv")
out_q = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_q")
out_k = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_k")
out_v = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_v")
torch.launch("gpt3_qkv", _gpt3_qkv_kernel,
x, wq, wk, wv, out_q, out_k, out_v,
M, K, N)
+9
View File
@@ -0,0 +1,9 @@
from kernbench.benches.registry import bench
@bench(
name="ipcq-allreduce",
description="IPCQ all-reduce kernel bench (placeholder).",
)
def run(torch):
print("IPCQ all reduce kernel bench")
+74
View File
@@ -0,0 +1,74 @@
"""Single-PE composite GEMM for PE_accelerator perf characterization.
Three operand-staging variants are selectable via MATMUL_VARIANT:
- "ref_ref" (default): a = tl.ref, b = tl.ref
Both operands HBM-resident; scheduler streams per-tile DMA.
- "load_ref": a = tl.load, b = tl.ref
A eagerly DMA'd into TCM up-front; B streamed per-tile.
- "load_load": a = tl.load, b = tl.load
Both eagerly DMA'd into TCM up-front.
Other env vars: MATMUL_M, MATMUL_K, MATMUL_N, MATMUL_DTYPE.
Run:
MATMUL_M=256 MATMUL_K=256 MATMUL_N=256 MATMUL_VARIANT=load_ref \
kernbench run --topology topology.yaml --bench matmul_composite
"""
import os
from kernbench.benches.registry import bench
from kernbench.policy.placement.dp import DPPolicy
M = int(os.environ.get("MATMUL_M", "256"))
K = int(os.environ.get("MATMUL_K", "256"))
N = int(os.environ.get("MATMUL_N", "256"))
DTYPE = os.environ.get("MATMUL_DTYPE", "f16")
VARIANT = os.environ.get("MATMUL_VARIANT", "ref_ref")
def _kernel_ref_ref(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
M, K, N = int(M), int(K), int(N)
a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE)
b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
tl.wait(h)
def _kernel_load_ref(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
M, K, N = int(M), int(K), int(N)
a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
tl.wait(h)
def _kernel_load_load(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
M, K, N = int(M), int(K), int(N)
a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
b = tl.load(int(b_ptr), shape=(K, N), dtype=DTYPE)
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
tl.wait(h)
_KERNELS = {
"ref_ref": _kernel_ref_ref,
"load_ref": _kernel_load_ref,
"load_load": _kernel_load_load,
}
@bench(
name="matmul-composite",
description="Single-PE composite GEMM with ref/load variants for perf characterization.",
)
def run(torch):
if VARIANT not in _KERNELS:
raise ValueError(f"unknown MATMUL_VARIANT={VARIANT!r}; "
f"expected one of {list(_KERNELS)}")
kernel_fn = _KERNELS[VARIANT]
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out")
torch.launch(f"matmul_composite_{VARIANT}", kernel_fn, a, b, out, M, K, N)
+46
View File
@@ -0,0 +1,46 @@
"""QKV GEMM benchmark: Q*K^T projection on a single PE.
Demonstrates the full host-to-PE kernel launch pipeline:
Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
- Tensor a is loaded into TCM via DMA
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
"""
from kernbench.benches.registry import bench
from kernbench.policy.placement.dp import DPPolicy
# GEMM dimensions: (M, K) x (K, N) → (M, N)
# Small dims (1 tile) for fast regression. The test verifies the full
# host→PE pipeline, not large-matrix throughput.
M, K, N = 32, 64, 32
DTYPE = "f16"
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
"""QKV GEMM kernel: out = a @ b.
a is loaded into TCM (DMA_READ).
b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
"""
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
tl.wait(handle)
@bench(
name="qkv-gemm",
description="QKV GEMM (Q*K^T) on a single PE — full host-to-PE pipeline.",
)
def run(torch):
"""Run the QKV GEMM benchmark."""
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
a = torch.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
b = torch.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
out = torch.empty(
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
)
# Launch GEMM kernel
torch.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
@@ -0,0 +1,46 @@
"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
- Tensor a is loaded into TCM via DMA
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
"""
from kernbench.benches.registry import bench
from kernbench.policy.placement.dp import DPPolicy
# GEMM dimensions: (M, K) x (K, N) -> (M, N)
# Small dims (1 tile) for fast regression. The test verifies the multi-PE
# fan-out pipeline, not large-matrix throughput.
M, K, N = 32, 64, 32
DTYPE = "f16"
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
"""QKV GEMM kernel: out = a @ b.
a is loaded into TCM (DMA_READ).
b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
"""
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
tl.wait(handle)
@bench(
name="qkv-gemm-multi-pe",
description="Column-parallel QKV GEMM across all PEs in a cube (multi-PE).",
)
def run(torch):
"""Run the multi-PE QKV GEMM benchmark."""
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
a = torch.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
b = torch.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
out = torch.empty(
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
)
# Launch GEMM kernel on all PEs
torch.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
+106
View File
@@ -0,0 +1,106 @@
"""Bench registry: @bench decorator + name/index resolution.
Each bench module under ``kernbench.benches`` MUST register its callable
via ``@bench(name=..., description=...)``. Indices are assigned
alphabetically by name after eager import; they are a CLI convenience,
not a stable API.
"""
from __future__ import annotations
import re
from collections.abc import Callable
from dataclasses import dataclass
from importlib import import_module
from pkgutil import iter_modules
BenchFn = Callable[..., object]
_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$")
@dataclass(frozen=True)
class BenchSpec:
index: int
name: str
description: str
run: BenchFn
_PENDING: list[tuple[str, str, BenchFn]] = []
_REGISTERED_MODULES: set[str] = set()
_REGISTRY: dict[str, BenchSpec] = {}
def bench(*, name: str, description: str) -> Callable[[BenchFn], BenchFn]:
if not isinstance(name, str) or not _NAME_RE.match(name):
raise ValueError(
f"bench name {name!r} must be kebab-case (lowercase, digits, dashes; "
f"starts with a letter)."
)
if not isinstance(description, str) or not description.strip():
raise ValueError(f"bench {name!r}: description must be a non-empty string.")
def deco(fn: BenchFn) -> BenchFn:
_PENDING.append((name, description, fn))
_REGISTERED_MODULES.add(fn.__module__)
return fn
return deco
def _finalize() -> None:
if _REGISTRY:
return
seen: set[str] = set()
for n, _, _ in _PENDING:
if n in seen:
raise RuntimeError(f"duplicate bench name: {n!r}")
seen.add(n)
for i, (n, d, f) in enumerate(sorted(_PENDING, key=lambda t: t[0]), start=1):
_REGISTRY[n] = BenchSpec(index=i, name=n, description=d, run=f)
def list_all() -> list[BenchSpec]:
_finalize()
return sorted(_REGISTRY.values(), key=lambda s: s.index)
def resolve(identifier: str) -> BenchSpec:
_finalize()
if not isinstance(identifier, str) or not identifier.strip():
raise ValueError("bench identifier must be a non-empty string.")
ident = identifier.strip()
if ident.isdigit():
idx = int(ident)
for s in _REGISTRY.values():
if s.index == idx:
return s
raise ValueError(
f"No bench with index {idx}. Use 'kernbench list' to see options."
)
if ident in _REGISTRY:
return _REGISTRY[ident]
raise ValueError(
f"Unknown bench {ident!r}. Use 'kernbench list' to see options."
)
def _audit_modules(imported: list[str], registered: set[str]) -> None:
missing = sorted(m for m in imported if m not in registered)
if missing:
raise RuntimeError(
f"Bench module(s) missing @bench decorator: {missing}. "
f"Each file under kernbench.benches/ must register at least one bench "
f"via @bench(...), or be renamed with a leading underscore if it is a "
f"helper."
)
def _eager_import_and_audit(pkg_path: list[str], pkg_name: str) -> None:
imported: list[str] = []
for m in iter_modules(pkg_path):
if m.name == "registry" or m.name.startswith("_"):
continue
mod = import_module(f"{pkg_name}.{m.name}")
imported.append(mod.__name__)
_audit_modules(imported, _REGISTERED_MODULES)
+47
View File
@@ -0,0 +1,47 @@
"""VA offset verification benchmark.
Verifies that Triton-style base_ptr + pid * stride addressing works correctly
with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
block from a sharded tensor and stores it back.
The kernel uses standard Triton patterns:
- tl.program_id(0) for PE index within cube
- tl.num_programs(0) for PE count within cube
- Shape args are automatically localized by launch()
"""
from kernbench.benches.registry import bench
from kernbench.policy.placement.dp import DPPolicy
M, K = 128, 256
DTYPE = "f16"
def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):
"""Standard Triton copy kernel. M and K are cube-local (set by launch)."""
pid = tl.program_id(0)
num_pe = tl.num_programs(0)
cols_per_pe = K // num_pe
elem_bytes = 2 # f16
offset = pid * M * cols_per_pe * elem_bytes
data = tl.load(src_ptr + offset, shape=(M, cols_per_pe), dtype=DTYPE)
tl.store(dst_ptr + offset, data)
@bench(
name="va-offset-verify",
description="Triton base_ptr + pid * stride VA addressing verification (TP sharded).",
)
def run(torch):
"""Run the VA offset verification benchmark with full TP sharding."""
dp = DPPolicy(cube="column_wise", pe="column_wise")
src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
# launch() automatically converts M, K to cube-local values
torch.launch("va_offset_copy", _copy_kernel, src, dst, M, K)
# Sanity check: kernel completed with non-zero latency
kernel_traces = [t for t in torch._traces if t["phase"] == "kernel"]
assert len(kernel_traces) > 0, "No kernel traces recorded"
for kt in kernel_traces:
assert kt["total_ns"] > 0, f"Kernel latency is zero for {kt}"
+23 -6
View File
@@ -1,10 +1,10 @@
import argparse
import sys
from benches.loader import resolve_bench
from kernbench.cli.probe import cmd_probe
from kernbench.benches.registry import list_all, resolve
from kernbench.cli.report import format_report
from kernbench.common.types import SimEngine
from kernbench.probes.probe import cmd_probe
from kernbench.runtime_api.bench_runner import run_bench
from kernbench.runtime_api.types import DeviceSelector, resolve_device
from kernbench.sim_engine.engine import GraphEngine
@@ -17,7 +17,10 @@ def build_parser() -> argparse.ArgumentParser:
runp = sub.add_parser("run", help="Run a benchmark")
runp.add_argument("--topology", required=True)
runp.add_argument("--bench", required=True)
runp.add_argument(
"--bench", required=True,
help="Bench name (kebab-case) or numeric index from 'kernbench list'",
)
runp.add_argument(
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
)
@@ -27,6 +30,9 @@ def build_parser() -> argparse.ArgumentParser:
)
runp.set_defaults(_handler=cmd_run)
listp = sub.add_parser("list", help="List registered benches")
listp.set_defaults(_handler=cmd_list)
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
probep.add_argument("--topology", required=True)
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
@@ -53,23 +59,34 @@ def cmd_web(args) -> int:
return 0
def cmd_list(args) -> int:
specs = list_all()
print(f"{'#':>3} {'NAME':<22} DESCRIPTION")
print("-" * 80)
for s in specs:
print(f"{s.index:>3} {s.name:<22} {s.description}")
return 0
def cmd_run(args) -> int:
print("> Running benchmark with:", args)
topo = resolve_topology(args.topology)
bench = resolve_bench(args.bench)
spec_entry = resolve(args.bench)
device = resolve_device(args.device)
verify_data = getattr(args, "verify_data", False)
def _factory(topology, device):
return engine_factory(topology, device, enable_data=verify_data)
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory)
result = run_bench(
topology=topo, bench_fn=spec_entry.run, device=device, engine_factory=_factory,
)
topo_obj = getattr(topo, "topology_obj", topo)
spec = getattr(topo_obj, "spec", None)
if result.traces:
print(format_report(result.traces, title=args.bench, spec=spec))
print(format_report(result.traces, title=spec_entry.name, spec=spec))
print(result.summary_text())
# Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
+5
View File
@@ -0,0 +1,5 @@
"""kernbench.probes: latency/BW diagnostic utilities (not benchmarks).
See ADR-0010 D4. Probe is a developer tool for verifying the latency/BW
model; it bypasses the bench registry.
"""