benches: package as kernbench.benches, add @bench registry + list subcommand
Move benches/ -> src/kernbench/benches/ and src/kernbench/cli/probe.py -> src/kernbench/probes/probe.py. Each bench self-registers via @bench(name=..., description=...); kernbench list enumerates benches with auto-assigned indices, --bench accepts kebab-case name or numeric index. Audit at package-import time fails if any non-underscore module forgets the decorator. ADR-0010 (EN + KO) updated to reflect the new resolver path, list subcommand, and probes package separation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
"""kernbench.benches: eager-import sibling modules so @bench fires.
|
||||
|
||||
Underscore-prefixed modules are treated as helpers and skipped.
|
||||
After import, every imported module must have registered at least one
|
||||
bench, or a RuntimeError is raised by the audit.
|
||||
"""
|
||||
from kernbench.benches.registry import _eager_import_and_audit
|
||||
|
||||
_eager_import_and_audit(__path__, __name__)
|
||||
@@ -0,0 +1,108 @@
|
||||
"""CCL all-reduce bench (ADR-0024 + ADR-0027).
|
||||
|
||||
Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
|
||||
tensor sharded row-wise across the cube mesh (pe0 per cube). After
|
||||
``dist.all_reduce(op="sum")`` every cube on every rank must hold
|
||||
``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
|
||||
|
||||
Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
|
||||
(SIP count → world_size, cube_mesh → N_CUBES).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
DEFAULT_N_ELEM = 8
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _BenchCfg:
|
||||
algorithm: str
|
||||
n_elem: int
|
||||
n_cubes: int
|
||||
world_size: int
|
||||
|
||||
|
||||
def _resolve_cfg(torch) -> _BenchCfg:
|
||||
"""Read ccl.yaml + topology once at host side."""
|
||||
merged = resolve_algorithm_config(load_ccl_config())
|
||||
ws = torch.distributed.get_world_size()
|
||||
spec = torch.spec or {}
|
||||
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||
if ws != n_sips:
|
||||
raise RuntimeError(
|
||||
f"ccl_allreduce bench requires world_size == topology SIP count "
|
||||
f"(world_size={ws}, n_sips={n_sips})."
|
||||
)
|
||||
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||
n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
|
||||
return _BenchCfg(
|
||||
algorithm=merged["algorithm"],
|
||||
n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
|
||||
n_cubes=n_cubes,
|
||||
world_size=ws,
|
||||
)
|
||||
|
||||
|
||||
def _rank_dp(n_cubes: int) -> DPPolicy:
|
||||
return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
|
||||
|
||||
|
||||
def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
|
||||
"""Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
|
||||
return torch.zeros(
|
||||
(cfg.n_cubes, cfg.n_elem), dtype="f16",
|
||||
dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
|
||||
)
|
||||
|
||||
|
||||
def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
|
||||
"""Fill all cubes with the scalar ``rank + 1``."""
|
||||
arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
|
||||
tensor.copy_(torch.from_numpy(arr))
|
||||
|
||||
|
||||
def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
|
||||
"""Single-line pass/fail printer (rank 0 only)."""
|
||||
expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
|
||||
ok = True
|
||||
for cube_id in range(cfg.n_cubes):
|
||||
if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
|
||||
ok = False
|
||||
break
|
||||
if ok:
|
||||
total = cfg.world_size * cfg.n_cubes
|
||||
print(f" {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
|
||||
return
|
||||
got = float(result.reshape(-1).mean())
|
||||
print(
|
||||
f" [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
|
||||
f"got mean={got:.3f}, expected={expected:.3f}"
|
||||
)
|
||||
|
||||
|
||||
def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
|
||||
torch.ahbm.set_device(rank)
|
||||
tensor = _allocate_rank_tensor(torch, rank, cfg)
|
||||
_init_with_rank_value(torch, tensor, rank, cfg)
|
||||
torch.distributed.all_reduce(tensor, op="sum")
|
||||
if rank == 0:
|
||||
_report(tensor.numpy(), cfg)
|
||||
|
||||
|
||||
@bench(
|
||||
name="ccl-allreduce",
|
||||
description="CCL all-reduce bench (TP launcher; rank = SIP).",
|
||||
)
|
||||
def run(torch) -> None:
|
||||
torch.distributed.init_process_group(backend="ahbm")
|
||||
cfg = _resolve_cfg(torch)
|
||||
torch.multiprocessing.spawn(
|
||||
_worker, args=(cfg, torch), nprocs=cfg.world_size,
|
||||
)
|
||||
@@ -0,0 +1,44 @@
|
||||
"""Single-PE GEMM benchmark via scheduler_v2 (pe_accel).
|
||||
|
||||
Full host-to-PE pipeline:
|
||||
Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
|
||||
|
||||
Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
|
||||
Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
|
||||
per-tile DMA internally.
|
||||
|
||||
Run:
|
||||
kernbench run gemm_single_pe
|
||||
"""
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
# GEMM dimensions: (M, K) x (K, N) → (M, N)
|
||||
M, K, N = 32, 128, 32
|
||||
DTYPE = "f16"
|
||||
|
||||
|
||||
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||
"""Single-PE GEMM: out = a @ b. Both operands streamed from HBM by scheduler."""
|
||||
M, K, N = int(M), int(K), int(N)
|
||||
|
||||
a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE)
|
||||
b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
|
||||
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
|
||||
tl.wait(h)
|
||||
|
||||
|
||||
@bench(
|
||||
name="gemm-single-pe",
|
||||
description="Single-PE GEMM via scheduler_v2 (pe_accel).",
|
||||
)
|
||||
def run(torch):
|
||||
"""Run the single-PE GEMM benchmark."""
|
||||
dp = DPPolicy(cube="replicate", pe="replicate",
|
||||
num_cubes=1, num_pes=1)
|
||||
|
||||
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
|
||||
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
|
||||
out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out")
|
||||
|
||||
torch.launch("gemm_single_pe", _gemm_kernel, a, b, out, M, K, N)
|
||||
@@ -0,0 +1,101 @@
|
||||
"""GPT-3 QKV projection benchmark: sharded across PEs via pe_accel_v1.
|
||||
|
||||
GPT-3 architecture:
|
||||
d_model = 12288 (hidden dimension)
|
||||
n_heads = 96 (attention heads)
|
||||
d_head = 128 (dimension per head)
|
||||
|
||||
Sharding strategy (column-wise across all PEs):
|
||||
X : (seq_len, d_model) -- replicated to all PEs
|
||||
W_Q/K/V : (d_model, d_model) -- column-wise sharded across cubes × PEs
|
||||
out_Q/K/V: (seq_len, d_model) -- column-wise sharded across cubes × PEs
|
||||
|
||||
Each PE computes:
|
||||
Q_slice = X @ W_Q_slice : (seq_len, d_model) @ (d_model, cols_per_pe) -> (seq_len, cols_per_pe)
|
||||
K_slice, V_slice: same
|
||||
|
||||
PE count is configurable via N_CUBES × N_PE_PER_CUBE (DPPolicy override).
|
||||
topology.yaml is unchanged.
|
||||
|
||||
Run:
|
||||
kernbench run gpt3_qkv
|
||||
"""
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
# -- PE configuration (DPPolicy overrides — does not change topology.yaml) -----
|
||||
N_SIPS = 1
|
||||
N_CUBES = 16 # cubes per SIP
|
||||
N_PE_PER_CUBE = 8 # PEs per cube
|
||||
N_PES = N_CUBES * N_PE_PER_CUBE # 128 total
|
||||
|
||||
# -- GPT-3 architecture -------------------------------------------------------
|
||||
GPT3_D_MODEL = 12288
|
||||
SEQ_LEN = 32
|
||||
COLS_PER_PE = GPT3_D_MODEL // N_PES # 12288 / 128 = 96
|
||||
DTYPE = "f16"
|
||||
|
||||
|
||||
def _gpt3_qkv_kernel(x_ptr, wq_ptr, wk_ptr, wv_ptr,
|
||||
out_q_ptr, out_k_ptr, out_v_ptr,
|
||||
seq_len, d_model, cols_per_pe, tl, DTYPE="f16"):
|
||||
"""GPT-3 QKV sharded: each PE uses program_id to index its VA slice."""
|
||||
pid = tl.program_id(0)
|
||||
bpe = 2 # f16
|
||||
|
||||
M = int(seq_len)
|
||||
K = int(d_model)
|
||||
N = int(cols_per_pe)
|
||||
|
||||
w_slice = K * N * bpe
|
||||
out_slice = M * N * bpe
|
||||
|
||||
x = tl.load(int(x_ptr), shape=(M, K), dtype=DTYPE)
|
||||
wq = tl.ref(int(wq_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
|
||||
wk = tl.ref(int(wk_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
|
||||
wv = tl.ref(int(wv_ptr) + pid * w_slice, shape=(K, N), dtype=DTYPE)
|
||||
|
||||
hq = tl.composite(op="gemm", a=x, b=wq,
|
||||
out_ptr=int(out_q_ptr) + pid * out_slice)
|
||||
hk = tl.composite(op="gemm", a=x, b=wk,
|
||||
out_ptr=int(out_k_ptr) + pid * out_slice)
|
||||
hv = tl.composite(op="gemm", a=x, b=wv,
|
||||
out_ptr=int(out_v_ptr) + pid * out_slice)
|
||||
|
||||
tl.wait(hq)
|
||||
tl.wait(hk)
|
||||
tl.wait(hv)
|
||||
|
||||
|
||||
@bench(
|
||||
name="gpt3-qkv",
|
||||
description="GPT-3 QKV projection sharded column-wise across all PEs.",
|
||||
)
|
||||
def run(torch):
|
||||
"""Run the GPT-3 QKV benchmark."""
|
||||
M = SEQ_LEN
|
||||
K = GPT3_D_MODEL
|
||||
N = COLS_PER_PE
|
||||
|
||||
# ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
|
||||
# ADR-0024 launcher calls this bench once per SIP (each worker via
|
||||
# torch.ahbm.set_device(rank)); here the policy describes only the
|
||||
# cube × PE layout within a single SIP.
|
||||
# X: replicated across all PEs within the SIP
|
||||
dp_replicate = DPPolicy(cube="replicate", pe="replicate",
|
||||
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
||||
# W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
|
||||
dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
|
||||
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
||||
|
||||
x = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
|
||||
wq = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
|
||||
wk = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wk")
|
||||
wv = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wv")
|
||||
out_q = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_q")
|
||||
out_k = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_k")
|
||||
out_v = torch.empty((M, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="out_v")
|
||||
|
||||
torch.launch("gpt3_qkv", _gpt3_qkv_kernel,
|
||||
x, wq, wk, wv, out_q, out_k, out_v,
|
||||
M, K, N)
|
||||
@@ -0,0 +1,9 @@
|
||||
from kernbench.benches.registry import bench
|
||||
|
||||
|
||||
@bench(
|
||||
name="ipcq-allreduce",
|
||||
description="IPCQ all-reduce kernel bench (placeholder).",
|
||||
)
|
||||
def run(torch):
|
||||
print("IPCQ all reduce kernel bench")
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Single-PE composite GEMM for PE_accelerator perf characterization.
|
||||
|
||||
Three operand-staging variants are selectable via MATMUL_VARIANT:
|
||||
|
||||
- "ref_ref" (default): a = tl.ref, b = tl.ref
|
||||
Both operands HBM-resident; scheduler streams per-tile DMA.
|
||||
- "load_ref": a = tl.load, b = tl.ref
|
||||
A eagerly DMA'd into TCM up-front; B streamed per-tile.
|
||||
- "load_load": a = tl.load, b = tl.load
|
||||
Both eagerly DMA'd into TCM up-front.
|
||||
|
||||
Other env vars: MATMUL_M, MATMUL_K, MATMUL_N, MATMUL_DTYPE.
|
||||
|
||||
Run:
|
||||
MATMUL_M=256 MATMUL_K=256 MATMUL_N=256 MATMUL_VARIANT=load_ref \
|
||||
kernbench run --topology topology.yaml --bench matmul_composite
|
||||
"""
|
||||
import os
|
||||
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
M = int(os.environ.get("MATMUL_M", "256"))
|
||||
K = int(os.environ.get("MATMUL_K", "256"))
|
||||
N = int(os.environ.get("MATMUL_N", "256"))
|
||||
DTYPE = os.environ.get("MATMUL_DTYPE", "f16")
|
||||
VARIANT = os.environ.get("MATMUL_VARIANT", "ref_ref")
|
||||
|
||||
|
||||
def _kernel_ref_ref(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||
M, K, N = int(M), int(K), int(N)
|
||||
a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE)
|
||||
b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
|
||||
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
|
||||
tl.wait(h)
|
||||
|
||||
|
||||
def _kernel_load_ref(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||
M, K, N = int(M), int(K), int(N)
|
||||
a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
|
||||
b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE)
|
||||
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
|
||||
tl.wait(h)
|
||||
|
||||
|
||||
def _kernel_load_load(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||
M, K, N = int(M), int(K), int(N)
|
||||
a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
|
||||
b = tl.load(int(b_ptr), shape=(K, N), dtype=DTYPE)
|
||||
h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr))
|
||||
tl.wait(h)
|
||||
|
||||
|
||||
_KERNELS = {
|
||||
"ref_ref": _kernel_ref_ref,
|
||||
"load_ref": _kernel_load_ref,
|
||||
"load_load": _kernel_load_load,
|
||||
}
|
||||
|
||||
|
||||
@bench(
|
||||
name="matmul-composite",
|
||||
description="Single-PE composite GEMM with ref/load variants for perf characterization.",
|
||||
)
|
||||
def run(torch):
|
||||
if VARIANT not in _KERNELS:
|
||||
raise ValueError(f"unknown MATMUL_VARIANT={VARIANT!r}; "
|
||||
f"expected one of {list(_KERNELS)}")
|
||||
kernel_fn = _KERNELS[VARIANT]
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
|
||||
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
|
||||
out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out")
|
||||
torch.launch(f"matmul_composite_{VARIANT}", kernel_fn, a, b, out, M, K, N)
|
||||
@@ -0,0 +1,46 @@
|
||||
"""QKV GEMM benchmark: Q*K^T projection on a single PE.
|
||||
|
||||
Demonstrates the full host-to-PE kernel launch pipeline:
|
||||
Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
|
||||
|
||||
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
|
||||
- Tensor a is loaded into TCM via DMA
|
||||
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
|
||||
"""
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
# GEMM dimensions: (M, K) x (K, N) → (M, N)
|
||||
# Small dims (1 tile) for fast regression. The test verifies the full
|
||||
# host→PE pipeline, not large-matrix throughput.
|
||||
M, K, N = 32, 64, 32
|
||||
DTYPE = "f16"
|
||||
|
||||
|
||||
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||
"""QKV GEMM kernel: out = a @ b.
|
||||
|
||||
a is loaded into TCM (DMA_READ).
|
||||
b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
|
||||
"""
|
||||
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
|
||||
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
|
||||
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
|
||||
tl.wait(handle)
|
||||
|
||||
|
||||
@bench(
|
||||
name="qkv-gemm",
|
||||
description="QKV GEMM (Q*K^T) on a single PE — full host-to-PE pipeline.",
|
||||
)
|
||||
def run(torch):
|
||||
"""Run the QKV GEMM benchmark."""
|
||||
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
|
||||
a = torch.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
|
||||
b = torch.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
|
||||
out = torch.empty(
|
||||
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
|
||||
)
|
||||
|
||||
# Launch GEMM kernel
|
||||
torch.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
|
||||
@@ -0,0 +1,46 @@
|
||||
"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
|
||||
|
||||
Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
|
||||
M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
|
||||
|
||||
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
|
||||
- Tensor a is loaded into TCM via DMA
|
||||
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
|
||||
"""
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
# GEMM dimensions: (M, K) x (K, N) -> (M, N)
|
||||
# Small dims (1 tile) for fast regression. The test verifies the multi-PE
|
||||
# fan-out pipeline, not large-matrix throughput.
|
||||
M, K, N = 32, 64, 32
|
||||
DTYPE = "f16"
|
||||
|
||||
|
||||
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||
"""QKV GEMM kernel: out = a @ b.
|
||||
|
||||
a is loaded into TCM (DMA_READ).
|
||||
b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
|
||||
"""
|
||||
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
|
||||
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
|
||||
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
|
||||
tl.wait(handle)
|
||||
|
||||
|
||||
@bench(
|
||||
name="qkv-gemm-multi-pe",
|
||||
description="Column-parallel QKV GEMM across all PEs in a cube (multi-PE).",
|
||||
)
|
||||
def run(torch):
|
||||
"""Run the multi-PE QKV GEMM benchmark."""
|
||||
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
|
||||
a = torch.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
|
||||
b = torch.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
|
||||
out = torch.empty(
|
||||
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
|
||||
)
|
||||
|
||||
# Launch GEMM kernel on all PEs
|
||||
torch.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
|
||||
@@ -0,0 +1,106 @@
|
||||
"""Bench registry: @bench decorator + name/index resolution.
|
||||
|
||||
Each bench module under ``kernbench.benches`` MUST register its callable
|
||||
via ``@bench(name=..., description=...)``. Indices are assigned
|
||||
alphabetically by name after eager import; they are a CLI convenience,
|
||||
not a stable API.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from importlib import import_module
|
||||
from pkgutil import iter_modules
|
||||
|
||||
BenchFn = Callable[..., object]
|
||||
|
||||
_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BenchSpec:
|
||||
index: int
|
||||
name: str
|
||||
description: str
|
||||
run: BenchFn
|
||||
|
||||
|
||||
_PENDING: list[tuple[str, str, BenchFn]] = []
|
||||
_REGISTERED_MODULES: set[str] = set()
|
||||
_REGISTRY: dict[str, BenchSpec] = {}
|
||||
|
||||
|
||||
def bench(*, name: str, description: str) -> Callable[[BenchFn], BenchFn]:
|
||||
if not isinstance(name, str) or not _NAME_RE.match(name):
|
||||
raise ValueError(
|
||||
f"bench name {name!r} must be kebab-case (lowercase, digits, dashes; "
|
||||
f"starts with a letter)."
|
||||
)
|
||||
if not isinstance(description, str) or not description.strip():
|
||||
raise ValueError(f"bench {name!r}: description must be a non-empty string.")
|
||||
|
||||
def deco(fn: BenchFn) -> BenchFn:
|
||||
_PENDING.append((name, description, fn))
|
||||
_REGISTERED_MODULES.add(fn.__module__)
|
||||
return fn
|
||||
|
||||
return deco
|
||||
|
||||
|
||||
def _finalize() -> None:
|
||||
if _REGISTRY:
|
||||
return
|
||||
seen: set[str] = set()
|
||||
for n, _, _ in _PENDING:
|
||||
if n in seen:
|
||||
raise RuntimeError(f"duplicate bench name: {n!r}")
|
||||
seen.add(n)
|
||||
for i, (n, d, f) in enumerate(sorted(_PENDING, key=lambda t: t[0]), start=1):
|
||||
_REGISTRY[n] = BenchSpec(index=i, name=n, description=d, run=f)
|
||||
|
||||
|
||||
def list_all() -> list[BenchSpec]:
|
||||
_finalize()
|
||||
return sorted(_REGISTRY.values(), key=lambda s: s.index)
|
||||
|
||||
|
||||
def resolve(identifier: str) -> BenchSpec:
|
||||
_finalize()
|
||||
if not isinstance(identifier, str) or not identifier.strip():
|
||||
raise ValueError("bench identifier must be a non-empty string.")
|
||||
ident = identifier.strip()
|
||||
if ident.isdigit():
|
||||
idx = int(ident)
|
||||
for s in _REGISTRY.values():
|
||||
if s.index == idx:
|
||||
return s
|
||||
raise ValueError(
|
||||
f"No bench with index {idx}. Use 'kernbench list' to see options."
|
||||
)
|
||||
if ident in _REGISTRY:
|
||||
return _REGISTRY[ident]
|
||||
raise ValueError(
|
||||
f"Unknown bench {ident!r}. Use 'kernbench list' to see options."
|
||||
)
|
||||
|
||||
|
||||
def _audit_modules(imported: list[str], registered: set[str]) -> None:
|
||||
missing = sorted(m for m in imported if m not in registered)
|
||||
if missing:
|
||||
raise RuntimeError(
|
||||
f"Bench module(s) missing @bench decorator: {missing}. "
|
||||
f"Each file under kernbench.benches/ must register at least one bench "
|
||||
f"via @bench(...), or be renamed with a leading underscore if it is a "
|
||||
f"helper."
|
||||
)
|
||||
|
||||
|
||||
def _eager_import_and_audit(pkg_path: list[str], pkg_name: str) -> None:
|
||||
imported: list[str] = []
|
||||
for m in iter_modules(pkg_path):
|
||||
if m.name == "registry" or m.name.startswith("_"):
|
||||
continue
|
||||
mod = import_module(f"{pkg_name}.{m.name}")
|
||||
imported.append(mod.__name__)
|
||||
_audit_modules(imported, _REGISTERED_MODULES)
|
||||
@@ -0,0 +1,47 @@
|
||||
"""VA offset verification benchmark.
|
||||
|
||||
Verifies that Triton-style base_ptr + pid * stride addressing works correctly
|
||||
with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
|
||||
block from a sharded tensor and stores it back.
|
||||
|
||||
The kernel uses standard Triton patterns:
|
||||
- tl.program_id(0) for PE index within cube
|
||||
- tl.num_programs(0) for PE count within cube
|
||||
- Shape args are automatically localized by launch()
|
||||
"""
|
||||
from kernbench.benches.registry import bench
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
M, K = 128, 256
|
||||
DTYPE = "f16"
|
||||
|
||||
|
||||
def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):
|
||||
"""Standard Triton copy kernel. M and K are cube-local (set by launch)."""
|
||||
pid = tl.program_id(0)
|
||||
num_pe = tl.num_programs(0)
|
||||
cols_per_pe = K // num_pe
|
||||
elem_bytes = 2 # f16
|
||||
offset = pid * M * cols_per_pe * elem_bytes
|
||||
data = tl.load(src_ptr + offset, shape=(M, cols_per_pe), dtype=DTYPE)
|
||||
tl.store(dst_ptr + offset, data)
|
||||
|
||||
|
||||
@bench(
|
||||
name="va-offset-verify",
|
||||
description="Triton base_ptr + pid * stride VA addressing verification (TP sharded).",
|
||||
)
|
||||
def run(torch):
|
||||
"""Run the VA offset verification benchmark with full TP sharding."""
|
||||
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||
src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
|
||||
dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
|
||||
|
||||
# launch() automatically converts M, K to cube-local values
|
||||
torch.launch("va_offset_copy", _copy_kernel, src, dst, M, K)
|
||||
|
||||
# Sanity check: kernel completed with non-zero latency
|
||||
kernel_traces = [t for t in torch._traces if t["phase"] == "kernel"]
|
||||
assert len(kernel_traces) > 0, "No kernel traces recorded"
|
||||
for kt in kernel_traces:
|
||||
assert kt["total_ns"] > 0, f"Kernel latency is zero for {kt}"
|
||||
@@ -1,10 +1,10 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from benches.loader import resolve_bench
|
||||
from kernbench.cli.probe import cmd_probe
|
||||
from kernbench.benches.registry import list_all, resolve
|
||||
from kernbench.cli.report import format_report
|
||||
from kernbench.common.types import SimEngine
|
||||
from kernbench.probes.probe import cmd_probe
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import DeviceSelector, resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
@@ -17,7 +17,10 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
|
||||
runp = sub.add_parser("run", help="Run a benchmark")
|
||||
runp.add_argument("--topology", required=True)
|
||||
runp.add_argument("--bench", required=True)
|
||||
runp.add_argument(
|
||||
"--bench", required=True,
|
||||
help="Bench name (kebab-case) or numeric index from 'kernbench list'",
|
||||
)
|
||||
runp.add_argument(
|
||||
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
|
||||
)
|
||||
@@ -27,6 +30,9 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
)
|
||||
runp.set_defaults(_handler=cmd_run)
|
||||
|
||||
listp = sub.add_parser("list", help="List registered benches")
|
||||
listp.set_defaults(_handler=cmd_list)
|
||||
|
||||
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
|
||||
probep.add_argument("--topology", required=True)
|
||||
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
|
||||
@@ -53,23 +59,34 @@ def cmd_web(args) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_list(args) -> int:
|
||||
specs = list_all()
|
||||
print(f"{'#':>3} {'NAME':<22} DESCRIPTION")
|
||||
print("-" * 80)
|
||||
for s in specs:
|
||||
print(f"{s.index:>3} {s.name:<22} {s.description}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_run(args) -> int:
|
||||
print("> Running benchmark with:", args)
|
||||
|
||||
topo = resolve_topology(args.topology)
|
||||
bench = resolve_bench(args.bench)
|
||||
spec_entry = resolve(args.bench)
|
||||
device = resolve_device(args.device)
|
||||
verify_data = getattr(args, "verify_data", False)
|
||||
|
||||
def _factory(topology, device):
|
||||
return engine_factory(topology, device, enable_data=verify_data)
|
||||
|
||||
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory)
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=spec_entry.run, device=device, engine_factory=_factory,
|
||||
)
|
||||
|
||||
topo_obj = getattr(topo, "topology_obj", topo)
|
||||
spec = getattr(topo_obj, "spec", None)
|
||||
if result.traces:
|
||||
print(format_report(result.traces, title=args.bench, spec=spec))
|
||||
print(format_report(result.traces, title=spec_entry.name, spec=spec))
|
||||
print(result.summary_text())
|
||||
|
||||
# Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
"""kernbench.probes: latency/BW diagnostic utilities (not benchmarks).
|
||||
|
||||
See ADR-0010 D4. Probe is a developer tool for verifying the latency/BW
|
||||
model; it bypasses the bench registry.
|
||||
"""
|
||||
Reference in New Issue
Block a user