commit - release 1

This commit is contained in:
2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
+96
View File
@@ -0,0 +1,96 @@
from __future__ import annotations
from collections.abc import Callable
from enum import Enum
from typing import Any
from kernbench.common.types import Completion, SimEngine, Trace
from .context import RuntimeContext
from .types import BenchResult, DeviceSelector
class CompletionPolicy(str, Enum):
LAST_SUBMITTED = "last_submitted"
LAST_COMPLETED = "last_completed" # requires trace/timestamps or engine support; stub for now
ALL_OK_FAIL_FAST = "all_ok_fail_fast"
BenchFn = Callable[[RuntimeContext], Any]
EngineFactory = Callable[[object, DeviceSelector], SimEngine]
def run_bench(
*,
topology: object,
bench_fn: BenchFn,
device: DeviceSelector,
engine_factory: EngineFactory,
correlation_id: str = "bench0",
completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
) -> BenchResult:
"""
Minimal bench runner.
- topology: compiled topology object (opaque to runtime here)
- bench_fn: callable that receives RuntimeContext and submits requests
- device: DeviceSelector ("all" or "sip:<N>")
- engine_factory: builds sim_engine for given topology & device
- completion_policy: how to determine overall completion/result
"""
engine = engine_factory(topology, device)
# Extract spec from TopologyHandle or TopologyGraph
topo_obj = getattr(topology, "topology_obj", topology)
spec = getattr(topo_obj, "spec", None)
ctx = RuntimeContext(
engine=engine, target_device=device,
correlation_id=correlation_id, spec=spec,
)
bench_fn(ctx)
ctx.wait_all()
collected_traces = ctx._traces or None
handles = ctx.handles()
if not handles:
return BenchResult(
completion=Completion(
ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests"
),
correlation_id=correlation_id,
trace=None,
traces=collected_traces,
)
if completion_policy == CompletionPolicy.LAST_SUBMITTED:
last = handles[-1]
completion, trace = engine.get_completion(last)
return BenchResult(
completion=completion, correlation_id=correlation_id,
trace=trace, traces=collected_traces,
)
if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
last_trace: Trace | None = None
for h in handles:
c, t = engine.get_completion(h)
last_trace = t if t is not None else last_trace
if not c.ok:
return BenchResult(
completion=c, correlation_id=correlation_id,
trace=last_trace, traces=collected_traces,
)
return BenchResult(
completion=Completion(ok=True), correlation_id=correlation_id,
trace=last_trace, traces=collected_traces,
)
# LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
last = handles[-1]
completion, trace = engine.get_completion(last)
return BenchResult(
completion=completion, correlation_id=correlation_id,
trace=trace, traces=collected_traces,
)
+282
View File
@@ -0,0 +1,282 @@
# kernbench/runtime_api/context.py
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from kernbench.common.types import Completion, RequestHandle, SimEngine
from .types import DeviceSelector
@dataclass
class RuntimeContext:
engine: SimEngine
target_device: DeviceSelector
correlation_id: str
spec: dict | None = None
_handles: list[RequestHandle] = field(default_factory=list, init=False)
_completed: set[RequestHandle] = field(default_factory=set, init=False)
_allocators: dict[int, Any] = field(default_factory=dict, init=False)
_tensor_counter: int = field(default=0, init=False)
_traces: list[dict] = field(default_factory=list, init=False)
def submit(self, request: Any) -> RequestHandle:
submit_fn = getattr(self.engine, "submit", None)
if submit_fn is None:
raise AttributeError("Engine does not implement submit(request) -> RequestHandle.")
handle: RequestHandle = submit_fn(request) # type: ignore[call-arg]
self._handles.append(handle)
return handle
def is_completed(self, handle: RequestHandle) -> bool:
return handle in self._completed
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
if handle in self._completed:
completion, trace = self.engine.get_completion(handle)
return completion
wait_fn = getattr(self.engine, "wait", None)
if wait_fn is not None:
wait_fn(handle) # type: ignore[misc]
completion, trace = self.engine.get_completion(handle)
self._completed.add(handle)
if _meta is not None and trace is not None:
entry = dict(trace) if isinstance(trace, dict) else {"raw": trace}
entry.update(_meta)
self._traces.append(entry)
return completion
def wait_all(self) -> None:
for h in self._handles:
if h not in self._completed:
self.wait(h)
def handles(self) -> list[RequestHandle]:
return list(self._handles)
# ── PyTorch-like tensor API ──────────────────────────────────────
def _ensure_allocators(self) -> dict:
"""Lazily create PEMemAllocator instances from spec."""
if self._allocators:
return self._allocators
if self.spec is None:
raise RuntimeError(
"RuntimeContext.spec is required for tensor operations. "
"Pass spec=graph.spec when creating RuntimeContext."
)
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
system = self.spec.get("system", {})
cube = self.spec.get("cube", {})
mm = cube.get("memory_map", {})
pe_template = cube.get("pe_template", {})
pe_comps = pe_template.get("components", {})
tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {})
sip_count = system.get("sips", {}).get("count", 1)
cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16)
pes_per_cube = (
cube.get("pe_layout", {}).get("pe_per_corner", 2)
* len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"]))
)
hbm_gb = mm.get("hbm_total_gb_per_cube", 48)
hbm_slices = mm.get("hbm_slices_per_cube", 8)
tcm_mb = tcm_cfg.get("size_mb", 16)
cfg = AddressConfig(
sip_count=sip_count,
cubes_per_sip=cubes_per_sip,
pes_per_cube=pes_per_cube,
hbm_bytes_per_cube=hbm_gb * (1 << 30),
hbm_slices_per_cube=hbm_slices,
tcm_bytes_per_pe=tcm_mb * (1 << 20),
tcm_scheduler_reserved_bytes=4 * (1 << 20),
sram_bytes_per_cube=32 * (1 << 20),
)
# Create allocators for all SIPs × cubes × PEs
# Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
self._pes_per_cube = pes_per_cube
self._num_cubes = cubes_per_sip
self._num_sips = sip_count
cubes_x_pes = cubes_per_sip * pes_per_cube
for sip_id in range(sip_count):
for cube_id in range(cubes_per_sip):
for pe_id in range(pes_per_cube):
flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
self._allocators[flat_idx] = PEMemAllocator(
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
)
return self._allocators
def _next_tensor_name(self) -> str:
self._tensor_counter += 1
return f"t{self._tensor_counter}"
def zeros(
self,
shape: tuple[int, ...],
dtype: str = "f16",
*,
placement: list | None = None,
dp: Any = None,
name: str | None = None,
):
"""Create a tensor and deploy to HBM with zero-fill (like torch.zeros)."""
return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp)
def empty(
self,
shape: tuple[int, ...],
dtype: str = "f16",
*,
placement: list | None = None,
dp: Any = None,
name: str | None = None,
):
"""Allocate a tensor in HBM without initialization (like torch.empty)."""
return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp)
def _create_tensor(
self,
shape: tuple[int, ...],
dtype: str,
placement: list | None,
name: str | None,
pattern: str | None,
dp: Any = None,
):
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
from kernbench.runtime_api.kernel import MemoryWriteMsg
from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize
tensor_name = name or self._next_tensor_name()
t = Tensor(shape=shape, dtype=dtype, name=tensor_name)
dp_policy: DPPolicy | None = None
# Resolve placement: dp= takes priority over placement=
if dp is not None and isinstance(dp, DPPolicy):
dp_policy = dp
allocators = self._ensure_allocators()
itemsize = dtype_itemsize(dtype)
shape_2d = (shape[0], shape[1]) # type: tuple[int, int]
total_cubes = self._num_sips * self._num_cubes
placement = resolve_dp_policy(
dp, shape=shape_2d, itemsize=itemsize,
num_pe=self._pes_per_cube, num_cubes=total_cubes,
)
elif placement is None:
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)]
# Infer target_pe from placement: multi-PE → "all", single PE → pe_index
pe_indices = {s.pe_index for s in placement}
target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
# Allocate PAs via PEMemAllocator
allocators = self._ensure_allocators()
handle = deploy_tensor(
name=tensor_name,
shape=shape,
dtype=dtype,
placement=placement,
allocators=allocators,
)
t._handle = handle
# Submit MemoryWriteMsg per shard (deploy data to device)
if pattern is not None:
for shard in handle.shards:
h = self.submit(MemoryWriteMsg(
correlation_id=self.correlation_id,
request_id=f"deploy_{tensor_name}_pe{shard.pe}",
dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe,
dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern,
target_cubes=(shard.cube,), target_pe=shard.pe,
))
self.wait(h, _meta={
"phase": "memory_write", "name": tensor_name,
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
"nbytes": shard.nbytes,
})
return t
def launch(
self,
kernel_name: str,
kernel_fn: Any,
*args: Any,
**kwargs: Any,
) -> RequestHandle:
"""Register and launch a kernel (like a fused torch op).
Positional args: Tensor objects become TensorArg, int/float become ScalarArg.
Keyword args: become ScalarArg (name is discarded, order preserved).
"""
from kernbench.runtime_api.kernel import (
KernelLaunchMsg,
KernelRef,
ScalarArg,
)
from kernbench.runtime_api.tensor import Tensor
from kernbench.triton_emu.registry import register_kernel
# Register kernel (idempotent)
try:
register_kernel(kernel_name, kernel_fn)
except ValueError:
pass
# Build kernel args from positional + keyword args
kernel_args: list = []
target_pe: int | str = 0
for a in args:
if isinstance(a, Tensor):
kernel_args.append(a.to_tensor_arg())
# Infer target_pe from tensor DP metadata
if a._dp_metadata is not None:
dp_target = a._dp_metadata.target_pe
if dp_target == "all":
target_pe = "all"
elif isinstance(dp_target, int) and target_pe != "all":
target_pe = dp_target
elif isinstance(a, (int, float)):
dtype_str = "f32" if isinstance(a, float) else "i32"
kernel_args.append(ScalarArg(dtype=dtype_str, value=a))
for v in kwargs.values():
if isinstance(v, (int, float)):
dtype_str = "f32" if isinstance(v, float) else "i32"
kernel_args.append(ScalarArg(dtype=dtype_str, value=v))
# Determine target cubes from all tensor shards
cube_set: set[int] = set()
for a in args:
if isinstance(a, Tensor) and a._handle is not None:
for s in a._handle.shards:
cube_set.add(s.cube)
target_cubes = tuple(sorted(cube_set)) if cube_set else (0,)
# Collect scalar values for GEMM FLOP calculation
scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")]
h = self.submit(KernelLaunchMsg(
correlation_id=self.correlation_id,
request_id=kernel_name,
kernel_ref=KernelRef(name=kernel_name, kind="builtin"),
args=tuple(kernel_args),
target_cubes=target_cubes,
target_pe=target_pe,
))
self.wait(h, _meta={
"phase": "kernel", "name": kernel_name,
"target_pe": target_pe, "scalars": scalar_vals,
})
return h
+123
View File
@@ -0,0 +1,123 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal, TypeAlias
@dataclass(frozen=True)
class MemoryWriteMsg:
correlation_id: str
request_id: str
dst_sip: int
dst_cube: int
dst_pe: int
dst_pa: int
nbytes: int
src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
pattern: str | None = None
target_cubes: tuple[int, ...] | Literal["all"] = "all"
target_pe: int | Literal["all"] = "all"
msg_type: Literal["memory_write"] = "memory_write"
@dataclass(frozen=True)
class MemoryReadMsg:
correlation_id: str
request_id: str
src_sip: int
src_cube: int
src_pe: int
src_pa: int
nbytes: int
target_cubes: tuple[int, ...] | Literal["all"] = "all"
target_pe: int | Literal["all"] = "all"
msg_type: Literal["memory_read"] = "memory_read"
@dataclass(frozen=True)
class KernelRef:
"""Reference to a kernel binary or builtin timing model.
Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
KernelLaunchMsg references the deployed location by PA — source code or IR
MUST NOT be embedded in launch messages.
- "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
- "builtin": simulator built-in timing model, identified by name.
"""
name: str
kind: Literal["deployed", "builtin"]
deploy_pa: int | None = None
deploy_sip: int = 0
deploy_cube: int = 0
deploy_pe: int = 0
nbytes_code: int = 0
@dataclass(frozen=True)
class TensorArgShard:
sip: int
cube: int
pe: int
pa: int
nbytes: int
offset_bytes: int
@dataclass(frozen=True)
class TensorArg:
shards: tuple[TensorArgShard, ...]
arg_kind: Literal["tensor"] = "tensor"
@dataclass(frozen=True)
class ScalarArg:
dtype: str
value: float | int
arg_kind: Literal["scalar"] = "scalar"
KernelArg: TypeAlias = TensorArg | ScalarArg
@dataclass(frozen=True)
class KernelLaunchMsg:
correlation_id: str
request_id: str
kernel_ref: KernelRef
args: tuple[KernelArg, ...]
target_cubes: tuple[int, ...] | Literal["all"] = "all"
target_pe: int | Literal["all"] = "all"
msg_type: Literal["kernel_launch"] = "kernel_launch"
@dataclass(frozen=True)
class ResponseMsg:
"""Device→Host response carrying PE execution result."""
correlation_id: str
request_id: str
src_cube: int
src_pe: int
success: bool
msg_type: Literal["response"] = "response"
@dataclass(frozen=True)
class PeDmaMsg:
"""Direct PE DMA request: host injects a transfer at PE_DMA level.
Used by the probe utility to measure PE→HBM latency without requiring
the full PE_CPU → scheduler → DMA pipeline.
"""
correlation_id: str
request_id: str
src_sip: int
src_cube: int
src_pe: int
dst_pa: int
nbytes: int
is_write: bool = False
msg_type: Literal["pe_dma"] = "pe_dma"
+166
View File
@@ -0,0 +1,166 @@
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Literal
from kernbench.policy.address.allocator import PEMemAllocator
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
@dataclass(frozen=True)
class TensorShard:
sip: int
cube: int
pe: int
pa: int
nbytes: int
offset_bytes: int
@dataclass(frozen=True)
class TensorHandle:
name: str
shape: tuple[int, ...]
dtype: str
itemsize: int
shards: tuple[TensorShard, ...]
@property
def nbytes(self) -> int:
return math.prod(self.shape) * self.itemsize
_DTYPE_ITEMSIZE = {
"fp16": 2, "float16": 2, "f16": 2,
"fp32": 4, "float32": 4, "f32": 4,
"bf16": 2,
"int8": 1, "i8": 1,
"int16": 2, "i16": 2,
"int32": 4, "i32": 4,
}
def dtype_itemsize(dtype: str) -> int:
if dtype not in _DTYPE_ITEMSIZE:
raise ValueError(f"unsupported dtype: {dtype}")
return _DTYPE_ITEMSIZE[dtype]
def deploy_tensor(
*,
name: str,
shape: tuple[int, ...],
dtype: str,
placement: list[ShardSpec],
allocators: dict[int, PEMemAllocator],
mem_kind: Literal["hbm", "tcm"] = "hbm",
) -> TensorHandle:
isize = dtype_itemsize(dtype)
shards: list[TensorShard] = []
for spec in placement:
alloc = allocators[spec.pe_index]
if mem_kind == "hbm":
pa = alloc.alloc_hbm(spec.nbytes)
else:
pa = alloc.alloc_tcm(spec.nbytes)
shards.append(TensorShard(
sip=alloc._sip_id,
cube=alloc._cube_id,
pe=alloc._pe_id,
pa=pa.encode(),
nbytes=spec.nbytes,
offset_bytes=spec.offset_bytes,
))
return TensorHandle(
name=name,
shape=shape,
dtype=dtype,
itemsize=isize,
shards=tuple(shards),
)
# ── PyTorch-like Tensor API ──────────────────────────────────────────
@dataclass(frozen=True)
class DPMetadata:
"""Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
placement: list[ShardSpec]
dp_policy: DPPolicy | None = None
sip: int = 0
cube: int = 0
target_pe: int | str = 0 # int → single PE, "all" → all PEs
class Tensor:
"""PyTorch-like tensor for benchmark code.
Usage::
a = ctx.zeros((M, K), dtype="f16")
a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
"""
def __init__(
self,
shape: tuple[int, ...],
dtype: str = "f16",
name: str = "",
) -> None:
self.shape = shape
self.dtype = dtype
self.name = name
self._dp_metadata: DPMetadata | None = None
self._handle: TensorHandle | None = None
@property
def itemsize(self) -> int:
return dtype_itemsize(self.dtype)
@property
def nbytes(self) -> int:
return math.prod(self.shape) * self.itemsize
@property
def pa(self) -> int:
"""Primary PA (first shard). Used as kernel pointer argument."""
if self._handle is None or not self._handle.shards:
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
return self._handle.shards[0].pa
def to(
self,
placement: list[ShardSpec] | None = None,
*,
dp_policy: DPPolicy | None = None,
sip: int = 0,
cube: int = 0,
target_pe: int | str = 0,
) -> Tensor:
"""Set DP placement metadata (like torch.Tensor.to())."""
if placement is None:
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
self._dp_metadata = DPMetadata(
placement=placement, dp_policy=dp_policy,
sip=sip, cube=cube, target_pe=target_pe,
)
return self
def to_tensor_arg(self) -> TensorArg:
"""Convert deployed shards to KernelLaunchMsg TensorArg."""
if self._handle is None:
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
return TensorArg(
shards=tuple(
TensorArgShard(
sip=s.sip, cube=s.cube, pe=s.pe,
pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
)
for s in self._handle.shards
),
)
+71
View File
@@ -0,0 +1,71 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from kernbench.common.types import Completion, Trace
@dataclass(frozen=True)
class BenchResult:
completion: Completion
correlation_id: str
trace: Trace | None = None
traces: list[dict] | None = None
def summary_text(self) -> str:
if self.completion.ok:
return f"[OK] correlation_id={self.correlation_id}"
code = self.completion.error_code or "ERROR"
msg = self.completion.error_message or ""
return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip()
@dataclass(frozen=True)
class DeviceSelector:
"""
Device selector.
Supported:
- "all" : all SIPs in the tray topology
- "sip:<N>" : a single SIP index
"""
raw: str # "all" or "sip:<N>"
@property
def is_all(self) -> bool:
return self.raw == "all"
@property
def sip_index(self) -> int:
if self.is_all:
raise ValueError("DeviceSelector is 'all'; no single sip_index.")
m = re.fullmatch(r"sip:(\d+)", self.raw)
if not m:
raise ValueError(
f"Invalid device '{self.raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0)."
)
return int(m.group(1))
def resolve_device(raw: str | None) -> DeviceSelector:
"""
Resolve the CLI --device string into a DeviceSelector.
Semantics:
- if omitted/empty -> "all"
- else accept "all" or "sip:<N>"
"""
if raw is None or raw.strip() == "":
return DeviceSelector(raw="all")
raw = raw.strip().lower()
if raw == "all":
return DeviceSelector(raw="all")
m = re.fullmatch(r"sip:(\d+)", raw)
if not m:
raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0).")
return DeviceSelector(raw=raw)