commit - release 1
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from kernbench.common.types import Completion, SimEngine, Trace
|
||||
|
||||
from .context import RuntimeContext
|
||||
from .types import BenchResult, DeviceSelector
|
||||
|
||||
|
||||
class CompletionPolicy(str, Enum):
|
||||
LAST_SUBMITTED = "last_submitted"
|
||||
LAST_COMPLETED = "last_completed" # requires trace/timestamps or engine support; stub for now
|
||||
ALL_OK_FAIL_FAST = "all_ok_fail_fast"
|
||||
|
||||
|
||||
BenchFn = Callable[[RuntimeContext], Any]
|
||||
EngineFactory = Callable[[object, DeviceSelector], SimEngine]
|
||||
|
||||
|
||||
def run_bench(
|
||||
*,
|
||||
topology: object,
|
||||
bench_fn: BenchFn,
|
||||
device: DeviceSelector,
|
||||
engine_factory: EngineFactory,
|
||||
correlation_id: str = "bench0",
|
||||
completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
|
||||
) -> BenchResult:
|
||||
"""
|
||||
Minimal bench runner.
|
||||
|
||||
- topology: compiled topology object (opaque to runtime here)
|
||||
- bench_fn: callable that receives RuntimeContext and submits requests
|
||||
- device: DeviceSelector ("all" or "sip:<N>")
|
||||
- engine_factory: builds sim_engine for given topology & device
|
||||
- completion_policy: how to determine overall completion/result
|
||||
"""
|
||||
engine = engine_factory(topology, device)
|
||||
# Extract spec from TopologyHandle or TopologyGraph
|
||||
topo_obj = getattr(topology, "topology_obj", topology)
|
||||
spec = getattr(topo_obj, "spec", None)
|
||||
ctx = RuntimeContext(
|
||||
engine=engine, target_device=device,
|
||||
correlation_id=correlation_id, spec=spec,
|
||||
)
|
||||
|
||||
bench_fn(ctx)
|
||||
|
||||
ctx.wait_all()
|
||||
|
||||
collected_traces = ctx._traces or None
|
||||
|
||||
handles = ctx.handles()
|
||||
if not handles:
|
||||
return BenchResult(
|
||||
completion=Completion(
|
||||
ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests"
|
||||
),
|
||||
correlation_id=correlation_id,
|
||||
trace=None,
|
||||
traces=collected_traces,
|
||||
)
|
||||
|
||||
if completion_policy == CompletionPolicy.LAST_SUBMITTED:
|
||||
last = handles[-1]
|
||||
completion, trace = engine.get_completion(last)
|
||||
return BenchResult(
|
||||
completion=completion, correlation_id=correlation_id,
|
||||
trace=trace, traces=collected_traces,
|
||||
)
|
||||
|
||||
if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
|
||||
last_trace: Trace | None = None
|
||||
for h in handles:
|
||||
c, t = engine.get_completion(h)
|
||||
last_trace = t if t is not None else last_trace
|
||||
if not c.ok:
|
||||
return BenchResult(
|
||||
completion=c, correlation_id=correlation_id,
|
||||
trace=last_trace, traces=collected_traces,
|
||||
)
|
||||
return BenchResult(
|
||||
completion=Completion(ok=True), correlation_id=correlation_id,
|
||||
trace=last_trace, traces=collected_traces,
|
||||
)
|
||||
|
||||
# LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
|
||||
last = handles[-1]
|
||||
completion, trace = engine.get_completion(last)
|
||||
return BenchResult(
|
||||
completion=completion, correlation_id=correlation_id,
|
||||
trace=trace, traces=collected_traces,
|
||||
)
|
||||
@@ -0,0 +1,282 @@
|
||||
# kernbench/runtime_api/context.py
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from kernbench.common.types import Completion, RequestHandle, SimEngine
|
||||
|
||||
from .types import DeviceSelector
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeContext:
|
||||
engine: SimEngine
|
||||
target_device: DeviceSelector
|
||||
correlation_id: str
|
||||
spec: dict | None = None
|
||||
|
||||
_handles: list[RequestHandle] = field(default_factory=list, init=False)
|
||||
_completed: set[RequestHandle] = field(default_factory=set, init=False)
|
||||
_allocators: dict[int, Any] = field(default_factory=dict, init=False)
|
||||
_tensor_counter: int = field(default=0, init=False)
|
||||
_traces: list[dict] = field(default_factory=list, init=False)
|
||||
|
||||
def submit(self, request: Any) -> RequestHandle:
|
||||
submit_fn = getattr(self.engine, "submit", None)
|
||||
if submit_fn is None:
|
||||
raise AttributeError("Engine does not implement submit(request) -> RequestHandle.")
|
||||
handle: RequestHandle = submit_fn(request) # type: ignore[call-arg]
|
||||
self._handles.append(handle)
|
||||
return handle
|
||||
|
||||
def is_completed(self, handle: RequestHandle) -> bool:
|
||||
return handle in self._completed
|
||||
|
||||
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
|
||||
if handle in self._completed:
|
||||
completion, trace = self.engine.get_completion(handle)
|
||||
return completion
|
||||
|
||||
wait_fn = getattr(self.engine, "wait", None)
|
||||
if wait_fn is not None:
|
||||
wait_fn(handle) # type: ignore[misc]
|
||||
|
||||
completion, trace = self.engine.get_completion(handle)
|
||||
self._completed.add(handle)
|
||||
if _meta is not None and trace is not None:
|
||||
entry = dict(trace) if isinstance(trace, dict) else {"raw": trace}
|
||||
entry.update(_meta)
|
||||
self._traces.append(entry)
|
||||
return completion
|
||||
|
||||
def wait_all(self) -> None:
|
||||
for h in self._handles:
|
||||
if h not in self._completed:
|
||||
self.wait(h)
|
||||
|
||||
def handles(self) -> list[RequestHandle]:
|
||||
return list(self._handles)
|
||||
|
||||
# ── PyTorch-like tensor API ──────────────────────────────────────
|
||||
|
||||
def _ensure_allocators(self) -> dict:
|
||||
"""Lazily create PEMemAllocator instances from spec."""
|
||||
if self._allocators:
|
||||
return self._allocators
|
||||
if self.spec is None:
|
||||
raise RuntimeError(
|
||||
"RuntimeContext.spec is required for tensor operations. "
|
||||
"Pass spec=graph.spec when creating RuntimeContext."
|
||||
)
|
||||
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
||||
|
||||
system = self.spec.get("system", {})
|
||||
cube = self.spec.get("cube", {})
|
||||
mm = cube.get("memory_map", {})
|
||||
pe_template = cube.get("pe_template", {})
|
||||
pe_comps = pe_template.get("components", {})
|
||||
tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {})
|
||||
|
||||
sip_count = system.get("sips", {}).get("count", 1)
|
||||
cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16)
|
||||
pes_per_cube = (
|
||||
cube.get("pe_layout", {}).get("pe_per_corner", 2)
|
||||
* len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"]))
|
||||
)
|
||||
hbm_gb = mm.get("hbm_total_gb_per_cube", 48)
|
||||
hbm_slices = mm.get("hbm_slices_per_cube", 8)
|
||||
tcm_mb = tcm_cfg.get("size_mb", 16)
|
||||
|
||||
cfg = AddressConfig(
|
||||
sip_count=sip_count,
|
||||
cubes_per_sip=cubes_per_sip,
|
||||
pes_per_cube=pes_per_cube,
|
||||
hbm_bytes_per_cube=hbm_gb * (1 << 30),
|
||||
hbm_slices_per_cube=hbm_slices,
|
||||
tcm_bytes_per_pe=tcm_mb * (1 << 20),
|
||||
tcm_scheduler_reserved_bytes=4 * (1 << 20),
|
||||
sram_bytes_per_cube=32 * (1 << 20),
|
||||
)
|
||||
# Create allocators for all SIPs × cubes × PEs
|
||||
# Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
|
||||
self._pes_per_cube = pes_per_cube
|
||||
self._num_cubes = cubes_per_sip
|
||||
self._num_sips = sip_count
|
||||
cubes_x_pes = cubes_per_sip * pes_per_cube
|
||||
for sip_id in range(sip_count):
|
||||
for cube_id in range(cubes_per_sip):
|
||||
for pe_id in range(pes_per_cube):
|
||||
flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
|
||||
self._allocators[flat_idx] = PEMemAllocator(
|
||||
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
||||
)
|
||||
return self._allocators
|
||||
|
||||
def _next_tensor_name(self) -> str:
|
||||
self._tensor_counter += 1
|
||||
return f"t{self._tensor_counter}"
|
||||
|
||||
def zeros(
|
||||
self,
|
||||
shape: tuple[int, ...],
|
||||
dtype: str = "f16",
|
||||
*,
|
||||
placement: list | None = None,
|
||||
dp: Any = None,
|
||||
name: str | None = None,
|
||||
):
|
||||
"""Create a tensor and deploy to HBM with zero-fill (like torch.zeros)."""
|
||||
return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp)
|
||||
|
||||
def empty(
|
||||
self,
|
||||
shape: tuple[int, ...],
|
||||
dtype: str = "f16",
|
||||
*,
|
||||
placement: list | None = None,
|
||||
dp: Any = None,
|
||||
name: str | None = None,
|
||||
):
|
||||
"""Allocate a tensor in HBM without initialization (like torch.empty)."""
|
||||
return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp)
|
||||
|
||||
def _create_tensor(
|
||||
self,
|
||||
shape: tuple[int, ...],
|
||||
dtype: str,
|
||||
placement: list | None,
|
||||
name: str | None,
|
||||
pattern: str | None,
|
||||
dp: Any = None,
|
||||
):
|
||||
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
|
||||
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||
from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize
|
||||
|
||||
tensor_name = name or self._next_tensor_name()
|
||||
t = Tensor(shape=shape, dtype=dtype, name=tensor_name)
|
||||
|
||||
dp_policy: DPPolicy | None = None
|
||||
|
||||
# Resolve placement: dp= takes priority over placement=
|
||||
if dp is not None and isinstance(dp, DPPolicy):
|
||||
dp_policy = dp
|
||||
allocators = self._ensure_allocators()
|
||||
itemsize = dtype_itemsize(dtype)
|
||||
shape_2d = (shape[0], shape[1]) # type: tuple[int, int]
|
||||
total_cubes = self._num_sips * self._num_cubes
|
||||
placement = resolve_dp_policy(
|
||||
dp, shape=shape_2d, itemsize=itemsize,
|
||||
num_pe=self._pes_per_cube, num_cubes=total_cubes,
|
||||
)
|
||||
elif placement is None:
|
||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)]
|
||||
|
||||
# Infer target_pe from placement: multi-PE → "all", single PE → pe_index
|
||||
pe_indices = {s.pe_index for s in placement}
|
||||
target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
|
||||
t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
|
||||
|
||||
# Allocate PAs via PEMemAllocator
|
||||
allocators = self._ensure_allocators()
|
||||
handle = deploy_tensor(
|
||||
name=tensor_name,
|
||||
shape=shape,
|
||||
dtype=dtype,
|
||||
placement=placement,
|
||||
allocators=allocators,
|
||||
)
|
||||
t._handle = handle
|
||||
|
||||
# Submit MemoryWriteMsg per shard (deploy data to device)
|
||||
if pattern is not None:
|
||||
for shard in handle.shards:
|
||||
h = self.submit(MemoryWriteMsg(
|
||||
correlation_id=self.correlation_id,
|
||||
request_id=f"deploy_{tensor_name}_pe{shard.pe}",
|
||||
dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe,
|
||||
dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern,
|
||||
target_cubes=(shard.cube,), target_pe=shard.pe,
|
||||
))
|
||||
self.wait(h, _meta={
|
||||
"phase": "memory_write", "name": tensor_name,
|
||||
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
|
||||
"nbytes": shard.nbytes,
|
||||
})
|
||||
|
||||
return t
|
||||
|
||||
def launch(
|
||||
self,
|
||||
kernel_name: str,
|
||||
kernel_fn: Any,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> RequestHandle:
|
||||
"""Register and launch a kernel (like a fused torch op).
|
||||
|
||||
Positional args: Tensor objects become TensorArg, int/float become ScalarArg.
|
||||
Keyword args: become ScalarArg (name is discarded, order preserved).
|
||||
"""
|
||||
from kernbench.runtime_api.kernel import (
|
||||
KernelLaunchMsg,
|
||||
KernelRef,
|
||||
ScalarArg,
|
||||
)
|
||||
from kernbench.runtime_api.tensor import Tensor
|
||||
from kernbench.triton_emu.registry import register_kernel
|
||||
|
||||
# Register kernel (idempotent)
|
||||
try:
|
||||
register_kernel(kernel_name, kernel_fn)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Build kernel args from positional + keyword args
|
||||
kernel_args: list = []
|
||||
target_pe: int | str = 0
|
||||
|
||||
for a in args:
|
||||
if isinstance(a, Tensor):
|
||||
kernel_args.append(a.to_tensor_arg())
|
||||
# Infer target_pe from tensor DP metadata
|
||||
if a._dp_metadata is not None:
|
||||
dp_target = a._dp_metadata.target_pe
|
||||
if dp_target == "all":
|
||||
target_pe = "all"
|
||||
elif isinstance(dp_target, int) and target_pe != "all":
|
||||
target_pe = dp_target
|
||||
elif isinstance(a, (int, float)):
|
||||
dtype_str = "f32" if isinstance(a, float) else "i32"
|
||||
kernel_args.append(ScalarArg(dtype=dtype_str, value=a))
|
||||
|
||||
for v in kwargs.values():
|
||||
if isinstance(v, (int, float)):
|
||||
dtype_str = "f32" if isinstance(v, float) else "i32"
|
||||
kernel_args.append(ScalarArg(dtype=dtype_str, value=v))
|
||||
|
||||
# Determine target cubes from all tensor shards
|
||||
cube_set: set[int] = set()
|
||||
for a in args:
|
||||
if isinstance(a, Tensor) and a._handle is not None:
|
||||
for s in a._handle.shards:
|
||||
cube_set.add(s.cube)
|
||||
target_cubes = tuple(sorted(cube_set)) if cube_set else (0,)
|
||||
|
||||
# Collect scalar values for GEMM FLOP calculation
|
||||
scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")]
|
||||
|
||||
h = self.submit(KernelLaunchMsg(
|
||||
correlation_id=self.correlation_id,
|
||||
request_id=kernel_name,
|
||||
kernel_ref=KernelRef(name=kernel_name, kind="builtin"),
|
||||
args=tuple(kernel_args),
|
||||
target_cubes=target_cubes,
|
||||
target_pe=target_pe,
|
||||
))
|
||||
self.wait(h, _meta={
|
||||
"phase": "kernel", "name": kernel_name,
|
||||
"target_pe": target_pe, "scalars": scalar_vals,
|
||||
})
|
||||
return h
|
||||
@@ -0,0 +1,123 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, TypeAlias
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemoryWriteMsg:
|
||||
correlation_id: str
|
||||
request_id: str
|
||||
dst_sip: int
|
||||
dst_cube: int
|
||||
dst_pe: int
|
||||
dst_pa: int
|
||||
nbytes: int
|
||||
src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
|
||||
pattern: str | None = None
|
||||
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||
target_pe: int | Literal["all"] = "all"
|
||||
msg_type: Literal["memory_write"] = "memory_write"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemoryReadMsg:
|
||||
correlation_id: str
|
||||
request_id: str
|
||||
src_sip: int
|
||||
src_cube: int
|
||||
src_pe: int
|
||||
src_pa: int
|
||||
nbytes: int
|
||||
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||
target_pe: int | Literal["all"] = "all"
|
||||
msg_type: Literal["memory_read"] = "memory_read"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class KernelRef:
|
||||
"""Reference to a kernel binary or builtin timing model.
|
||||
|
||||
Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
|
||||
KernelLaunchMsg references the deployed location by PA — source code or IR
|
||||
MUST NOT be embedded in launch messages.
|
||||
|
||||
- "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
|
||||
- "builtin": simulator built-in timing model, identified by name.
|
||||
"""
|
||||
|
||||
name: str
|
||||
kind: Literal["deployed", "builtin"]
|
||||
deploy_pa: int | None = None
|
||||
deploy_sip: int = 0
|
||||
deploy_cube: int = 0
|
||||
deploy_pe: int = 0
|
||||
nbytes_code: int = 0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TensorArgShard:
|
||||
sip: int
|
||||
cube: int
|
||||
pe: int
|
||||
pa: int
|
||||
nbytes: int
|
||||
offset_bytes: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TensorArg:
|
||||
shards: tuple[TensorArgShard, ...]
|
||||
arg_kind: Literal["tensor"] = "tensor"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ScalarArg:
|
||||
dtype: str
|
||||
value: float | int
|
||||
arg_kind: Literal["scalar"] = "scalar"
|
||||
|
||||
|
||||
KernelArg: TypeAlias = TensorArg | ScalarArg
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class KernelLaunchMsg:
|
||||
correlation_id: str
|
||||
request_id: str
|
||||
kernel_ref: KernelRef
|
||||
args: tuple[KernelArg, ...]
|
||||
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||
target_pe: int | Literal["all"] = "all"
|
||||
msg_type: Literal["kernel_launch"] = "kernel_launch"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResponseMsg:
|
||||
"""Device→Host response carrying PE execution result."""
|
||||
|
||||
correlation_id: str
|
||||
request_id: str
|
||||
src_cube: int
|
||||
src_pe: int
|
||||
success: bool
|
||||
msg_type: Literal["response"] = "response"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PeDmaMsg:
|
||||
"""Direct PE DMA request: host injects a transfer at PE_DMA level.
|
||||
|
||||
Used by the probe utility to measure PE→HBM latency without requiring
|
||||
the full PE_CPU → scheduler → DMA pipeline.
|
||||
"""
|
||||
|
||||
correlation_id: str
|
||||
request_id: str
|
||||
src_sip: int
|
||||
src_cube: int
|
||||
src_pe: int
|
||||
dst_pa: int
|
||||
nbytes: int
|
||||
is_write: bool = False
|
||||
msg_type: Literal["pe_dma"] = "pe_dma"
|
||||
@@ -0,0 +1,166 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
from kernbench.policy.address.allocator import PEMemAllocator
|
||||
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
|
||||
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TensorShard:
|
||||
sip: int
|
||||
cube: int
|
||||
pe: int
|
||||
pa: int
|
||||
nbytes: int
|
||||
offset_bytes: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TensorHandle:
|
||||
name: str
|
||||
shape: tuple[int, ...]
|
||||
dtype: str
|
||||
itemsize: int
|
||||
shards: tuple[TensorShard, ...]
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return math.prod(self.shape) * self.itemsize
|
||||
|
||||
|
||||
_DTYPE_ITEMSIZE = {
|
||||
"fp16": 2, "float16": 2, "f16": 2,
|
||||
"fp32": 4, "float32": 4, "f32": 4,
|
||||
"bf16": 2,
|
||||
"int8": 1, "i8": 1,
|
||||
"int16": 2, "i16": 2,
|
||||
"int32": 4, "i32": 4,
|
||||
}
|
||||
|
||||
|
||||
def dtype_itemsize(dtype: str) -> int:
|
||||
if dtype not in _DTYPE_ITEMSIZE:
|
||||
raise ValueError(f"unsupported dtype: {dtype}")
|
||||
return _DTYPE_ITEMSIZE[dtype]
|
||||
|
||||
|
||||
def deploy_tensor(
|
||||
*,
|
||||
name: str,
|
||||
shape: tuple[int, ...],
|
||||
dtype: str,
|
||||
placement: list[ShardSpec],
|
||||
allocators: dict[int, PEMemAllocator],
|
||||
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
||||
) -> TensorHandle:
|
||||
isize = dtype_itemsize(dtype)
|
||||
shards: list[TensorShard] = []
|
||||
for spec in placement:
|
||||
alloc = allocators[spec.pe_index]
|
||||
if mem_kind == "hbm":
|
||||
pa = alloc.alloc_hbm(spec.nbytes)
|
||||
else:
|
||||
pa = alloc.alloc_tcm(spec.nbytes)
|
||||
shards.append(TensorShard(
|
||||
sip=alloc._sip_id,
|
||||
cube=alloc._cube_id,
|
||||
pe=alloc._pe_id,
|
||||
pa=pa.encode(),
|
||||
nbytes=spec.nbytes,
|
||||
offset_bytes=spec.offset_bytes,
|
||||
))
|
||||
return TensorHandle(
|
||||
name=name,
|
||||
shape=shape,
|
||||
dtype=dtype,
|
||||
itemsize=isize,
|
||||
shards=tuple(shards),
|
||||
)
|
||||
|
||||
|
||||
# ── PyTorch-like Tensor API ──────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DPMetadata:
|
||||
"""Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
|
||||
|
||||
placement: list[ShardSpec]
|
||||
dp_policy: DPPolicy | None = None
|
||||
sip: int = 0
|
||||
cube: int = 0
|
||||
target_pe: int | str = 0 # int → single PE, "all" → all PEs
|
||||
|
||||
|
||||
class Tensor:
|
||||
"""PyTorch-like tensor for benchmark code.
|
||||
|
||||
Usage::
|
||||
|
||||
a = ctx.zeros((M, K), dtype="f16")
|
||||
a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
|
||||
ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
shape: tuple[int, ...],
|
||||
dtype: str = "f16",
|
||||
name: str = "",
|
||||
) -> None:
|
||||
self.shape = shape
|
||||
self.dtype = dtype
|
||||
self.name = name
|
||||
self._dp_metadata: DPMetadata | None = None
|
||||
self._handle: TensorHandle | None = None
|
||||
|
||||
@property
|
||||
def itemsize(self) -> int:
|
||||
return dtype_itemsize(self.dtype)
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return math.prod(self.shape) * self.itemsize
|
||||
|
||||
@property
|
||||
def pa(self) -> int:
|
||||
"""Primary PA (first shard). Used as kernel pointer argument."""
|
||||
if self._handle is None or not self._handle.shards:
|
||||
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
||||
return self._handle.shards[0].pa
|
||||
|
||||
def to(
|
||||
self,
|
||||
placement: list[ShardSpec] | None = None,
|
||||
*,
|
||||
dp_policy: DPPolicy | None = None,
|
||||
sip: int = 0,
|
||||
cube: int = 0,
|
||||
target_pe: int | str = 0,
|
||||
) -> Tensor:
|
||||
"""Set DP placement metadata (like torch.Tensor.to())."""
|
||||
if placement is None:
|
||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
|
||||
self._dp_metadata = DPMetadata(
|
||||
placement=placement, dp_policy=dp_policy,
|
||||
sip=sip, cube=cube, target_pe=target_pe,
|
||||
)
|
||||
return self
|
||||
|
||||
def to_tensor_arg(self) -> TensorArg:
|
||||
"""Convert deployed shards to KernelLaunchMsg TensorArg."""
|
||||
if self._handle is None:
|
||||
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
||||
return TensorArg(
|
||||
shards=tuple(
|
||||
TensorArgShard(
|
||||
sip=s.sip, cube=s.cube, pe=s.pe,
|
||||
pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
|
||||
)
|
||||
for s in self._handle.shards
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,71 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
from kernbench.common.types import Completion, Trace
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BenchResult:
|
||||
completion: Completion
|
||||
correlation_id: str
|
||||
trace: Trace | None = None
|
||||
traces: list[dict] | None = None
|
||||
|
||||
def summary_text(self) -> str:
|
||||
if self.completion.ok:
|
||||
return f"[OK] correlation_id={self.correlation_id}"
|
||||
code = self.completion.error_code or "ERROR"
|
||||
msg = self.completion.error_message or ""
|
||||
return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeviceSelector:
|
||||
"""
|
||||
Device selector.
|
||||
|
||||
Supported:
|
||||
- "all" : all SIPs in the tray topology
|
||||
- "sip:<N>" : a single SIP index
|
||||
"""
|
||||
|
||||
raw: str # "all" or "sip:<N>"
|
||||
|
||||
@property
|
||||
def is_all(self) -> bool:
|
||||
return self.raw == "all"
|
||||
|
||||
@property
|
||||
def sip_index(self) -> int:
|
||||
if self.is_all:
|
||||
raise ValueError("DeviceSelector is 'all'; no single sip_index.")
|
||||
m = re.fullmatch(r"sip:(\d+)", self.raw)
|
||||
if not m:
|
||||
raise ValueError(
|
||||
f"Invalid device '{self.raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0)."
|
||||
)
|
||||
return int(m.group(1))
|
||||
|
||||
|
||||
def resolve_device(raw: str | None) -> DeviceSelector:
|
||||
"""
|
||||
Resolve the CLI --device string into a DeviceSelector.
|
||||
|
||||
Semantics:
|
||||
- if omitted/empty -> "all"
|
||||
- else accept "all" or "sip:<N>"
|
||||
"""
|
||||
if raw is None or raw.strip() == "":
|
||||
return DeviceSelector(raw="all")
|
||||
|
||||
raw = raw.strip().lower()
|
||||
if raw == "all":
|
||||
return DeviceSelector(raw="all")
|
||||
|
||||
m = re.fullmatch(r"sip:(\d+)", raw)
|
||||
if not m:
|
||||
raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0).")
|
||||
|
||||
return DeviceSelector(raw=raw)
|
||||
Reference in New Issue
Block a user