commit - release 1

2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Literal
+
+from kernbench.policy.address.allocator import PEMemAllocator
+from kernbench.policy.placement.dp import DPPolicy, ShardSpec
+from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
+
+
+@dataclass(frozen=True)
+class TensorShard:
+    sip: int
+    cube: int
+    pe: int
+    pa: int
+    nbytes: int
+    offset_bytes: int
+
+
+@dataclass(frozen=True)
+class TensorHandle:
+    name: str
+    shape: tuple[int, ...]
+    dtype: str
+    itemsize: int
+    shards: tuple[TensorShard, ...]
+
+    @property
+    def nbytes(self) -> int:
+        return math.prod(self.shape) * self.itemsize
+
+
+_DTYPE_ITEMSIZE = {
+    "fp16": 2, "float16": 2, "f16": 2,
+    "fp32": 4, "float32": 4, "f32": 4,
+    "bf16": 2,
+    "int8": 1, "i8": 1,
+    "int16": 2, "i16": 2,
+    "int32": 4, "i32": 4,
+}
+
+
+def dtype_itemsize(dtype: str) -> int:
+    if dtype not in _DTYPE_ITEMSIZE:
+        raise ValueError(f"unsupported dtype: {dtype}")
+    return _DTYPE_ITEMSIZE[dtype]
+
+
+def deploy_tensor(
+    *,
+    name: str,
+    shape: tuple[int, ...],
+    dtype: str,
+    placement: list[ShardSpec],
+    allocators: dict[int, PEMemAllocator],
+    mem_kind: Literal["hbm", "tcm"] = "hbm",
+) -> TensorHandle:
+    isize = dtype_itemsize(dtype)
+    shards: list[TensorShard] = []
+    for spec in placement:
+        alloc = allocators[spec.pe_index]
+        if mem_kind == "hbm":
+            pa = alloc.alloc_hbm(spec.nbytes)
+        else:
+            pa = alloc.alloc_tcm(spec.nbytes)
+        shards.append(TensorShard(
+            sip=alloc._sip_id,
+            cube=alloc._cube_id,
+            pe=alloc._pe_id,
+            pa=pa.encode(),
+            nbytes=spec.nbytes,
+            offset_bytes=spec.offset_bytes,
+        ))
+    return TensorHandle(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        itemsize=isize,
+        shards=tuple(shards),
+    )
+
+
+# ── PyTorch-like Tensor API ──────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class DPMetadata:
+    """Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
+
+    placement: list[ShardSpec]
+    dp_policy: DPPolicy | None = None
+    sip: int = 0
+    cube: int = 0
+    target_pe: int | str = 0  # int → single PE, "all" → all PEs
+
+
+class Tensor:
+    """PyTorch-like tensor for benchmark code.
+
+    Usage::
+
+        a = ctx.zeros((M, K), dtype="f16")
+        a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
+        ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
+    """
+
+    def __init__(
+        self,
+        shape: tuple[int, ...],
+        dtype: str = "f16",
+        name: str = "",
+    ) -> None:
+        self.shape = shape
+        self.dtype = dtype
+        self.name = name
+        self._dp_metadata: DPMetadata | None = None
+        self._handle: TensorHandle | None = None
+
+    @property
+    def itemsize(self) -> int:
+        return dtype_itemsize(self.dtype)
+
+    @property
+    def nbytes(self) -> int:
+        return math.prod(self.shape) * self.itemsize
+
+    @property
+    def pa(self) -> int:
+        """Primary PA (first shard). Used as kernel pointer argument."""
+        if self._handle is None or not self._handle.shards:
+            raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
+        return self._handle.shards[0].pa
+
+    def to(
+        self,
+        placement: list[ShardSpec] | None = None,
+        *,
+        dp_policy: DPPolicy | None = None,
+        sip: int = 0,
+        cube: int = 0,
+        target_pe: int | str = 0,
+    ) -> Tensor:
+        """Set DP placement metadata (like torch.Tensor.to())."""
+        if placement is None:
+            placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
+        self._dp_metadata = DPMetadata(
+            placement=placement, dp_policy=dp_policy,
+            sip=sip, cube=cube, target_pe=target_pe,
+        )
+        return self
+
+    def to_tensor_arg(self) -> TensorArg:
+        """Convert deployed shards to KernelLaunchMsg TensorArg."""
+        if self._handle is None:
+            raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
+        return TensorArg(
+            shards=tuple(
+                TensorArgShard(
+                    sip=s.sip, cube=s.cube, pe=s.pe,
+                    pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
+                )
+                for s in self._handle.shards
+            ),
+        )