Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+# dtype → bit width (for TFLOPS scaling)
+_DTYPE_BITS: dict[str, int] = {
+    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
+    "f32": 32, "fp32": 32, "float32": 32,
+    "i8": 8, "int8": 8,
+    "i16": 16, "int16": 16,
+    "i32": 32, "int32": 32,
+}
+
+
+class PeGemmComponent(PeEngineBase):
+    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
+
+    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    exclusive with PE_MATH within the same PE.
+
+    Compute latency model:
+      FLOPs = 2 * M * K * N
+      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
+      compute_ns = FLOPs / (effective_tflops * 1e3)
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._accel: simpy.Resource | None = None
+        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        resource_name = self.node.attrs.get("shared_resource")
+        if resource_name and self.ctx:
+            self._accel = self.ctx.get_shared_resource(
+                env, f"{self._pe_prefix}.{resource_name}"
+            )
+
+    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
+        """Compute GEMM latency in nanoseconds."""
+        if self._peak_tflops_f16 <= 0:
+            return float(self.node.attrs.get("overhead_ns", 0.0))
+        dtype_bits = _DTYPE_BITS.get(dtype, 16)
+        effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
+        flops = 2.0 * m * k * n
+        return flops / (effective_tflops * 1e3)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        from kernbench.common.pe_commands import GemmCmd
+
+        cmd = pe_txn.command
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                if isinstance(cmd, GemmCmd):
+                    ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
+                    yield env.timeout(ns)
+                else:
+                    yield from self.run(env, 0)
+        else:
+            if isinstance(cmd, GemmCmd):
+                ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
+                yield env.timeout(ns)
+            else:
+                yield from self.run(env, 0)
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Transaction forwarding with accel_slot acquisition."""
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from super()._forward_txn(env, txn)
+        else:
+            yield from super()._forward_txn(env, txn)