Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class HbmCtrlComponent(ComponentBase):
+    """HBM controller: terminal component that models HBM access latency.
+
+    Dual-channel model: separate read and write resources (each capacity=1)
+    allowing concurrent read/write like PE_DMA. Multiple reads or multiple
+    writes still serialize within their respective channel.
+
+    On completion, creates a ResponseMsg and sends it back on the reverse path
+    so that response latency is modeled through the fabric.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._read: simpy.Resource | None = None
+        self._write: simpy.Resource | None = None
+
+    def start(self, env: simpy.Environment) -> None:
+        capacity = int(self.node.attrs.get("capacity", 1))
+        self._read = simpy.Resource(env, capacity=capacity)
+        self._write = simpy.Resource(env, capacity=capacity)
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _select_channel(self, txn: Any) -> simpy.Resource:
+        """Select channel based on request type: write requests → write, else → read."""
+        from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+
+        assert self._read is not None and self._write is not None
+        req = txn.request
+        if isinstance(req, MemoryWriteMsg):
+            return self._write
+        if isinstance(req, PeDmaMsg) and req.is_write:
+            return self._write
+        return self._read
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._handle_txn(env, txn))
+
+    def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Acquire channel, run, apply drain, send response."""
+        channel = self._select_channel(txn)
+        with channel.request() as req:
+            yield req
+            yield from self.run(env, txn.nbytes)
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+        yield from self._send_response(env, txn)
+
+    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Route completion based on path type.
+
+        - PeDmaMsg: succeed done directly (probe).
+        - Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
+          data back on reverse path with original done event.
+        - M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
+        """
+        from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
+
+        if isinstance(txn.request, PeDmaMsg):
+            reverse_path = list(reversed(txn.path))
+            if len(reverse_path) >= 2:
+                resp_txn = Transaction(
+                    request=txn.request, path=reverse_path, step=0,
+                    nbytes=0, done=txn.done, is_response=True,
+                )
+                yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                return
+            txn.done.succeed()
+            return
+
+        # Bypass path: no m_cpu in the transaction path
+        is_bypass = not any("m_cpu" in n for n in txn.path)
+        if is_bypass:
+            if isinstance(txn.request, MemoryReadMsg):
+                # D2H: send data back on reverse path to pcie_ep
+                reverse_path = list(reversed(txn.path))
+                if len(reverse_path) >= 2:
+                    resp_txn = Transaction(
+                        request=txn.request, path=reverse_path, step=0,
+                        nbytes=txn.request.nbytes, done=txn.done,
+                    )
+                    yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                    return
+            # MemoryWrite bypass or short path: done
+            txn.done.succeed()
+            return
+
+        # M_CPU DMA path: send ResponseMsg for aggregation
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2 and self.ctx:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            pe_id = int(parts[3].replace("slice", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=txn.request.correlation_id,
+                request_id=txn.request.request_id,
+                src_cube=cube_id, src_pe=pe_id, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()