Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeDmaComponent(PeEngineBase):
+    """PE_DMA: dual-channel DMA engine with READ and WRITE resources.
+
+    Each channel has capacity=1 (ADR-0014 D4):
+      - DMA_READ and DMA_WRITE may execute concurrently.
+      - Multiple READs cannot overlap; multiple WRITEs cannot overlap.
+
+    Handles two message types:
+      - Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
+      - PeInternalTxn: PE-internal commands from PE_SCHEDULER
+        (DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._dma_read: simpy.Resource | None = None
+        self._dma_write: simpy.Resource | None = None
+        self._mmu = None  # PeMMU instance, set by engine wiring
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        self._dma_read = simpy.Resource(env, capacity=1)
+        self._dma_write = simpy.Resource(env, capacity=1)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
+        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
+        from kernbench.policy.address.phyaddr import PhysAddr
+        from kernbench.runtime_api.kernel import PeDmaMsg
+
+        cmd = pe_txn.command
+        assert self._dma_read is not None and self._dma_write is not None
+
+        # Determine direction and target address (VA → PA via MMU)
+        if isinstance(cmd, DmaReadCmd):
+            dma_res = self._dma_read
+            raw_addr = cmd.src_addr
+            is_write = False
+        elif isinstance(cmd, DmaWriteCmd):
+            dma_res = self._dma_write
+            raw_addr = cmd.dst_addr
+            is_write = True
+        else:
+            pe_txn.done.succeed()
+            return
+
+        # Translate VA → PA via MMU (if available), then resolve HBM node
+        # If MMU has no mapping for this address (PageFault), treat as PA directly
+        # (backward-compatible with PA-only mode)
+        if self._mmu is not None:
+            from kernbench.policy.address.pe_mmu import PageFault
+            try:
+                target_pa = self._mmu.translate(raw_addr)
+                if self._mmu.overhead_ns > 0:
+                    yield env.timeout(self._mmu.overhead_ns)
+            except PageFault:
+                target_pa = raw_addr
+        else:
+            target_pa = raw_addr  # fallback: treat as PA directly
+        pa = PhysAddr.decode(target_pa)
+        dst_node = self.ctx.resolver.resolve(pa)
+        path = self.ctx.router.find_path(self._pe_prefix, dst_node)
+        drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
+
+        # Acquire DMA channel (command issue serialization)
+        with dma_res.request() as req:
+            yield req
+            # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
+            sub_done = env.event()
+            sub_request = PeDmaMsg(
+                correlation_id="pe_internal",
+                request_id=f"dma_{id(pe_txn)}",
+                src_sip=0, src_cube=0, src_pe=0,
+                dst_pa=target_pa, nbytes=cmd.nbytes,
+                is_write=is_write,
+            )
+            sub_txn = Transaction(
+                request=sub_request, path=path, step=0,
+                nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
+            )
+            # Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
+            if len(path) > 1:
+                yield self.out_ports[path[1]].put(sub_txn.advance())
+        # DMA channel released after issue
+
+        # Wait for HBM transfer completion
+        yield sub_done
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
+        # Response transactions bypass DMA channel (no outbound resource needed)
+        if getattr(txn, "is_response", False):
+            next_hop = txn.next_hop
+            if next_hop:
+                yield self.out_ports[next_hop].put(txn.advance())
+            else:
+                txn.done.succeed()
+            return
+
+        dma_res = self._select_channel(txn)
+        with dma_res.request() as req:
+            yield req
+            next_hop = txn.next_hop
+            if next_hop:
+                yield self.out_ports[next_hop].put(txn.advance())
+            else:
+                drain = getattr(txn, "drain_ns", 0.0)
+                if drain > 0:
+                    yield env.timeout(drain)
+                txn.done.succeed()
+
+    def _select_channel(self, txn: Any) -> simpy.Resource:
+        """Select DMA channel based on request type."""
+        from kernbench.runtime_api.kernel import MemoryWriteMsg
+
+        assert self._dma_read is not None and self._dma_write is not None
+        if isinstance(txn.request, MemoryWriteMsg):
+            return self._dma_write
+        return self._dma_read