Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise)
- PE_CPU: auto num_programs from cube shard count
- context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape
- deploy_tensor: removed mmus param, MMU mapping is context-only responsibility
- ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename
- VA offset bench + tests: 2D/1D, standard Triton kernel pattern

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
+138
View File
@@ -0,0 +1,138 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import PeEngineBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PeDmaComponent(PeEngineBase):
"""PE_DMA: dual-channel DMA engine with READ and WRITE resources.
Each channel has capacity=1 (ADR-0014 D4):
- DMA_READ and DMA_WRITE may execute concurrently.
- Multiple READs cannot overlap; multiple WRITEs cannot overlap.
Handles two message types:
- Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
- PeInternalTxn: PE-internal commands from PE_SCHEDULER
(DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._dma_read: simpy.Resource | None = None
self._dma_write: simpy.Resource | None = None
self._mmu = None # PeMMU instance, set by engine wiring
def init_resources(self, env: simpy.Environment) -> None:
self._dma_read = simpy.Resource(env, capacity=1)
self._dma_write = simpy.Resource(env, capacity=1)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
yield env.timeout(0)
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
"""Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import PeDmaMsg
cmd = pe_txn.command
assert self._dma_read is not None and self._dma_write is not None
# Determine direction and target address (VA → PA via MMU)
if isinstance(cmd, DmaReadCmd):
dma_res = self._dma_read
raw_addr = cmd.src_addr
is_write = False
elif isinstance(cmd, DmaWriteCmd):
dma_res = self._dma_write
raw_addr = cmd.dst_addr
is_write = True
else:
pe_txn.done.succeed()
return
# Translate VA → PA via MMU (if available), then resolve HBM node
# If MMU has no mapping for this address (PageFault), treat as PA directly
# (backward-compatible with PA-only mode)
if self._mmu is not None:
from kernbench.policy.address.pe_mmu import PageFault
try:
target_pa = self._mmu.translate(raw_addr)
if self._mmu.overhead_ns > 0:
yield env.timeout(self._mmu.overhead_ns)
except PageFault:
target_pa = raw_addr
else:
target_pa = raw_addr # fallback: treat as PA directly
pa = PhysAddr.decode(target_pa)
dst_node = self.ctx.resolver.resolve(pa)
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
# Acquire DMA channel (command issue serialization)
with dma_res.request() as req:
yield req
# Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
sub_done = env.event()
sub_request = PeDmaMsg(
correlation_id="pe_internal",
request_id=f"dma_{id(pe_txn)}",
src_sip=0, src_cube=0, src_pe=0,
dst_pa=target_pa, nbytes=cmd.nbytes,
is_write=is_write,
)
sub_txn = Transaction(
request=sub_request, path=path, step=0,
nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
)
# Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
if len(path) > 1:
yield self.out_ports[path[1]].put(sub_txn.advance())
# DMA channel released after issue
# Wait for HBM transfer completion
yield sub_done
pe_txn.done.succeed()
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
# Response transactions bypass DMA channel (no outbound resource needed)
if getattr(txn, "is_response", False):
next_hop = txn.next_hop
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
txn.done.succeed()
return
dma_res = self._select_channel(txn)
with dma_res.request() as req:
yield req
next_hop = txn.next_hop
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
txn.done.succeed()
def _select_channel(self, txn: Any) -> simpy.Resource:
"""Select DMA channel based on request type."""
from kernbench.runtime_api.kernel import MemoryWriteMsg
assert self._dma_read is not None and self._dma_write is not None
if isinstance(txn.request, MemoryWriteMsg):
return self._dma_write
return self._dma_read