Add SIP-level tensor parallelism, component registry YAML, VA offset verification
- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Generator
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import simpy
|
||||
|
||||
from kernbench.components.base import PeEngineBase
|
||||
from kernbench.sim_engine.transaction import Transaction
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from kernbench.common.pe_commands import PeInternalTxn
|
||||
from kernbench.components.context import ComponentContext
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
class PeDmaComponent(PeEngineBase):
|
||||
"""PE_DMA: dual-channel DMA engine with READ and WRITE resources.
|
||||
|
||||
Each channel has capacity=1 (ADR-0014 D4):
|
||||
- DMA_READ and DMA_WRITE may execute concurrently.
|
||||
- Multiple READs cannot overlap; multiple WRITEs cannot overlap.
|
||||
|
||||
Handles two message types:
|
||||
- Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
|
||||
- PeInternalTxn: PE-internal commands from PE_SCHEDULER
|
||||
(DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
|
||||
"""
|
||||
|
||||
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||
super().__init__(node, ctx)
|
||||
self._dma_read: simpy.Resource | None = None
|
||||
self._dma_write: simpy.Resource | None = None
|
||||
self._mmu = None # PeMMU instance, set by engine wiring
|
||||
|
||||
def init_resources(self, env: simpy.Environment) -> None:
|
||||
self._dma_read = simpy.Resource(env, capacity=1)
|
||||
self._dma_write = simpy.Resource(env, capacity=1)
|
||||
|
||||
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||
yield env.timeout(0)
|
||||
|
||||
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||
"""Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
|
||||
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||
|
||||
cmd = pe_txn.command
|
||||
assert self._dma_read is not None and self._dma_write is not None
|
||||
|
||||
# Determine direction and target address (VA → PA via MMU)
|
||||
if isinstance(cmd, DmaReadCmd):
|
||||
dma_res = self._dma_read
|
||||
raw_addr = cmd.src_addr
|
||||
is_write = False
|
||||
elif isinstance(cmd, DmaWriteCmd):
|
||||
dma_res = self._dma_write
|
||||
raw_addr = cmd.dst_addr
|
||||
is_write = True
|
||||
else:
|
||||
pe_txn.done.succeed()
|
||||
return
|
||||
|
||||
# Translate VA → PA via MMU (if available), then resolve HBM node
|
||||
# If MMU has no mapping for this address (PageFault), treat as PA directly
|
||||
# (backward-compatible with PA-only mode)
|
||||
if self._mmu is not None:
|
||||
from kernbench.policy.address.pe_mmu import PageFault
|
||||
try:
|
||||
target_pa = self._mmu.translate(raw_addr)
|
||||
if self._mmu.overhead_ns > 0:
|
||||
yield env.timeout(self._mmu.overhead_ns)
|
||||
except PageFault:
|
||||
target_pa = raw_addr
|
||||
else:
|
||||
target_pa = raw_addr # fallback: treat as PA directly
|
||||
pa = PhysAddr.decode(target_pa)
|
||||
dst_node = self.ctx.resolver.resolve(pa)
|
||||
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
|
||||
drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
|
||||
|
||||
# Acquire DMA channel (command issue serialization)
|
||||
with dma_res.request() as req:
|
||||
yield req
|
||||
# Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
|
||||
sub_done = env.event()
|
||||
sub_request = PeDmaMsg(
|
||||
correlation_id="pe_internal",
|
||||
request_id=f"dma_{id(pe_txn)}",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
dst_pa=target_pa, nbytes=cmd.nbytes,
|
||||
is_write=is_write,
|
||||
)
|
||||
sub_txn = Transaction(
|
||||
request=sub_request, path=path, step=0,
|
||||
nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
|
||||
)
|
||||
# Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
|
||||
if len(path) > 1:
|
||||
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||
# DMA channel released after issue
|
||||
|
||||
# Wait for HBM transfer completion
|
||||
yield sub_done
|
||||
pe_txn.done.succeed()
|
||||
|
||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
|
||||
# Response transactions bypass DMA channel (no outbound resource needed)
|
||||
if getattr(txn, "is_response", False):
|
||||
next_hop = txn.next_hop
|
||||
if next_hop:
|
||||
yield self.out_ports[next_hop].put(txn.advance())
|
||||
else:
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
dma_res = self._select_channel(txn)
|
||||
with dma_res.request() as req:
|
||||
yield req
|
||||
next_hop = txn.next_hop
|
||||
if next_hop:
|
||||
yield self.out_ports[next_hop].put(txn.advance())
|
||||
else:
|
||||
drain = getattr(txn, "drain_ns", 0.0)
|
||||
if drain > 0:
|
||||
yield env.timeout(drain)
|
||||
txn.done.succeed()
|
||||
|
||||
def _select_channel(self, txn: Any) -> simpy.Resource:
|
||||
"""Select DMA channel based on request type."""
|
||||
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||
|
||||
assert self._dma_read is not None and self._dma_write is not None
|
||||
if isinstance(txn.request, MemoryWriteMsg):
|
||||
return self._dma_write
|
||||
return self._dma_read
|
||||
Reference in New Issue
Block a user