Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise)
- PE_CPU: auto num_programs from cube shard count
- context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape
- deploy_tensor: removed mmus param, MMU mapping is context-only responsibility
- ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename
- VA offset bench + tests: 2D/1D, standard Triton kernel pattern

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
@@ -0,0 +1,90 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import PeEngineBase
if TYPE_CHECKING:
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
# dtype → bit width (for TFLOPS scaling)
_DTYPE_BITS: dict[str, int] = {
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
"f32": 32, "fp32": 32, "float32": 32,
"i8": 8, "int8": 8,
"i16": 16, "int16": 16,
"i32": 32, "int32": 32,
}
class PeGemmComponent(PeEngineBase):
"""PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
exclusive with PE_MATH within the same PE.
Compute latency model:
FLOPs = 2 * M * K * N
effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
compute_ns = FLOPs / (effective_tflops * 1e3)
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._accel: simpy.Resource | None = None
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
def init_resources(self, env: simpy.Environment) -> None:
resource_name = self.node.attrs.get("shared_resource")
if resource_name and self.ctx:
self._accel = self.ctx.get_shared_resource(
env, f"{self._pe_prefix}.{resource_name}"
)
def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
"""Compute GEMM latency in nanoseconds."""
if self._peak_tflops_f16 <= 0:
return float(self.node.attrs.get("overhead_ns", 0.0))
dtype_bits = _DTYPE_BITS.get(dtype, 16)
effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
flops = 2.0 * m * k * n
return flops / (effective_tflops * 1e3)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
from kernbench.common.pe_commands import GemmCmd
cmd = pe_txn.command
if self._accel:
with self._accel.request() as req:
yield req
if isinstance(cmd, GemmCmd):
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
yield env.timeout(ns)
else:
yield from self.run(env, 0)
else:
if isinstance(cmd, GemmCmd):
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
yield env.timeout(ns)
else:
yield from self.run(env, 0)
pe_txn.done.succeed()
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Transaction forwarding with accel_slot acquisition."""
if self._accel:
with self._accel.request() as req:
yield req
yield from super()._forward_txn(env, txn)
else:
yield from super()._forward_txn(env, txn)