Add SIP-level tensor parallelism, component registry YAML, VA offset verification
- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Generator
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import simpy
|
||||
|
||||
from kernbench.components.base import PeEngineBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from kernbench.common.pe_commands import PeInternalTxn
|
||||
from kernbench.components.context import ComponentContext
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
# dtype → bit width (for TFLOPS scaling)
|
||||
_DTYPE_BITS: dict[str, int] = {
|
||||
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
||||
"f32": 32, "fp32": 32, "float32": 32,
|
||||
"i8": 8, "int8": 8,
|
||||
"i16": 16, "int16": 16,
|
||||
"i32": 32, "int32": 32,
|
||||
}
|
||||
|
||||
|
||||
class PeGemmComponent(PeEngineBase):
|
||||
"""PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
|
||||
|
||||
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
||||
exclusive with PE_MATH within the same PE.
|
||||
|
||||
Compute latency model:
|
||||
FLOPs = 2 * M * K * N
|
||||
effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
|
||||
compute_ns = FLOPs / (effective_tflops * 1e3)
|
||||
"""
|
||||
|
||||
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||
super().__init__(node, ctx)
|
||||
self._accel: simpy.Resource | None = None
|
||||
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
|
||||
|
||||
def init_resources(self, env: simpy.Environment) -> None:
|
||||
resource_name = self.node.attrs.get("shared_resource")
|
||||
if resource_name and self.ctx:
|
||||
self._accel = self.ctx.get_shared_resource(
|
||||
env, f"{self._pe_prefix}.{resource_name}"
|
||||
)
|
||||
|
||||
def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
|
||||
"""Compute GEMM latency in nanoseconds."""
|
||||
if self._peak_tflops_f16 <= 0:
|
||||
return float(self.node.attrs.get("overhead_ns", 0.0))
|
||||
dtype_bits = _DTYPE_BITS.get(dtype, 16)
|
||||
effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
|
||||
flops = 2.0 * m * k * n
|
||||
return flops / (effective_tflops * 1e3)
|
||||
|
||||
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||
yield env.timeout(overhead_ns)
|
||||
|
||||
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||
from kernbench.common.pe_commands import GemmCmd
|
||||
|
||||
cmd = pe_txn.command
|
||||
if self._accel:
|
||||
with self._accel.request() as req:
|
||||
yield req
|
||||
if isinstance(cmd, GemmCmd):
|
||||
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
|
||||
yield env.timeout(ns)
|
||||
else:
|
||||
yield from self.run(env, 0)
|
||||
else:
|
||||
if isinstance(cmd, GemmCmd):
|
||||
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
|
||||
yield env.timeout(ns)
|
||||
else:
|
||||
yield from self.run(env, 0)
|
||||
pe_txn.done.succeed()
|
||||
|
||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
"""Transaction forwarding with accel_slot acquisition."""
|
||||
if self._accel:
|
||||
with self._accel.request() as req:
|
||||
yield req
|
||||
yield from super()._forward_txn(env, txn)
|
||||
else:
|
||||
yield from super()._forward_txn(env, txn)
|
||||
Reference in New Issue
Block a user