14d800b0ae
- KernelLaunchMsg gains target_start_ns: IO_CPU stamps a global barrier (max path latency across every target PE), M_CPU passes it through, PE_CPU yields until it before recording pe_exec_start. Every PE in a launch begins kernel execution at the same env.now regardless of its dispatch path length — eliminates per-PE dispatch-offset artifact in cross-PE and cross-cube latency measurements. - PE_DMA._handle_ipcq_inbound now pays Transaction.drain_ns at the top, matching the terminal-drain behavior of ComponentBase._forward_txn for every non-IPCQ Transaction. SRC-side tl.send stays fire-and-forget (sender doesn't yield on sub_done); tl.recv now blocks until bytes have actually drained into its inbox. - ComponentContext: new compute_path_latency_ns helper + node_overhead_ns field populated by GraphEngine. - tests/test_kernel_launch_sync.py: asserts all PEs in one launch produce identical pe_exec_ns for a no-op kernel (zero spread). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
5.3 KiB
Python
187 lines
5.3 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Literal, TypeAlias
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MemoryWriteMsg:
|
|
correlation_id: str
|
|
request_id: str
|
|
dst_sip: int
|
|
dst_cube: int
|
|
dst_pe: int
|
|
dst_pa: int
|
|
nbytes: int
|
|
src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
|
|
pattern: str | None = None
|
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
|
target_pe: int | Literal["all"] = "all"
|
|
msg_type: Literal["memory_write"] = "memory_write"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MemoryReadMsg:
|
|
correlation_id: str
|
|
request_id: str
|
|
src_sip: int
|
|
src_cube: int
|
|
src_pe: int
|
|
src_pa: int
|
|
nbytes: int
|
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
|
target_pe: int | Literal["all"] = "all"
|
|
msg_type: Literal["memory_read"] = "memory_read"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class KernelRef:
|
|
"""Reference to a kernel binary or builtin timing model.
|
|
|
|
Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
|
|
KernelLaunchMsg references the deployed location by PA — source code or IR
|
|
MUST NOT be embedded in launch messages.
|
|
|
|
- "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
|
|
- "builtin": simulator built-in timing model, identified by name.
|
|
"""
|
|
|
|
name: str
|
|
kind: Literal["deployed", "builtin"]
|
|
deploy_pa: int | None = None
|
|
deploy_sip: int = 0
|
|
deploy_cube: int = 0
|
|
deploy_pe: int = 0
|
|
nbytes_code: int = 0
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TensorArgShard:
|
|
sip: int
|
|
cube: int
|
|
pe: int
|
|
pa: int
|
|
nbytes: int
|
|
offset_bytes: int
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TensorArg:
|
|
shards: tuple[TensorArgShard, ...]
|
|
arg_kind: Literal["tensor"] = "tensor"
|
|
va_base: int = 0 # VA base address for the entire tensor
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ScalarArg:
|
|
dtype: str
|
|
value: float | int
|
|
arg_kind: Literal["scalar"] = "scalar"
|
|
|
|
|
|
KernelArg: TypeAlias = TensorArg | ScalarArg
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class KernelLaunchMsg:
|
|
correlation_id: str
|
|
request_id: str
|
|
kernel_ref: KernelRef
|
|
args: tuple[KernelArg, ...]
|
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
|
target_pe: int | tuple[int, ...] | Literal["all"] = "all"
|
|
# ADR-0009 D5: synchronized kernel start. When set, each PE_CPU yields
|
|
# until env.now >= target_start_ns before beginning kernel execution,
|
|
# so every PE in a launch starts at the same simulated time regardless
|
|
# of its M_CPU dispatch path length. Stamped by M_CPU fan-out.
|
|
target_start_ns: float | None = None
|
|
msg_type: Literal["kernel_launch"] = "kernel_launch"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ResponseMsg:
|
|
"""Device→Host response carrying PE execution result."""
|
|
|
|
correlation_id: str
|
|
request_id: str
|
|
src_cube: int
|
|
src_pe: int
|
|
success: bool
|
|
msg_type: Literal["response"] = "response"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PeDmaMsg:
|
|
"""Direct PE DMA request: host injects a transfer at PE_DMA level.
|
|
|
|
Used by the probe utility to measure PE→HBM latency without requiring
|
|
the full PE_CPU → scheduler → DMA pipeline.
|
|
"""
|
|
|
|
correlation_id: str
|
|
request_id: str
|
|
src_sip: int
|
|
src_cube: int
|
|
src_pe: int
|
|
dst_pa: int
|
|
nbytes: int
|
|
is_write: bool = False
|
|
msg_type: Literal["pe_dma"] = "pe_dma"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MmuMapMsg:
|
|
"""MMU mapping install: broadcast VA→PA entries to target PEs.
|
|
|
|
Sent via fabric: Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_MMU.
|
|
target_sips controls which SIPs receive the message.
|
|
"""
|
|
|
|
correlation_id: str
|
|
request_id: str
|
|
entries: tuple[dict, ...] # ({"va": int, "pa": int, "size": int}, ...)
|
|
target_sips: tuple[int, ...] | Literal["all"] = "all"
|
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
|
target_pe: int | Literal["all"] = "all"
|
|
msg_type: Literal["mmu_map"] = "mmu_map"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MmuUnmapMsg:
|
|
"""MMU mapping removal: broadcast VA ranges to unmap from all PEs."""
|
|
|
|
correlation_id: str
|
|
request_id: str
|
|
entries: tuple[dict, ...] # ({"va": int, "size": int}, ...)
|
|
target_sips: tuple[int, ...] | Literal["all"] = "all"
|
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
|
target_pe: int | Literal["all"] = "all"
|
|
msg_type: Literal["mmu_unmap"] = "mmu_unmap"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class IpcqInitMsg:
|
|
"""IPCQ neighbor table install (sideband fan-out, ADR-0023 D10/D12).
|
|
|
|
Backend issues this at ``init_process_group`` time to install per-PE
|
|
IPCQ neighbor tables. Each entry covers one direction (N/S/E/W) and
|
|
carries the peer's IpcqEndpoint plus this PE's own rx_buffer base
|
|
and a pre-wired SimPy Store for credit return fast path (D9).
|
|
|
|
Routing is similar to MmuMapMsg.
|
|
"""
|
|
|
|
correlation_id: str
|
|
request_id: str
|
|
target_sips: tuple[int, ...] | Literal["all"] = "all"
|
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
|
target_pe: int | tuple[int, ...] | Literal["all"] = "all"
|
|
# entries: tuple[IpcqInitEntry, ...] — kept as tuple of plain objects to
|
|
# avoid a runtime import cycle (IpcqInitEntry lives in
|
|
# kernbench.common.ipcq_types).
|
|
entries: tuple = ()
|
|
backpressure_mode: str = "sleep" # "poll" | "sleep"
|
|
buffer_kind: str = "tcm" # "tcm" | "hbm" | "sram"
|
|
credit_size_bytes: int = 16
|
|
msg_type: Literal["ipcq_init"] = "ipcq_init"
|