Kernel-launch sync (ADR-0009 D5) and IPCQ drain at inbound (ADR-0023)
- KernelLaunchMsg gains target_start_ns: IO_CPU stamps a global barrier (max path latency across every target PE), M_CPU passes it through, PE_CPU yields until it before recording pe_exec_start. Every PE in a launch begins kernel execution at the same env.now regardless of its dispatch path length — eliminates per-PE dispatch-offset artifact in cross-PE and cross-cube latency measurements. - PE_DMA._handle_ipcq_inbound now pays Transaction.drain_ns at the top, matching the terminal-drain behavior of ComponentBase._forward_txn for every non-IPCQ Transaction. SRC-side tl.send stays fire-and-forget (sender doesn't yield on sub_done); tl.recv now blocks until bytes have actually drained into its inbox. - ComponentContext: new compute_path_latency_ns helper + node_overhead_ns field populated by GraphEngine. - tests/test_kernel_launch_sync.py: asserts all PEs in one launch produce identical pe_exec_ns for a no-op kernel (zero spread). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -58,7 +58,18 @@ class IoCpuComponent(ComponentBase):
|
||||
self._pending[key] = (expected, received, parent_done)
|
||||
|
||||
def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
|
||||
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses.
|
||||
|
||||
ADR-0009 D5 (extended): for KernelLaunchMsg, stamp a single global
|
||||
target_start_ns = env.now + max(IO_CPU → any target PE_CPU path
|
||||
latency across all target cubes). M_CPU passes this value through
|
||||
unchanged; every PE in every cube yields until the same sim-time
|
||||
before beginning kernel execution. Without this, cross-cube
|
||||
launches would have each cube's M_CPU compute its own per-cube
|
||||
barrier relative to its local env.now, leaving PEs on different
|
||||
cubes out of sync (the "h3/h4 dispatch-offset artifact").
|
||||
"""
|
||||
import dataclasses
|
||||
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
|
||||
|
||||
request = txn.request
|
||||
@@ -72,6 +83,36 @@ class IoCpuComponent(ComponentBase):
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
# For KernelLaunchMsg, compute the global barrier once here so
|
||||
# every downstream PE_CPU uses the same target_start_ns.
|
||||
if isinstance(request, KernelLaunchMsg):
|
||||
global_max_latency = 0.0
|
||||
pe_ids = self._resolve_pe_ids(
|
||||
getattr(request, "target_pe", "all")
|
||||
)
|
||||
for sip, cube in cube_targets:
|
||||
for pe_id in pe_ids:
|
||||
pe_cpu_id = (
|
||||
f"sip{sip}.cube{cube}.pe{pe_id}.pe_cpu"
|
||||
)
|
||||
try:
|
||||
path = self.ctx.router.find_node_path(
|
||||
self.node.id, pe_cpu_id,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if len(path) < 2:
|
||||
continue
|
||||
latency = self.ctx.compute_path_latency_ns(
|
||||
path, nbytes=0,
|
||||
)
|
||||
if latency > global_max_latency:
|
||||
global_max_latency = latency
|
||||
request = dataclasses.replace(
|
||||
request,
|
||||
target_start_ns=float(env.now) + global_max_latency,
|
||||
)
|
||||
|
||||
# Setup aggregation
|
||||
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
|
||||
|
||||
@@ -91,6 +132,19 @@ class IoCpuComponent(ComponentBase):
|
||||
)
|
||||
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||
|
||||
def _resolve_pe_ids(self, target_pe: Any) -> list[int]:
|
||||
"""Resolve target_pe → list of PE indices (mirrors M_CPU logic)."""
|
||||
if isinstance(target_pe, int):
|
||||
return [target_pe]
|
||||
if isinstance(target_pe, tuple):
|
||||
return list(target_pe)
|
||||
# "all": all PEs in a cube
|
||||
n_slices = 8
|
||||
if self.ctx and self.ctx.spec:
|
||||
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
|
||||
n_slices = mm.get("hbm_slices_per_cube", 8)
|
||||
return list(range(n_slices))
|
||||
|
||||
def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
|
||||
"""Return list of (sip, cube) pairs to fan out to."""
|
||||
from kernbench.runtime_api.kernel import (
|
||||
|
||||
@@ -162,7 +162,11 @@ class MCpuComponent(ComponentBase):
|
||||
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
|
||||
PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
|
||||
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
|
||||
|
||||
ADR-0009 D5: stamps target_start_ns so every PE in this fanout
|
||||
starts executing at the same env.now regardless of dispatch path.
|
||||
"""
|
||||
import dataclasses
|
||||
request = txn.request
|
||||
target_pe = getattr(request, "target_pe", "all")
|
||||
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
|
||||
@@ -172,9 +176,13 @@ class MCpuComponent(ComponentBase):
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
# Fan out to each PE_CPU, using response-based aggregation
|
||||
sub_txns: list[Transaction] = []
|
||||
n_dispatched = 0
|
||||
# Resolve per-PE paths. If IO_CPU already stamped a global
|
||||
# target_start_ns (ADR-0009 D5 extended), pass it through
|
||||
# unchanged so every PE across every cube uses the same barrier.
|
||||
# Otherwise (e.g. direct-to-M_CPU launch in a unit test) compute
|
||||
# a per-cube barrier from env.now.
|
||||
per_pe: list[tuple[int, list[str], float]] = []
|
||||
max_latency = 0.0
|
||||
for pe_id in pe_ids:
|
||||
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
|
||||
try:
|
||||
@@ -183,8 +191,24 @@ class MCpuComponent(ComponentBase):
|
||||
continue
|
||||
if len(path) < 2:
|
||||
continue
|
||||
latency = self.ctx.compute_path_latency_ns(path, nbytes=0)
|
||||
per_pe.append((pe_id, path, latency))
|
||||
if latency > max_latency:
|
||||
max_latency = latency
|
||||
|
||||
if getattr(request, "target_start_ns", None) is not None:
|
||||
stamped_request = request
|
||||
else:
|
||||
stamped_request = dataclasses.replace(
|
||||
request, target_start_ns=float(env.now) + max_latency,
|
||||
)
|
||||
|
||||
# Fan out to each PE_CPU, using response-based aggregation
|
||||
sub_txns: list[Transaction] = []
|
||||
n_dispatched = 0
|
||||
for pe_id, path, _lat in per_pe:
|
||||
sub_txn = Transaction(
|
||||
request=request, path=path, step=0,
|
||||
request=stamped_request, path=path, step=0,
|
||||
nbytes=0, done=env.event(),
|
||||
)
|
||||
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||
|
||||
@@ -95,6 +95,13 @@ class PeCpuComponent(ComponentBase):
|
||||
request = txn.request
|
||||
yield from self.run(env, 0)
|
||||
|
||||
# ADR-0009 D5: synchronized launch barrier. If M_CPU stamped a
|
||||
# target_start_ns, wait until then so every PE in this launch
|
||||
# begins pe_exec measurement at the same simulated time.
|
||||
target_start = getattr(request, "target_start_ns", None)
|
||||
if target_start is not None and target_start > env.now:
|
||||
yield env.timeout(float(target_start) - env.now)
|
||||
|
||||
kernel_fn = get_kernel(request.kernel_ref.name)
|
||||
num_programs = self._derive_num_programs(request)
|
||||
kernel_args = self._unpack_kernel_args(request)
|
||||
|
||||
@@ -186,13 +186,37 @@ class PeDmaComponent(PeEngineBase):
|
||||
# ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ──────
|
||||
|
||||
def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
"""At destination PE_DMA: atomically write data and forward metadata.
|
||||
"""At destination PE_DMA: pay terminal drain, then atomically write
|
||||
data and forward metadata.
|
||||
|
||||
ADR-0023 D9 (drain at inbound terminal): the Transaction carries
|
||||
``drain_ns = nbytes / bottleneck_bw_on_path`` stamped by the sender
|
||||
PE_DMA. Like every other Transaction terminal in the simulator (see
|
||||
``ComponentBase._forward_txn``), this drain must be paid when the
|
||||
Transaction reaches its destination. SRC-side ``tl.send`` is
|
||||
fire-and-forget — it never yields on ``sub_done`` — so paying the
|
||||
drain here does NOT delay the sender. What it DOES delay is the
|
||||
IpcqMetaArrival forwarded below: that delay is the only signal
|
||||
``tl.recv`` on DST blocks on, which is exactly the desired
|
||||
semantics — "send dispatches and returns; recv waits until the
|
||||
bytes have actually landed in its inbox".
|
||||
|
||||
The drain MUST be paid before the atomic block — inserting a yield
|
||||
inside would break invariant I6.
|
||||
|
||||
I6 (MUST): no SimPy yield between MemoryStore.write and the
|
||||
IpcqMetaArrival put into PE_IPCQ.
|
||||
"""
|
||||
from kernbench.common.ipcq_types import IpcqMetaArrival
|
||||
|
||||
# Pay terminal BW drain before the atomic write/metadata forward.
|
||||
# Without this, IPCQ effectively got fabric bandwidth for free at
|
||||
# the terminal (only intermediate-hop overhead_ns was charged),
|
||||
# making IPCQ lower than raw DMA at large sizes in benchmarks.
|
||||
drain = getattr(txn, "drain_ns", 0.0)
|
||||
if drain > 0:
|
||||
yield env.timeout(drain)
|
||||
|
||||
token = txn.request
|
||||
|
||||
# ── ATOMIC: do not introduce yield between these two operations ──
|
||||
|
||||
Reference in New Issue
Block a user