Kernel-launch sync (ADR-0009 D5) and IPCQ drain at inbound (ADR-0023)

- KernelLaunchMsg gains target_start_ns: IO_CPU stamps a global barrier
  (max path latency across every target PE), M_CPU passes it through,
  PE_CPU yields until it before recording pe_exec_start. Every PE in a
  launch begins kernel execution at the same env.now regardless of its
  dispatch path length — eliminates per-PE dispatch-offset artifact in
  cross-PE and cross-cube latency measurements.

- PE_DMA._handle_ipcq_inbound now pays Transaction.drain_ns at the top,
  matching the terminal-drain behavior of ComponentBase._forward_txn for
  every non-IPCQ Transaction. SRC-side tl.send stays fire-and-forget
  (sender doesn't yield on sub_done); tl.recv now blocks until bytes
  have actually drained into its inbox.

- ComponentContext: new compute_path_latency_ns helper + node_overhead_ns
  field populated by GraphEngine.

- tests/test_kernel_launch_sync.py: asserts all PEs in one launch
  produce identical pe_exec_ns for a no-op kernel (zero spread).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-23 15:30:29 -07:00
parent 6918e6e906
commit 14d800b0ae
14 changed files with 409 additions and 17 deletions
+55 -1
View File
@@ -58,7 +58,18 @@ class IoCpuComponent(ComponentBase):
self._pending[key] = (expected, received, parent_done)
def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses.
ADR-0009 D5 (extended): for KernelLaunchMsg, stamp a single global
target_start_ns = env.now + max(IO_CPU → any target PE_CPU path
latency across all target cubes). M_CPU passes this value through
unchanged; every PE in every cube yields until the same sim-time
before beginning kernel execution. Without this, cross-cube
launches would have each cube's M_CPU compute its own per-cube
barrier relative to its local env.now, leaving PEs on different
cubes out of sync (the "h3/h4 dispatch-offset artifact").
"""
import dataclasses
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
request = txn.request
@@ -72,6 +83,36 @@ class IoCpuComponent(ComponentBase):
txn.done.succeed()
return
# For KernelLaunchMsg, compute the global barrier once here so
# every downstream PE_CPU uses the same target_start_ns.
if isinstance(request, KernelLaunchMsg):
global_max_latency = 0.0
pe_ids = self._resolve_pe_ids(
getattr(request, "target_pe", "all")
)
for sip, cube in cube_targets:
for pe_id in pe_ids:
pe_cpu_id = (
f"sip{sip}.cube{cube}.pe{pe_id}.pe_cpu"
)
try:
path = self.ctx.router.find_node_path(
self.node.id, pe_cpu_id,
)
except Exception:
continue
if len(path) < 2:
continue
latency = self.ctx.compute_path_latency_ns(
path, nbytes=0,
)
if latency > global_max_latency:
global_max_latency = latency
request = dataclasses.replace(
request,
target_start_ns=float(env.now) + global_max_latency,
)
# Setup aggregation
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
@@ -91,6 +132,19 @@ class IoCpuComponent(ComponentBase):
)
yield self.out_ports[path[1]].put(sub_txn.advance())
def _resolve_pe_ids(self, target_pe: Any) -> list[int]:
"""Resolve target_pe → list of PE indices (mirrors M_CPU logic)."""
if isinstance(target_pe, int):
return [target_pe]
if isinstance(target_pe, tuple):
return list(target_pe)
# "all": all PEs in a cube
n_slices = 8
if self.ctx and self.ctx.spec:
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
n_slices = mm.get("hbm_slices_per_cube", 8)
return list(range(n_slices))
def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
"""Return list of (sip, cube) pairs to fan out to."""
from kernbench.runtime_api.kernel import (
+28 -4
View File
@@ -162,7 +162,11 @@ class MCpuComponent(ComponentBase):
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
ADR-0009 D5: stamps target_start_ns so every PE in this fanout
starts executing at the same env.now regardless of dispatch path.
"""
import dataclasses
request = txn.request
target_pe = getattr(request, "target_pe", "all")
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
@@ -172,9 +176,13 @@ class MCpuComponent(ComponentBase):
txn.done.succeed()
return
# Fan out to each PE_CPU, using response-based aggregation
sub_txns: list[Transaction] = []
n_dispatched = 0
# Resolve per-PE paths. If IO_CPU already stamped a global
# target_start_ns (ADR-0009 D5 extended), pass it through
# unchanged so every PE across every cube uses the same barrier.
# Otherwise (e.g. direct-to-M_CPU launch in a unit test) compute
# a per-cube barrier from env.now.
per_pe: list[tuple[int, list[str], float]] = []
max_latency = 0.0
for pe_id in pe_ids:
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
try:
@@ -183,8 +191,24 @@ class MCpuComponent(ComponentBase):
continue
if len(path) < 2:
continue
latency = self.ctx.compute_path_latency_ns(path, nbytes=0)
per_pe.append((pe_id, path, latency))
if latency > max_latency:
max_latency = latency
if getattr(request, "target_start_ns", None) is not None:
stamped_request = request
else:
stamped_request = dataclasses.replace(
request, target_start_ns=float(env.now) + max_latency,
)
# Fan out to each PE_CPU, using response-based aggregation
sub_txns: list[Transaction] = []
n_dispatched = 0
for pe_id, path, _lat in per_pe:
sub_txn = Transaction(
request=request, path=path, step=0,
request=stamped_request, path=path, step=0,
nbytes=0, done=env.event(),
)
yield self.out_ports[path[1]].put(sub_txn.advance())
@@ -95,6 +95,13 @@ class PeCpuComponent(ComponentBase):
request = txn.request
yield from self.run(env, 0)
# ADR-0009 D5: synchronized launch barrier. If M_CPU stamped a
# target_start_ns, wait until then so every PE in this launch
# begins pe_exec measurement at the same simulated time.
target_start = getattr(request, "target_start_ns", None)
if target_start is not None and target_start > env.now:
yield env.timeout(float(target_start) - env.now)
kernel_fn = get_kernel(request.kernel_ref.name)
num_programs = self._derive_num_programs(request)
kernel_args = self._unpack_kernel_args(request)
+25 -1
View File
@@ -186,13 +186,37 @@ class PeDmaComponent(PeEngineBase):
# ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ──────
def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator:
"""At destination PE_DMA: atomically write data and forward metadata.
"""At destination PE_DMA: pay terminal drain, then atomically write
data and forward metadata.
ADR-0023 D9 (drain at inbound terminal): the Transaction carries
``drain_ns = nbytes / bottleneck_bw_on_path`` stamped by the sender
PE_DMA. Like every other Transaction terminal in the simulator (see
``ComponentBase._forward_txn``), this drain must be paid when the
Transaction reaches its destination. SRC-side ``tl.send`` is
fire-and-forget — it never yields on ``sub_done`` — so paying the
drain here does NOT delay the sender. What it DOES delay is the
IpcqMetaArrival forwarded below: that delay is the only signal
``tl.recv`` on DST blocks on, which is exactly the desired
semantics — "send dispatches and returns; recv waits until the
bytes have actually landed in its inbox".
The drain MUST be paid before the atomic block — inserting a yield
inside would break invariant I6.
I6 (MUST): no SimPy yield between MemoryStore.write and the
IpcqMetaArrival put into PE_IPCQ.
"""
from kernbench.common.ipcq_types import IpcqMetaArrival
# Pay terminal BW drain before the atomic write/metadata forward.
# Without this, IPCQ effectively got fabric bandwidth for free at
# the terminal (only intermediate-hop overhead_ns was charged),
# making IPCQ lower than raw DMA at large sizes in benchmarks.
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
token = txn.request
# ── ATOMIC: do not introduce yield between these two operations ──
+19
View File
@@ -26,6 +26,9 @@ class ComponentContext:
spec: dict = field(default_factory=dict) # topology spec (cube layout, PE count, etc.)
memory_store: Any = None # MemoryStore for Phase 1 data-aware execution (ADR-0020)
op_logger: Any = None # OpLogger for Phase 1 op recording (ADR-0020)
# node_id -> overhead_ns (ADR-0009 D5: used by M_CPU to compute per-PE
# dispatch latency when stamping target_start_ns on KernelLaunchMsg).
node_overhead_ns: dict[str, float] = field(default_factory=dict)
def get_shared_resource(
self, env: simpy.Environment, key: str, capacity: int = 1,
@@ -52,3 +55,19 @@ class ComponentContext:
if min_bw == float("inf"):
return 0.0
return nbytes / min_bw
def compute_path_latency_ns(self, path: list[str], nbytes: int = 0) -> float:
"""Formula latency along path: wire + per-node overhead + drain.
ADR-0009 D5: M_CPU uses this to compute per-PE dispatch latency
when stamping target_start_ns on KernelLaunchMsg fanout.
"""
total = 0.0
for i in range(len(path) - 1):
edge = self.edge_map.get((path[i], path[i + 1]))
if edge:
total += edge.distance_mm * self.ns_per_mm
for node_id in path:
total += self.node_overhead_ns.get(node_id, 0.0)
total += self.compute_drain_ns(path, nbytes)
return total
@@ -58,7 +58,13 @@ class IoCpuComponent(ComponentBase):
self._pending[key] = (expected, received, parent_done)
def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses.
ADR-0009 D5 (extended): stamp a global target_start_ns on
KernelLaunchMsg so every PE across every target cube starts at
the same env.now. See the non-legacy builtin for full rationale.
"""
import dataclasses
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
request = txn.request
@@ -72,6 +78,34 @@ class IoCpuComponent(ComponentBase):
txn.done.succeed()
return
if isinstance(request, KernelLaunchMsg):
global_max_latency = 0.0
pe_ids = self._resolve_pe_ids(
getattr(request, "target_pe", "all")
)
for sip, cube in cube_targets:
for pe_id in pe_ids:
pe_cpu_id = (
f"sip{sip}.cube{cube}.pe{pe_id}.pe_cpu"
)
try:
path = self.ctx.router.find_node_path(
self.node.id, pe_cpu_id,
)
except Exception:
continue
if len(path) < 2:
continue
latency = self.ctx.compute_path_latency_ns(
path, nbytes=0,
)
if latency > global_max_latency:
global_max_latency = latency
request = dataclasses.replace(
request,
target_start_ns=float(env.now) + global_max_latency,
)
# Setup aggregation
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
@@ -91,6 +125,18 @@ class IoCpuComponent(ComponentBase):
)
yield self.out_ports[path[1]].put(sub_txn.advance())
def _resolve_pe_ids(self, target_pe: Any) -> list[int]:
"""Resolve target_pe → list of PE indices (mirrors M_CPU logic)."""
if isinstance(target_pe, int):
return [target_pe]
if isinstance(target_pe, tuple):
return list(target_pe)
n_slices = 8
if self.ctx and self.ctx.spec:
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
n_slices = mm.get("hbm_slices_per_cube", 8)
return list(range(n_slices))
def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
"""Return list of (sip, cube) pairs to fan out to."""
from kernbench.runtime_api.kernel import (
@@ -162,7 +162,11 @@ class MCpuComponent(ComponentBase):
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
ADR-0009 D5: stamps target_start_ns so every PE in this fanout
starts executing at the same env.now regardless of dispatch path.
"""
import dataclasses
request = txn.request
target_pe = getattr(request, "target_pe", "all")
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
@@ -172,9 +176,10 @@ class MCpuComponent(ComponentBase):
txn.done.succeed()
return
# Fan out to each PE_CPU, using response-based aggregation
sub_txns: list[Transaction] = []
n_dispatched = 0
# Resolve per-PE paths. If IO_CPU already stamped a global
# target_start_ns (ADR-0009 D5 extended), pass it through.
per_pe: list[tuple[int, list[str], float]] = []
max_latency = 0.0
for pe_id in pe_ids:
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
try:
@@ -183,8 +188,24 @@ class MCpuComponent(ComponentBase):
continue
if len(path) < 2:
continue
latency = self.ctx.compute_path_latency_ns(path, nbytes=0)
per_pe.append((pe_id, path, latency))
if latency > max_latency:
max_latency = latency
if getattr(request, "target_start_ns", None) is not None:
stamped_request = request
else:
stamped_request = dataclasses.replace(
request, target_start_ns=float(env.now) + max_latency,
)
# Fan out to each PE_CPU, using response-based aggregation
sub_txns: list[Transaction] = []
n_dispatched = 0
for pe_id, path, _lat in per_pe:
sub_txn = Transaction(
request=request, path=path, step=0,
request=stamped_request, path=path, step=0,
nbytes=0, done=env.event(),
)
yield self.out_ports[path[1]].put(sub_txn.advance())
@@ -71,6 +71,13 @@ class PeCpuComponent(ComponentBase):
request = txn.request
yield from self.run(env, 0)
# ADR-0009 D5: synchronized launch barrier. If M_CPU stamped a
# target_start_ns, wait until then so every PE in this launch
# begins pe_exec measurement at the same simulated time.
target_start = getattr(request, "target_start_ns", None)
if target_start is not None and target_start > env.now:
yield env.timeout(float(target_start) - env.now)
kernel_fn = get_kernel(request.kernel_ref.name)
num_programs = self._derive_num_programs(request)
kernel_args = self._unpack_kernel_args(request)