ADR-0009 D5: chain-aware target_start_ns + zero-byte launch fanout
The single-walk predictor (find_node_path(io_cpu, pe_cpu) +
compute_path_latency_ns) under-shot actual dispatch latency for far
cubes -- the routing graph could pick a path bypassing M_CPU, and
non-zero-nbytes launch sub-txns serialized on shared first hops.
Far PEs arrived at _execute_kernel after target_start_ns, silently
skipped the barrier yield, and started pe_exec_start late. Their
reported pe_exec_ns under-counted by exactly the late_ns amount
(63 ns observed at h4 cube4.pe0 in the IPCQ test, up to 113 ns
worst case for cubes 9-11), producing the suspicious flat region
in the h4 IPCQ curve at 8192/10240 bytes.
Fix:
- IO_CPU predictor uses the explicit two-leg chain
(IO_CPU->M_CPU + M_CPU->PE_CPU - io.overhead - m.overhead), so
every PE on every targeted cube has a barrier >= its real
dispatch arrival.
- Kernel-launch fanout sub-txns carry nbytes=0 (control-plane,
not data-plane), removing the per-cube fanout serialization
that pushed far M_CPUs past the predictor.
- Legacy io_cpu mirror updated.
ADR-0009 D5 mechanism updated to specify the two-leg formula and
the nbytes=0 requirement. New tests/test_d5_barrier_invariant.py
asserts (a) no PE enters _execute_kernel after target_start_ns and
(b) every PE in a multi-cube launch has identical pe_exec_start --
both regressions silently pass on the existing
tests/test_kernel_launch_sync.py because that test only inspects
post-aggregation max(pe_exec_ns).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,194 @@
|
||||
"""ADR-0009 D5 invariant: all PEs targeted by a single kernel launch MUST
|
||||
begin executing the kernel body at the same simulated time, regardless of
|
||||
their dispatch path length.
|
||||
|
||||
These tests directly verify the invariant by capturing per-PE state at the
|
||||
top of `_execute_kernel`:
|
||||
|
||||
test_no_pe_arrives_after_target_start_ns
|
||||
Asserts: for every PE that enters _execute_kernel during a multi-cube
|
||||
launch, `env.now` at entry must be <= target_start_ns. Otherwise the
|
||||
PE's barrier yield would be a no-op and `pe_exec_start` would be set
|
||||
late, breaking the D5 "same simulated time" mandate.
|
||||
|
||||
test_all_pes_have_identical_pe_exec_start
|
||||
Asserts: every PE's `pe_exec_start` (the value of `env.now` recorded
|
||||
immediately AFTER the barrier yield) is identical across all PEs in
|
||||
the launch.
|
||||
|
||||
Both tests are expected to FAIL today and become the regression check the
|
||||
Phase 2 D5 predictor + fallback fix must make pass.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
|
||||
def _capture_per_pe_d5_state():
|
||||
"""Monkey-patch PeCpuComponent._execute_kernel to record, per PE:
|
||||
|
||||
- entry_now: env.now at function entry (before any yield)
|
||||
- target_start_ns: the value carried by the request
|
||||
- barrier_yielded: True if the barrier yield fired (entry_now < target)
|
||||
- pe_exec_start: env.now immediately after the barrier check
|
||||
(i.e. the value the original code sets)
|
||||
|
||||
Returns (records: list[dict], restore: callable).
|
||||
"""
|
||||
import kernbench.components.builtin.pe_cpu as pe_cpu_mod
|
||||
|
||||
records: list[dict] = []
|
||||
original = pe_cpu_mod.PeCpuComponent._execute_kernel
|
||||
|
||||
def patched(self, env, txn):
|
||||
request = txn.request
|
||||
target_start = getattr(request, "target_start_ns", None)
|
||||
entry_now = float(env.now)
|
||||
rec = {
|
||||
"node_id": self.node.id,
|
||||
"entry_now": entry_now,
|
||||
"target_start_ns": (
|
||||
float(target_start) if target_start is not None else None
|
||||
),
|
||||
"barrier_yielded": (
|
||||
target_start is not None
|
||||
and float(target_start) > entry_now
|
||||
),
|
||||
"pe_exec_start": None, # filled below by sniff
|
||||
"late_ns": (
|
||||
None if target_start is None
|
||||
else max(0.0, entry_now - float(target_start))
|
||||
),
|
||||
}
|
||||
records.append(rec)
|
||||
|
||||
# We can't easily inject a callback at the original's
|
||||
# `pe_exec_start = env.now` line without rewriting it. Approximate:
|
||||
# if the original yields the barrier, env.now after the yield is
|
||||
# target_start_ns; otherwise pe_exec_start is entry_now (skipped).
|
||||
if rec["barrier_yielded"]:
|
||||
rec["pe_exec_start"] = float(target_start)
|
||||
else:
|
||||
rec["pe_exec_start"] = entry_now
|
||||
|
||||
yield from original(self, env, txn)
|
||||
|
||||
pe_cpu_mod.PeCpuComponent._execute_kernel = patched
|
||||
|
||||
def restore():
|
||||
pe_cpu_mod.PeCpuComponent._execute_kernel = original
|
||||
|
||||
return records, restore
|
||||
|
||||
|
||||
def _run_multicube_launch():
|
||||
"""Drive a no-op kernel launch across all 16 cubes x 8 PEs and return
|
||||
the per-PE D5 records collected by the monkey-patch."""
|
||||
records, restore = _capture_per_pe_d5_state()
|
||||
try:
|
||||
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
spec = topo.topology_obj.spec
|
||||
|
||||
with RuntimeContext(
|
||||
engine=engine, target_device=DeviceSelector("all"),
|
||||
correlation_id="d5_barrier", spec=spec,
|
||||
) as ctx:
|
||||
dp = DPPolicy(
|
||||
cube="row_wise", pe="column_wise",
|
||||
num_cubes=16, num_pes=8,
|
||||
)
|
||||
|
||||
def kernel(t_ptr, n_elem, tl):
|
||||
pass # no-op
|
||||
|
||||
ctx.ahbm.set_device(0)
|
||||
t = ctx.zeros(
|
||||
(16, 8 * 64), dtype="f16", dp=dp, name="probe",
|
||||
)
|
||||
t.copy_(ctx.from_numpy(
|
||||
np.zeros((16, 8 * 64), dtype=np.float16),
|
||||
))
|
||||
|
||||
pending = ctx.launch(
|
||||
"d5_probe", kernel, t, 64, _defer_wait=True,
|
||||
)
|
||||
for h, _sip, meta in pending:
|
||||
ctx.wait(h, _meta=meta)
|
||||
finally:
|
||||
restore()
|
||||
return records
|
||||
|
||||
|
||||
def test_no_pe_arrives_after_target_start_ns():
|
||||
"""ADR-0009 D5: no PE may enter `_execute_kernel` after target_start_ns.
|
||||
|
||||
Today this fails because IO_CPU's predictor under-shoots actual
|
||||
dispatch latency for far cubes (cube4, cube9-15). Phase 2 fix:
|
||||
chain-aware predictor in IO_CPU + monotonic upward re-stamp in M_CPU.
|
||||
"""
|
||||
records = _run_multicube_launch()
|
||||
assert records, "expected per-PE _execute_kernel records"
|
||||
|
||||
late = [
|
||||
r for r in records
|
||||
if r["target_start_ns"] is not None
|
||||
and r["late_ns"] is not None
|
||||
and r["late_ns"] > 1e-6
|
||||
]
|
||||
|
||||
if late:
|
||||
# Provide actionable diagnostic in the failure.
|
||||
worst = sorted(late, key=lambda r: -r["late_ns"])[:5]
|
||||
details = "\n".join(
|
||||
f" {r['node_id']}: late by {r['late_ns']:.2f} ns "
|
||||
f"(entry_now={r['entry_now']:.2f}, "
|
||||
f"target_start_ns={r['target_start_ns']:.2f})"
|
||||
for r in worst
|
||||
)
|
||||
pytest.fail(
|
||||
f"ADR-0009 D5 violated: {len(late)}/{len(records)} PEs "
|
||||
f"entered _execute_kernel AFTER target_start_ns "
|
||||
f"(barrier yield silently skipped). "
|
||||
f"Worst offenders:\n{details}"
|
||||
)
|
||||
|
||||
|
||||
def test_all_pes_have_identical_pe_exec_start():
|
||||
"""ADR-0009 D5: every PE's pe_exec_start must be identical.
|
||||
|
||||
With D5 honored, every PE either yields to target_start_ns (start =
|
||||
target_start_ns) or, if late, would still be aligned by the M_CPU
|
||||
upward re-stamp (Phase 2). Today: 75/128 PEs in this launch have
|
||||
distinct pe_exec_start values because they skipped the barrier.
|
||||
"""
|
||||
records = _run_multicube_launch()
|
||||
assert records, "expected per-PE _execute_kernel records"
|
||||
|
||||
starts = sorted({round(r["pe_exec_start"], 6) for r in records})
|
||||
if len(starts) > 1:
|
||||
spread = max(starts) - min(starts)
|
||||
# Distribution of how many PEs at each distinct start time
|
||||
from collections import Counter
|
||||
bucket = Counter(round(r["pe_exec_start"], 6) for r in records)
|
||||
details = "\n".join(
|
||||
f" pe_exec_start={t}: {n} PEs"
|
||||
for t, n in sorted(bucket.items())
|
||||
)
|
||||
pytest.fail(
|
||||
f"ADR-0009 D5 violated: PEs have {len(starts)} distinct "
|
||||
f"pe_exec_start values (spread = {spread:.2f} ns); "
|
||||
f"D5 mandates a single common value. "
|
||||
f"Distribution:\n{details}"
|
||||
)
|
||||
Reference in New Issue
Block a user