ADR-0009 D5: chain-aware target_start_ns + zero-byte launch fanout
The single-walk predictor (find_node_path(io_cpu, pe_cpu) +
compute_path_latency_ns) under-shot actual dispatch latency for far
cubes -- the routing graph could pick a path bypassing M_CPU, and
non-zero-nbytes launch sub-txns serialized on shared first hops.
Far PEs arrived at _execute_kernel after target_start_ns, silently
skipped the barrier yield, and started pe_exec_start late. Their
reported pe_exec_ns under-counted by exactly the late_ns amount
(63 ns observed at h4 cube4.pe0 in the IPCQ test, up to 113 ns
worst case for cubes 9-11), producing the suspicious flat region
in the h4 IPCQ curve at 8192/10240 bytes.
Fix:
- IO_CPU predictor uses the explicit two-leg chain
(IO_CPU->M_CPU + M_CPU->PE_CPU - io.overhead - m.overhead), so
every PE on every targeted cube has a barrier >= its real
dispatch arrival.
- Kernel-launch fanout sub-txns carry nbytes=0 (control-plane,
not data-plane), removing the per-cube fanout serialization
that pushed far M_CPUs past the predictor.
- Legacy io_cpu mirror updated.
ADR-0009 D5 mechanism updated to specify the two-leg formula and
the nbytes=0 requirement. New tests/test_d5_barrier_invariant.py
asserts (a) no PE enters _execute_kernel after target_start_ns and
(b) every PE in a multi-cube launch has identical pe_exec_start --
both regressions silently pass on the existing
tests/test_kernel_launch_sync.py because that test only inspects
post-aggregation max(pe_exec_ns).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -86,26 +86,41 @@ class IoCpuComponent(ComponentBase):
|
||||
# For KernelLaunchMsg, compute the global barrier once here so
|
||||
# every downstream PE_CPU uses the same target_start_ns.
|
||||
if isinstance(request, KernelLaunchMsg):
|
||||
io_overhead = self.ctx.node_overhead_ns.get(self.node.id, 0.0)
|
||||
global_max_latency = 0.0
|
||||
pe_ids = self._resolve_pe_ids(
|
||||
getattr(request, "target_pe", "all")
|
||||
)
|
||||
for sip, cube in cube_targets:
|
||||
try:
|
||||
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
|
||||
io_to_m_path = self.ctx.router.find_node_path(
|
||||
self.node.id, m_cpu_id,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if len(io_to_m_path) < 2:
|
||||
continue
|
||||
leg1 = self.ctx.compute_path_latency_ns(
|
||||
io_to_m_path, nbytes=0,
|
||||
)
|
||||
m_overhead = self.ctx.node_overhead_ns.get(m_cpu_id, 0.0)
|
||||
for pe_id in pe_ids:
|
||||
pe_cpu_id = (
|
||||
f"sip{sip}.cube{cube}.pe{pe_id}.pe_cpu"
|
||||
)
|
||||
try:
|
||||
path = self.ctx.router.find_node_path(
|
||||
self.node.id, pe_cpu_id,
|
||||
m_to_pe_path = self.ctx.router.find_node_path(
|
||||
m_cpu_id, pe_cpu_id,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if len(path) < 2:
|
||||
if len(m_to_pe_path) < 2:
|
||||
continue
|
||||
latency = self.ctx.compute_path_latency_ns(
|
||||
path, nbytes=0,
|
||||
leg2 = self.ctx.compute_path_latency_ns(
|
||||
m_to_pe_path, nbytes=0,
|
||||
)
|
||||
latency = leg1 + leg2 - io_overhead - m_overhead
|
||||
if latency > global_max_latency:
|
||||
global_max_latency = latency
|
||||
request = dataclasses.replace(
|
||||
@@ -116,7 +131,12 @@ class IoCpuComponent(ComponentBase):
|
||||
# Setup aggregation
|
||||
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
|
||||
|
||||
# Fan out to each target cube's M_CPU
|
||||
# Fan out to each target cube's M_CPU. Kernel-launch fanout
|
||||
# carries control metadata only; nbytes is forced to 0 for
|
||||
# KernelLaunchMsg so the launch sub-txns do not occupy data-fabric
|
||||
# BW (would otherwise serialize 16 cubes worth of fanout on the
|
||||
# shared first hop and break ADR-0009 D5's barrier prediction).
|
||||
is_kernel_launch = isinstance(request, KernelLaunchMsg)
|
||||
for sip, cube in cube_targets:
|
||||
try:
|
||||
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
|
||||
@@ -127,7 +147,8 @@ class IoCpuComponent(ComponentBase):
|
||||
continue
|
||||
sub_txn = Transaction(
|
||||
request=request, path=path, step=0,
|
||||
nbytes=txn.nbytes, done=env.event(),
|
||||
nbytes=0 if is_kernel_launch else txn.nbytes,
|
||||
done=env.event(),
|
||||
result_data=txn.result_data,
|
||||
)
|
||||
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||
|
||||
@@ -79,26 +79,41 @@ class IoCpuComponent(ComponentBase):
|
||||
return
|
||||
|
||||
if isinstance(request, KernelLaunchMsg):
|
||||
io_overhead = self.ctx.node_overhead_ns.get(self.node.id, 0.0)
|
||||
global_max_latency = 0.0
|
||||
pe_ids = self._resolve_pe_ids(
|
||||
getattr(request, "target_pe", "all")
|
||||
)
|
||||
for sip, cube in cube_targets:
|
||||
try:
|
||||
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
|
||||
io_to_m_path = self.ctx.router.find_node_path(
|
||||
self.node.id, m_cpu_id,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if len(io_to_m_path) < 2:
|
||||
continue
|
||||
leg1 = self.ctx.compute_path_latency_ns(
|
||||
io_to_m_path, nbytes=0,
|
||||
)
|
||||
m_overhead = self.ctx.node_overhead_ns.get(m_cpu_id, 0.0)
|
||||
for pe_id in pe_ids:
|
||||
pe_cpu_id = (
|
||||
f"sip{sip}.cube{cube}.pe{pe_id}.pe_cpu"
|
||||
)
|
||||
try:
|
||||
path = self.ctx.router.find_node_path(
|
||||
self.node.id, pe_cpu_id,
|
||||
m_to_pe_path = self.ctx.router.find_node_path(
|
||||
m_cpu_id, pe_cpu_id,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if len(path) < 2:
|
||||
if len(m_to_pe_path) < 2:
|
||||
continue
|
||||
latency = self.ctx.compute_path_latency_ns(
|
||||
path, nbytes=0,
|
||||
leg2 = self.ctx.compute_path_latency_ns(
|
||||
m_to_pe_path, nbytes=0,
|
||||
)
|
||||
latency = leg1 + leg2 - io_overhead - m_overhead
|
||||
if latency > global_max_latency:
|
||||
global_max_latency = latency
|
||||
request = dataclasses.replace(
|
||||
@@ -109,7 +124,7 @@ class IoCpuComponent(ComponentBase):
|
||||
# Setup aggregation
|
||||
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
|
||||
|
||||
# Fan out to each target cube's M_CPU
|
||||
is_kernel_launch = isinstance(request, KernelLaunchMsg)
|
||||
for sip, cube in cube_targets:
|
||||
try:
|
||||
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
|
||||
@@ -120,7 +135,8 @@ class IoCpuComponent(ComponentBase):
|
||||
continue
|
||||
sub_txn = Transaction(
|
||||
request=request, path=path, step=0,
|
||||
nbytes=txn.nbytes, done=env.event(),
|
||||
nbytes=0 if is_kernel_launch else txn.nbytes,
|
||||
done=env.event(),
|
||||
result_data=txn.result_data,
|
||||
)
|
||||
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||
|
||||
Reference in New Issue
Block a user