Fix Phase 1 slot-overwrite race + PE_MATH latency model (n_slots=4 safe)
Root cause: In ring all-reduce, PE_IPCQ's recv handler advances my_tail and issues a credit return immediately. With tight credit latency (0.12ns intra-cube), the sender can refill the slot BEFORE the receiver's outbound PE_DMA reads from it for the next send. The outbound snapshot then captures stale data from a later round. Fix: Propagate TensorHandle.data (captured at recv-time, before credit return) through the entire send chain: tl.send(src=handle) → IpcqSendCmd.data → IpcqDmaToken.data PE_DMA outbound already prefers token.data over MemoryStore read, so the recv-time snapshot is used for the in-flight data. This eliminates the race: the snapshot is captured before the slot can be overwritten. Additional fixes: - PE_MATH handle_command: compute SIMD latency from output tensor element count via _compute_ns(), using max(overhead_ns, compute_ns). Previously used overhead_ns=0.0 for all standalone MathCmd, making math ops take 0ns in SimPy. - DataExecutor secondary sort: same-t_start ops sorted by op_kind (memory < gemm < math) so IPCQ slot writes execute before math reads. - ipcq_copy recorded at INBOUND time (receiver PE_DMA arrival) instead of outbound. Inbound time is after fabric propagation, so it sorts correctly relative to the receiver's math. - record_copy accepts explicit snapshot parameter (from token.data). Result: N_ELEM=32 + 256-rank + n_slots=4 + cross-SIP now passes. n_slots reverted to 4 (the deeper buffer was a workaround, not needed). 502 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -154,23 +154,13 @@ class PeDmaComponent(PeEngineBase):
|
||||
except Exception:
|
||||
token.data = None
|
||||
|
||||
# Record the IPCQ copy in op_log at OUTBOUND time. ADR-0020 D6:
|
||||
# Phase 2 replays the copy in t_start order; using outbound time
|
||||
# (rather than inbound) ensures the copy executes before any later
|
||||
# local op at the sender that might overwrite token.src_addr (e.g.
|
||||
# a tl.store after a recv).
|
||||
if self._op_logger is not None:
|
||||
try:
|
||||
self._op_logger.record_copy(
|
||||
t_start=float(env.now), t_end=float(env.now),
|
||||
component_id=self.node.id,
|
||||
src_space=token.src_space, src_addr=token.src_addr,
|
||||
dst_space=peer.buffer_kind,
|
||||
dst_addr=token.dst_addr,
|
||||
shape=token.shape, dtype=token.dtype, nbytes=token.nbytes,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
# Note: ipcq_copy is recorded at INBOUND time (in _handle_ipcq_inbound),
|
||||
# not here. Outbound time is too early — it precedes fabric propagation,
|
||||
# so in Phase 2 a later round's copy can sort before the receiver's
|
||||
# math for an earlier round, causing slot data corruption.
|
||||
# The secondary sort in DataExecutor (memory ops before math at the
|
||||
# same t_start) ensures the inbound copy runs before the local math
|
||||
# that reads the slot.
|
||||
|
||||
try:
|
||||
path = self.ctx.router.find_path(self._pe_prefix, peer_pe_dma)
|
||||
@@ -223,6 +213,28 @@ class PeDmaComponent(PeEngineBase):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Record the IPCQ copy at INBOUND time with embedded data snapshot.
|
||||
# The snapshot (token.data) was captured by the sender's outbound
|
||||
# PE_DMA at send time. Phase 2 writes the snapshot directly to
|
||||
# dst — it does NOT re-read from MemoryStore[src_addr], which may
|
||||
# have been mutated by a different PE's Phase 2 ops by that point.
|
||||
# DataExecutor's secondary sort (memory before math at same
|
||||
# t_start) ensures the write completes before the local math
|
||||
# that reads the slot.
|
||||
if self._op_logger is not None:
|
||||
try:
|
||||
self._op_logger.record_copy(
|
||||
t_start=float(env.now), t_end=float(env.now),
|
||||
component_id=self.node.id,
|
||||
src_space=token.src_space, src_addr=token.src_addr,
|
||||
dst_space=token.dst_endpoint.buffer_kind,
|
||||
dst_addr=token.dst_addr,
|
||||
shape=token.shape, dtype=token.dtype, nbytes=token.nbytes,
|
||||
snapshot=token.data,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2. Forward IpcqMetaArrival to local PE_IPCQ
|
||||
ipcq_id = f"{self._pe_prefix}.pe_ipcq"
|
||||
if ipcq_id in self.out_ports:
|
||||
|
||||
@@ -221,6 +221,10 @@ class PeIpcqComponent(ComponentBase):
|
||||
handle_id=cmd.handle_id,
|
||||
shape=cmd.shape,
|
||||
dtype=cmd.dtype,
|
||||
# Carry the handle's recv-time data snapshot so the outbound
|
||||
# PE_DMA doesn't need to re-read from MemoryStore (which may
|
||||
# have been overwritten by a later inbound in the meantime).
|
||||
data=getattr(cmd, "data", None),
|
||||
sender_seq=qp["my_head"],
|
||||
src_sip=self._self_sip,
|
||||
src_cube=self._self_cube,
|
||||
|
||||
@@ -92,13 +92,31 @@ class PeMathComponent(PeEngineBase):
|
||||
token.pipeline_ctx.complete_tile()
|
||||
|
||||
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||
"""Legacy PeInternalTxn handling."""
|
||||
"""PeInternalTxn handling for standalone MathCmd (CCL kernels).
|
||||
|
||||
Latency = max(overhead_ns, _compute_ns(num_elements)):
|
||||
- overhead_ns: fixed per-invocation setup cost (from node attrs).
|
||||
- _compute_ns: SIMD cycle-based model (from vector_width + clock_freq).
|
||||
The larger of the two dominates (setup-bound vs compute-bound).
|
||||
"""
|
||||
from kernbench.common.pe_commands import MathCmd
|
||||
import math as _math
|
||||
|
||||
cmd = pe_txn.command
|
||||
num_elements = 0
|
||||
if isinstance(cmd, MathCmd) and cmd.out.shape:
|
||||
num_elements = _math.prod(cmd.out.shape)
|
||||
|
||||
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||
compute_ns = self._compute_ns(num_elements)
|
||||
ns = max(overhead_ns, compute_ns)
|
||||
|
||||
if self._accel:
|
||||
with self._accel.request() as req:
|
||||
yield req
|
||||
yield from self.run(env, 0)
|
||||
yield env.timeout(ns)
|
||||
else:
|
||||
yield from self.run(env, 0)
|
||||
yield env.timeout(ns)
|
||||
pe_txn.done.succeed()
|
||||
|
||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
|
||||
Reference in New Issue
Block a user