"""Diagnostic for the inter-cube RAW > IPCQ asymmetry on h3/h4 plots. Single-shot run at h3 (sip0.cube0.pe0 -> sip0.cube1.pe0), nbytes=4096. Captures per-PE pe_exec_ns and the actual path / drain / per-node overhead breakdown for the RAW sub-txn (PE_DMA -> remote HBM_CTRL) vs the IPCQ outbound sub-txn (PE_DMA -> peer PE_DMA), so we can localize the gap to one of: (a) drain at HBM-BW (RAW) vs fabric-BW (IPCQ) (b) path-length / per-node overhead asymmetry (c) RAW SRC paying tl.load (local HBM read) on top of remote tl.store while IPCQ DST only pays inbound traversal+drain. Phase 1 / test-only. No production code is modified. """ from __future__ import annotations from pathlib import Path import numpy as np import pytest from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip from kernbench.policy.placement.dp import DPPolicy from kernbench.runtime_api.context import RuntimeContext from kernbench.runtime_api.types import DeviceSelector from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import resolve_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" import os # Allow the test to be re-run for h4 (inter-cube vertical) at multiple sizes # to investigate why IPCQ slope flattens past 8192 B (path may differ). NBYTES = int(os.environ.get("DIAG_NBYTES", "4096")) ELEM_BYTES = 2 N_ELEM = NBYTES // ELEM_BYTES N_CUBES = 16 N_PES = 8 HOP = os.environ.get("DIAG_HOP", "h3") if HOP == "h4": SRC = (0, 0, 0) DST = (0, 4, 0) # h4 inter-cube vertical else: SRC = (0, 0, 0) DST = (0, 1, 0) # h3 inter-cube horizontal # ── Per-PE pe_exec_ns capture via monkey-patch ─────────────────────── def _install_barrier_capture(): """Wrap PeCpuComponent._execute_kernel to log, for every PE that enters: env.now at entry, target_start_ns the request carried, whether the barrier yield fired (i.e. env.now < target_start_ns), and env.now at pe_exec_start. """ import kernbench.components.builtin.pe_cpu as pe_cpu_mod log: list[dict] = [] original = pe_cpu_mod.PeCpuComponent._execute_kernel def patched(self, env, txn): request = txn.request target_start = getattr(request, "target_start_ns", None) entry_now = float(env.now) log_entry = { "node_id": self.node.id, "entry_now": entry_now, "target_start_ns": ( float(target_start) if target_start is not None else None ), "barrier_skipped": ( target_start is None or float(target_start) <= entry_now ), "delta_late_ns": ( None if target_start is None else max(0.0, entry_now - float(target_start)) ), } log.append(log_entry) yield from original(self, env, txn) pe_cpu_mod.PeCpuComponent._execute_kernel = patched def restore(): pe_cpu_mod.PeCpuComponent._execute_kernel = original return log, restore def _install_per_pe_capture(): """Wrap PeCpuComponent._execute_kernel so we record (node_id -> pe_exec_ns) for every PE that executes a kernel during the run. Returns (capture_dict, restore_callable). """ import kernbench.components.builtin.pe_cpu as pe_cpu_mod captured: dict[str, float] = {} original = pe_cpu_mod.PeCpuComponent._execute_kernel def patched(self, env, txn): gen = original(self, env, txn) try: value = yield from gen finally: v = txn.result_data.get("pe_exec_ns") if v is not None: captured[self.node.id] = float(v) return value pe_cpu_mod.PeCpuComponent._execute_kernel = patched def restore(): pe_cpu_mod.PeCpuComponent._execute_kernel = original return captured, restore def _install_recv_capture(target_node_id: str): """Wrap PeIpcqComponent._handle_recv to log entry/exit times and the peer_head_cache/my_tail values seen at the start. This pins down whether recv ever blocked on a wait_event, or whether it consumed without waiting (i.e. peer_head_cache > my_tail at entry). """ import kernbench.components.builtin.pe_ipcq as pe_ipcq_mod log: list[dict] = [] original = pe_ipcq_mod.PeIpcqComponent._handle_recv def patched(self, env, req, cmd): if self.node.id != target_node_id: yield from original(self, env, req, cmd) return # Snapshot state before dispatch d = cmd.direction qp = self._queue_pairs.get(d, {}) log.append({ "phase": "enter", "t": float(env.now), "direction": d, "peer_head_cache": qp.get("peer_head_cache"), "my_tail": qp.get("my_tail"), }) yield from original(self, env, req, cmd) qp = self._queue_pairs.get(d, {}) log.append({ "phase": "exit", "t": float(env.now), "direction": d, "peer_head_cache": qp.get("peer_head_cache"), "my_tail": qp.get("my_tail"), }) pe_ipcq_mod.PeIpcqComponent._handle_recv = patched def restore(): pe_ipcq_mod.PeIpcqComponent._handle_recv = original return log, restore def _install_meta_arrival_capture(target_node_id: str): """Log every IpcqMetaArrival that lands on ``target_node_id`` PE_IPCQ. Records (env_now, sender_seq, dst_addr, matched_direction, peer_head_cache_before, my_tail_before). """ import kernbench.components.builtin.pe_ipcq as pe_ipcq_mod log: list[dict] = [] original = pe_ipcq_mod.PeIpcqComponent._handle_meta_arrival def patched(self, msg): if self.node.id == target_node_id: token = msg.token now = float(self._env.now) if hasattr(self, "_env") else 0.0 # _env is not stored on the component; use ctx? Fall back to # introspection via self._inbox._env (SimPy stores reference). try: now = float(self._inbox._env.now) except Exception: pass entry = { "t": now, "sender_seq": getattr(token, "sender_seq", None), "dst_addr": getattr(token, "dst_addr", None), "src_sip": getattr(token, "src_sip", None), "src_cube": getattr(token, "src_cube", None), "src_pe": getattr(token, "src_pe", None), "src_direction": getattr(token, "src_direction", None), "nbytes": getattr(token, "nbytes", None), "matched_direction": None, "peer_head_cache_before": {}, "my_tail_before": {}, } for d, qp in self._queue_pairs.items(): entry["peer_head_cache_before"][d] = qp["peer_head_cache"] entry["my_tail_before"][d] = qp["my_tail"] base = qp["my_rx_base_pa"] size = qp["n_slots"] * qp["slot_size"] if base <= entry["dst_addr"] < base + size: entry["matched_direction"] = d log.append(entry) return original(self, msg) pe_ipcq_mod.PeIpcqComponent._handle_meta_arrival = patched def restore(): pe_ipcq_mod.PeIpcqComponent._handle_meta_arrival = original return log, restore def _snapshot_qp_state(engine, target_node_id: str) -> dict: """Snapshot every direction's qp state on the target PE_IPCQ now. Captures peer_head_cache, my_tail, my_rx_base_pa, n_slots, slot_size for each installed direction. """ comp = engine._components.get(target_node_id) if comp is None: return {} return { d: { "peer_head_cache": qp["peer_head_cache"], "my_tail": qp["my_tail"], "my_rx_base_pa": qp["my_rx_base_pa"], "n_slots": qp["n_slots"], "slot_size": qp["slot_size"], "rx_range": ( qp["my_rx_base_pa"], qp["my_rx_base_pa"] + qp["n_slots"] * qp["slot_size"], ), } for d, qp in comp.queue_pairs.items() } # ── Path / drain breakdown using engine ctx ────────────────────────── def _path_breakdown(ctx, path: list[str], nbytes: int) -> dict: edge_total_ns = 0.0 edge_details = [] min_bw = float("inf") for i in range(len(path) - 1): edge = ctx.edge_map.get((path[i], path[i + 1])) if edge is None: edge_details.append((path[i], path[i + 1], None, None, None)) continue prop_ns = edge.distance_mm * ctx.ns_per_mm edge_total_ns += prop_ns bw = getattr(edge, "bw_gbs", None) or 0.0 if bw > 0 and bw < min_bw: min_bw = bw edge_details.append( (path[i], path[i + 1], edge.distance_mm, prop_ns, bw), ) overhead_total_ns = 0.0 overhead_details = [] for nid in path: oh = float(ctx.node_overhead_ns.get(nid, 0.0)) overhead_total_ns += oh overhead_details.append((nid, oh)) drain_ns = ctx.compute_drain_ns(path, nbytes) bottleneck_bw = None if min_bw == float("inf") else min_bw return { "path": path, "edges": edge_details, "edge_total_ns": edge_total_ns, "overheads": overhead_details, "overhead_total_ns": overhead_total_ns, "drain_ns": drain_ns, "bottleneck_bw_gbs": bottleneck_bw, "expected_total_ns": edge_total_ns + overhead_total_ns + drain_ns, } def _print_breakdown(label: str, br: dict) -> None: print(f"\n {label}") print(f" path ({len(br['path'])} nodes):") for nid in br["path"]: print(f" - {nid}") print(f" edges (prop. delay):") for src, dst, dist_mm, prop_ns, bw in br["edges"]: if dist_mm is None: print(f" ! {src} -> {dst} EDGE NOT FOUND IN edge_map") continue print( f" {src} -> {dst} " f"dist={dist_mm:.3f}mm prop={prop_ns:.2f}ns " f"bw={bw or 0:.2f}GB/s" ) print(f" per-node overhead_ns:") for nid, oh in br["overheads"]: if oh > 0: print(f" {nid:<60s} overhead_ns={oh:.2f}") print(f" edge_total_ns = {br['edge_total_ns']:.2f}") print(f" overhead_total_ns = {br['overhead_total_ns']:.2f}") print(f" bottleneck_bw_gbs = {br['bottleneck_bw_gbs']}") print(f" drain_ns (nbytes={NBYTES}) = {br['drain_ns']:.2f}") print(f" expected_total_ns = {br['expected_total_ns']:.2f}") # ── RAW path scenario ──────────────────────────────────────────────── def _dump_src_op_records(engine, src_sip, src_cube, src_pe, label) -> None: """Print op_logger records for ops on the SRC PE. The op log captures t_start/t_end for memory/math/gemm/copy ops on every component, so we can see how long tl.load vs tl.store vs tl.send actually took at the engine level. """ op_logger = getattr(engine, "_op_logger", None) if op_logger is None: print(f" ({label}) op_logger not available") return src_prefix = f"sip{src_sip}.cube{src_cube}.pe{src_pe}." recs = [r for r in op_logger.records if r.component_id.startswith(src_prefix)] print(f" ({label}) op_logger records on SRC PE ({src_prefix}*):") for r in recs[:40]: dur = r.t_end - r.t_start comp_short = r.component_id.replace(src_prefix, "") params_short = "" if "nbytes" in r.params: params_short = f" nbytes={r.params['nbytes']}" if "src_addr" in r.params: params_short += f" src_addr={r.params['src_addr']}" if "dst_addr" in r.params: params_short += f" dst_addr={r.params['dst_addr']}" print( f" t=[{r.t_start:7.2f}..{r.t_end:7.2f}] dur={dur:6.2f}ns " f"{comp_short:<25s} {r.op_kind:<8s} {r.op_name:<12s}{params_short}" ) def _run_raw(): captured, restore = _install_per_pe_capture() try: topo = resolve_topology(str(TOPOLOGY_PATH)) engine = GraphEngine(topo.topology_obj, enable_data=True) spec = topo.topology_obj.spec src_sip, src_cube, src_pe = SRC dst_sip, dst_cube, dst_pe = DST assert src_sip == dst_sip src_off = (src_cube * N_PES + src_pe) * N_ELEM * ELEM_BYTES dst_off = (dst_cube * N_PES + dst_pe) * N_ELEM * ELEM_BYTES with RuntimeContext( engine=engine, target_device=DeviceSelector("all"), correlation_id="diag_raw", spec=spec, ) as rt: dp = DPPolicy( cube="row_wise", pe="column_wise", num_cubes=N_CUBES, num_pes=N_PES, ) rt.ahbm.set_device(src_sip) t = rt.zeros( (N_CUBES, N_PES * N_ELEM), dtype="f16", dp=dp, name="raw_tensor", ) t.copy_(rt.from_numpy( np.full((N_CUBES, N_PES * N_ELEM), 1.0, dtype=np.float16), )) def kernel(t_ptr, n_elem, tl): pe_id = tl.program_id(axis=0) cube_id = tl.program_id(axis=1) if cube_id == src_cube and pe_id == src_pe: data = tl.load( t_ptr + src_off, shape=(n_elem,), dtype="f16", ) tl.store(t_ptr + dst_off, data) pending = rt.launch( "diag_raw_kernel", kernel, t, N_ELEM, _defer_wait=True, ) for h, _sip, meta in pending: rt.wait(h, _meta=meta) # Compute the RAW sub-txn path: src PE_DMA -> dst HBM_CTRL from kernbench.policy.address.phyaddr import PhysAddr ctx = next(iter(engine._components.values())).ctx src_pe_prefix = f"sip{src_sip}.cube{src_cube}.pe{src_pe}" # Resolve dst PA to HBM controller node # The raw store kernel issues DmaWriteCmd on dst VA; in the engine # this is translated via PE_MMU. For diagnostic we approximate # the destination as the dst cube's HBM controller for slice # belonging to dst_pe. # Use the resolver on a constructed PA matching the same memory # slice the kernel writes to. # The tensor is "row_wise" sharded across cubes, so each cube # owns row[cube_id, :], with each PE owning a column slice. # The actual dst PA depends on the AHBM allocator; we read it # via the tensor's shard map. shard_map = getattr(t, "_shard_map", None) or getattr(t, "shard_map", None) # Fallback: query the resolver directly by constructing a PA in # the dst cube's HBM region. If shard_map is unavailable, still # show the breakdown for src-PE-DMA -> first reachable HBM_CTRL # in dst cube. dst_hbm_id = f"sip{dst_sip}.cube{dst_cube}.hbm_ctrl" if dst_hbm_id not in engine._components: # try alternate naming for nid in engine._components.keys(): if ( nid.startswith(f"sip{dst_sip}.cube{dst_cube}.") and "hbm" in nid ): dst_hbm_id = nid break # find_path() prepends ".pe_dma" to src_pe automatically try: raw_path = ctx.router.find_path(src_pe_prefix, dst_hbm_id) except Exception as e: raw_path = [] print(f" WARN: find_path raw failed: {e}") if not raw_path: # Try other HBM-related node names in dst cube for nid in engine._components.keys(): if not nid.startswith(f"sip{dst_sip}.cube{dst_cube}."): continue if "hbm" not in nid: continue try: p = ctx.router.find_path(src_pe_prefix, nid) except Exception: p = [] if p: raw_path = p print(f" (fallback raw dst node: {nid})") break return captured, ctx, raw_path, engine finally: restore() # ── IPCQ path scenario ─────────────────────────────────────────────── def _run_ipcq(): captured, restore = _install_per_pe_capture() dst_pe_ipcq_id = ( f"sip{DST[0]}.cube{DST[1]}.pe{DST[2]}.pe_ipcq" ) arrival_log, restore_arrival = _install_meta_arrival_capture( dst_pe_ipcq_id, ) recv_log, restore_recv = _install_recv_capture(dst_pe_ipcq_id) barrier_log, restore_barrier = _install_barrier_capture() try: topo = resolve_topology(str(TOPOLOGY_PATH)) engine = GraphEngine(topo.topology_obj, enable_data=True) spec = topo.topology_obj.spec src_sip, src_cube, src_pe = SRC dst_sip, dst_cube, dst_pe = DST cfg = load_ccl_config() merged = resolve_algorithm_config(cfg, name="lrab_hierarchical_allreduce") merged["slot_size"] = max(int(merged.get("slot_size", 4096)), NBYTES) with RuntimeContext( engine=engine, target_device=DeviceSelector("all"), correlation_id="diag_ipcq", spec=spec, ) as rt: configure_sfr_intercube_multisip(engine, spec, merged) dp = DPPolicy( cube="row_wise", pe="column_wise", num_cubes=N_CUBES, num_pes=N_PES, ) def kernel(t_ptr, n_elem, tl): pe_id = tl.program_id(axis=0) cube_id = tl.program_id(axis=1) if cube_id == src_cube and pe_id == src_pe: data = tl.load(t_ptr, shape=(n_elem,), dtype="f16") tl.send(dir=("E" if HOP == "h3" else "S"), src=data) elif cube_id == dst_cube and pe_id == dst_pe: tl.recv( dir=("W" if HOP == "h3" else "N"), shape=(n_elem,), dtype="f16", ) tensors = [] for s in sorted({src_sip, dst_sip}): rt.ahbm.set_device(s) t = rt.zeros( (N_CUBES, N_PES * N_ELEM), dtype="f16", dp=dp, name=f"sip{s}", ) t.copy_(rt.from_numpy( np.full((N_CUBES, N_PES * N_ELEM), 1.0, dtype=np.float16), )) tensors.append(t) all_pending = [] for tt in tensors: pending = rt.launch( "diag_ipcq_kernel", kernel, tt, N_ELEM, _defer_wait=True, ) all_pending.extend(pending) for h, _sip, meta in all_pending: rt.wait(h, _meta=meta) ctx = next(iter(engine._components.values())).ctx src_pe_prefix = f"sip{src_sip}.cube{src_cube}.pe{src_pe}" dst_pe_dma = f"sip{dst_sip}.cube{dst_cube}.pe{dst_pe}.pe_dma" try: ipcq_path = ctx.router.find_path(src_pe_prefix, dst_pe_dma) except Exception as e: ipcq_path = [] print(f" WARN: find_path ipcq failed: {e}") # Snapshot DST PE_IPCQ qp state at end-of-run so we can see what # peer_head_cache/my_tail looked like (and at which directions). qp_state = _snapshot_qp_state(engine, dst_pe_ipcq_id) return (captured, ctx, ipcq_path, engine, arrival_log, qp_state, recv_log, barrier_log) finally: restore_barrier() restore_recv() restore_arrival() restore() # ── Test entry ─────────────────────────────────────────────────────── @pytest.mark.diagnostic def test_pe_to_pe_diagnostic_h3(): print("\n" + "=" * 78) print(f" Diagnostic: h3 inter-cube horizontal, nbytes={NBYTES}") print(f" src={SRC} dst={DST}") print("=" * 78) # ── RAW scenario print("\n[RAW] tl.load + tl.store (sender pays both legs)") raw_per_pe, raw_ctx, raw_path, raw_engine = _run_raw() print(f" per-PE pe_exec_ns ({len(raw_per_pe)} entries):") src_id = f"sip{SRC[0]}.cube{SRC[1]}.pe{SRC[2]}.pe_cpu" dst_id = f"sip{DST[0]}.cube{DST[1]}.pe{DST[2]}.pe_cpu" for nid in (src_id, dst_id): if nid in raw_per_pe: print(f" {nid:<60s} {raw_per_pe[nid]:.2f} ns <-- key PE") nonzero = {k: v for k, v in raw_per_pe.items() if v > 0.5} if nonzero: print(f" other PEs with pe_exec_ns > 0.5 ns:") for nid, v in sorted(nonzero.items(), key=lambda kv: -kv[1])[:6]: if nid not in (src_id, dst_id): print(f" {nid:<60s} {v:.2f} ns") print(f" max(pe_exec_ns) = " f"{max(raw_per_pe.values()) if raw_per_pe else 0:.2f} ns") if raw_path: br = _path_breakdown(raw_ctx, raw_path, NBYTES) _print_breakdown("RAW sub-txn path (src.pe_dma -> dst.hbm_ctrl)", br) _dump_src_op_records(raw_engine, *SRC, "RAW") # ── IPCQ scenario print("\n[IPCQ] tl.send + tl.recv (recv pays inbound traversal+drain)") (ipcq_per_pe, ipcq_ctx, ipcq_path, ipcq_engine, arrival_log, qp_state, recv_log, barrier_log) = _run_ipcq() print(f"\n [BARRIER LOG] {len(barrier_log)} _execute_kernel entries:") src_id = f"sip{SRC[0]}.cube{SRC[1]}.pe{SRC[2]}.pe_cpu" dst_id = f"sip{DST[0]}.cube{DST[1]}.pe{DST[2]}.pe_cpu" n_skipped = 0 src_entry = None dst_entry = None for e in barrier_log: if e["barrier_skipped"]: n_skipped += 1 if e["node_id"] == src_id: src_entry = e if e["node_id"] == dst_id: dst_entry = e print(f" PEs entering _execute_kernel: {len(barrier_log)}") print(f" PEs that SKIPPED barrier (env.now > target_start): {n_skipped}") if src_entry: print( f" SRC pe ({src_id}): entry_now={src_entry['entry_now']:.2f} " f"target_start={src_entry['target_start_ns']:.2f} " f"skipped={src_entry['barrier_skipped']} " f"late_ns={src_entry['delta_late_ns']:.2f}" ) if dst_entry: print( f" DST pe ({dst_id}): entry_now={dst_entry['entry_now']:.2f} " f"target_start={dst_entry['target_start_ns']:.2f} " f"skipped={dst_entry['barrier_skipped']} " f"late_ns={dst_entry['delta_late_ns']:.2f}" ) # Top 5 latest arrivals sorted_late = sorted( [e for e in barrier_log if e["delta_late_ns"] is not None], key=lambda e: -e["delta_late_ns"], )[:5] print(f" Top 5 latest PE arrivals (positive = barrier missed):") for e in sorted_late: if e["delta_late_ns"] > 0: print( f" {e['node_id']}: late by {e['delta_late_ns']:.2f} ns " f"(entry={e['entry_now']:.2f}, target={e['target_start_ns']:.2f})" ) print(f"\n [RECV LOG on dst pe_ipcq] {len(recv_log)} entries:") for e in recv_log: print( f" {e['phase']:5s} t={e['t']:8.2f} ns " f"dir={e['direction']} " f"peer_head_cache={e['peer_head_cache']} " f"my_tail={e['my_tail']}" ) print(f"\n [META-ARRIVAL LOG on dst pe_ipcq] {len(arrival_log)} arrivals:") for i, e in enumerate(arrival_log): print( f" #{i:2d} t={e['t']:8.2f} ns " f"src=(sip{e['src_sip']},cube{e['src_cube']},pe{e['src_pe']}) " f"dir={e['src_direction']} " f"sender_seq={e['sender_seq']} " f"matched_dir={e['matched_direction']} " f"nbytes={e['nbytes']}" ) for d, ph in e["peer_head_cache_before"].items(): mt = e["my_tail_before"][d] if ph != 0 or mt != 0 or d == e["matched_direction"]: print( f" before: dir={d} peer_head_cache={ph} my_tail={mt}" ) print(f"\n [QP STATE END-OF-RUN on dst pe_ipcq]:") for d, st in qp_state.items(): print( f" dir={d} peer_head_cache={st['peer_head_cache']} " f"my_tail={st['my_tail']} rx_range=[{st['rx_range'][0]}..." f"{st['rx_range'][1]}) n_slots={st['n_slots']} " f"slot_size={st['slot_size']}" ) print(f" per-PE pe_exec_ns ({len(ipcq_per_pe)} entries):") for nid in (src_id, dst_id): if nid in ipcq_per_pe: print(f" {nid:<60s} {ipcq_per_pe[nid]:.2f} ns <-- key PE") nonzero = {k: v for k, v in ipcq_per_pe.items() if v > 0.5} if nonzero: print(f" other PEs with pe_exec_ns > 0.5 ns:") for nid, v in sorted(nonzero.items(), key=lambda kv: -kv[1])[:6]: if nid not in (src_id, dst_id): print(f" {nid:<60s} {v:.2f} ns") print(f" max(pe_exec_ns) = " f"{max(ipcq_per_pe.values()) if ipcq_per_pe else 0:.2f} ns") if ipcq_path: br = _path_breakdown(ipcq_ctx, ipcq_path, NBYTES) _print_breakdown("IPCQ sub-txn path (src.pe_dma -> peer.pe_dma)", br) _dump_src_op_records(ipcq_engine, *SRC, "IPCQ") _dump_src_op_records(ipcq_engine, *DST, "IPCQ DST") # ── Credit-return path analysis (where the missing IPCQ "ack" lives) print("\n" + "-" * 78) print("Credit-return path (current modeling)") print("-" * 78) src_pe_prefix = f"sip{SRC[0]}.cube{SRC[1]}.pe{SRC[2]}" dst_pe_prefix = f"sip{DST[0]}.cube{DST[1]}.pe{DST[2]}" # PE_IPCQ._credit_latency_ns calls # ctx.router.find_path(self._pe_prefix, peer_pe_prefix) # where the *destination* lacks the ".pe_dma" suffix. find_path() # only auto-appends to the source, so this raises -> the except # clause silently returns 0.0. Effectively credit latency = 0. try: ipcq_ctx.router.find_path(dst_pe_prefix, src_pe_prefix) bug_caught = False except Exception as e: bug_caught = True print(f" CONFIRMED BUG in _credit_latency_ns: dest lacks '.pe_dma' " f"-> find_path raises -> caught exception -> returns 0.0") print(f" Error: {e}") # The intended credit path is recv -> sender (reverse data direction) try: credit_path = ipcq_ctx.router.find_path( dst_pe_prefix, f"{src_pe_prefix}.pe_dma", ) except Exception as e: credit_path = [] print(f" WARN: corrected find_path credit failed: {e}") if credit_path: credit_size = 16 # PE_IPCQ default _credit_size_bytes # Today's modeling: drain only, 16 bytes -> ~0.125 ns cur = ipcq_ctx.compute_drain_ns(credit_path, credit_size) # Proposed modeling: full path latency (edges + node overhead + drain) proposed = ipcq_ctx.compute_path_latency_ns(credit_path, credit_size) print(f" credit path nodes = {len(credit_path)} (recv -> sender)") for nid in credit_path[:6]: print(f" {nid}") if len(credit_path) > 6: print(f" ... {len(credit_path) - 6} more nodes") br = _path_breakdown(ipcq_ctx, credit_path, credit_size) print(f" edge_total_ns = {br['edge_total_ns']:.2f}") print(f" overhead_total_ns = {br['overhead_total_ns']:.2f}") print(f" drain_ns(16 bytes) = {br['drain_ns']:.2f}") print(f" CURRENT _credit_latency_ns (drain only) = {cur:.3f} ns") print(f" PROPOSED (compute_path_latency_ns) = {proposed:.2f} ns") print(f" delta = {proposed - cur:+.2f} ns") # ── Comparison summary print("\n" + "-" * 78) print("Summary") print("-" * 78) raw_max = max(raw_per_pe.values()) if raw_per_pe else 0.0 ipcq_max = max(ipcq_per_pe.values()) if ipcq_per_pe else 0.0 print(f" RAW max(pe_exec_ns) = {raw_max:.2f} ns") print(f" IPCQ max(pe_exec_ns) (current) = {ipcq_max:.2f} ns") print(f" delta (RAW - IPCQ current) = {raw_max - ipcq_max:+.2f} ns") if credit_path: ipcq_with_credit = ipcq_max + (proposed - cur) print( f" IPCQ projected w/ blocking credit + full path overhead " f"= {ipcq_with_credit:.2f} ns" ) print( f" delta (RAW - IPCQ projected) = " f"{raw_max - ipcq_with_credit:+.2f} ns " f"(<= 0 means IPCQ >= RAW)" ) # No assertions — this is observational. assert raw_per_pe, "no RAW pe_exec_ns recorded" assert ipcq_per_pe, "no IPCQ pe_exec_ns recorded"