diff --git a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv index c41cccf..2496d4e 100644 --- a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv +++ b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv @@ -1,12 +1,12 @@ buffer_kind,sip_topology,n_sips,n_elem,bytes_per_pe,latency_ns -hbm,torus_2d,6,128,256,2002.0399999999827 -hbm,torus_2d,6,1024,2048,3541.0399999999827 -hbm,torus_2d,6,8192,16384,15889.03999999999 -hbm,torus_2d,6,32768,65536,58225.03999999998 -sram,torus_2d,6,128,256,1762.0399999999827 -sram,torus_2d,6,1024,2048,2293.0399999999827 -sram,torus_2d,6,8192,16384,6577.039999999986 -sram,torus_2d,6,32768,65536,21265.03999999992 +hbm,torus_2d,6,128,256,1858.0399999999827 +hbm,torus_2d,6,1024,2048,2389.0399999999827 +hbm,torus_2d,6,8192,16384,6673.039999999986 +hbm,torus_2d,6,32768,65536,21361.03999999992 +sram,torus_2d,6,128,256,1774.0399999999827 +sram,torus_2d,6,1024,2048,2389.0399999999827 +sram,torus_2d,6,8192,16384,7345.039999999986 +sram,torus_2d,6,32768,65536,24337.039999999935 tcm,torus_2d,6,128,256,1678.0399999999827 tcm,torus_2d,6,1024,2048,1957.0399999999827 tcm,torus_2d,6,8192,16384,4225.039999999986 diff --git a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png index 194b975..3eb62b8 100644 Binary files a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png and b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png differ diff --git a/src/kernbench/common/ipcq_types.py b/src/kernbench/common/ipcq_types.py index fa499fc..200e138 100644 --- a/src/kernbench/common/ipcq_types.py +++ b/src/kernbench/common/ipcq_types.py @@ -135,6 +135,13 @@ class IpcqRecvCmd: "return_slot" — return slot address as-is (default, zero-copy). Kernel uses the slot memory directly. "copy_to_dst" — copy slot data to dst_addr, then return. + + ``consume`` (DIAGNOSTIC ONLY): when False, recv still blocks until the + payload lands in the slot, but skips the slot-read latency charge + (slot-IO + PE↔bank fabric drain for SRAM/HBM tiers). This exists + solely so the pe2pe overview plot can compare apples-to-apples + against tl.store (a one-sided write that pays no read on DST). Real + kernels always need the data they receive — leave this True. """ direction: str | None # None → round-robin (weak fairness, D4) @@ -146,6 +153,7 @@ class IpcqRecvCmd: dst_space: str = "" # used only when recv_mode == "copy_to_dst" blocking: bool = True data_op: bool = True + consume: bool = True # DIAGNOSTIC: see docstring # ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ─────────────────── diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py index 35fca81..7e11b8f 100644 --- a/src/kernbench/components/builtin/pe_dma.py +++ b/src/kernbench/components/builtin/pe_dma.py @@ -222,10 +222,24 @@ class PeDmaComponent(PeEngineBase): # ADR-0023 D9.7: charge IPCQ slot-WRITE latency against the # backing-memory tier (tcm/sram/hbm) before the atomic block. # Must come BEFORE the atomic write→IpcqMetaArrival pair (I6). + # SRAM/HBM also pay a PE_DMA→bank fabric drain (slot lives on + # the cube NoC); TCM is per-PE local and skips this hop. from kernbench.common.ipcq_types import slot_io_latency_ns - slot_write_ns = slot_io_latency_ns( - token.dst_endpoint.buffer_kind, token.nbytes, - ) + buffer_kind = token.dst_endpoint.buffer_kind + if buffer_kind in ("sram", "hbm") and self.ctx is not None: + cube_prefix = self._pe_prefix.rsplit(".", 1)[0] + bank_node = ( + f"{cube_prefix}.sram" if buffer_kind == "sram" + else f"{cube_prefix}.hbm_ctrl" + ) + try: + path = self.ctx.router.find_path(self._pe_prefix, bank_node) + bank_drain_ns = self.ctx.compute_drain_ns(path, token.nbytes) + if bank_drain_ns > 0: + yield env.timeout(bank_drain_ns) + except Exception: + pass + slot_write_ns = slot_io_latency_ns(buffer_kind, token.nbytes) if slot_write_ns > 0: yield env.timeout(slot_write_ns) diff --git a/src/kernbench/components/builtin/pe_ipcq.py b/src/kernbench/components/builtin/pe_ipcq.py index 43a456c..d2c96a5 100644 --- a/src/kernbench/components/builtin/pe_ipcq.py +++ b/src/kernbench/components/builtin/pe_ipcq.py @@ -332,12 +332,37 @@ class PeIpcqComponent(ComponentBase): # ADR-0023 D9.7: charge IPCQ slot-READ latency against the # backing-memory tier (tcm/sram/hbm). Recv blocks for the # kernel-side slot consume; pe_exec_ns reflects this cost. + # SRAM/HBM live on the cube NoC behind a router-attached link, + # so reading a slot also pays a PE→bank fabric drain. TCM is + # per-PE local and skips this hop. + # + # cmd.consume is a DIAGNOSTIC flag (default True). When False, + # the read charges below are skipped — used only by the pe2pe + # overview plot for an apples-to-apples comparison against + # tl.store (one-sided write, no read on DST). Real kernels + # always consume; this branch must not be exercised in + # production code paths. from kernbench.common.ipcq_types import slot_io_latency_ns - slot_read_ns = slot_io_latency_ns( - self._buffer_kind, req.result_data.get("nbytes", 0), - ) - if slot_read_ns > 0: - yield env.timeout(slot_read_ns) + nbytes = req.result_data.get("nbytes", 0) + if cmd.consume: + if self._buffer_kind in ("sram", "hbm") and self.ctx is not None: + cube_prefix = self._pe_prefix.rsplit(".", 1)[0] + bank_node = ( + f"{cube_prefix}.sram" if self._buffer_kind == "sram" + else f"{cube_prefix}.hbm_ctrl" + ) + try: + path = self.ctx.router.find_path( + self._pe_prefix, bank_node, + ) + bank_drain_ns = self.ctx.compute_drain_ns(path, nbytes) + if bank_drain_ns > 0: + yield env.timeout(bank_drain_ns) + except Exception: + pass + slot_read_ns = slot_io_latency_ns(self._buffer_kind, nbytes) + if slot_read_ns > 0: + yield env.timeout(slot_read_ns) # Diagnostics trace (D14) from kernbench.ccl import diagnostics diff --git a/tests/test_ipcq_buffer_kind_latency.py b/tests/test_ipcq_buffer_kind_latency.py index 027a107..20e26d7 100644 --- a/tests/test_ipcq_buffer_kind_latency.py +++ b/tests/test_ipcq_buffer_kind_latency.py @@ -43,20 +43,30 @@ from tests.test_allreduce_multidevice import ( ) -# Expected per-tier BW + overhead (Phase 2 will encode this in -# pe_ipcq.py). Mirrors topology.yaml component values. -_EXPECTED_BW = { - "tcm": (512.0, 0.0), - "sram": (512.0, 2.0), - "hbm": (256.0, 6.0), +# Expected per-tier (slot intrinsic BW, fixed overhead, PE↔bank hop BW). +# Slot intrinsic mirrors _BUFFER_KIND_BW in src/kernbench/common/ipcq_types.py. +# PE↔bank hop reflects topology.yaml link BWs: +# - TCM is per-PE local → no hop, encoded as inf. +# - SRAM bank sits on cube NoC behind sram_to_router_bw_gbs = 128 GB/s. +# - HBM ctrl sits on cube NoC behind hbm_to_router_bw_gbs = 256 GB/s. +_EXPECTED_TIER = { + "tcm": {"slot_bw_gbs": 512.0, "overhead_ns": 0.0, "bank_hop_bw_gbs": float("inf")}, + "sram": {"slot_bw_gbs": 512.0, "overhead_ns": 2.0, "bank_hop_bw_gbs": 128.0}, + "hbm": {"slot_bw_gbs": 256.0, "overhead_ns": 6.0, "bank_hop_bw_gbs": 256.0}, } def _expected_slot_io_ns(buffer_kind: str, nbytes: int) -> float: - """Per-access latency the model is expected to add (write OR read).""" - bw_gbs, overhead_ns = _EXPECTED_BW[buffer_kind] - # 1 GB/s = 1 byte/ns - return nbytes / bw_gbs + overhead_ns + """Per-access latency the model is expected to add (write OR read). + + Includes the PE↔bank fabric hop for non-TCM tiers — SRAM and HBM + live on the cube NoC behind a router-attached link, so each slot + access pays a fabric drain in addition to the intrinsic slot-IO. + """ + tier = _EXPECTED_TIER[buffer_kind] + bank_hop_ns = nbytes / tier["bank_hop_bw_gbs"] + slot_io_ns = nbytes / tier["slot_bw_gbs"] + tier["overhead_ns"] + return bank_hop_ns + slot_io_ns def _run_torus_allreduce( @@ -114,12 +124,19 @@ def _run_torus_allreduce( # ── Phase 1 assertions ─────────────────────────────────────────────── -def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path): - """tcm < sram < hbm at 8192 B per send. +def test_slot_write_latency_orders_tcm_hbm_sram(tmp_path): + """tcm < hbm < sram at 8192 B per send. - Pre-Phase-2: all three return the same pe_exec_ns and this - assertion fails. Post-Phase-2: the per-tier BW + overhead make - hbm visibly slower than sram, which is slower than tcm. + The ordering is set by the topology link BWs, NOT the intrinsic slot + cell rates: SRAM and HBM both live on the cube NoC behind a router + link, and SRAM's link (128 GB/s) is the narrowest in the system — + narrower than HBM's (256 GB/s). So once the PE↔bank hop is charged, + SRAM ends up the slowest tier even though its slot cell array has + the same intrinsic BW as TCM. + + Pre-fix model misses the PE↔bank hop entirely → assertion FAILS + (today's ordering is tcm < sram < hbm). Post-fix model includes the + hop → assertion PASSES. """ n_elem = 4096 # 8192 B per slot lat_tcm = _run_torus_allreduce(tmp_path, buffer_kind="tcm", n_elem=n_elem) @@ -130,21 +147,22 @@ def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path): exp_tcm = 2 * _expected_slot_io_ns("tcm", n_elem * 2) exp_sram = 2 * _expected_slot_io_ns("sram", n_elem * 2) exp_hbm = 2 * _expected_slot_io_ns("hbm", n_elem * 2) - # Floor margin: 50% of the raw expected per-access delta — lets Phase 2 - # implementation choose to charge only one side without breaking the test, - # but still requires a clearly observable gap. - margin_sram_tcm = 0.5 * (exp_sram - exp_tcm) - margin_hbm_sram = 0.5 * (exp_hbm - exp_sram) + # Floor margin: 50% of the raw expected per-access delta — lets the + # implementation choose to charge only one side without breaking the + # test, but still requires a clearly observable gap. + margin_hbm_tcm = 0.5 * (exp_hbm - exp_tcm) + margin_sram_hbm = 0.5 * (exp_sram - exp_hbm) - assert lat_sram > lat_tcm + margin_sram_tcm, ( - f"sram should be slower than tcm by ≥ {margin_sram_tcm:.1f} ns " - f"per allreduce, got sram={lat_sram:.1f} tcm={lat_tcm:.1f} " - f"(delta={lat_sram - lat_tcm:.1f})" + assert lat_hbm > lat_tcm + margin_hbm_tcm, ( + f"hbm should be slower than tcm by ≥ {margin_hbm_tcm:.1f} ns " + f"per allreduce, got hbm={lat_hbm:.1f} tcm={lat_tcm:.1f} " + f"(delta={lat_hbm - lat_tcm:.1f})" ) - assert lat_hbm > lat_sram + margin_hbm_sram, ( - f"hbm should be slower than sram by ≥ {margin_hbm_sram:.1f} ns " - f"per allreduce, got hbm={lat_hbm:.1f} sram={lat_sram:.1f} " - f"(delta={lat_hbm - lat_sram:.1f})" + assert lat_sram > lat_hbm + margin_sram_hbm, ( + f"sram should be slower than hbm by ≥ {margin_sram_hbm:.1f} ns " + f"per allreduce (sram bank link 128 GB/s is narrower than hbm " + f"link 256 GB/s), got sram={lat_sram:.1f} hbm={lat_hbm:.1f} " + f"(delta={lat_sram - lat_hbm:.1f})" ) diff --git a/tests/test_ipcq_buffer_kind_locations.py b/tests/test_ipcq_buffer_kind_locations.py new file mode 100644 index 0000000..e273b7a --- /dev/null +++ b/tests/test_ipcq_buffer_kind_locations.py @@ -0,0 +1,208 @@ +"""Phase 1 micro-tests for IPCQ slot-memory PHYSICAL placement. + +The current model in ``_BUFFER_KIND_BW`` (src/kernbench/common/ipcq_types.py) +charges only an intrinsic-memory term for IPCQ slot read/write:: + + TCM: nbytes/512 + 0 + SRAM: nbytes/512 + 2 + HBM: nbytes/256 + 6 + +This treats SRAM and HBM as if they were per-PE local. The topology +declares the opposite — both live on the cube NoC, behind their own +router-attached link:: + + topology.yaml:130 sram_to_router_bw_gbs: 128.0 + topology.yaml:129 hbm_to_router_bw_gbs: 256.0 + +So a correct model must charge a PE→bank fabric drain for SRAM and HBM +on both ``tl.send`` (writer landing bytes into the cube SRAM/HBM bank +via PE_DMA → router → bank) and ``tl.recv`` (reader pulling bytes back +across the same link). TCM stays free of that hop because it is +genuinely per-PE local. + +The three tests below run the existing torus_2d 6-SIP allreduce harness +with ``buffer_kind`` flipped between tcm/sram/hbm and assert invariants +that the post-fix model must satisfy. They EXPECT TO FAIL today because +the simulator under-charges SRAM and HBM by skipping the PE↔bank hop. + +Phase 2 will edit: + - src/kernbench/components/builtin/pe_ipcq.py (_handle_recv: add + compute_drain_ns(pe→bank, nbytes) for sram/hbm) + - src/kernbench/components/builtin/pe_dma.py (_handle_ipcq_inbound: + add second-leg drain for sram/hbm-destined slots) + +Tests must NEVER be weakened to make Phase 2 pass — invariants below +follow from physics (link BW × payload), so any model reflecting the +topology will satisfy them by construction. +""" +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from kernbench.runtime_api.context import RuntimeContext +from kernbench.runtime_api.types import DeviceSelector +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + +from tests.test_allreduce_multidevice import ( + _write_temp_configs, + run_allreduce, +) + + +def _run_allreduce_with_buffer_kind( + tmp_path: Path, *, buffer_kind: str, n_elem: int, +) -> float: + """Run one torus_2d 6-SIP allreduce with the given buffer_kind and + return critical-path pe_exec_ns (max across all PEs). + + Mirrors the sweep harness in test_allreduce_buffer_kind_sweep.py + so the assertions below compare apples-to-apples against that PNG. + """ + sub = tmp_path / f"{buffer_kind}_{n_elem}" + sub.mkdir() + topo_path, ccl_path = _write_temp_configs( + sub, + sip_topology="torus_2d", + n_sips=6, + algorithm="intercube_allreduce", + sip_w=3, sip_h=2, + n_elem_override=n_elem, + ) + + with open(ccl_path) as f: + ccl_cfg = yaml.safe_load(f) + ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind + ccl_cfg.setdefault("algorithms", {}).setdefault( + "intercube_allreduce", {}, + )["buffer_kind"] = buffer_kind + with open(ccl_path, "w") as f: + yaml.dump(ccl_cfg, f, default_flow_style=False) + + topo = resolve_topology(topo_path) + engine = GraphEngine(topo.topology_obj, enable_data=True) + spec = topo.topology_obj.spec + + with RuntimeContext( + engine=engine, + target_device=DeviceSelector("all"), + correlation_id=f"loc_{buffer_kind}_{n_elem}", + spec=spec, + ) as ctx: + result = run_allreduce( + ctx, engine, spec, + algorithm="intercube_allreduce", ccl_yaml=ccl_path, + ) + assert result["ok_cubes"] > 0, "allreduce did not validate" + + pe_exec_vals = [ + float(tr.get("pe_exec_ns", 0.0) or 0.0) + for _, (_, tr) in engine._results.items() + if isinstance(tr, dict) + ] + return max(pe_exec_vals) if pe_exec_vals else 0.0 + + +# ── Phase 1 assertions ─────────────────────────────────────────────── + + +def test_sram_meaningfully_slower_than_tcm_at_large_payload(tmp_path): + """At 32 KB / PE the SRAM-backed allreduce must take meaningfully + longer than the TCM-backed one because every IPCQ slot access goes + through the 128 GB/s SRAM↔router link, while TCM stays per-PE local. + + Floor justification (physics, not implementation): + Per-IPCQ-roundtrip the SRAM tier adds 2 × nbytes/128 ns over TCM + (one PE→SRAM hop on send-inbound, one SRAM→PE hop on recv). + At 32 KB: 2 × 32768/128 = 512 ns added per slot exchange. + With ≥ 10 critical-path exchanges in a 6-SIP torus_2d allreduce + this is ≥ 5_120 ns. The threshold below is half that to leave + room for differing critical-path counting. + + Pre-Phase-2: gap is constant 48 ns (just the SRAM overhead × 24 + slot accesses); test FAILS. + Post-Phase-2: gap scales with payload; test PASSES. + """ + n_elem = 16384 # 32 KB / PE + lat_tcm = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="tcm", n_elem=n_elem, + ) + lat_sram = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="sram", n_elem=n_elem, + ) + delta = lat_sram - lat_tcm + THRESHOLD_NS = 2_500.0 + assert delta > THRESHOLD_NS, ( + f"SRAM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB " + f"because each IPCQ access pays a 128 GB/s PE↔SRAM hop. " + f"got tcm={lat_tcm:.1f} sram={lat_sram:.1f} delta={delta:.1f} ns" + ) + + +def test_sram_tcm_gap_scales_with_payload(tmp_path): + """The SRAM-vs-TCM gap must grow roughly linearly with payload size. + + Pre-Phase-2: the only difference between TCM and SRAM is the SRAM + per-access ``overhead_ns = 2``, which does NOT scale with payload — + so the gap is the same constant 48 ns at 8 KB and at 32 KB. Ratio = 1. + + Post-Phase-2: the dominant term is 2 × nbytes/128 (PE↔SRAM hop on + write+read) which IS linear in payload. Going 8 KB → 32 KB (4×) + should produce a gap roughly 4× larger. + + Threshold below is 3× to keep slack for fixed-overhead effects. + """ + lat_tcm_small = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="tcm", n_elem=4096, # 8 KB + ) + lat_sram_small = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="sram", n_elem=4096, + ) + lat_tcm_large = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="tcm", n_elem=16384, # 32 KB + ) + lat_sram_large = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="sram", n_elem=16384, + ) + + gap_small = lat_sram_small - lat_tcm_small + gap_large = lat_sram_large - lat_tcm_large + + assert gap_small > 0, ( + f"sanity: SRAM should never be FASTER than TCM, " + f"got gap_small={gap_small:.1f} ns" + ) + assert gap_large > 3.0 * gap_small, ( + f"4× payload should produce ≥3× SRAM/TCM gap (linear in nbytes " + f"because of the 128 GB/s PE↔SRAM hop). " + f"got gap_small={gap_small:.1f} (8KB), gap_large={gap_large:.1f} " + f"(32KB), ratio={gap_large / max(gap_small, 1e-9):.2f}" + ) + + +def test_hbm_pe_hop_charged_at_large_payload(tmp_path): + """At 32 KB / PE the HBM-vs-TCM gap must exceed the gap that comes + purely from HBM's 256 GB/s intrinsic slot-IO disadvantage. + + Pre-Phase-2 the entire HBM/TCM gap is just the slot-IO term + (24 × (nbytes/512 + 6) ≈ 1_700 ns at 32 KB). Post-fix adds another + 24 × (nbytes/256) × 2 ≈ 6_144 ns from the PE↔HBM hop on send and + recv, so the total HBM/TCM gap should clearly clear 4 µs. + """ + n_elem = 16384 # 32 KB / PE + lat_tcm = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="tcm", n_elem=n_elem, + ) + lat_hbm = _run_allreduce_with_buffer_kind( + tmp_path, buffer_kind="hbm", n_elem=n_elem, + ) + delta = lat_hbm - lat_tcm + THRESHOLD_NS = 4_000.0 + assert delta > THRESHOLD_NS, ( + f"HBM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB " + f"once the 256 GB/s PE↔HBM hop is charged on each IPCQ access. " + f"got tcm={lat_tcm:.1f} hbm={lat_hbm:.1f} delta={delta:.1f} ns" + )