ADR-0023 D9.7+: charge PE↔bank fabric hop for SRAM/HBM IPCQ slots

Cube SRAM and HBM live on the cube NoC behind router-attached links (sram_to_router_bw_gbs=128, hbm_to_router_bw_gbs=256). Previously the slot-IO model treated them as if they were per-PE local, so the buffer_kind sweep showed TCM ≈ SRAM at 64 KB / PE. pe_ipcq._handle_recv and pe_dma._handle_ipcq_inbound now charge a PE→bank compute_drain_ns on top of the intrinsic slot-IO for SRAM/HBM. TCM stays free of this hop. Adds an internal IpcqRecvCmd.consume field that gates the recv-side hop+slot-IO charges (used by a follow-up diagnostic API; default True keeps current behavior). Post-fix at 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs. SRAM is slowest because its 128 GB/s bank link is the narrowest in the system — narrower than HBM's 256 GB/s. The existing ordering test is rewritten from tcm<sram<hbm to tcm<hbm<sram and a new test_ipcq_buffer_kind_locations adds 3 invariants on the gap. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 18:20:28 -07:00
parent 533e699299
commit 9c129d6131
7 changed files with 317 additions and 44 deletions
@@ -1,12 +1,12 @@
 buffer_kind,sip_topology,n_sips,n_elem,bytes_per_pe,latency_ns
-hbm,torus_2d,6,128,256,2002.0399999999827
+hbm,torus_2d,6,128,256,1858.0399999999827
-hbm,torus_2d,6,1024,2048,3541.0399999999827
+hbm,torus_2d,6,1024,2048,2389.0399999999827
-hbm,torus_2d,6,8192,16384,15889.03999999999
+hbm,torus_2d,6,8192,16384,6673.039999999986
-hbm,torus_2d,6,32768,65536,58225.03999999998
+hbm,torus_2d,6,32768,65536,21361.03999999992
-sram,torus_2d,6,128,256,1762.0399999999827
+sram,torus_2d,6,128,256,1774.0399999999827
-sram,torus_2d,6,1024,2048,2293.0399999999827
+sram,torus_2d,6,1024,2048,2389.0399999999827
-sram,torus_2d,6,8192,16384,6577.039999999986
+sram,torus_2d,6,8192,16384,7345.039999999986
-sram,torus_2d,6,32768,65536,21265.03999999992
+sram,torus_2d,6,32768,65536,24337.039999999935
 tcm,torus_2d,6,128,256,1678.0399999999827
 tcm,torus_2d,6,1024,2048,1957.0399999999827
 tcm,torus_2d,6,8192,16384,4225.039999999986
@@ -135,6 +135,13 @@ class IpcqRecvCmd:
        "return_slot" — return slot address as-is (default, zero-copy).
                        Kernel uses the slot memory directly.
        "copy_to_dst" — copy slot data to dst_addr, then return.
    ``consume`` (DIAGNOSTIC ONLY): when False, recv still blocks until the
    payload lands in the slot, but skips the slot-read latency charge
    (slot-IO + PE↔bank fabric drain for SRAM/HBM tiers). This exists
    solely so the pe2pe overview plot can compare apples-to-apples
    against tl.store (a one-sided write that pays no read on DST). Real
    kernels always need the data they receive — leave this True.
    """
    direction: str | None        # None → round-robin (weak fairness, D4)
@@ -146,6 +153,7 @@ class IpcqRecvCmd:
    dst_space: str = ""          # used only when recv_mode == "copy_to_dst"
    blocking: bool = True
    data_op: bool = True
    consume: bool = True         # DIAGNOSTIC: see docstring
 # ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ───────────────────
@@ -222,10 +222,24 @@ class PeDmaComponent(PeEngineBase):
        # ADR-0023 D9.7: charge IPCQ slot-WRITE latency against the
        # backing-memory tier (tcm/sram/hbm) before the atomic block.
        # Must come BEFORE the atomic write→IpcqMetaArrival pair (I6).
        # SRAM/HBM also pay a PE_DMA→bank fabric drain (slot lives on
        # the cube NoC); TCM is per-PE local and skips this hop.
        from kernbench.common.ipcq_types import slot_io_latency_ns
-        slot_write_ns = slot_io_latency_ns(
+        buffer_kind = token.dst_endpoint.buffer_kind
-            token.dst_endpoint.buffer_kind, token.nbytes,
+        if buffer_kind in ("sram", "hbm") and self.ctx is not None:
-        )
+            cube_prefix = self._pe_prefix.rsplit(".", 1)[0]
            bank_node = (
                f"{cube_prefix}.sram" if buffer_kind == "sram"
                else f"{cube_prefix}.hbm_ctrl"
            )
            try:
                path = self.ctx.router.find_path(self._pe_prefix, bank_node)
                bank_drain_ns = self.ctx.compute_drain_ns(path, token.nbytes)
                if bank_drain_ns > 0:
                    yield env.timeout(bank_drain_ns)
            except Exception:
                pass
        slot_write_ns = slot_io_latency_ns(buffer_kind, token.nbytes)
        if slot_write_ns > 0:
            yield env.timeout(slot_write_ns)
@@ -332,12 +332,37 @@ class PeIpcqComponent(ComponentBase):
        # ADR-0023 D9.7: charge IPCQ slot-READ latency against the
        # backing-memory tier (tcm/sram/hbm). Recv blocks for the
        # kernel-side slot consume; pe_exec_ns reflects this cost.
        # SRAM/HBM live on the cube NoC behind a router-attached link,
        # so reading a slot also pays a PE→bank fabric drain. TCM is
        # per-PE local and skips this hop.
        #
        # cmd.consume is a DIAGNOSTIC flag (default True). When False,
        # the read charges below are skipped — used only by the pe2pe
        # overview plot for an apples-to-apples comparison against
        # tl.store (one-sided write, no read on DST). Real kernels
        # always consume; this branch must not be exercised in
        # production code paths.
        from kernbench.common.ipcq_types import slot_io_latency_ns
-        slot_read_ns = slot_io_latency_ns(
+        nbytes = req.result_data.get("nbytes", 0)
-            self._buffer_kind, req.result_data.get("nbytes", 0),
+        if cmd.consume:
-        )
+            if self._buffer_kind in ("sram", "hbm") and self.ctx is not None:
-        if slot_read_ns > 0:
+                cube_prefix = self._pe_prefix.rsplit(".", 1)[0]
-            yield env.timeout(slot_read_ns)
+                bank_node = (
                    f"{cube_prefix}.sram" if self._buffer_kind == "sram"
                    else f"{cube_prefix}.hbm_ctrl"
                )
                try:
                    path = self.ctx.router.find_path(
                        self._pe_prefix, bank_node,
                    )
                    bank_drain_ns = self.ctx.compute_drain_ns(path, nbytes)
                    if bank_drain_ns > 0:
                        yield env.timeout(bank_drain_ns)
                except Exception:
                    pass
            slot_read_ns = slot_io_latency_ns(self._buffer_kind, nbytes)
            if slot_read_ns > 0:
                yield env.timeout(slot_read_ns)
        # Diagnostics trace (D14)
        from kernbench.ccl import diagnostics
@@ -43,20 +43,30 @@ from tests.test_allreduce_multidevice import (
 )
-# Expected per-tier BW + overhead (Phase 2 will encode this in
+# Expected per-tier (slot intrinsic BW, fixed overhead, PE↔bank hop BW).
-# pe_ipcq.py). Mirrors topology.yaml component values.
+# Slot intrinsic mirrors _BUFFER_KIND_BW in src/kernbench/common/ipcq_types.py.
-_EXPECTED_BW = {
+# PE↔bank hop reflects topology.yaml link BWs:
-    "tcm":  (512.0, 0.0),
+#   - TCM is per-PE local → no hop, encoded as inf.
-    "sram": (512.0, 2.0),
+#   - SRAM bank sits on cube NoC behind sram_to_router_bw_gbs = 128 GB/s.
-    "hbm":  (256.0, 6.0),
+#   - HBM ctrl sits on cube NoC behind hbm_to_router_bw_gbs  = 256 GB/s.
 _EXPECTED_TIER = {
    "tcm":  {"slot_bw_gbs": 512.0, "overhead_ns": 0.0, "bank_hop_bw_gbs": float("inf")},
    "sram": {"slot_bw_gbs": 512.0, "overhead_ns": 2.0, "bank_hop_bw_gbs": 128.0},
    "hbm":  {"slot_bw_gbs": 256.0, "overhead_ns": 6.0, "bank_hop_bw_gbs": 256.0},
 }
 def _expected_slot_io_ns(buffer_kind: str, nbytes: int) -> float:
-    """Per-access latency the model is expected to add (write OR read)."""
+    """Per-access latency the model is expected to add (write OR read).
-    bw_gbs, overhead_ns = _EXPECTED_BW[buffer_kind]
+
-    # 1 GB/s = 1 byte/ns
+    Includes the PE↔bank fabric hop for non-TCM tiers — SRAM and HBM
-    return nbytes / bw_gbs + overhead_ns
+    live on the cube NoC behind a router-attached link, so each slot
    access pays a fabric drain in addition to the intrinsic slot-IO.
    """
    tier = _EXPECTED_TIER[buffer_kind]
    bank_hop_ns = nbytes / tier["bank_hop_bw_gbs"]
    slot_io_ns = nbytes / tier["slot_bw_gbs"] + tier["overhead_ns"]
    return bank_hop_ns + slot_io_ns
 def _run_torus_allreduce(
@@ -114,12 +124,19 @@ def _run_torus_allreduce(
 # ── Phase 1 assertions ───────────────────────────────────────────────
-def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
+def test_slot_write_latency_orders_tcm_hbm_sram(tmp_path):
-    """tcm < sram < hbm at 8192 B per send.
+    """tcm < hbm < sram at 8192 B per send.
-    Pre-Phase-2: all three return the same pe_exec_ns and this
+    The ordering is set by the topology link BWs, NOT the intrinsic slot
-    assertion fails. Post-Phase-2: the per-tier BW + overhead make
+    cell rates: SRAM and HBM both live on the cube NoC behind a router
-    hbm visibly slower than sram, which is slower than tcm.
+    link, and SRAM's link (128 GB/s) is the narrowest in the system —
    narrower than HBM's (256 GB/s). So once the PE↔bank hop is charged,
    SRAM ends up the slowest tier even though its slot cell array has
    the same intrinsic BW as TCM.
    Pre-fix model misses the PE↔bank hop entirely → assertion FAILS
    (today's ordering is tcm < sram < hbm). Post-fix model includes the
    hop → assertion PASSES.
    """
    n_elem = 4096  # 8192 B per slot
    lat_tcm = _run_torus_allreduce(tmp_path, buffer_kind="tcm",  n_elem=n_elem)
@@ -130,21 +147,22 @@ def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
    exp_tcm = 2 * _expected_slot_io_ns("tcm",  n_elem * 2)
    exp_sram = 2 * _expected_slot_io_ns("sram", n_elem * 2)
    exp_hbm = 2 * _expected_slot_io_ns("hbm",  n_elem * 2)
-    # Floor margin: 50% of the raw expected per-access delta — lets Phase 2
+    # Floor margin: 50% of the raw expected per-access delta — lets the
-    # implementation choose to charge only one side without breaking the test,
+    # implementation choose to charge only one side without breaking the
-    # but still requires a clearly observable gap.
+    # test, but still requires a clearly observable gap.
-    margin_sram_tcm = 0.5 * (exp_sram - exp_tcm)
+    margin_hbm_tcm = 0.5 * (exp_hbm - exp_tcm)
-    margin_hbm_sram = 0.5 * (exp_hbm - exp_sram)
+    margin_sram_hbm = 0.5 * (exp_sram - exp_hbm)
-    assert lat_sram > lat_tcm + margin_sram_tcm, (
+    assert lat_hbm > lat_tcm + margin_hbm_tcm, (
-        f"sram should be slower than tcm by ≥ {margin_sram_tcm:.1f} ns "
+        f"hbm should be slower than tcm by ≥ {margin_hbm_tcm:.1f} ns "
-        f"per allreduce, got sram={lat_sram:.1f} tcm={lat_tcm:.1f} "
+        f"per allreduce, got hbm={lat_hbm:.1f} tcm={lat_tcm:.1f} "
-        f"(delta={lat_sram - lat_tcm:.1f})"
+        f"(delta={lat_hbm - lat_tcm:.1f})"
    )
-    assert lat_hbm > lat_sram + margin_hbm_sram, (
+    assert lat_sram > lat_hbm + margin_sram_hbm, (
-        f"hbm should be slower than sram by ≥ {margin_hbm_sram:.1f} ns "
+        f"sram should be slower than hbm by ≥ {margin_sram_hbm:.1f} ns "
-        f"per allreduce, got hbm={lat_hbm:.1f} sram={lat_sram:.1f} "
+        f"per allreduce (sram bank link 128 GB/s is narrower than hbm "
-        f"(delta={lat_hbm - lat_sram:.1f})"
+        f"link 256 GB/s), got sram={lat_sram:.1f} hbm={lat_hbm:.1f} "
        f"(delta={lat_sram - lat_hbm:.1f})"
    )
@@ -0,0 +1,208 @@
 """Phase 1 micro-tests for IPCQ slot-memory PHYSICAL placement.
 The current model in ``_BUFFER_KIND_BW`` (src/kernbench/common/ipcq_types.py)
 charges only an intrinsic-memory term for IPCQ slot read/write::
    TCM:  nbytes/512 + 0
    SRAM: nbytes/512 + 2
    HBM:  nbytes/256 + 6
 This treats SRAM and HBM as if they were per-PE local. The topology
 declares the opposite — both live on the cube NoC, behind their own
 router-attached link::
    topology.yaml:130   sram_to_router_bw_gbs: 128.0
    topology.yaml:129   hbm_to_router_bw_gbs:  256.0
 So a correct model must charge a PE→bank fabric drain for SRAM and HBM
 on both ``tl.send`` (writer landing bytes into the cube SRAM/HBM bank
 via PE_DMA → router → bank) and ``tl.recv`` (reader pulling bytes back
 across the same link). TCM stays free of that hop because it is
 genuinely per-PE local.
 The three tests below run the existing torus_2d 6-SIP allreduce harness
 with ``buffer_kind`` flipped between tcm/sram/hbm and assert invariants
 that the post-fix model must satisfy. They EXPECT TO FAIL today because
 the simulator under-charges SRAM and HBM by skipping the PE↔bank hop.
 Phase 2 will edit:
  - src/kernbench/components/builtin/pe_ipcq.py   (_handle_recv: add
        compute_drain_ns(pe→bank, nbytes) for sram/hbm)
  - src/kernbench/components/builtin/pe_dma.py    (_handle_ipcq_inbound:
        add second-leg drain for sram/hbm-destined slots)
 Tests must NEVER be weakened to make Phase 2 pass — invariants below
 follow from physics (link BW × payload), so any model reflecting the
 topology will satisfy them by construction.
 """
 from __future__ import annotations
 from pathlib import Path
 import pytest
 import yaml
 from kernbench.runtime_api.context import RuntimeContext
 from kernbench.runtime_api.types import DeviceSelector
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import resolve_topology
 from tests.test_allreduce_multidevice import (
    _write_temp_configs,
    run_allreduce,
 )
 def _run_allreduce_with_buffer_kind(
    tmp_path: Path, *, buffer_kind: str, n_elem: int,
 ) -> float:
    """Run one torus_2d 6-SIP allreduce with the given buffer_kind and
    return critical-path pe_exec_ns (max across all PEs).
    Mirrors the sweep harness in test_allreduce_buffer_kind_sweep.py
    so the assertions below compare apples-to-apples against that PNG.
    """
    sub = tmp_path / f"{buffer_kind}_{n_elem}"
    sub.mkdir()
    topo_path, ccl_path = _write_temp_configs(
        sub,
        sip_topology="torus_2d",
        n_sips=6,
        algorithm="intercube_allreduce",
        sip_w=3, sip_h=2,
        n_elem_override=n_elem,
    )
    with open(ccl_path) as f:
        ccl_cfg = yaml.safe_load(f)
    ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
    ccl_cfg.setdefault("algorithms", {}).setdefault(
        "intercube_allreduce", {},
    )["buffer_kind"] = buffer_kind
    with open(ccl_path, "w") as f:
        yaml.dump(ccl_cfg, f, default_flow_style=False)
    topo = resolve_topology(topo_path)
    engine = GraphEngine(topo.topology_obj, enable_data=True)
    spec = topo.topology_obj.spec
    with RuntimeContext(
        engine=engine,
        target_device=DeviceSelector("all"),
        correlation_id=f"loc_{buffer_kind}_{n_elem}",
        spec=spec,
    ) as ctx:
        result = run_allreduce(
            ctx, engine, spec,
            algorithm="intercube_allreduce", ccl_yaml=ccl_path,
        )
        assert result["ok_cubes"] > 0, "allreduce did not validate"
    pe_exec_vals = [
        float(tr.get("pe_exec_ns", 0.0) or 0.0)
        for _, (_, tr) in engine._results.items()
        if isinstance(tr, dict)
    ]
    return max(pe_exec_vals) if pe_exec_vals else 0.0
 # ── Phase 1 assertions ───────────────────────────────────────────────
 def test_sram_meaningfully_slower_than_tcm_at_large_payload(tmp_path):
    """At 32 KB / PE the SRAM-backed allreduce must take meaningfully
    longer than the TCM-backed one because every IPCQ slot access goes
    through the 128 GB/s SRAM↔router link, while TCM stays per-PE local.
    Floor justification (physics, not implementation):
        Per-IPCQ-roundtrip the SRAM tier adds 2 × nbytes/128 ns over TCM
        (one PE→SRAM hop on send-inbound, one SRAM→PE hop on recv).
        At 32 KB:  2 × 32768/128 = 512 ns added per slot exchange.
        With ≥ 10 critical-path exchanges in a 6-SIP torus_2d allreduce
        this is ≥ 5_120 ns. The threshold below is half that to leave
        room for differing critical-path counting.
    Pre-Phase-2: gap is constant 48 ns (just the SRAM overhead × 24
    slot accesses); test FAILS.
    Post-Phase-2: gap scales with payload; test PASSES.
    """
    n_elem = 16384  # 32 KB / PE
    lat_tcm = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="tcm", n_elem=n_elem,
    )
    lat_sram = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="sram", n_elem=n_elem,
    )
    delta = lat_sram - lat_tcm
    THRESHOLD_NS = 2_500.0
    assert delta > THRESHOLD_NS, (
        f"SRAM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB "
        f"because each IPCQ access pays a 128 GB/s PE↔SRAM hop. "
        f"got tcm={lat_tcm:.1f} sram={lat_sram:.1f} delta={delta:.1f} ns"
    )
 def test_sram_tcm_gap_scales_with_payload(tmp_path):
    """The SRAM-vs-TCM gap must grow roughly linearly with payload size.
    Pre-Phase-2: the only difference between TCM and SRAM is the SRAM
    per-access ``overhead_ns = 2``, which does NOT scale with payload —
    so the gap is the same constant 48 ns at 8 KB and at 32 KB. Ratio = 1.
    Post-Phase-2: the dominant term is 2 × nbytes/128 (PE↔SRAM hop on
    write+read) which IS linear in payload. Going 8 KB → 32 KB (4×)
    should produce a gap roughly 4× larger.
    Threshold below is 3× to keep slack for fixed-overhead effects.
    """
    lat_tcm_small = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="tcm", n_elem=4096,    # 8 KB
    )
    lat_sram_small = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="sram", n_elem=4096,
    )
    lat_tcm_large = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="tcm", n_elem=16384,   # 32 KB
    )
    lat_sram_large = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="sram", n_elem=16384,
    )
    gap_small = lat_sram_small - lat_tcm_small
    gap_large = lat_sram_large - lat_tcm_large
    assert gap_small > 0, (
        f"sanity: SRAM should never be FASTER than TCM, "
        f"got gap_small={gap_small:.1f} ns"
    )
    assert gap_large > 3.0 * gap_small, (
        f"4× payload should produce ≥3× SRAM/TCM gap (linear in nbytes "
        f"because of the 128 GB/s PE↔SRAM hop). "
        f"got gap_small={gap_small:.1f} (8KB), gap_large={gap_large:.1f} "
        f"(32KB), ratio={gap_large / max(gap_small, 1e-9):.2f}"
    )
 def test_hbm_pe_hop_charged_at_large_payload(tmp_path):
    """At 32 KB / PE the HBM-vs-TCM gap must exceed the gap that comes
    purely from HBM's 256 GB/s intrinsic slot-IO disadvantage.
    Pre-Phase-2 the entire HBM/TCM gap is just the slot-IO term
    (24 × (nbytes/512 + 6) ≈ 1_700 ns at 32 KB). Post-fix adds another
    24 × (nbytes/256) × 2 ≈ 6_144 ns from the PE↔HBM hop on send and
    recv, so the total HBM/TCM gap should clearly clear 4 µs.
    """
    n_elem = 16384  # 32 KB / PE
    lat_tcm = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="tcm", n_elem=n_elem,
    )
    lat_hbm = _run_allreduce_with_buffer_kind(
        tmp_path, buffer_kind="hbm", n_elem=n_elem,
    )
    delta = lat_hbm - lat_tcm
    THRESHOLD_NS = 4_000.0
    assert delta > THRESHOLD_NS, (
        f"HBM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB "
        f"once the 256 GB/s PE↔HBM hop is charged on each IPCQ access. "
        f"got tcm={lat_tcm:.1f} hbm={lat_hbm:.1f} delta={delta:.1f} ns"
    )