ADR-0023 D9.7+: charge PE↔bank fabric hop for SRAM/HBM IPCQ slots
Cube SRAM and HBM live on the cube NoC behind router-attached links (sram_to_router_bw_gbs=128, hbm_to_router_bw_gbs=256). Previously the slot-IO model treated them as if they were per-PE local, so the buffer_kind sweep showed TCM ≈ SRAM at 64 KB / PE. pe_ipcq._handle_recv and pe_dma._handle_ipcq_inbound now charge a PE→bank compute_drain_ns on top of the intrinsic slot-IO for SRAM/HBM. TCM stays free of this hop. Adds an internal IpcqRecvCmd.consume field that gates the recv-side hop+slot-IO charges (used by a follow-up diagnostic API; default True keeps current behavior). Post-fix at 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs. SRAM is slowest because its 128 GB/s bank link is the narrowest in the system — narrower than HBM's 256 GB/s. The existing ordering test is rewritten from tcm<sram<hbm to tcm<hbm<sram and a new test_ipcq_buffer_kind_locations adds 3 invariants on the gap. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -43,20 +43,30 @@ from tests.test_allreduce_multidevice import (
|
||||
)
|
||||
|
||||
|
||||
# Expected per-tier BW + overhead (Phase 2 will encode this in
|
||||
# pe_ipcq.py). Mirrors topology.yaml component values.
|
||||
_EXPECTED_BW = {
|
||||
"tcm": (512.0, 0.0),
|
||||
"sram": (512.0, 2.0),
|
||||
"hbm": (256.0, 6.0),
|
||||
# Expected per-tier (slot intrinsic BW, fixed overhead, PE↔bank hop BW).
|
||||
# Slot intrinsic mirrors _BUFFER_KIND_BW in src/kernbench/common/ipcq_types.py.
|
||||
# PE↔bank hop reflects topology.yaml link BWs:
|
||||
# - TCM is per-PE local → no hop, encoded as inf.
|
||||
# - SRAM bank sits on cube NoC behind sram_to_router_bw_gbs = 128 GB/s.
|
||||
# - HBM ctrl sits on cube NoC behind hbm_to_router_bw_gbs = 256 GB/s.
|
||||
_EXPECTED_TIER = {
|
||||
"tcm": {"slot_bw_gbs": 512.0, "overhead_ns": 0.0, "bank_hop_bw_gbs": float("inf")},
|
||||
"sram": {"slot_bw_gbs": 512.0, "overhead_ns": 2.0, "bank_hop_bw_gbs": 128.0},
|
||||
"hbm": {"slot_bw_gbs": 256.0, "overhead_ns": 6.0, "bank_hop_bw_gbs": 256.0},
|
||||
}
|
||||
|
||||
|
||||
def _expected_slot_io_ns(buffer_kind: str, nbytes: int) -> float:
|
||||
"""Per-access latency the model is expected to add (write OR read)."""
|
||||
bw_gbs, overhead_ns = _EXPECTED_BW[buffer_kind]
|
||||
# 1 GB/s = 1 byte/ns
|
||||
return nbytes / bw_gbs + overhead_ns
|
||||
"""Per-access latency the model is expected to add (write OR read).
|
||||
|
||||
Includes the PE↔bank fabric hop for non-TCM tiers — SRAM and HBM
|
||||
live on the cube NoC behind a router-attached link, so each slot
|
||||
access pays a fabric drain in addition to the intrinsic slot-IO.
|
||||
"""
|
||||
tier = _EXPECTED_TIER[buffer_kind]
|
||||
bank_hop_ns = nbytes / tier["bank_hop_bw_gbs"]
|
||||
slot_io_ns = nbytes / tier["slot_bw_gbs"] + tier["overhead_ns"]
|
||||
return bank_hop_ns + slot_io_ns
|
||||
|
||||
|
||||
def _run_torus_allreduce(
|
||||
@@ -114,12 +124,19 @@ def _run_torus_allreduce(
|
||||
# ── Phase 1 assertions ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
|
||||
"""tcm < sram < hbm at 8192 B per send.
|
||||
def test_slot_write_latency_orders_tcm_hbm_sram(tmp_path):
|
||||
"""tcm < hbm < sram at 8192 B per send.
|
||||
|
||||
Pre-Phase-2: all three return the same pe_exec_ns and this
|
||||
assertion fails. Post-Phase-2: the per-tier BW + overhead make
|
||||
hbm visibly slower than sram, which is slower than tcm.
|
||||
The ordering is set by the topology link BWs, NOT the intrinsic slot
|
||||
cell rates: SRAM and HBM both live on the cube NoC behind a router
|
||||
link, and SRAM's link (128 GB/s) is the narrowest in the system —
|
||||
narrower than HBM's (256 GB/s). So once the PE↔bank hop is charged,
|
||||
SRAM ends up the slowest tier even though its slot cell array has
|
||||
the same intrinsic BW as TCM.
|
||||
|
||||
Pre-fix model misses the PE↔bank hop entirely → assertion FAILS
|
||||
(today's ordering is tcm < sram < hbm). Post-fix model includes the
|
||||
hop → assertion PASSES.
|
||||
"""
|
||||
n_elem = 4096 # 8192 B per slot
|
||||
lat_tcm = _run_torus_allreduce(tmp_path, buffer_kind="tcm", n_elem=n_elem)
|
||||
@@ -130,21 +147,22 @@ def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
|
||||
exp_tcm = 2 * _expected_slot_io_ns("tcm", n_elem * 2)
|
||||
exp_sram = 2 * _expected_slot_io_ns("sram", n_elem * 2)
|
||||
exp_hbm = 2 * _expected_slot_io_ns("hbm", n_elem * 2)
|
||||
# Floor margin: 50% of the raw expected per-access delta — lets Phase 2
|
||||
# implementation choose to charge only one side without breaking the test,
|
||||
# but still requires a clearly observable gap.
|
||||
margin_sram_tcm = 0.5 * (exp_sram - exp_tcm)
|
||||
margin_hbm_sram = 0.5 * (exp_hbm - exp_sram)
|
||||
# Floor margin: 50% of the raw expected per-access delta — lets the
|
||||
# implementation choose to charge only one side without breaking the
|
||||
# test, but still requires a clearly observable gap.
|
||||
margin_hbm_tcm = 0.5 * (exp_hbm - exp_tcm)
|
||||
margin_sram_hbm = 0.5 * (exp_sram - exp_hbm)
|
||||
|
||||
assert lat_sram > lat_tcm + margin_sram_tcm, (
|
||||
f"sram should be slower than tcm by ≥ {margin_sram_tcm:.1f} ns "
|
||||
f"per allreduce, got sram={lat_sram:.1f} tcm={lat_tcm:.1f} "
|
||||
f"(delta={lat_sram - lat_tcm:.1f})"
|
||||
assert lat_hbm > lat_tcm + margin_hbm_tcm, (
|
||||
f"hbm should be slower than tcm by ≥ {margin_hbm_tcm:.1f} ns "
|
||||
f"per allreduce, got hbm={lat_hbm:.1f} tcm={lat_tcm:.1f} "
|
||||
f"(delta={lat_hbm - lat_tcm:.1f})"
|
||||
)
|
||||
assert lat_hbm > lat_sram + margin_hbm_sram, (
|
||||
f"hbm should be slower than sram by ≥ {margin_hbm_sram:.1f} ns "
|
||||
f"per allreduce, got hbm={lat_hbm:.1f} sram={lat_sram:.1f} "
|
||||
f"(delta={lat_hbm - lat_sram:.1f})"
|
||||
assert lat_sram > lat_hbm + margin_sram_hbm, (
|
||||
f"sram should be slower than hbm by ≥ {margin_sram_hbm:.1f} ns "
|
||||
f"per allreduce (sram bank link 128 GB/s is narrower than hbm "
|
||||
f"link 256 GB/s), got sram={lat_sram:.1f} hbm={lat_hbm:.1f} "
|
||||
f"(delta={lat_sram - lat_hbm:.1f})"
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user