Files
kernbench2/tests/test_ipcq_buffer_kind_latency.py
T
ywkang 533e699299 IPCQ-DMA co-design HW design doc + fix IPCQ slot BW model
Add hardware design document (docs/ipcq-dma-codesign-hw.md) covering
PE_IPCQ high-level architecture, simulator verification, proposed HW
implementation, and alternatives analysis. Include D2 block diagrams
for baseline and proposed PE architectures.

Fix IPCQ slot-memory bandwidth parameters to match topology.yaml:
SRAM 128→512 GB/s (intrinsic BW, NoC-bottlenecked at 128),
HBM 32→256 GB/s (was per-channel, now per-PE aggregate).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-28 13:31:02 -07:00

220 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Phase 1 micro-tests for IPCQ slot-memory latency model.
These tests assert the TARGET behavior expected after Phase 2 wires
``buffer_kind`` (tcm/sram/hbm) into the IPCQ slot read/write latency
charges. They are written BEFORE the production change and are
EXPECTED TO FAIL today.
Failure semantics today:
- Slot access is latency-free, so the tcm/sram/hbm runs produce
identical pe_exec_ns. The ordering assertion therefore fails with
"tcm == sram == hbm" — proving the test harness is wired and that
Phase 2 production work is what makes them pass.
Reference (Phase 2 will edit these):
- src/kernbench/components/builtin/pe_dma.py — _handle_ipcq_inbound
- src/kernbench/components/builtin/pe_ipcq.py — _handle_recv,
_BUFFER_KIND_BW table
- src/kernbench/runtime_api/kernel.py — IpcqDmaToken adds
buffer_kind field
- ccl.yaml — algorithm.buffer_kind
The tests reuse the existing config-driven allreduce app
(``run_allreduce`` in tests/test_allreduce_multidevice.py) with a 2-SIP
ring topology and a SMALL n_elem so they finish fast (~3-5 s each).
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import pytest
from kernbench.runtime_api.context import RuntimeContext
from kernbench.runtime_api.types import DeviceSelector
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
# Reuse the test app's helpers so this micro-test file does not
# duplicate the run-allreduce + write-temp-configs plumbing.
from tests.test_allreduce_multidevice import (
_write_temp_configs,
run_allreduce,
)
# Expected per-tier BW + overhead (Phase 2 will encode this in
# pe_ipcq.py). Mirrors topology.yaml component values.
_EXPECTED_BW = {
"tcm": (512.0, 0.0),
"sram": (512.0, 2.0),
"hbm": (256.0, 6.0),
}
def _expected_slot_io_ns(buffer_kind: str, nbytes: int) -> float:
"""Per-access latency the model is expected to add (write OR read)."""
bw_gbs, overhead_ns = _EXPECTED_BW[buffer_kind]
# 1 GB/s = 1 byte/ns
return nbytes / bw_gbs + overhead_ns
def _run_torus_allreduce(
tmp_path: Path, *, buffer_kind: str, n_elem: int,
) -> float:
"""Run one torus_2d 6-SIP allreduce and return critical-path
pe_exec_ns. The buffer_kind override is wired into ccl.yaml.
"""
sub = tmp_path / f"{buffer_kind}_{n_elem}"
sub.mkdir()
topo_path, ccl_path = _write_temp_configs(
sub,
sip_topology="torus_2d",
n_sips=6,
algorithm="intercube_allreduce",
sip_w=3, sip_h=2,
n_elem_override=n_elem,
)
# Patch ccl.yaml in-place so the algorithm picks up buffer_kind.
import yaml
with open(ccl_path) as f:
ccl_cfg = yaml.safe_load(f)
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
ccl_cfg.setdefault("algorithms", {}).setdefault(
"intercube_allreduce", {},
)["buffer_kind"] = buffer_kind
with open(ccl_path, "w") as f:
yaml.dump(ccl_cfg, f, default_flow_style=False)
topo = resolve_topology(topo_path)
engine = GraphEngine(topo.topology_obj, enable_data=True)
spec = topo.topology_obj.spec
with RuntimeContext(
engine=engine,
target_device=DeviceSelector("all"),
correlation_id=f"bk_{buffer_kind}_{n_elem}",
spec=spec,
) as ctx:
result = run_allreduce(
ctx, engine, spec,
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
)
assert result["ok_cubes"] > 0, "allreduce did not validate"
pe_exec_vals = [
float(tr.get("pe_exec_ns", 0.0) or 0.0)
for _, (_, tr) in engine._results.items()
if isinstance(tr, dict)
]
return max(pe_exec_vals) if pe_exec_vals else 0.0
# ── Phase 1 assertions ───────────────────────────────────────────────
def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
"""tcm < sram < hbm at 8192 B per send.
Pre-Phase-2: all three return the same pe_exec_ns and this
assertion fails. Post-Phase-2: the per-tier BW + overhead make
hbm visibly slower than sram, which is slower than tcm.
"""
n_elem = 4096 # 8192 B per slot
lat_tcm = _run_torus_allreduce(tmp_path, buffer_kind="tcm", n_elem=n_elem)
lat_sram = _run_torus_allreduce(tmp_path, buffer_kind="sram", n_elem=n_elem)
lat_hbm = _run_torus_allreduce(tmp_path, buffer_kind="hbm", n_elem=n_elem)
# Expected per-access deltas (write+read = 2× the per-access value).
exp_tcm = 2 * _expected_slot_io_ns("tcm", n_elem * 2)
exp_sram = 2 * _expected_slot_io_ns("sram", n_elem * 2)
exp_hbm = 2 * _expected_slot_io_ns("hbm", n_elem * 2)
# Floor margin: 50% of the raw expected per-access delta — lets Phase 2
# implementation choose to charge only one side without breaking the test,
# but still requires a clearly observable gap.
margin_sram_tcm = 0.5 * (exp_sram - exp_tcm)
margin_hbm_sram = 0.5 * (exp_hbm - exp_sram)
assert lat_sram > lat_tcm + margin_sram_tcm, (
f"sram should be slower than tcm by ≥ {margin_sram_tcm:.1f} ns "
f"per allreduce, got sram={lat_sram:.1f} tcm={lat_tcm:.1f} "
f"(delta={lat_sram - lat_tcm:.1f})"
)
assert lat_hbm > lat_sram + margin_hbm_sram, (
f"hbm should be slower than sram by ≥ {margin_hbm_sram:.1f} ns "
f"per allreduce, got hbm={lat_hbm:.1f} sram={lat_sram:.1f} "
f"(delta={lat_hbm - lat_sram:.1f})"
)
def test_slot_io_scales_linearly_with_nbytes(tmp_path):
"""For buffer_kind=hbm, doubling nbytes should add ~nbytes/32 ns
of latency to each slot access. Sanity-checks the slope.
Pre-Phase-2: latency does not respond to nbytes via memory BW
(only via fabric drain), so the observed slope is dominated by
fabric BW and does NOT match 1/32 ns/B.
"""
lat_4k = _run_torus_allreduce(tmp_path, buffer_kind="hbm", n_elem=2048)
lat_8k = _run_torus_allreduce(tmp_path, buffer_kind="hbm", n_elem=4096)
# Expected delta from doubling: at least one slot-IO event per cube
# in the critical path (very conservative). Per-access add = 4096/256 = 16
# ns on HBM going from 4k → 8k. Multiple slot accesses on the critical
# path should make the observed delta meaningfully larger.
expected_min_delta = 0.5 * (4096 / 256.0) # ≈ 8 ns
assert lat_8k - lat_4k > expected_min_delta, (
f"doubling nbytes on hbm should add ≥ {expected_min_delta:.1f} ns "
f"of slot-IO latency, got delta={lat_8k - lat_4k:.1f} ns "
f"(lat_4k={lat_4k:.1f}, lat_8k={lat_8k:.1f})"
)
def test_buffer_kind_sensitivity_grows_with_payload(tmp_path):
"""Credit-return cost is fabric-only by design (16 B packet); only
the data slot-IO charge depends on ``buffer_kind``. Therefore the
tcm-vs-hbm gap must scale with payload size and be a small fraction
of the large-payload gap at small payloads.
Concrete invariant the model must satisfy:
gap_small / gap_large < 0.10
Pre-Phase-2: gap_small == gap_large == 0 (division undefined → test
fails because gap_large is required > 0). Post-Phase-2: at small
nbytes the slot-IO charge is dominated by the constant
``overhead_ns`` term, while at large nbytes it is dominated by the
``nbytes / bw_gbs`` term — so gap_large grows linearly while
gap_small stays small.
"""
n_elem_small = 8 # 16 B per slot — overhead-bound
n_elem_large = 16384 # 32 KB per slot — bandwidth-bound
lat_tcm_small = _run_torus_allreduce(
tmp_path, buffer_kind="tcm", n_elem=n_elem_small,
)
lat_hbm_small = _run_torus_allreduce(
tmp_path, buffer_kind="hbm", n_elem=n_elem_small,
)
lat_tcm_large = _run_torus_allreduce(
tmp_path, buffer_kind="tcm", n_elem=n_elem_large,
)
lat_hbm_large = _run_torus_allreduce(
tmp_path, buffer_kind="hbm", n_elem=n_elem_large,
)
gap_small = abs(lat_hbm_small - lat_tcm_small)
gap_large = abs(lat_hbm_large - lat_tcm_large)
assert gap_large > 1000.0, (
f"large-payload buffer_kind gap must be observably large "
f"(this is the sweep's whole point). got gap_large={gap_large:.1f} ns "
f"(lat_tcm_large={lat_tcm_large:.1f}, lat_hbm_large={lat_hbm_large:.1f})"
)
assert gap_small / gap_large < 0.10, (
f"buffer_kind sensitivity should grow with payload — "
f"small-payload gap should be < 10% of large-payload gap. "
f"got gap_small={gap_small:.1f} ns, gap_large={gap_large:.1f} ns, "
f"ratio={gap_small / gap_large:.3f}"
)