Files
kernbench2/tests/test_hbm_pc_striping.py
T
ywkang 5fdb6f8797 Latency model: HBM PC striping + chunk-loop drain (ADR-0033)
Previous model double-counted slow-upstream paths (e.g., 64KB via UCIe
128 GB/s was ~2x pessimistic). HBM CTRL now distributes bursts across
8 pseudo-channels via global round-robin, with per-chunk commit timing
that pipelines correctly against the bottleneck link's data arrival.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 21:59:07 -07:00

331 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for HBM CTRL per-pseudo-channel (PC) striping model (ADR-0033).
Replaces the prior dual-channel `simpy.Resource(capacity=1)` model with a
stateless per-PC `available_at[N]` array, global round-robin chunking, and
read/write sharing per PC. Burst granularity is `burst_bytes` (default 256B).
These tests are written BEFORE the production change and are expected to
FAIL on current code (which serializes via Resource cap=1). Phase 2 must
make them PASS without weakening assertions.
Verification matrix references ADR-0033 D1 (modeled) and D2 (approximated).
"""
from __future__ import annotations
from pathlib import Path
import pytest
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import MemoryReadMsg, MemoryWriteMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology, resolve_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _engine() -> GraphEngine:
return GraphEngine(load_topology(TOPOLOGY_PATH))
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0, offset: int = 0x1000) -> int:
slice_bytes = 48 * (1 << 30) // 8
return PhysAddr.pe_hbm_addr(
sip_id=sip, die_id=cube, pe_id=pe_id,
pe_local_hbm_offset=offset, slice_size_bytes=slice_bytes,
).encode()
def _write_msg(req_id: str, *, cube: int, pe: int, nbytes: int) -> MemoryWriteMsg:
return MemoryWriteMsg(
correlation_id="pc-striping", request_id=req_id,
dst_sip=0, dst_cube=cube, dst_pe=pe,
dst_pa=_hbm_pa(sip=0, cube=cube, pe_id=pe), nbytes=nbytes,
pattern="zero", target_pe=pe,
)
def _single_write_ns(nbytes: int, cube: int = 0, pe: int = 0) -> float:
eng = _engine()
msg = _write_msg(f"single-{cube}-{pe}-{nbytes}", cube=cube, pe=pe, nbytes=nbytes)
h = eng.submit(msg)
eng.wait(h)
_, t = eng.get_completion(h)
return t["total_ns"]
def _path_drain_for_write(eng: GraphEngine, msg: MemoryWriteMsg) -> float:
"""Compute engine path drain dynamically (test-time access to engine internals)."""
pcie_ep_id = eng._resolver.find_pcie_ep(msg.dst_sip)
pa = PhysAddr.decode(msg.dst_pa)
hbm_node = eng._resolver.resolve(pa)
path = eng._router.find_memory_path(pcie_ep_id, hbm_node)
return eng._path_drain_ns(path, msg.nbytes)
# ── 1. Builder derives pc_bw_gbs ──────────────────────────────────
def test_builder_derives_pc_bw_gbs():
"""Topology builder must inject `pc_bw_gbs = hbm_to_router_bw_gbs / num_pcs`
as an attr on every hbm_ctrl node. Enforces ADR-0019 D9 invariant
(channels_per_PE × per-PC BW = aggregated link BW) at build time.
"""
handle = resolve_topology(str(TOPOLOGY_PATH))
topo = handle.topology_obj
spec = topo.spec
expected_total_bw = float(spec["cube"]["links"]["hbm_to_router_bw_gbs"])
expected_num_pcs = int(spec["cube"]["memory_map"]["hbm_channels_per_pe"])
expected_pc_bw = expected_total_bw / expected_num_pcs
hbm_nodes = [n for n in topo.nodes.values() if "hbm_ctrl" in n.id]
assert hbm_nodes, "no hbm_ctrl nodes found in topology"
for node in hbm_nodes:
assert "num_pcs" in node.attrs, f"{node.id} missing num_pcs"
assert int(node.attrs["num_pcs"]) == expected_num_pcs, (
f"{node.id} num_pcs={node.attrs['num_pcs']} != {expected_num_pcs}"
)
assert "pc_bw_gbs" in node.attrs, f"{node.id} missing builder-derived pc_bw_gbs"
assert abs(float(node.attrs["pc_bw_gbs"]) - expected_pc_bw) < 1e-6, (
f"{node.id} pc_bw_gbs={node.attrs['pc_bw_gbs']} != {expected_pc_bw}"
)
# ── 2. PC parallelism: concurrent writes do NOT serialize at HBM CTRL ──
def test_two_concurrent_writes_parallel_across_pcs():
"""Two concurrent writes to the same cube (different PEs) must use
different PCs (via global round-robin) and finish in less than 2x
the single-write latency.
Current model (Resource cap=1) serializes them → max ≈ 2x single.
PC striping must give max < 1.7x single (allowing for shared wire BW
occupancy, which remains).
"""
nbytes = 1024
single_ns = _single_write_ns(nbytes)
eng = _engine()
msg_a = _write_msg("conc-a", cube=0, pe=0, nbytes=nbytes)
msg_b = _write_msg("conc-b", cube=0, pe=1, nbytes=nbytes)
ha = eng.submit(msg_a)
hb = eng.submit(msg_b)
eng.wait(ha)
eng.wait(hb)
_, ta = eng.get_completion(ha)
_, tb = eng.get_completion(hb)
max_ns = max(ta["total_ns"], tb["total_ns"])
assert max_ns < single_ns * 1.7, (
f"PC striping: 2 concurrent 1KB writes should not serialize at HBM CTRL. "
f"single={single_ns:.2f}ns, concurrent max={max_ns:.2f}ns, "
f"ratio={max_ns/single_ns:.2f} (expected < 1.7)"
)
def test_eight_concurrent_writes_makespan():
"""8 concurrent 1KB writes (one per PE in cube0) must achieve makespan
significantly less than 8x single-write latency.
With 8 PCs and global round-robin, each write maps to a distinct set of
PCs; the makespan is dominated by wire BW (shared 256 GB/s pipe), not
by HBM-side serialization.
Current cap=1 model: makespan ≈ 8x single. Target: < 4x single.
"""
nbytes = 1024
single_ns = _single_write_ns(nbytes)
eng = _engine()
handles = []
for pe in range(8):
msg = _write_msg(f"8way-{pe}", cube=0, pe=pe, nbytes=nbytes)
handles.append(eng.submit(msg))
for h in handles:
eng.wait(h)
times = [eng.get_completion(h)[1]["total_ns"] for h in handles]
makespan = max(times)
assert makespan < single_ns * 4.0, (
f"8 concurrent 1KB writes: makespan={makespan:.2f}ns, "
f"single={single_ns:.2f}ns, ratio={makespan/single_ns:.2f} "
f"(expected < 4.0 with PC striping; current cap=1 gives ~8x)"
)
# ── 3. Large transfer not 2x pessimistic ──────────────────────────
def test_large_transfer_not_double_counted():
"""64KB write must not be ~2x the wire transfer time.
With cut-through (head_arrived event) + PC striping, the HBM PC commit
time overlaps with wire arrival. For 64KB at 256 GB/s aggregate:
- Wire transfer: ~256ns
- PC commit (parallel across 8 PCs, 32 chunks each): ~256ns
- Overlapped real-HW total: ~256ns (one of them dominates)
- Current sequential model: ~512ns (~2x)
Assert: total < 1.5x of (wire transfer time alone).
"""
nbytes = 65536 # 64KB
# Path bottleneck (dynamic) — for MemoryWrite this is UCIe 128 GB/s.
eng = _engine()
msg = _write_msg("64kb-probe", cube=0, pe=0, nbytes=nbytes)
drain = _path_drain_for_write(eng, msg)
total = _single_write_ns(nbytes)
assert total < drain * 1.5, (
f"64KB write should not be ~2x path bottleneck transfer time. "
f"drain={drain:.2f}ns, total={total:.2f}ns, "
f"ratio={total/drain:.2f} (expected < 1.5)"
)
# ── 4. Read/write share per-PC available_at ──────────────────────
def test_read_write_share_pc_array():
"""Read and write requests targeting overlapping PC regions must
serialize on the shared `pc_avail` array (NOT proceed in parallel like
the prior dual-channel model).
Strategy: a read and a write to the same PE/cube should land on the
same set of PCs (since global round-robin advances by chunk count, and
chunk count of 256B == 1 chunk consumes 1 PC). With single-chunk read+write
submitted concurrently, the second to acquire its chunk's PC must wait.
We assert: makespan of (concurrent read + write) > single_write_ns.
If they ran in parallel on disjoint resources (old dual-channel),
makespan ≈ single. With shared PC, makespan > single.
"""
nbytes = 256 # 1 chunk
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
single_w = _single_write_ns(nbytes)
eng = _engine()
w_msg = _write_msg("rw-write", cube=0, pe=0, nbytes=nbytes)
r_msg = MemoryReadMsg(
correlation_id="pc-striping", request_id="rw-read",
src_sip=0, src_cube=0, src_pe=0,
src_pa=pa, nbytes=nbytes,
)
hw = eng.submit(w_msg)
hr = eng.submit(r_msg)
eng.wait(hw)
eng.wait(hr)
_, tw = eng.get_completion(hw)
_, tr = eng.get_completion(hr)
makespan = max(tw["total_ns"], tr["total_ns"])
# When R and W share the same first PC, the second one to acquire pays
# the burst time of the first. Assert makespan strictly > single,
# demonstrating sharing (vs the prior dual-channel parallelism).
assert makespan > single_w * 1.05, (
f"Read+Write should share per-PC slot when targeting the same starting "
f"PC. single_write={single_w:.2f}ns, R+W makespan={makespan:.2f}ns "
f"(expected > 1.05x single, demonstrating PC sharing)"
)
# ── 5. Switch penalty: default 0, mechanism wired up ─────────────
def _makespan(eng: GraphEngine, handles: list) -> float:
for h in handles:
eng.wait(h)
return max(eng.get_completion(h)[1]["total_ns"] for h in handles)
def _engine_with_switch_penalty(switch_penalty_ns: float) -> GraphEngine:
"""Build a GraphEngine, overriding switch_penalty_ns on every hbm_ctrl
node. None means leave the attr absent (i.e., test the default)."""
graph = load_topology(TOPOLOGY_PATH)
if switch_penalty_ns is not None:
for node in graph.nodes.values():
if "hbm_ctrl" in node.id:
node.attrs["switch_penalty_ns"] = switch_penalty_ns
return GraphEngine(graph)
def _rw_write_time(eng: GraphEngine, nbytes: int) -> float:
"""Submit one read followed by one write of the same size; return the
write's completion time. With `nbytes >= num_pcs * burst_bytes`, the
read populates PCs 0..N-1 with last_dir='R' and the write then wraps
back to PC 0, so every chunk of the write sees an R→W direction
switch. The write's completion time is the direct observable for the
switch-penalty mechanism (the read's time is dominated by the
response-path latency and would mask the effect)."""
r = MemoryReadMsg(
correlation_id="pc-striping", request_id="rw-1",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
)
w = _write_msg("rw-2", cube=0, pe=0, nbytes=nbytes)
hr = eng.submit(r)
hw = eng.submit(w)
eng.wait(hr); eng.wait(hw)
return eng.get_completion(hw)[1]["total_ns"]
def test_switch_penalty_default_zero():
"""Default (no `switch_penalty_ns` attr) must behave identically to
explicit `switch_penalty_ns=0`.
This documents Tier 0 (ADR-0033 D2): we assume an ideal HBM scheduler
amortizes switching cost; the mechanism exists but is dormant.
"""
nbytes = 2048
rw_default = _rw_write_time(_engine_with_switch_penalty(None), nbytes)
rw_zero = _rw_write_time(_engine_with_switch_penalty(0.0), nbytes)
diff = abs(rw_default - rw_zero)
assert diff < 0.01, (
f"Default (no attr) must match explicit switch_penalty_ns=0. "
f"default={rw_default:.2f}ns, explicit_zero={rw_zero:.2f}ns, "
f"diff={diff:.4f}ns"
)
def test_switch_penalty_mechanism_when_enabled():
"""When `switch_penalty_ns` is set non-zero via attr, R→W on the same
PC must show that extra delay.
Phase 2 must wire up the mechanism so that overriding the attr at
runtime (or via a modified topology) produces the expected delay.
Default config keeps it 0; this test creates an engine with an
explicit override.
"""
# Use nbytes that span all 8 PCs so the write back-wraps to PCs that
# were just touched by the read, forcing an R→W switch on each PC.
# 8 PCs × 256B burst = 2048B fills every PC exactly once.
nbytes = 2048
switch_penalty = 20.0 # large enough to be visible
# R+W with explicit switch_penalty=0: baseline (W observed time)
rw_zero = _rw_write_time(_engine_with_switch_penalty(0.0), nbytes)
# R+W with explicit switch_penalty=20: mechanism engaged
rw_pen = _rw_write_time(_engine_with_switch_penalty(switch_penalty), nbytes)
delta = rw_pen - rw_zero
# The switch penalty applies once on the second txn's first chunk.
# Conservative: assert at least half the switch_penalty shows up.
assert delta >= switch_penalty * 0.4, (
f"switch_penalty_ns={switch_penalty} should add measurable delay "
f"when R→W on same PC. R+W@0={rw_zero:.2f}ns, "
f"R+W@{switch_penalty}={rw_pen:.2f}ns, delta={delta:.2f}ns "
f"(expected >= {switch_penalty*0.4:.2f}ns)"
)
# ── 6. Backwards compat sanity ───────────────────────────────────
def test_existing_single_txn_latency_positive():
"""Sanity: single write still produces positive latency (no regression
of basic engine behavior). Companion to test_bw_occupancy.py."""
t = _single_write_ns(4096)
assert t > 0