Files
kernbench2/tests/test_wire_cut_through.py
ywkang 5fdb6f8797 Latency model: HBM PC striping + chunk-loop drain (ADR-0033)
Previous model double-counted slow-upstream paths (e.g., 64KB via UCIe
128 GB/s was ~2x pessimistic). HBM CTRL now distributes bursts across
8 pseudo-channels via global round-robin, with per-chunk commit timing
that pipelines correctly against the bottleneck link's data arrival.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 21:59:07 -07:00

143 lines
5.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for wire cut-through via `Transaction.head_arrived` event (ADR-0033 D1).
The wire model (ADR-0015 D2) currently delivers a message to the destination
in_port only after the full nbytes/bw_gbs transfer time has elapsed
(store-and-forward). Phase 2 adds a `head_arrived` SimPy event on the
Transaction that fires at `prop_ns + FLIT_BYTES / bw_gbs` — letting opted-in
destinations (e.g., HBM CTRL) start processing the leading flit before the
tail arrives. The wire's BW occupancy (`available_at`) is unchanged.
These tests assert the *behavioral* consequence: when both the wire and
HBM CTRL contribute meaningfully to total latency, the model must not
double-count their time. They are written BEFORE Phase 2 production
changes and expected to FAIL on current code.
"""
from __future__ import annotations
from pathlib import Path
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import MemoryWriteMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _engine() -> GraphEngine:
return GraphEngine(load_topology(TOPOLOGY_PATH))
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
slice_bytes = 48 * (1 << 30) // 8
return PhysAddr.pe_hbm_addr(
sip_id=sip, die_id=cube, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
).encode()
def _path_drain_for_write(eng: GraphEngine, msg: MemoryWriteMsg) -> float:
"""Dynamically compute the engine's path drain for this write."""
pcie_ep_id = eng._resolver.find_pcie_ep(msg.dst_sip)
pa = PhysAddr.decode(msg.dst_pa)
hbm_node = eng._resolver.resolve(pa)
path = eng._router.find_memory_path(pcie_ep_id, hbm_node)
return eng._path_drain_ns(path, msg.nbytes)
def _write_ns(nbytes: int) -> tuple[float, float]:
"""Return (total_ns, path_drain_ns) for the MemoryWrite of given nbytes."""
eng = _engine()
msg = MemoryWriteMsg(
correlation_id="cut-through", request_id=f"w-{nbytes}",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=nbytes,
pattern="zero", target_pe=0,
)
drain = _path_drain_for_write(eng, msg)
h = eng.submit(msg)
eng.wait(h)
_, t = eng.get_completion(h)
return t["total_ns"], drain
# ── 1. Effective slope: total_ns vs nbytes should grow at the rate of
# the bottleneck BW, not 2x that rate (which double-counts wire+HBM).
def test_effective_slope_single_bw_not_doubled():
"""The effective ns-per-byte slope should match the path bottleneck rate
(= 1 / bottleneck_bw), NOT 2× that rate (which would double-count wire
and HBM drain). Drain is computed dynamically from the engine path.
Measurement: linear fit between two large transfer sizes. Constants
cancel; slope is the discriminator.
"""
n1, n2 = 32768, 131072 # 32KB and 128KB
t1, drain1 = _write_ns(n1)
t2, drain2 = _write_ns(n2)
slope = (t2 - t1) / (n2 - n1) # ns per byte
expected_slope = drain2 / n2 # = 1 / bottleneck_bw (ns/byte)
# 50% tolerance above ideal accounts for propagation prop_ns at
# large-N regimes; still well below 2× (doubled) doubling.
assert slope < expected_slope * 1.5, (
f"Effective slope {slope*1000:.4f} ps/byte too steep; "
f"expected ~{expected_slope*1000:.4f} ps/byte at path bottleneck. "
f"A doubled (wire + HBM drain) model would give ~"
f"{expected_slope*2*1000:.4f} ps/byte."
)
# ── 2. Absolute upper bound: 1MB transfer not 2x wire time ──
def test_1mb_transfer_upper_bound():
"""A 1MB write should complete in roughly the path-bottleneck transfer
time, plus modest fixed overhead. A doubled (wire + HBM drain) model
would give ~2× that.
"""
nbytes = 1 << 20 # 1 MB
total, drain = _write_ns(nbytes)
assert total < drain * 1.5, (
f"1MB write should not be ~2x bottleneck transfer time. "
f"drain={drain:.2f}ns, total={total:.2f}ns, "
f"ratio={total/drain:.2f} (expected < 1.5)"
)
# ── 3. Small transfer: cut-through dominated by component overhead ──
def test_small_transfer_remains_finite_and_positive():
"""Sanity: small (single-chunk) transfer still completes with positive
finite latency. Cut-through should not introduce zero-latency bugs.
"""
t, _ = _write_ns(256)
assert t > 0
assert t < 1000.0, f"256B write should be << 1us, got {t}ns"
# ── 4. Monotonicity preserved under cut-through ──
def test_monotonicity_at_extreme_sizes():
"""Once payload is large enough to be wire-dominated, monotonicity
must hold: a much larger write takes more time than a smaller one.
Note: in the PC parallelism regime (ADR-0033 D1), a small single-PC
transfer can actually be slower than a small few-PC transfer (a 1KB
write spans 4 PCs in parallel and finishes around the same wall-clock
time as a 256B write that only loads 1 PC). This is physically
correct and matches real-HW behavior; strict monotonicity over the
sub-PC regime is not asserted. We assert it only across an extreme
range where the wire-transfer term dominates.
"""
small, _ = _write_ns(256)
large, _ = _write_ns(65536)
assert large > small, (
f"65KB ({large:.2f}ns) must exceed 256B ({small:.2f}ns) — "
f"wire transfer at 256GB/s alone is 256ns for 64KB, so total "
f"must dominate any sub-microsecond small-transfer time."
)