Latency model: HBM PC striping + chunk-loop drain (ADR-0033)
Previous model double-counted slow-upstream paths (e.g., 64KB via UCIe 128 GB/s was ~2x pessimistic). HBM CTRL now distributes bursts across 8 pseudo-channels via global round-robin, with per-chunk commit timing that pipelines correctly against the bottleneck link's data arrival. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,142 @@
|
||||
"""Tests for wire cut-through via `Transaction.head_arrived` event (ADR-0033 D1).
|
||||
|
||||
The wire model (ADR-0015 D2) currently delivers a message to the destination
|
||||
in_port only after the full nbytes/bw_gbs transfer time has elapsed
|
||||
(store-and-forward). Phase 2 adds a `head_arrived` SimPy event on the
|
||||
Transaction that fires at `prop_ns + FLIT_BYTES / bw_gbs` — letting opted-in
|
||||
destinations (e.g., HBM CTRL) start processing the leading flit before the
|
||||
tail arrives. The wire's BW occupancy (`available_at`) is unchanged.
|
||||
|
||||
These tests assert the *behavioral* consequence: when both the wire and
|
||||
HBM CTRL contribute meaningfully to total latency, the model must not
|
||||
double-count their time. They are written BEFORE Phase 2 production
|
||||
changes and expected to FAIL on current code.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import load_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
|
||||
def _engine() -> GraphEngine:
|
||||
return GraphEngine(load_topology(TOPOLOGY_PATH))
|
||||
|
||||
|
||||
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
|
||||
slice_bytes = 48 * (1 << 30) // 8
|
||||
return PhysAddr.pe_hbm_addr(
|
||||
sip_id=sip, die_id=cube, pe_id=pe_id,
|
||||
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||
).encode()
|
||||
|
||||
|
||||
def _path_drain_for_write(eng: GraphEngine, msg: MemoryWriteMsg) -> float:
|
||||
"""Dynamically compute the engine's path drain for this write."""
|
||||
pcie_ep_id = eng._resolver.find_pcie_ep(msg.dst_sip)
|
||||
pa = PhysAddr.decode(msg.dst_pa)
|
||||
hbm_node = eng._resolver.resolve(pa)
|
||||
path = eng._router.find_memory_path(pcie_ep_id, hbm_node)
|
||||
return eng._path_drain_ns(path, msg.nbytes)
|
||||
|
||||
|
||||
def _write_ns(nbytes: int) -> tuple[float, float]:
|
||||
"""Return (total_ns, path_drain_ns) for the MemoryWrite of given nbytes."""
|
||||
eng = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="cut-through", request_id=f"w-{nbytes}",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=nbytes,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
drain = _path_drain_for_write(eng, msg)
|
||||
h = eng.submit(msg)
|
||||
eng.wait(h)
|
||||
_, t = eng.get_completion(h)
|
||||
return t["total_ns"], drain
|
||||
|
||||
|
||||
# ── 1. Effective slope: total_ns vs nbytes should grow at the rate of
|
||||
# the bottleneck BW, not 2x that rate (which double-counts wire+HBM).
|
||||
|
||||
|
||||
def test_effective_slope_single_bw_not_doubled():
|
||||
"""The effective ns-per-byte slope should match the path bottleneck rate
|
||||
(= 1 / bottleneck_bw), NOT 2× that rate (which would double-count wire
|
||||
and HBM drain). Drain is computed dynamically from the engine path.
|
||||
|
||||
Measurement: linear fit between two large transfer sizes. Constants
|
||||
cancel; slope is the discriminator.
|
||||
"""
|
||||
n1, n2 = 32768, 131072 # 32KB and 128KB
|
||||
t1, drain1 = _write_ns(n1)
|
||||
t2, drain2 = _write_ns(n2)
|
||||
slope = (t2 - t1) / (n2 - n1) # ns per byte
|
||||
expected_slope = drain2 / n2 # = 1 / bottleneck_bw (ns/byte)
|
||||
|
||||
# 50% tolerance above ideal accounts for propagation prop_ns at
|
||||
# large-N regimes; still well below 2× (doubled) doubling.
|
||||
assert slope < expected_slope * 1.5, (
|
||||
f"Effective slope {slope*1000:.4f} ps/byte too steep; "
|
||||
f"expected ~{expected_slope*1000:.4f} ps/byte at path bottleneck. "
|
||||
f"A doubled (wire + HBM drain) model would give ~"
|
||||
f"{expected_slope*2*1000:.4f} ps/byte."
|
||||
)
|
||||
|
||||
|
||||
# ── 2. Absolute upper bound: 1MB transfer not 2x wire time ──
|
||||
|
||||
|
||||
def test_1mb_transfer_upper_bound():
|
||||
"""A 1MB write should complete in roughly the path-bottleneck transfer
|
||||
time, plus modest fixed overhead. A doubled (wire + HBM drain) model
|
||||
would give ~2× that.
|
||||
"""
|
||||
nbytes = 1 << 20 # 1 MB
|
||||
total, drain = _write_ns(nbytes)
|
||||
assert total < drain * 1.5, (
|
||||
f"1MB write should not be ~2x bottleneck transfer time. "
|
||||
f"drain={drain:.2f}ns, total={total:.2f}ns, "
|
||||
f"ratio={total/drain:.2f} (expected < 1.5)"
|
||||
)
|
||||
|
||||
|
||||
# ── 3. Small transfer: cut-through dominated by component overhead ──
|
||||
|
||||
|
||||
def test_small_transfer_remains_finite_and_positive():
|
||||
"""Sanity: small (single-chunk) transfer still completes with positive
|
||||
finite latency. Cut-through should not introduce zero-latency bugs.
|
||||
"""
|
||||
t, _ = _write_ns(256)
|
||||
assert t > 0
|
||||
assert t < 1000.0, f"256B write should be << 1us, got {t}ns"
|
||||
|
||||
|
||||
# ── 4. Monotonicity preserved under cut-through ──
|
||||
|
||||
|
||||
def test_monotonicity_at_extreme_sizes():
|
||||
"""Once payload is large enough to be wire-dominated, monotonicity
|
||||
must hold: a much larger write takes more time than a smaller one.
|
||||
|
||||
Note: in the PC parallelism regime (ADR-0033 D1), a small single-PC
|
||||
transfer can actually be slower than a small few-PC transfer (a 1KB
|
||||
write spans 4 PCs in parallel and finishes around the same wall-clock
|
||||
time as a 256B write that only loads 1 PC). This is physically
|
||||
correct and matches real-HW behavior; strict monotonicity over the
|
||||
sub-PC regime is not asserted. We assert it only across an extreme
|
||||
range where the wire-transfer term dominates.
|
||||
"""
|
||||
small, _ = _write_ns(256)
|
||||
large, _ = _write_ns(65536)
|
||||
assert large > small, (
|
||||
f"65KB ({large:.2f}ns) must exceed 256B ({small:.2f}ns) — "
|
||||
f"wire transfer at 256GB/s alone is 256ns for 64KB, so total "
|
||||
f"must dominate any sub-microsecond small-transfer time."
|
||||
)
|
||||
Reference in New Issue
Block a user