Files
kernbench2/tests/test_bw_occupancy.py
T
ywkang d75da439c6 Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop
  timestamps, split cross-cube into best/worst cases, D2H read section
- UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix
  cross-cube-best < cross-half latency inversion
- HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing
  effective BW from 256 to 204.8 GB/s
- Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases
- Probe default data size: 4KB -> 32KB for more realistic measurements
- IOChiplet NOC + D2H topology and tests
- NOC mesh, xbar, BW occupancy components and tests
- Cube mesh visualization diagram

278 tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00

386 lines
14 KiB
Python

"""Tests for per-link BW occupancy (available_at) model.
Verifies that:
- Single transactions see no extra delay from BW tracking.
- Back-to-back transactions on the same link see BW contention.
- Transactions on independent paths see no contention.
- Response messages (nbytes=0) do not occupy BW.
"""
from pathlib import Path
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _engine():
return GraphEngine(load_topology(TOPOLOGY_PATH))
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
slice_bytes = 48 * (1 << 30) // 8
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
return pa.encode()
# ── 1. Single transaction: available_at does not add delay ────────
def test_single_txn_latency_unchanged():
"""Single H2D write must produce identical latency with or without BW tracking.
With no contention, available_at starts at 0 for every link, so the
current transaction sees zero BW wait. Total latency = prop + overhead + drain.
Two separate engines running the same request must match exactly.
"""
msg = MemoryWriteMsg(
correlation_id="bw", request_id="single",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
e1 = _engine()
h1 = e1.submit(msg)
e1.wait(h1)
_, t1 = e1.get_completion(h1)
e2 = _engine()
h2 = e2.submit(msg)
e2.wait(h2)
_, t2 = e2.get_completion(h2)
assert t1["total_ns"] == t2["total_ns"], (
f"Single txn must be deterministic: {t1['total_ns']} vs {t2['total_ns']}"
)
assert t1["total_ns"] > 0
# ── 2. Back-to-back transactions: BW contention on shared links ──
def test_back_to_back_same_cube_bw_contention():
"""Two concurrent H2D writes to the same cube must show BW contention.
Both transactions share the same command path (pcie_ep → io_cpu → ... → m_cpu).
The second transaction must wait for BW occupancy on shared links.
The slower (later-finishing) request must take longer than a single isolated request.
"""
# Baseline: single isolated write
engine_single = _engine()
msg_single = MemoryWriteMsg(
correlation_id="bw", request_id="baseline",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
h = engine_single.submit(msg_single)
engine_single.wait(h)
_, t_single = engine_single.get_completion(h)
single_ns = t_single["total_ns"]
# Concurrent: two writes to same cube, different PEs
engine_conc = _engine()
msg_a = MemoryWriteMsg(
correlation_id="bw", request_id="conc-a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="bw", request_id="conc-b",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
pattern="zero", target_pe=1,
)
ha = engine_conc.submit(msg_a)
hb = engine_conc.submit(msg_b)
engine_conc.wait(ha)
engine_conc.wait(hb)
_, ta = engine_conc.get_completion(ha)
_, tb = engine_conc.get_completion(hb)
max_ns = max(ta["total_ns"], tb["total_ns"])
assert max_ns > single_ns, (
f"BW contention: concurrent max ({max_ns:.2f}ns) must > "
f"single ({single_ns:.2f}ns) due to link BW occupancy"
)
def test_back_to_back_bw_delay_magnitude():
"""BW contention delay must be approximately nbytes / bottleneck_bw.
For two 4KB writes on the same path, the second txn should be delayed
by roughly 4096 / bottleneck_bw_gbs ns on the shared links.
The delta between concurrent max and single should be > 0 and bounded.
"""
nbytes = 4096
# Single isolated
engine_single = _engine()
msg = MemoryWriteMsg(
correlation_id="bw", request_id="mag-single",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
pattern="zero", target_pe=0,
)
h = engine_single.submit(msg)
engine_single.wait(h)
_, t = engine_single.get_completion(h)
single_ns = t["total_ns"]
# Concurrent pair
engine_conc = _engine()
msg_a = MemoryWriteMsg(
correlation_id="bw", request_id="mag-a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="bw", request_id="mag-b",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=nbytes,
pattern="zero", target_pe=1,
)
ha = engine_conc.submit(msg_a)
hb = engine_conc.submit(msg_b)
engine_conc.wait(ha)
engine_conc.wait(hb)
_, ta = engine_conc.get_completion(ha)
_, tb = engine_conc.get_completion(hb)
max_ns = max(ta["total_ns"], tb["total_ns"])
delta = max_ns - single_ns
# Delta should be positive (contention exists)
assert delta > 0, f"Expected positive BW contention delta, got {delta:.4f}"
# Delta should be at least nbytes / max_possible_bw (768 GB/s PCIe)
min_expected_delta = nbytes / 768.0 # ~5.3ns
assert delta >= min_expected_delta * 0.5, (
f"BW contention delta ({delta:.2f}ns) too small, "
f"expected >= ~{min_expected_delta:.2f}ns"
)
# ── 3. Independent paths: no cross-contention ────────────────────
def test_independent_paths_no_contention():
"""Two concurrent H2D writes to different cubes via different paths
should not contend (or contend minimally) with each other.
cube0 and cube3 are in different columns of the 4x4 mesh,
so their UCIe paths diverge after IO_CPU.
"""
# Single write to cube0
engine_single = _engine()
msg0 = MemoryWriteMsg(
correlation_id="bw", request_id="indep-single",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
h0 = engine_single.submit(msg0)
engine_single.wait(h0)
_, t0 = engine_single.get_completion(h0)
single_ns_cube0 = t0["total_ns"]
# Concurrent: cube0 + cube3 (different column, divergent paths)
engine_conc = _engine()
msg_a = MemoryWriteMsg(
correlation_id="bw", request_id="indep-a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="bw", request_id="indep-b",
dst_sip=0, dst_cube=3, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=3, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
ha = engine_conc.submit(msg_a)
hb = engine_conc.submit(msg_b)
engine_conc.wait(ha)
engine_conc.wait(hb)
_, ta = engine_conc.get_completion(ha)
_, tb = engine_conc.get_completion(hb)
# cube0 in concurrent run should be close to its single-run time
# Allow some tolerance for shared early links (pcie_ep → io_cpu)
delta_cube0 = ta["total_ns"] - single_ns_cube0
# Shared early links may add some contention, but much less than same-path
assert delta_cube0 < single_ns_cube0 * 0.5, (
f"Independent path cube0 delta ({delta_cube0:.2f}ns) too large; "
f"paths should mostly be independent"
)
# ── 4. Response messages: no BW occupancy ────────────────────────
def test_response_no_bw_occupancy():
"""Response messages have nbytes=0 and must not occupy link BW.
A single H2D write completes (including response path). If responses
occupied BW, a concurrent write would see extra contention on the
reverse path. We verify that two writes complete without excessive delay.
"""
engine = _engine()
msg_a = MemoryWriteMsg(
correlation_id="bw", request_id="resp-a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="bw", request_id="resp-b",
dst_sip=0, dst_cube=0, dst_pe=2,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=2), nbytes=4096,
pattern="zero", target_pe=2,
)
ha = engine.submit(msg_a)
hb = engine.submit(msg_b)
engine.wait(ha)
engine.wait(hb)
comp_a, _ = engine.get_completion(ha)
comp_b, _ = engine.get_completion(hb)
assert comp_a.ok is True
assert comp_b.ok is True
# ── 5. PE DMA: formula still equals actual for single txn ────────
def test_pe_dma_formula_lower_bound():
"""PE DMA single transaction: formula_latency is a lower bound of actual.
PE DMA now routes through NOC, which applies internal mesh traversal
latency (XY routing distance) not captured by the formula (edge
distance_mm=0 for distributed NOC). The formula is a lower bound:
formula <= actual.
"""
from kernbench.policy.routing.router import AddressResolver, PathRouter
graph = load_topology(TOPOLOGY_PATH)
engine = GraphEngine(graph)
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
msg = PeDmaMsg(
correlation_id="bw", request_id="formula-check",
src_sip=0, src_cube=0, src_pe=0,
dst_pa=pa, nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
actual = trace["total_ns"]
formula = trace.get("formula_ns")
if formula is not None:
assert formula <= actual + 0.01, (
f"PE DMA formula ({formula:.4f}) must be <= actual ({actual:.4f})"
)
assert actual > 0
# ── 6. Existing probe invariants preserved ───────────────────────
def test_h2d_monotonicity_preserved():
"""H2D latency monotonicity (1hop < 2hop < 3hop) must be preserved.
available_at does not affect single-transaction ordering since each
engine instance starts fresh with available_at=0 everywhere.
"""
cubes = [0, 4, 8]
latencies = []
for cube in cubes:
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="bw", request_id=f"mono-c{cube}",
dst_sip=0, dst_cube=cube, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=cube, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
h = engine.submit(msg)
engine.wait(h)
_, t = engine.get_completion(h)
latencies.append(t["total_ns"])
for i in range(len(latencies) - 1):
assert latencies[i] < latencies[i + 1], (
f"Monotonicity: cube{cubes[i]}({latencies[i]:.2f}) "
f"must < cube{cubes[i+1]}({latencies[i+1]:.2f})"
)
# ── 7. BW contention scales with payload size ───────────────────
def test_bw_contention_scales_with_nbytes():
"""Contention delay must increase with payload size.
With link BW occupancy, a 64KB concurrent write should cause more
contention delay than a 4KB concurrent write (occupancy = nbytes/bw).
Component-level contention (DMA capacity) is fixed regardless of size,
so this test specifically validates link-level BW modeling.
NOTE: This test is expected to FAIL before available_at implementation
(contention delta is fixed by component resources, not proportional to nbytes).
After implementation, it should PASS.
"""
def _concurrent_max_ns(nbytes: int) -> tuple[float, float]:
"""Return (single_ns, concurrent_max_ns) for given payload."""
engine_s = _engine()
msg_s = MemoryWriteMsg(
correlation_id="bw", request_id=f"scale-s-{nbytes}",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
pattern="zero", target_pe=0,
)
hs = engine_s.submit(msg_s)
engine_s.wait(hs)
_, ts = engine_s.get_completion(hs)
engine_c = _engine()
msg_a = MemoryWriteMsg(
correlation_id="bw", request_id=f"scale-a-{nbytes}",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="bw", request_id=f"scale-b-{nbytes}",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=nbytes,
pattern="zero", target_pe=1,
)
ha = engine_c.submit(msg_a)
hb = engine_c.submit(msg_b)
engine_c.wait(ha)
engine_c.wait(hb)
_, ta = engine_c.get_completion(ha)
_, tb = engine_c.get_completion(hb)
return ts["total_ns"], max(ta["total_ns"], tb["total_ns"])
single_4k, max_4k = _concurrent_max_ns(4096)
single_64k, max_64k = _concurrent_max_ns(65536)
delta_4k = max_4k - single_4k
delta_64k = max_64k - single_64k
# With BW occupancy: delta_64k should be significantly larger than delta_4k
# (64KB occupies links 16x longer than 4KB)
assert delta_64k > delta_4k * 2.0, (
f"BW contention must scale with payload: "
f"delta_64k ({delta_64k:.2f}ns) should be >> delta_4k ({delta_4k:.2f}ns)"
)