81cc32c46b
Remove rack_id (4 bits), rename sip_seg→die_id, shift fields to enable 42-bit local_offset (4 TB per die). Define PE_LOCAL/MCPU_LOCAL/CUBE_SRAM sub-unit tables for AHBM dies and IOCPU sub-unit table for IOCHIPLET dies (1 TB window). Supersedes ADR-0031. Also fixes latent VA/PA confusion in pe_dma pipeline DMA path where virtual addresses were decoded as physical addresses without MMU translation — previously masked by coincidental bit-position alignment. 529 passed (+6 recovered), 10 pre-existing failures unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
386 lines
14 KiB
Python
386 lines
14 KiB
Python
"""Tests for per-link BW occupancy (available_at) model.
|
|
|
|
Verifies that:
|
|
- Single transactions see no extra delay from BW tracking.
|
|
- Back-to-back transactions on the same link see BW contention.
|
|
- Transactions on independent paths see no contention.
|
|
- Response messages (nbytes=0) do not occupy BW.
|
|
"""
|
|
from pathlib import Path
|
|
|
|
from kernbench.policy.address.phyaddr import PhysAddr
|
|
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import load_topology
|
|
|
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
|
|
|
|
|
def _engine():
|
|
return GraphEngine(load_topology(TOPOLOGY_PATH))
|
|
|
|
|
|
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
|
|
slice_bytes = 48 * (1 << 30) // 8
|
|
pa = PhysAddr.pe_hbm_addr(
|
|
sip_id=sip, die_id=cube, pe_id=pe_id,
|
|
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
|
)
|
|
return pa.encode()
|
|
|
|
|
|
# ── 1. Single transaction: available_at does not add delay ────────
|
|
|
|
|
|
def test_single_txn_latency_unchanged():
|
|
"""Single H2D write must produce identical latency with or without BW tracking.
|
|
|
|
With no contention, available_at starts at 0 for every link, so the
|
|
current transaction sees zero BW wait. Total latency = prop + overhead + drain.
|
|
Two separate engines running the same request must match exactly.
|
|
"""
|
|
msg = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="single",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
e1 = _engine()
|
|
h1 = e1.submit(msg)
|
|
e1.wait(h1)
|
|
_, t1 = e1.get_completion(h1)
|
|
|
|
e2 = _engine()
|
|
h2 = e2.submit(msg)
|
|
e2.wait(h2)
|
|
_, t2 = e2.get_completion(h2)
|
|
|
|
assert t1["total_ns"] == t2["total_ns"], (
|
|
f"Single txn must be deterministic: {t1['total_ns']} vs {t2['total_ns']}"
|
|
)
|
|
assert t1["total_ns"] > 0
|
|
|
|
|
|
# ── 2. Back-to-back transactions: BW contention on shared links ──
|
|
|
|
|
|
def test_back_to_back_same_cube_bw_contention():
|
|
"""Two concurrent H2D writes to the same cube must show BW contention.
|
|
|
|
Both transactions share the same command path (pcie_ep → io_cpu → ... → m_cpu).
|
|
The second transaction must wait for BW occupancy on shared links.
|
|
The slower (later-finishing) request must take longer than a single isolated request.
|
|
"""
|
|
# Baseline: single isolated write
|
|
engine_single = _engine()
|
|
msg_single = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="baseline",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
h = engine_single.submit(msg_single)
|
|
engine_single.wait(h)
|
|
_, t_single = engine_single.get_completion(h)
|
|
single_ns = t_single["total_ns"]
|
|
|
|
# Concurrent: two writes to same cube, different PEs
|
|
engine_conc = _engine()
|
|
msg_a = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="conc-a",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
msg_b = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="conc-b",
|
|
dst_sip=0, dst_cube=0, dst_pe=1,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
|
|
pattern="zero", target_pe=1,
|
|
)
|
|
ha = engine_conc.submit(msg_a)
|
|
hb = engine_conc.submit(msg_b)
|
|
engine_conc.wait(ha)
|
|
engine_conc.wait(hb)
|
|
_, ta = engine_conc.get_completion(ha)
|
|
_, tb = engine_conc.get_completion(hb)
|
|
|
|
max_ns = max(ta["total_ns"], tb["total_ns"])
|
|
assert max_ns > single_ns, (
|
|
f"BW contention: concurrent max ({max_ns:.2f}ns) must > "
|
|
f"single ({single_ns:.2f}ns) due to link BW occupancy"
|
|
)
|
|
|
|
|
|
def test_back_to_back_bw_delay_magnitude():
|
|
"""BW contention delay must be approximately nbytes / bottleneck_bw.
|
|
|
|
For two 4KB writes on the same path, the second txn should be delayed
|
|
by roughly 4096 / bottleneck_bw_gbs ns on the shared links.
|
|
The delta between concurrent max and single should be > 0 and bounded.
|
|
"""
|
|
nbytes = 4096
|
|
|
|
# Single isolated
|
|
engine_single = _engine()
|
|
msg = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="mag-single",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
h = engine_single.submit(msg)
|
|
engine_single.wait(h)
|
|
_, t = engine_single.get_completion(h)
|
|
single_ns = t["total_ns"]
|
|
|
|
# Concurrent pair
|
|
engine_conc = _engine()
|
|
msg_a = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="mag-a",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
msg_b = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="mag-b",
|
|
dst_sip=0, dst_cube=0, dst_pe=1,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=nbytes,
|
|
pattern="zero", target_pe=1,
|
|
)
|
|
ha = engine_conc.submit(msg_a)
|
|
hb = engine_conc.submit(msg_b)
|
|
engine_conc.wait(ha)
|
|
engine_conc.wait(hb)
|
|
_, ta = engine_conc.get_completion(ha)
|
|
_, tb = engine_conc.get_completion(hb)
|
|
|
|
max_ns = max(ta["total_ns"], tb["total_ns"])
|
|
delta = max_ns - single_ns
|
|
|
|
# Delta should be positive (contention exists)
|
|
assert delta > 0, f"Expected positive BW contention delta, got {delta:.4f}"
|
|
|
|
# Delta should be at least nbytes / max_possible_bw (768 GB/s PCIe)
|
|
min_expected_delta = nbytes / 768.0 # ~5.3ns
|
|
assert delta >= min_expected_delta * 0.5, (
|
|
f"BW contention delta ({delta:.2f}ns) too small, "
|
|
f"expected >= ~{min_expected_delta:.2f}ns"
|
|
)
|
|
|
|
|
|
# ── 3. Independent paths: no cross-contention ────────────────────
|
|
|
|
|
|
def test_independent_paths_no_contention():
|
|
"""Two concurrent H2D writes to different cubes via different paths
|
|
should not contend (or contend minimally) with each other.
|
|
|
|
cube0 and cube3 are in different columns of the 4x4 mesh,
|
|
so their UCIe paths diverge after IO_CPU.
|
|
"""
|
|
# Single write to cube0
|
|
engine_single = _engine()
|
|
msg0 = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="indep-single",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
h0 = engine_single.submit(msg0)
|
|
engine_single.wait(h0)
|
|
_, t0 = engine_single.get_completion(h0)
|
|
single_ns_cube0 = t0["total_ns"]
|
|
|
|
# Concurrent: cube0 + cube3 (different column, divergent paths)
|
|
engine_conc = _engine()
|
|
msg_a = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="indep-a",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
msg_b = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="indep-b",
|
|
dst_sip=0, dst_cube=3, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=3, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
ha = engine_conc.submit(msg_a)
|
|
hb = engine_conc.submit(msg_b)
|
|
engine_conc.wait(ha)
|
|
engine_conc.wait(hb)
|
|
_, ta = engine_conc.get_completion(ha)
|
|
_, tb = engine_conc.get_completion(hb)
|
|
|
|
# cube0 in concurrent run should be close to its single-run time
|
|
# Allow some tolerance for shared early links (pcie_ep → io_cpu)
|
|
delta_cube0 = ta["total_ns"] - single_ns_cube0
|
|
# Shared early links may add some contention, but much less than same-path
|
|
assert delta_cube0 < single_ns_cube0 * 0.5, (
|
|
f"Independent path cube0 delta ({delta_cube0:.2f}ns) too large; "
|
|
f"paths should mostly be independent"
|
|
)
|
|
|
|
|
|
# ── 4. Response messages: no BW occupancy ────────────────────────
|
|
|
|
|
|
def test_response_no_bw_occupancy():
|
|
"""Response messages have nbytes=0 and must not occupy link BW.
|
|
|
|
A single H2D write completes (including response path). If responses
|
|
occupied BW, a concurrent write would see extra contention on the
|
|
reverse path. We verify that two writes complete without excessive delay.
|
|
"""
|
|
engine = _engine()
|
|
msg_a = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="resp-a",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
msg_b = MemoryWriteMsg(
|
|
correlation_id="bw", request_id="resp-b",
|
|
dst_sip=0, dst_cube=0, dst_pe=2,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=2), nbytes=4096,
|
|
pattern="zero", target_pe=2,
|
|
)
|
|
ha = engine.submit(msg_a)
|
|
hb = engine.submit(msg_b)
|
|
engine.wait(ha)
|
|
engine.wait(hb)
|
|
comp_a, _ = engine.get_completion(ha)
|
|
comp_b, _ = engine.get_completion(hb)
|
|
assert comp_a.ok is True
|
|
assert comp_b.ok is True
|
|
|
|
|
|
# ── 5. PE DMA: formula still equals actual for single txn ────────
|
|
|
|
|
|
def test_pe_dma_formula_lower_bound():
|
|
"""PE DMA single transaction: formula_latency is a lower bound of actual.
|
|
|
|
PE DMA now routes through NOC, which applies internal mesh traversal
|
|
latency (XY routing distance) not captured by the formula (edge
|
|
distance_mm=0 for distributed NOC). The formula is a lower bound:
|
|
formula <= actual.
|
|
"""
|
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
|
|
|
graph = load_topology(TOPOLOGY_PATH)
|
|
engine = GraphEngine(graph)
|
|
|
|
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
|
|
msg = PeDmaMsg(
|
|
correlation_id="bw", request_id="formula-check",
|
|
src_sip=0, src_cube=0, src_pe=0,
|
|
dst_pa=pa, nbytes=4096,
|
|
)
|
|
h = engine.submit(msg)
|
|
engine.wait(h)
|
|
_, trace = engine.get_completion(h)
|
|
actual = trace["total_ns"]
|
|
formula = trace.get("formula_ns")
|
|
|
|
if formula is not None:
|
|
assert formula <= actual + 0.01, (
|
|
f"PE DMA formula ({formula:.4f}) must be <= actual ({actual:.4f})"
|
|
)
|
|
assert actual > 0
|
|
|
|
|
|
# ── 6. Existing probe invariants preserved ───────────────────────
|
|
|
|
|
|
def test_h2d_monotonicity_preserved():
|
|
"""H2D latency monotonicity (1hop < 2hop < 3hop) must be preserved.
|
|
|
|
available_at does not affect single-transaction ordering since each
|
|
engine instance starts fresh with available_at=0 everywhere.
|
|
"""
|
|
cubes = [0, 4, 8]
|
|
latencies = []
|
|
for cube in cubes:
|
|
engine = _engine()
|
|
msg = MemoryWriteMsg(
|
|
correlation_id="bw", request_id=f"mono-c{cube}",
|
|
dst_sip=0, dst_cube=cube, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=cube, pe_id=0), nbytes=4096,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
h = engine.submit(msg)
|
|
engine.wait(h)
|
|
_, t = engine.get_completion(h)
|
|
latencies.append(t["total_ns"])
|
|
|
|
for i in range(len(latencies) - 1):
|
|
assert latencies[i] <= latencies[i + 1], (
|
|
f"Monotonicity: cube{cubes[i]}({latencies[i]:.2f}) "
|
|
f"must <= cube{cubes[i+1]}({latencies[i+1]:.2f})"
|
|
)
|
|
|
|
|
|
# ── 7. BW contention scales with payload size ───────────────────
|
|
|
|
|
|
def test_bw_contention_scales_with_nbytes():
|
|
"""Contention delay must increase with payload size.
|
|
|
|
With link BW occupancy, a 64KB concurrent write should cause more
|
|
contention delay than a 4KB concurrent write (occupancy = nbytes/bw).
|
|
Component-level contention (DMA capacity) is fixed regardless of size,
|
|
so this test specifically validates link-level BW modeling.
|
|
|
|
NOTE: This test is expected to FAIL before available_at implementation
|
|
(contention delta is fixed by component resources, not proportional to nbytes).
|
|
After implementation, it should PASS.
|
|
"""
|
|
def _concurrent_max_ns(nbytes: int) -> tuple[float, float]:
|
|
"""Return (single_ns, concurrent_max_ns) for given payload."""
|
|
engine_s = _engine()
|
|
msg_s = MemoryWriteMsg(
|
|
correlation_id="bw", request_id=f"scale-s-{nbytes}",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
hs = engine_s.submit(msg_s)
|
|
engine_s.wait(hs)
|
|
_, ts = engine_s.get_completion(hs)
|
|
|
|
engine_c = _engine()
|
|
msg_a = MemoryWriteMsg(
|
|
correlation_id="bw", request_id=f"scale-a-{nbytes}",
|
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
|
|
pattern="zero", target_pe=0,
|
|
)
|
|
msg_b = MemoryWriteMsg(
|
|
correlation_id="bw", request_id=f"scale-b-{nbytes}",
|
|
dst_sip=0, dst_cube=0, dst_pe=1,
|
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=nbytes,
|
|
pattern="zero", target_pe=1,
|
|
)
|
|
ha = engine_c.submit(msg_a)
|
|
hb = engine_c.submit(msg_b)
|
|
engine_c.wait(ha)
|
|
engine_c.wait(hb)
|
|
_, ta = engine_c.get_completion(ha)
|
|
_, tb = engine_c.get_completion(hb)
|
|
return ts["total_ns"], max(ta["total_ns"], tb["total_ns"])
|
|
|
|
single_4k, max_4k = _concurrent_max_ns(4096)
|
|
single_64k, max_64k = _concurrent_max_ns(65536)
|
|
|
|
delta_4k = max_4k - single_4k
|
|
delta_64k = max_64k - single_64k
|
|
|
|
# With BW occupancy: delta_64k should be significantly larger than delta_4k
|
|
# (64KB occupies links 16x longer than 4KB)
|
|
assert delta_64k > delta_4k * 2.0, (
|
|
f"BW contention must scale with payload: "
|
|
f"delta_64k ({delta_64k:.2f}ns) should be >> delta_4k ({delta_4k:.2f}ns)"
|
|
)
|