Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+119
-7
@@ -7,7 +7,7 @@ from pathlib import Path
|
||||
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
||||
from kernbench.runtime_api.kernel import MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import load_topology
|
||||
|
||||
@@ -189,16 +189,16 @@ def test_pe_dma_local_completes():
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
def test_pe_dma_local_bottleneck_256():
|
||||
"""PE DMA pe0→slice0 (local): bottleneck = 256 GB/s (direct xbar→hbm)."""
|
||||
def test_pe_dma_local_bottleneck_hbm():
|
||||
"""PE DMA pe0→slice0 (local): bottleneck = HBM effective BW (256 * 0.8 = 204.8)."""
|
||||
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=0)
|
||||
assert bn == 256.0, f"Local PE DMA bottleneck {bn}, expected 256.0"
|
||||
assert bn == 204.8, f"Local PE DMA bottleneck {bn}, expected 204.8"
|
||||
|
||||
|
||||
def test_pe_dma_chain_bottleneck_128():
|
||||
"""PE DMA pe0→slice1 (xbar chain): bottleneck = 128 GB/s."""
|
||||
def test_pe_dma_same_half_bottleneck_hbm():
|
||||
"""PE DMA pe0→slice1 (same half via xbar_top): bottleneck = HBM effective BW."""
|
||||
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1)
|
||||
assert bn == 128.0, f"Chain PE DMA bottleneck {bn}, expected 128.0"
|
||||
assert bn == 204.8, f"Same-half PE DMA bottleneck {bn}, expected 204.8"
|
||||
|
||||
|
||||
def test_pe_dma_deterministic():
|
||||
@@ -219,3 +219,115 @@ def test_pe_dma_deterministic():
|
||||
_, t2 = e2.get_completion(h2)
|
||||
|
||||
assert t1["total_ns"] == t2["total_ns"]
|
||||
|
||||
|
||||
# ── 7. PE DMA cross-cube best vs worst ──────────────────────────
|
||||
|
||||
|
||||
def _pe_dma_cross_cube_latency(dst_cube: int) -> float:
|
||||
engine = _engine()
|
||||
msg = PeDmaMsg(
|
||||
correlation_id="probe", request_id=f"dma-cross-c{dst_cube}",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=dst_cube, pe_id=0), nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
return trace["total_ns"]
|
||||
|
||||
|
||||
def test_pe_cross_cube_best_worst():
|
||||
"""Cross-cube best (adjacent cube1) must have lower latency than worst (far cube15)."""
|
||||
best = _pe_dma_cross_cube_latency(dst_cube=1)
|
||||
worst = _pe_dma_cross_cube_latency(dst_cube=15)
|
||||
assert best < worst, (
|
||||
f"Best (cube1) {best:.2f}ns must < worst (cube15) {worst:.2f}ns"
|
||||
)
|
||||
|
||||
|
||||
# ── 8. Probe timestamp trace ──────────────────────────────────
|
||||
|
||||
|
||||
def test_probe_timestamp_trace():
|
||||
"""_hop_timestamps must return monotonically increasing cumulative timestamps."""
|
||||
from kernbench.cli.probe import _hop_timestamps, _build_edge_map
|
||||
graph = _graph()
|
||||
edge_map = _build_edge_map(graph)
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
|
||||
pa_obj = PhysAddr.decode(pa)
|
||||
dst_node = resolver.resolve(pa_obj)
|
||||
pe_ref = "sip0.cube0.pe0"
|
||||
path = router.find_path(pe_ref, dst_node)
|
||||
timestamps = _hop_timestamps(path, 4096, edge_map, graph)
|
||||
assert len(timestamps) == len(path)
|
||||
for i in range(len(timestamps) - 1):
|
||||
assert timestamps[i][1] <= timestamps[i + 1][1], (
|
||||
f"Timestamps not monotonic at hop {i}: "
|
||||
f"{timestamps[i][1]:.4f} > {timestamps[i + 1][1]:.4f}"
|
||||
)
|
||||
|
||||
|
||||
# ── 9. D2H Read latency monotonicity ────────────────────────────
|
||||
|
||||
|
||||
def _d2h_latency(src_cube: int) -> float:
|
||||
engine = _engine()
|
||||
msg = MemoryReadMsg(
|
||||
correlation_id="probe", request_id=f"d2h-c{src_cube}",
|
||||
src_sip=0, src_cube=src_cube, src_pe=0,
|
||||
src_pa=_hbm_pa(sip=0, cube=src_cube, pe_id=0), nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
return trace["total_ns"]
|
||||
|
||||
|
||||
def test_d2h_latency_monotonic():
|
||||
"""D2H read: 1hop < 2hop < 3hop < 4hop."""
|
||||
cubes = [0, 4, 8, 12]
|
||||
latencies = [(c, _d2h_latency(c)) for c in cubes]
|
||||
for i in range(len(latencies) - 1):
|
||||
assert latencies[i][1] < latencies[i + 1][1], (
|
||||
f"cube{latencies[i][0]}({latencies[i][1]:.2f}) "
|
||||
f"must < cube{latencies[i + 1][0]}({latencies[i + 1][1]:.2f})"
|
||||
)
|
||||
|
||||
|
||||
def test_d2h_latency_gte_h2d():
|
||||
"""D2H read latency >= H2D write latency for same cube (reverse data path)."""
|
||||
for cube in [0, 4, 8]:
|
||||
h2d = _h2d_latency(dst_cube=cube, dst_pe=0)
|
||||
d2h = _d2h_latency(src_cube=cube)
|
||||
assert d2h >= h2d * 0.8, (
|
||||
f"cube{cube}: D2H ({d2h:.2f}ns) should be >= 80% of H2D ({h2d:.2f}ns)"
|
||||
)
|
||||
|
||||
|
||||
# ── 10. HBM efficiency applied ──────────────────────────────────
|
||||
|
||||
|
||||
def test_hbm_efficiency_applied():
|
||||
"""HBM edge BW should reflect efficiency factor (256 * 0.8 = 204.8)."""
|
||||
graph = _graph()
|
||||
edge_map = {(e.src, e.dst): e for e in graph.edges}
|
||||
e = edge_map.get(("sip0.cube0.xbar_top", "sip0.cube0.hbm_ctrl.slice0"))
|
||||
assert e is not None, "xbar_top -> hbm_ctrl.slice0 edge missing"
|
||||
assert e.bw_gbs == 204.8, f"HBM edge BW {e.bw_gbs}, expected 204.8 (256*0.8)"
|
||||
|
||||
|
||||
# ── 11. Sweep saturation ──────────────────────────────────────
|
||||
|
||||
|
||||
def test_probe_sweep_saturation():
|
||||
"""Utilization at 1MB must exceed utilization at 4KB for pe-local-hbm."""
|
||||
from kernbench.cli.probe import _sweep_util
|
||||
# pe-local-hbm: ovhd=2ns (xbar), wire~0.03ns, bn=204.8 GB/s
|
||||
u = _sweep_util(2.0, 0.03, 204.8)
|
||||
assert u[-1] > u[0], (
|
||||
f"1MB util ({u[-1]:.1f}%) must exceed 4KB util ({u[0]:.1f}%)"
|
||||
)
|
||||
assert u[-1] > 99.0, f"1MB util ({u[-1]:.1f}%) should be >99%"
|
||||
|
||||
Reference in New Issue
Block a user