commit - release 1
This commit is contained in:
@@ -0,0 +1,221 @@
|
||||
"""Tests for H2D writes and PE DMA probe latency invariants.
|
||||
|
||||
H2D tests use MemoryWriteMsg (pcie_ep → io_cpu → m_cpu → hbm_ctrl → response).
|
||||
PE DMA tests use PeDmaMsg (direct pe_dma → xbar → hbm_ctrl injection).
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import load_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
|
||||
def _engine():
|
||||
return GraphEngine(load_topology(TOPOLOGY_PATH))
|
||||
|
||||
|
||||
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
|
||||
slice_bytes = 48 * (1 << 30) // 8
|
||||
pa = PhysAddr.pe_hbm_addr(
|
||||
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
|
||||
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||
)
|
||||
return pa.encode()
|
||||
|
||||
|
||||
def _h2d_latency(dst_cube: int, dst_pe: int = 0) -> float:
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="probe", request_id=f"h2d-c{dst_cube}-p{dst_pe}",
|
||||
dst_sip=0, dst_cube=dst_cube, dst_pe=dst_pe,
|
||||
dst_pa=_hbm_pa(sip=0, cube=dst_cube, pe_id=dst_pe), nbytes=4096,
|
||||
pattern="zero", target_pe=dst_pe,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
return trace["total_ns"]
|
||||
|
||||
|
||||
# ── 1. Single-PE write completes ──────────────────────────────────
|
||||
|
||||
|
||||
def test_single_pe_write_completes():
|
||||
"""MemoryWriteMsg(target_pe=0) must complete with ok=True, latency > 0."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="probe", request_id="pe-local",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
# ── 2. Cross-cube write positive latency ─────────────────────────
|
||||
|
||||
|
||||
def test_cross_cube_write_positive():
|
||||
"""Cross-cube MemoryWriteMsg(target_pe=0) must complete with latency > 0."""
|
||||
lat = _h2d_latency(dst_cube=1, dst_pe=0)
|
||||
assert lat > 0
|
||||
|
||||
|
||||
# ── 3. H2D latency monotonicity ──────────────────────────────────
|
||||
|
||||
|
||||
def test_h2d_latency_monotonic():
|
||||
"""1hop < 2hop < 3hop < 4hop."""
|
||||
cubes = [0, 4, 8, 12]
|
||||
latencies: list[tuple[int, float]] = []
|
||||
for cube in cubes:
|
||||
lat = _h2d_latency(dst_cube=cube, dst_pe=0)
|
||||
latencies.append((cube, lat))
|
||||
|
||||
for i in range(len(latencies) - 1):
|
||||
assert latencies[i][1] < latencies[i + 1][1], (
|
||||
f"cube{latencies[i][0]}({latencies[i][1]:.2f}) "
|
||||
f"must < cube{latencies[i + 1][0]}({latencies[i + 1][1]:.2f})"
|
||||
)
|
||||
|
||||
|
||||
# ── 4. Single-PE write deterministic ─────────────────────────────
|
||||
|
||||
|
||||
def test_single_pe_write_deterministic():
|
||||
"""Same MemoryWriteMsg on two engines must produce identical latency."""
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="probe", request_id="det",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
e1 = _engine()
|
||||
h1 = e1.submit(msg)
|
||||
e1.wait(h1)
|
||||
_, t1 = e1.get_completion(h1)
|
||||
|
||||
e2 = _engine()
|
||||
h2 = e2.submit(msg)
|
||||
e2.wait(h2)
|
||||
_, t2 = e2.get_completion(h2)
|
||||
|
||||
assert t1["total_ns"] == t2["total_ns"]
|
||||
|
||||
|
||||
# ── 5. Cut-through (wormhole) wire model invariants ──────────────
|
||||
|
||||
|
||||
def test_h2d_local_cube_cut_through():
|
||||
"""H2D to local cube with cut-through should be < 50ns for 4096B.
|
||||
|
||||
Full command path: pcie_ep → io_cpu → ucie → noc → m_cpu
|
||||
DMA: m_cpu → noc → xbar → hbm_ctrl (drain once at terminal)
|
||||
Plus response path back.
|
||||
With store-and-forward each hop would serialize; cut-through keeps it low.
|
||||
"""
|
||||
lat = _h2d_latency(dst_cube=0, dst_pe=0)
|
||||
assert lat < 65.0, f"Local H2D {lat:.2f}ns; cut-through expects < 65ns"
|
||||
|
||||
|
||||
def test_h2d_remote_cube_cut_through():
|
||||
"""H2D to 1-hop remote cube: cut-through drain dominates, not per-hop serialization.
|
||||
|
||||
With store-and-forward, each hop would serialize 4096B, total >> 100ns.
|
||||
With cut-through, drain happens once at bottleneck.
|
||||
"""
|
||||
lat = _h2d_latency(dst_cube=4, dst_pe=0)
|
||||
assert lat < 80.0, f"Remote H2D {lat:.2f}ns; cut-through expects < 80ns"
|
||||
|
||||
|
||||
# ── 6. PE DMA: direct injection tests ─────────────────────────
|
||||
|
||||
|
||||
def _graph():
|
||||
return load_topology(TOPOLOGY_PATH)
|
||||
|
||||
|
||||
def _pe_dma_latency(src_cube: int, src_pe: int, dst_pe: int) -> float:
|
||||
engine = _engine()
|
||||
msg = PeDmaMsg(
|
||||
correlation_id="probe", request_id=f"dma-c{src_cube}-p{src_pe}-s{dst_pe}",
|
||||
src_sip=0, src_cube=src_cube, src_pe=src_pe,
|
||||
dst_pa=_hbm_pa(sip=0, cube=src_cube, pe_id=dst_pe), nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
return trace["total_ns"]
|
||||
|
||||
|
||||
def _pe_dma_bottleneck(src_cube: int, src_pe: int, dst_pe: int) -> float | None:
|
||||
graph = _graph()
|
||||
edge_map = {(e.src, e.dst): e for e in graph.edges}
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
pa = _hbm_pa(sip=0, cube=src_cube, pe_id=dst_pe)
|
||||
pa_obj = PhysAddr.decode(pa)
|
||||
dst_node = resolver.resolve(pa_obj)
|
||||
pe_ref = f"sip0.cube{src_cube}.pe{src_pe}"
|
||||
path = router.find_path(pe_ref, dst_node)
|
||||
bws: list[float] = []
|
||||
for i in range(len(path) - 1):
|
||||
e = edge_map.get((path[i], path[i + 1]))
|
||||
if e and e.bw_gbs:
|
||||
bws.append(e.bw_gbs)
|
||||
return min(bws) if bws else None
|
||||
|
||||
|
||||
def test_pe_dma_local_completes():
|
||||
"""PeDmaMsg to local slice0 must complete with ok=True, latency > 0."""
|
||||
engine = _engine()
|
||||
msg = PeDmaMsg(
|
||||
correlation_id="probe", request_id="dma-local",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
def test_pe_dma_local_bottleneck_256():
|
||||
"""PE DMA pe0→slice0 (local): bottleneck = 256 GB/s (direct xbar→hbm)."""
|
||||
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=0)
|
||||
assert bn == 256.0, f"Local PE DMA bottleneck {bn}, expected 256.0"
|
||||
|
||||
|
||||
def test_pe_dma_chain_bottleneck_128():
|
||||
"""PE DMA pe0→slice1 (xbar chain): bottleneck = 128 GB/s."""
|
||||
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1)
|
||||
assert bn == 128.0, f"Chain PE DMA bottleneck {bn}, expected 128.0"
|
||||
|
||||
|
||||
def test_pe_dma_deterministic():
|
||||
"""Same PeDmaMsg on two engines must produce identical latency."""
|
||||
msg = PeDmaMsg(
|
||||
correlation_id="probe", request_id="det",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
)
|
||||
e1 = _engine()
|
||||
h1 = e1.submit(msg)
|
||||
e1.wait(h1)
|
||||
_, t1 = e1.get_completion(h1)
|
||||
|
||||
e2 = _engine()
|
||||
h2 = e2.submit(msg)
|
||||
e2.wait(h2)
|
||||
_, t2 = e2.get_completion(h2)
|
||||
|
||||
assert t1["total_ns"] == t2["total_ns"]
|
||||
Reference in New Issue
Block a user