Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,320 @@
|
||||
"""Tests for IOChiplet NOC + D2H (combined #3+#4).
|
||||
|
||||
Validates:
|
||||
- IOChiplet topology: io_noc, io_ucie PHY, conn nodes
|
||||
- H2D MemoryWrite: data flows pcie_ep → io_noc → cube → hbm (m_cpu bypass)
|
||||
- D2H MemoryRead: data flows hbm → cube → io_noc → pcie_ep (host drain)
|
||||
- KernelLaunch: still routes through m_cpu → PE
|
||||
- Latency invariants preserved
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||
from kernbench.runtime_api.kernel import (
|
||||
KernelLaunchMsg,
|
||||
KernelRef,
|
||||
MemoryReadMsg,
|
||||
MemoryWriteMsg,
|
||||
TensorArg,
|
||||
TensorArgShard,
|
||||
)
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import load_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
|
||||
def _graph():
|
||||
return load_topology(TOPOLOGY_PATH)
|
||||
|
||||
|
||||
def _engine():
|
||||
return GraphEngine(_graph())
|
||||
|
||||
|
||||
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
|
||||
slice_bytes = 48 * (1 << 30) // 8
|
||||
pa = PhysAddr.pe_hbm_addr(
|
||||
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
|
||||
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||
)
|
||||
return pa.encode()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# 1. IOChiplet Topology Structure
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
def test_io_chiplet_has_noc_node():
|
||||
"""Each IOChiplet instance must have an io_noc node."""
|
||||
graph = _graph()
|
||||
assert "sip0.io0.noc" in graph.nodes, "io_noc node missing"
|
||||
node = graph.nodes["sip0.io0.noc"]
|
||||
assert node.kind == "io_noc"
|
||||
|
||||
|
||||
def test_io_chiplet_has_ucie_phy_nodes():
|
||||
"""Each IOChiplet PHY must exist as a separate node."""
|
||||
graph = _graph()
|
||||
for phy in ["P0", "P1", "P2", "P3"]:
|
||||
node_id = f"sip0.io0.ucie-{phy}"
|
||||
assert node_id in graph.nodes, f"io_ucie PHY node {node_id} missing"
|
||||
|
||||
|
||||
def test_io_chiplet_has_conn_nodes():
|
||||
"""Each IOChiplet PHY must have conn nodes (NOC ↔ conn ↔ io_ucie pattern)."""
|
||||
graph = _graph()
|
||||
for phy in ["P0", "P1", "P2", "P3"]:
|
||||
for ci in range(4): # n_connections=4
|
||||
conn_id = f"sip0.io0.ucie-{phy}.conn{ci}"
|
||||
assert conn_id in graph.nodes, f"conn node {conn_id} missing"
|
||||
|
||||
|
||||
def test_io_noc_connects_to_pcie_ep():
|
||||
"""pcie_ep must connect to io_noc (bidirectional)."""
|
||||
graph = _graph()
|
||||
edge_set = {(e.src, e.dst) for e in graph.edges}
|
||||
assert ("sip0.io0.pcie_ep", "sip0.io0.noc") in edge_set
|
||||
assert ("sip0.io0.noc", "sip0.io0.pcie_ep") in edge_set
|
||||
|
||||
|
||||
def test_io_noc_connects_to_io_cpu():
|
||||
"""io_cpu must connect to io_noc (bidirectional)."""
|
||||
graph = _graph()
|
||||
edge_set = {(e.src, e.dst) for e in graph.edges}
|
||||
assert ("sip0.io0.io_cpu", "sip0.io0.noc") in edge_set
|
||||
assert ("sip0.io0.noc", "sip0.io0.io_cpu") in edge_set
|
||||
|
||||
|
||||
def test_io_noc_connects_to_conn_nodes():
|
||||
"""io_noc must connect to conn nodes (per PHY, bidirectional)."""
|
||||
graph = _graph()
|
||||
edge_set = {(e.src, e.dst) for e in graph.edges}
|
||||
conn_id = "sip0.io0.ucie-P0.conn0"
|
||||
assert ("sip0.io0.noc", conn_id) in edge_set
|
||||
assert (conn_id, "sip0.io0.noc") in edge_set
|
||||
|
||||
|
||||
def test_no_direct_io_cpu_to_cube_edges():
|
||||
"""io_cpu must NOT have direct edges to cube UCIe ports (replaced by io_noc path)."""
|
||||
graph = _graph()
|
||||
for e in graph.edges:
|
||||
if e.src == "sip0.io0.io_cpu" and "cube" in e.dst:
|
||||
raise AssertionError(
|
||||
f"Direct io_cpu→cube edge found: {e.src} → {e.dst}. "
|
||||
f"All cube traffic should route through io_noc."
|
||||
)
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# 2. H2D MemoryWrite (data path: pcie_ep → io_noc → cube → hbm)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
def test_h2d_write_completes():
|
||||
"""H2D MemoryWrite must complete with ok=True and positive latency."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="noc", request_id="h2d-w",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
def test_h2d_write_cross_cube_completes():
|
||||
"""H2D MemoryWrite to remote cube must complete."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="noc", request_id="h2d-remote",
|
||||
dst_sip=0, dst_cube=4, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=4, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
def test_h2d_write_deterministic():
|
||||
"""Same H2D write on two engines must produce identical latency."""
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="noc", request_id="h2d-det",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
e1, e2 = _engine(), _engine()
|
||||
h1 = e1.submit(msg)
|
||||
e1.wait(h1)
|
||||
_, t1 = e1.get_completion(h1)
|
||||
|
||||
h2 = e2.submit(msg)
|
||||
e2.wait(h2)
|
||||
_, t2 = e2.get_completion(h2)
|
||||
|
||||
assert t1["total_ns"] == t2["total_ns"]
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# 3. D2H MemoryRead (data path: hbm → cube → io_noc → pcie_ep)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
def test_d2h_read_completes():
|
||||
"""D2H MemoryRead must complete with ok=True and positive latency."""
|
||||
engine = _engine()
|
||||
msg = MemoryReadMsg(
|
||||
correlation_id="noc", request_id="d2h-r",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
def test_d2h_read_includes_host_drain():
|
||||
"""D2H MemoryRead latency must include host-side drain.
|
||||
|
||||
Read data (nbytes>0) flows hbm → ... → pcie_ep with BW occupancy.
|
||||
D2H should take longer than H2D for the same address because:
|
||||
- H2D: data pcie_ep→hbm (forward only)
|
||||
- D2H: command pcie_ep→hbm (forward) + data hbm→pcie_ep (reverse, with nbytes)
|
||||
"""
|
||||
engine_w = _engine()
|
||||
msg_w = MemoryWriteMsg(
|
||||
correlation_id="noc", request_id="drain-w",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
hw = engine_w.submit(msg_w)
|
||||
engine_w.wait(hw)
|
||||
_, tw = engine_w.get_completion(hw)
|
||||
|
||||
engine_r = _engine()
|
||||
msg_r = MemoryReadMsg(
|
||||
correlation_id="noc", request_id="drain-r",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
)
|
||||
hr = engine_r.submit(msg_r)
|
||||
engine_r.wait(hr)
|
||||
_, tr = engine_r.get_completion(hr)
|
||||
|
||||
# D2H read should include reverse data path + host drain
|
||||
# so it should be >= H2D write latency
|
||||
assert tr["total_ns"] >= tw["total_ns"] * 0.8, (
|
||||
f"D2H read ({tr['total_ns']:.2f}ns) should be comparable to or "
|
||||
f"greater than H2D write ({tw['total_ns']:.2f}ns) due to host drain"
|
||||
)
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# 4. KernelLaunch (still routes through m_cpu)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
def test_kernel_launch_still_works():
|
||||
"""KernelLaunch must still complete via m_cpu → PE path."""
|
||||
from kernbench.triton_emu.registry import clear_registry, register_kernel
|
||||
|
||||
clear_registry()
|
||||
|
||||
def gemm_kernel(a_ptr, tl):
|
||||
a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
|
||||
tl.store(a_ptr, a)
|
||||
|
||||
register_kernel("gemm", gemm_kernel)
|
||||
|
||||
engine = _engine()
|
||||
shard0 = TensorArgShard(
|
||||
sip=0, cube=0, pe=0,
|
||||
pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
|
||||
)
|
||||
msg = KernelLaunchMsg(
|
||||
correlation_id="noc", request_id="kern",
|
||||
kernel_ref=KernelRef(name="gemm", kind="builtin"),
|
||||
args=(TensorArg(shards=(shard0,)),),
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
clear_registry()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# 5. Latency Invariants
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
def test_h2d_latency_monotonicity():
|
||||
"""H2D write: closer cube = lower latency (1hop < 2hop < 3hop)."""
|
||||
cubes = [0, 4, 8]
|
||||
latencies = []
|
||||
for cube in cubes:
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="noc", request_id=f"mono-c{cube}",
|
||||
dst_sip=0, dst_cube=cube, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=cube, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, t = engine.get_completion(h)
|
||||
latencies.append(t["total_ns"])
|
||||
|
||||
for i in range(len(latencies) - 1):
|
||||
assert latencies[i] < latencies[i + 1], (
|
||||
f"Monotonicity: cube{cubes[i]}({latencies[i]:.2f}) "
|
||||
f"must < cube{cubes[i+1]}({latencies[i+1]:.2f})"
|
||||
)
|
||||
|
||||
|
||||
def test_h2d_path_includes_io_noc():
|
||||
"""H2D path from pcie_ep to hbm must traverse io_noc."""
|
||||
graph = _graph()
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
|
||||
pcie_ep = resolver.find_pcie_ep(0)
|
||||
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
|
||||
hbm_target = resolver.resolve(PhysAddr.decode(pa))
|
||||
|
||||
path = router.find_memory_path(pcie_ep, hbm_target)
|
||||
assert "sip0.io0.noc" in path, (
|
||||
f"H2D path must include io_noc. Path: {path}"
|
||||
)
|
||||
|
||||
|
||||
def test_h2d_path_excludes_m_cpu():
|
||||
"""H2D MemoryWrite path must NOT include m_cpu (direct to hbm)."""
|
||||
graph = _graph()
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
|
||||
pcie_ep = resolver.find_pcie_ep(0)
|
||||
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
|
||||
hbm_target = resolver.resolve(PhysAddr.decode(pa))
|
||||
|
||||
path = router.find_memory_path(pcie_ep, hbm_target)
|
||||
m_cpu_nodes = [n for n in path if "m_cpu" in n]
|
||||
assert len(m_cpu_nodes) == 0, (
|
||||
f"H2D MemoryWrite path must not include m_cpu. "
|
||||
f"Path: {path}, m_cpu nodes: {m_cpu_nodes}"
|
||||
)
|
||||
Reference in New Issue
Block a user