Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop
  timestamps, split cross-cube into best/worst cases, D2H read section
- UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix
  cross-cube-best < cross-half latency inversion
- HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing
  effective BW from 256 to 204.8 GB/s
- Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases
- Probe default data size: 4KB -> 32KB for more realistic measurements
- IOChiplet NOC + D2H topology and tests
- NOC mesh, xbar, BW occupancy components and tests
- Cube mesh visualization diagram

278 tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
+122 -106
View File
@@ -1,5 +1,6 @@
from pathlib import Path
from kernbench.policy.routing.router import PathRouter
from kernbench.topology.builder import load_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
@@ -15,28 +16,32 @@ def _graph():
def test_full_graph_node_count():
g = _graph()
# 1 switch
# + 2 SIPs × (1 IO × 2 comps + 16 cubes × (cube_comps + 8 PEs × 6 pe_comps))
# + 2 SIPs × (1 IO × (3 comps + 4 io_ucie + 16 io_conn)
# + 16 cubes × (cube_comps + 8 PEs × 6 pe_comps))
# IO: pcie_ep + io_cpu + io_noc + 4 io_ucie + 4*4 io_conn = 23
# cube_comps: 9 (noc, m_cpu, sram, 2 bridge, 4 ucie)
# + 8 xbar.pe{0..7} [replaced xbar.top/xbar.bottom]
# + 8 hbm_slices = 25
# = 1 + 2*(2 + 16*(25+48)) = 1 + 2*(2+1168) = 1 + 2340 = 2341
assert len(g.nodes) == 2341
# + 16 ucie_conn (4 ports × 4 connections)
# + 2 xbar_top/bot
# + 8 hbm_slices = 35
# = 1 + 2*(23 + 16*(35+48)) = 1 + 2*(23+1328) = 1 + 2702 = 2703
assert len(g.nodes) == 2703
def test_full_graph_edge_count():
g = _graph()
# Per cube: 144 (88 cube-fabric + 56 PE-internal)
# cube-fabric: 8 pe→xbar.pe + 8 pe→noc + 8 noc→pe_cpu
# + 8 xbar.pe→slice + 8 slice→xbar.pe (bidirectional for response)
# + 12 xbar chain (3 pairs × 2 dir × 2 halves)
# + 8 xbar.pe↔bridge (pe0↔bL, pe4↔bL, pe3↔bR, pe7↔bR, ×2 dir each)
# + 4 noc→ucie + 4 ucie→noc (bidirectional)
# + 8 noc→xbar.pe + 8 xbar.pe→noc (bidirectional for response)
# + 1 m_cpunoc + 1 noc→m_cpu + 1 noc→sram + 1 sram→noc = 88
# Per SIP: 16*144 + 48 inter-cube(bidirectional) + 8 io↔cube(bidirectional)
# + 1 io_internal + 1 switch→io = 2362
# Total: 2 * 2362 = 4724
assert len(g.edges) == 4724
# Per cube: 168
# PE-internal: 56
# PE_DMA→noc: 8, noc→pe_cpu: 8
# xbar_top→hbm{0..3}: 4+4=8, xbar_bot→hbm{4..7}: 4+4=8
# noc↔xbar_top: 2, noc↔xbar_bot: 2
# xbar_top↔bridge.left: 2, bridge.left↔xbar_bot: 2
# xbar_top↔bridge.right: 2, bridge.right↔xbar_bot: 2
# ucie: 64, m_cpunoc: 2, noc↔sram: 2
# Total: 56+8+8+8+8+2+2+2+2+2+2+64+2+2 = 168
# IO edges per SIP: 77
# Per SIP: 16*168 + 48 inter-cube + 77 IO = 2813
# Total: 2 * 2813 = 5626
assert len(g.edges) == 5626
# ── Full graph: specific nodes exist ─────────────────────────────────
@@ -62,16 +67,12 @@ def test_cube_component_nodes_exist():
for name in ("noc", "m_cpu",
"bridge.left", "bridge.right",
"ucie-N", "ucie-S", "ucie-E", "ucie-W",
"sram"):
"sram", "xbar_top", "xbar_bot"):
assert f"{cp}.{name}" in g.nodes
# xbar.top/xbar.bottom replaced by per-PE xbar entry nodes
assert "sip0.cube0.xbar.top" not in g.nodes
assert "sip0.cube0.xbar.bottom" not in g.nodes
# Per-PE xbar entry nodes no longer exist
for pe in range(8):
node_id = f"{cp}.xbar.pe{pe}"
assert node_id in g.nodes, f"{node_id} missing"
assert g.nodes[node_id].kind == "xbar"
# HBM slices (one per PE)
assert f"{cp}.xbar.pe{pe}" not in g.nodes
# HBM slices
for s in range(8):
assert f"{cp}.hbm_ctrl.slice{s}" in g.nodes
assert g.nodes[f"{cp}.hbm_ctrl.slice{s}"].kind == "hbm_ctrl"
@@ -131,9 +132,9 @@ def test_inter_cube_ucie_edges():
def test_io_to_cube_edges():
es = _edge_set(_graph())
# io0 connects to cubes (0,0)..(3,0) on N side
assert ("sip0.io0.io_cpu", "sip0.cube0.ucie-N") in es
assert ("sip0.io0.io_cpu", "sip0.cube3.ucie-N") in es
# io0 connects io_ucie PHYs to cube UCIe ports on N side
assert ("sip0.io0.ucie-P0", "sip0.cube0.ucie-N") in es
assert ("sip0.io0.ucie-P3", "sip0.cube3.ucie-N") in es
def test_switch_to_io_edges():
@@ -142,15 +143,15 @@ def test_switch_to_io_edges():
assert ("fabric.switch0", "sip1.io0.pcie_ep") in es
def test_pe_to_xbar_edges():
def test_pe_dma_to_noc_only():
"""PE_DMA connects only to NOC (no direct xbar connection)."""
es = _edge_set(_graph())
cp = "sip0.cube0"
# Each PE connects to its own xbar entry (per-PE chain model)
for pe in range(8):
assert (f"{cp}.pe{pe}.pe_dma", f"{cp}.xbar.pe{pe}") in es
# Old shared xbar.top/bottom edges must NOT exist
assert (f"{cp}.pe0.pe_dma", f"{cp}.xbar.top") not in es
assert (f"{cp}.pe4.pe_dma", f"{cp}.xbar.bottom") not in es
assert (f"{cp}.pe{pe}.pe_dma", f"{cp}.noc") in es
# No direct pe_dma → xbar edges
assert (f"{cp}.pe{pe}.pe_dma", f"{cp}.xbar_top") not in es
assert (f"{cp}.pe{pe}.pe_dma", f"{cp}.xbar_bot") not in es
def test_command_path_m_cpu_noc_pe_cpu():
@@ -176,17 +177,17 @@ def test_pe_internal_edges():
assert (f"{pp}.pe_math", f"{pp}.pe_tcm") in es
def test_xbar_to_hbm_slice_edges():
"""Each xbar.pe{i} connects only to its own (local) HBM slice."""
def test_xbar_top_bot_to_hbm_slice_edges():
"""xbar_top connects to slices 0-3, xbar_bot to slices 4-7."""
es = _edge_set(_graph())
cp = "sip0.cube0"
# xbar.pe_i -> slice_i only (local Y-direction access)
for pe in range(8):
assert (f"{cp}.xbar.pe{pe}", f"{cp}.hbm_ctrl.slice{pe}") in es
# Negative: xbar.pe_i must NOT directly connect to a different slice
assert (f"{cp}.xbar.pe0", f"{cp}.hbm_ctrl.slice1") not in es
assert (f"{cp}.xbar.pe0", f"{cp}.hbm_ctrl.slice4") not in es
assert (f"{cp}.xbar.pe4", f"{cp}.hbm_ctrl.slice0") not in es
for i in range(4):
assert (f"{cp}.xbar_top", f"{cp}.hbm_ctrl.slice{i}") in es
for i in range(4, 8):
assert (f"{cp}.xbar_bot", f"{cp}.hbm_ctrl.slice{i}") in es
# Negative: xbar_top must NOT connect to bottom slices
assert (f"{cp}.xbar_top", f"{cp}.hbm_ctrl.slice4") not in es
assert (f"{cp}.xbar_bot", f"{cp}.hbm_ctrl.slice0") not in es
# ── Views: system ────────────────────────────────────────────────────
@@ -235,9 +236,12 @@ def test_cube_view_has_all_components():
expected = {"ucie-N", "ucie-S", "ucie-W", "ucie-E",
"m_cpu", "hbm_ctrl",
"bridge.left", "bridge.right", "noc", "sram",
"xbar.pe0", "xbar.pe1", "xbar.pe2", "xbar.pe3",
"xbar.pe4", "xbar.pe5", "xbar.pe6", "xbar.pe7",
"xbar_top", "xbar_bot",
"pe0", "pe1", "pe2", "pe3", "pe4", "pe5", "pe6", "pe7"}
# Add UCIe connection nodes (4 ports × 4 connections)
for port in ("N", "S", "E", "W"):
for ci in range(4):
expected.add(f"ucie-{port}.conn{ci}")
assert set(v.nodes.keys()) == expected
@@ -249,15 +253,12 @@ def test_cube_view_hbm_at_center():
assert v.height_mm == 14.0
def test_cube_view_pe_corner_mapping():
def test_cube_view_pe_to_noc():
"""PEs connect to NOC in cube view (no per-PE xbar)."""
v = _graph().cube_view
ves = {(e.src, e.dst) for e in v.edges}
# Each PE connects to its own xbar entry (chain model)
for i in range(8):
assert (f"pe{i}", f"xbar.pe{i}") in ves
# Old shared xbar.top/bottom mapping must not exist
assert ("pe0", "xbar.top") not in ves
assert ("pe4", "xbar.bottom") not in ves
assert (f"pe{i}", "noc") in ves
# ── Views: PE ────────────────────────────────────────────────────────
@@ -311,24 +312,16 @@ def test_pe_dma_to_noc_edges():
# ── Bridge connects XBAR halves (not NOC) ──────────────────────────
def test_bridge_connects_xbar_halves():
"""bridge.left connects leftmost PE nodes (pe0 top, pe4 bottom).
bridge.right connects rightmost PE nodes (pe3 top, pe7 bottom)."""
def test_bridge_connects_xbar_top_bot():
"""Bridges connect xbar_top ↔ xbar_bot (bidirectional)."""
es = _edge_set(_graph())
cp = "sip0.cube0"
# bridge.left ↔ pe0 (top-left) and pe4 (bottom-left)
assert (f"{cp}.xbar.pe0", f"{cp}.bridge.left") in es
assert (f"{cp}.bridge.left", f"{cp}.xbar.pe0") in es
assert (f"{cp}.xbar.pe4", f"{cp}.bridge.left") in es
assert (f"{cp}.bridge.left", f"{cp}.xbar.pe4") in es
# bridge.right ↔ pe3 (top-right) and pe7 (bottom-right)
assert (f"{cp}.xbar.pe3", f"{cp}.bridge.right") in es
assert (f"{cp}.bridge.right", f"{cp}.xbar.pe3") in es
assert (f"{cp}.xbar.pe7", f"{cp}.bridge.right") in es
assert (f"{cp}.bridge.right", f"{cp}.xbar.pe7") in es
# Old xbar.top/bottom ↔ bridge edges must NOT exist
assert (f"{cp}.xbar.top", f"{cp}.bridge.left") not in es
assert (f"{cp}.xbar.bottom", f"{cp}.bridge.left") not in es
for bname in ("left", "right"):
br = f"{cp}.bridge.{bname}"
assert (f"{cp}.xbar_top", br) in es
assert (br, f"{cp}.xbar_top") in es
assert (f"{cp}.xbar_bot", br) in es
assert (br, f"{cp}.xbar_bot") in es
def test_no_bridge_to_noc_edges():
@@ -341,7 +334,8 @@ def test_no_bridge_to_noc_edges():
# ── Cube view: new edges ────────────────────────────────────────────
def test_cube_view_pe_to_noc():
def test_cube_view_pe_to_noc_edges():
"""All PEs connect to NOC in cube view."""
v = _graph().cube_view
ves = {(e.src, e.dst) for e in v.edges}
for i in range(8):
@@ -357,53 +351,75 @@ def test_cube_view_sram():
def test_cube_view_bridge_xbar():
"""Cube view bridges connect xbar_top ↔ xbar_bot."""
v = _graph().cube_view
ves = {(e.src, e.dst) for e in v.edges}
# bridge.left connects pe0 (top-left) ↔ pe4 (bottom-left)
assert ("xbar.pe0", "bridge.left") in ves
assert ("bridge.left", "xbar.pe0") in ves
assert ("xbar.pe4", "bridge.left") in ves
assert ("bridge.left", "xbar.pe4") in ves
# bridge.right connects pe3 (top-right) ↔ pe7 (bottom-right)
assert ("xbar.pe3", "bridge.right") in ves
assert ("bridge.right", "xbar.pe3") in ves
assert ("xbar.pe7", "bridge.right") in ves
assert ("bridge.right", "xbar.pe7") in ves
# ── Chain xbar: new topology edges ──────────────────────────────────
def test_xbar_chain_edges():
"""Adjacent xbar.pe nodes within each half are bidirectionally connected."""
es = _edge_set(_graph())
cp = "sip0.cube0"
# Top chain: pe0 ↔ pe1 ↔ pe2 ↔ pe3 (NW→NE direction)
for a, b in [(0, 1), (1, 2), (2, 3)]:
assert (f"{cp}.xbar.pe{a}", f"{cp}.xbar.pe{b}") in es, f"missing pe{a}→pe{b}"
assert (f"{cp}.xbar.pe{b}", f"{cp}.xbar.pe{a}") in es, f"missing pe{b}→pe{a}"
# Bottom chain: pe4 ↔ pe5 ↔ pe6 ↔ pe7
for a, b in [(4, 5), (5, 6), (6, 7)]:
assert (f"{cp}.xbar.pe{a}", f"{cp}.xbar.pe{b}") in es, f"missing pe{a}→pe{b}"
assert (f"{cp}.xbar.pe{b}", f"{cp}.xbar.pe{a}") in es, f"missing pe{b}→pe{a}"
# Negative: no cross-chain direct edges
assert (f"{cp}.xbar.pe0", f"{cp}.xbar.pe2") not in es
assert (f"{cp}.xbar.pe0", f"{cp}.xbar.pe4") not in es
for bname in ("left", "right"):
br = f"bridge.{bname}"
assert ("xbar_top", br) in ves
assert (br, "xbar_top") in ves
assert ("xbar_bot", br) in ves
assert (br, "xbar_bot") in ves
def test_ucie_noc_reverse_edges():
"""UCIe ports must have reverse edges back to NOC (bidirectional)."""
"""UCIe ports connect to NOC via conn nodes (bidirectional)."""
es = _edge_set(_graph())
cp = "sip0.cube1" # non-edge cube to avoid io-cube edges
for port in ("N", "S", "E", "W"):
assert (f"{cp}.ucie-{port}", f"{cp}.noc") in es, \
f"missing ucie-{port}->noc reverse edge"
# Direct ucie→noc no longer exists; path goes through conn nodes
assert (f"{cp}.ucie-{port}", f"{cp}.noc") not in es
# Each conn has edges: ucie↔conn, conn↔noc
for ci in range(4):
conn = f"{cp}.ucie-{port}.conn{ci}"
assert (f"{cp}.ucie-{port}", conn) in es, \
f"missing ucie-{port}->conn{ci}"
assert (conn, f"{cp}.noc") in es, \
f"missing conn{ci}->noc"
assert (f"{cp}.noc", conn) in es, \
f"missing noc->conn{ci}"
assert (conn, f"{cp}.ucie-{port}") in es, \
f"missing conn{ci}->ucie-{port}"
def test_noc_to_xbar_pe_edges():
"""NOC connects to all xbar.pe nodes (for remote cube HBM access)."""
def test_ucie_conn_nodes_exist():
"""Each UCIe port must have n_connections independent conn nodes."""
g = _graph()
cp = "sip0.cube0"
for port in ("N", "S", "E", "W"):
for ci in range(4):
conn_id = f"{cp}.ucie-{port}.conn{ci}"
assert conn_id in g.nodes, f"missing {conn_id}"
assert g.nodes[conn_id].kind == "ucie_conn"
assert g.nodes[conn_id].attrs["overhead_ns"] == 0.0
def test_ucie_conn_edge_bw():
"""conn↔NOC edges must have per_connection_bw_gbs (128 GB/s)."""
g = _graph()
edge_map = {(e.src, e.dst): e for e in g.edges}
cp = "sip0.cube0"
for port in ("N", "S", "E", "W"):
for ci in range(4):
conn_id = f"{cp}.ucie-{port}.conn{ci}"
e = edge_map[(conn_id, f"{cp}.noc")]
assert e.bw_gbs == 128.0, f"{conn_id}→noc bw={e.bw_gbs}"
e_rev = edge_map[(f"{cp}.noc", conn_id)]
assert e_rev.bw_gbs == 128.0
def test_cross_cube_path_includes_conn():
"""PE cross-cube path must traverse conn nodes."""
g = _graph()
router = PathRouter(g)
path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.slice0")
conn_nodes = [n for n in path if ".conn" in n]
assert len(conn_nodes) >= 2, f"Expected >=2 conn nodes in path, got {conn_nodes}"
def test_noc_to_xbar_top_bot_edges():
"""NOC connects to xbar_top and xbar_bot."""
es = _edge_set(_graph())
cp = "sip0.cube0"
for pe in range(8):
assert (f"{cp}.noc", f"{cp}.xbar.pe{pe}") in es, \
f"missing noc->xbar.pe{pe}"
assert (f"{cp}.noc", f"{cp}.xbar_top") in es
assert (f"{cp}.noc", f"{cp}.xbar_bot") in es