Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop
  timestamps, split cross-cube into best/worst cases, D2H read section
- UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix
  cross-cube-best < cross-half latency inversion
- HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing
  effective BW from 256 to 204.8 GB/s
- Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases
- Probe default data size: 4KB -> 32KB for more realistic measurements
- IOChiplet NOC + D2H topology and tests
- NOC mesh, xbar, BW occupancy components and tests
- Cube mesh visualization diagram

278 tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
+3 -1
View File
@@ -18,13 +18,14 @@ from kernbench.components.impls.pe_math import PeMathComponent
from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
from kernbench.components.impls.pe_tcm import PeTcmComponent
from kernbench.components.impls.sram import SramComponent
from kernbench.components.impls.xbar import PositionAwareXbarComponent
ComponentRegistry.register("forwarding_v1", TransitComponent)
ComponentRegistry.register("switch_v1", TransitComponent)
ComponentRegistry.register("noc_v1", TransitComponent)
ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
ComponentRegistry.register("ucie_v1", TransitComponent)
ComponentRegistry.register("xbar_v1", TransitComponent)
ComponentRegistry.register("xbar_v1", PositionAwareXbarComponent)
ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
ComponentRegistry.register("m_cpu_v1", MCpuComponent)
@@ -50,5 +51,6 @@ __all__ = [
"PeTcmComponent",
"TransitComponent",
"TwoDMeshNocComponent",
"PositionAwareXbarComponent",
"SramComponent",
]
+24 -4
View File
@@ -69,17 +69,37 @@ class HbmCtrlComponent(ComponentBase):
yield from self._send_response(env, txn)
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
"""Create ResponseMsg and send on reverse path back to originator.
"""Route completion based on path type.
PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
so we succeed txn.done directly instead of sending a response Transaction.
- PeDmaMsg: succeed done directly (probe).
- Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
data back on reverse path with original done event.
- M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
"""
from kernbench.runtime_api.kernel import PeDmaMsg
from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
if isinstance(txn.request, PeDmaMsg):
txn.done.succeed()
return
# Bypass path: no m_cpu in the transaction path
is_bypass = not any("m_cpu" in n for n in txn.path)
if is_bypass:
if isinstance(txn.request, MemoryReadMsg):
# D2H: send data back on reverse path to pcie_ep
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
resp_txn = Transaction(
request=txn.request, path=reverse_path, step=0,
nbytes=txn.request.nbytes, done=txn.done,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
return
# MemoryWrite bypass or short path: done
txn.done.succeed()
return
# M_CPU DMA path: send ResponseMsg for aggregation
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2 and self.ctx:
from kernbench.runtime_api.kernel import ResponseMsg
+37
View File
@@ -52,6 +52,26 @@ class TwoDMeshNocComponent(ComponentBase):
def _build_grid(self) -> None:
if not self.ctx:
return
mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None
if mesh:
self._build_grid_from_mesh(mesh)
else:
self._build_grid_from_positions()
def _build_grid_from_mesh(self, mesh: dict) -> None:
"""Build XY grid from cube_mesh.yaml router positions (authoritative)."""
origin_x, origin_y = self._cube_origin()
xs: set[float] = set()
ys: set[float] = set()
for key, router in mesh.get("routers", {}).items():
if router is not None:
xs.add(round(origin_x + router["pos_mm"][0], 2))
ys.add(round(origin_y + router["pos_mm"][1], 2))
self._x_grid = sorted(xs)
self._y_grid = sorted(ys)
def _build_grid_from_positions(self) -> None:
"""Fallback: infer grid from all node positions in the cube."""
cube_prefix = self.node.id.rsplit(".", 1)[0]
xs: set[float] = set()
ys: set[float] = set()
@@ -62,6 +82,23 @@ class TwoDMeshNocComponent(ComponentBase):
self._x_grid = sorted(xs)
self._y_grid = sorted(ys)
def _cube_origin(self) -> tuple[float, float]:
"""Compute absolute origin (top-left) of this cube from cube_id."""
parts = self.node.id.split(".")
cube_str = [p for p in parts if p.startswith("cube")][0]
cube_id = int(cube_str[4:])
spec = self.ctx.spec
sip_spec = spec.get("sip", {})
cube_spec = spec.get("cube", {})
mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0)
seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
"distance_mm_across_seam", 1.0)
col = cube_id % mesh_w
row = cube_id // mesh_w
return (col * (cube_w + seam), row * (cube_h + seam))
def _get_link(self, key: tuple) -> simpy.Resource:
if key not in self._links:
assert self._env is not None
+168
View File
@@ -0,0 +1,168 @@
"""Position-aware XBAR component.
Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm,
where internal_distance is the Manhattan distance between the entry port
(PE router attachment) and exit port (HBM slice logical position) within
the crossbar matrix.
PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]).
HBM slice positions are uniformly distributed across the HBM physical width.
"""
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PositionAwareXbarComponent(ComponentBase):
"""XBAR with position-dependent latency based on PE-to-slice distance.
Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm
Entry/exit port X positions are determined from the transaction path:
- PE_DMA nodes: router X from cube_mesh.yaml
- HBM slices: uniformly distributed across HBM physical width
- Bridge nodes: physical X from topology positions
- NOC: resolved by scanning path for PE_DMA node
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0))
self._pe_router_xs: dict[str, float] = {}
self._slice_xs: dict[str, float] = {}
self._bridge_xs: dict[str, float] = {}
self._ns_per_mm: float = 0.0
def start(self, env: simpy.Environment) -> None:
self._build_position_map()
super().start(env)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
yield env.timeout(self._base_overhead_ns)
# ── Position map construction ─────────────────────────────────
def _build_position_map(self) -> None:
if not self.ctx or not self.ctx.spec:
return
mesh = self.ctx.spec.get("_mesh")
if not mesh:
return
self._ns_per_mm = self.ctx.ns_per_mm
cube_prefix = self.node.id.rsplit(".", 1)[0]
xbar_name = self.node.id.rsplit(".", 1)[1]
is_top = xbar_name == "xbar_top"
xbar_key = "top" if is_top else "bottom"
# PE router X positions from mesh attachments
routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", [])
for router_id in routers_list:
router_data = mesh["routers"].get(router_id)
if router_data is None:
continue
router_x = router_data["pos_mm"][0]
for attach in router_data.get("attach", []):
if attach.endswith(".dma"):
pe_name = attach.split(".")[0]
pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma"
self._pe_router_xs[pe_dma_id] = router_x
# HBM slice X positions: uniformly distributed across HBM width
cube_spec = self.ctx.spec.get("cube", {})
cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0)
n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8)
half = n_slices // 2
hbm_left = (cube_w - hbm_w) / 2
if is_top:
slice_range = range(half)
else:
slice_range = range(half, n_slices)
n = len(list(slice_range))
for i, sl in enumerate(slice_range):
if n > 1:
x = hbm_left + i * hbm_w / (n - 1)
else:
x = cube_w / 2
self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x
# Bridge X positions from topology positions
for node_id, pos in self.ctx.positions.items():
if node_id.startswith(cube_prefix + ".bridge.") and pos is not None:
origin_x = self._cube_origin_x()
self._bridge_xs[node_id] = pos[0] - origin_x
def _cube_origin_x(self) -> float:
"""Compute absolute X origin of this cube."""
parts = self.node.id.split(".")
cube_str = [p for p in parts if p.startswith("cube")][0]
cube_id = int(cube_str[4:])
spec = self.ctx.spec
sip_spec = spec.get("sip", {})
cube_spec = spec.get("cube", {})
mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
"distance_mm_across_seam", 1.0)
col = cube_id % mesh_w
return col * (cube_w + seam)
# ── Worker override ───────────────────────────────────────────
def _worker(self, env: simpy.Environment) -> Generator:
while True:
txn: Any = yield self._inbox.get()
env.process(self._position_aware_forward(env, txn))
def _position_aware_forward(
self, env: simpy.Environment, txn: Any,
) -> Generator:
prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
next_hop = txn.next_hop
overhead = self._base_overhead_ns
if prev_hop and next_hop and self._ns_per_mm > 0:
entry_x = self._get_port_x(prev_hop, txn.path)
exit_x = self._get_port_x(next_hop, txn.path)
if entry_x is not None and exit_x is not None:
overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm
yield env.timeout(overhead)
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
txn.done.succeed()
def _get_port_x(self, node_id: str, path: list[str]) -> float | None:
"""Resolve the X position of an XBAR port from node context."""
# Direct lookup: PE DMA
if node_id in self._pe_router_xs:
return self._pe_router_xs[node_id]
# Direct lookup: HBM slice
if node_id in self._slice_xs:
return self._slice_xs[node_id]
# Direct lookup: bridge
if node_id in self._bridge_xs:
return self._bridge_xs[node_id]
# NOC: scan path for PE DMA node
if "noc" in node_id:
for p in path:
if p in self._pe_router_xs:
return self._pe_router_xs[p]
return None