Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -18,13 +18,14 @@ from kernbench.components.impls.pe_math import PeMathComponent
|
||||
from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
|
||||
from kernbench.components.impls.pe_tcm import PeTcmComponent
|
||||
from kernbench.components.impls.sram import SramComponent
|
||||
from kernbench.components.impls.xbar import PositionAwareXbarComponent
|
||||
|
||||
ComponentRegistry.register("forwarding_v1", TransitComponent)
|
||||
ComponentRegistry.register("switch_v1", TransitComponent)
|
||||
ComponentRegistry.register("noc_v1", TransitComponent)
|
||||
ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
|
||||
ComponentRegistry.register("ucie_v1", TransitComponent)
|
||||
ComponentRegistry.register("xbar_v1", TransitComponent)
|
||||
ComponentRegistry.register("xbar_v1", PositionAwareXbarComponent)
|
||||
ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
|
||||
ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
|
||||
ComponentRegistry.register("m_cpu_v1", MCpuComponent)
|
||||
@@ -50,5 +51,6 @@ __all__ = [
|
||||
"PeTcmComponent",
|
||||
"TransitComponent",
|
||||
"TwoDMeshNocComponent",
|
||||
"PositionAwareXbarComponent",
|
||||
"SramComponent",
|
||||
]
|
||||
|
||||
@@ -69,17 +69,37 @@ class HbmCtrlComponent(ComponentBase):
|
||||
yield from self._send_response(env, txn)
|
||||
|
||||
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
"""Create ResponseMsg and send on reverse path back to originator.
|
||||
"""Route completion based on path type.
|
||||
|
||||
PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
|
||||
so we succeed txn.done directly instead of sending a response Transaction.
|
||||
- PeDmaMsg: succeed done directly (probe).
|
||||
- Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
|
||||
data back on reverse path with original done event.
|
||||
- M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
|
||||
"""
|
||||
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||
from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
|
||||
|
||||
if isinstance(txn.request, PeDmaMsg):
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
# Bypass path: no m_cpu in the transaction path
|
||||
is_bypass = not any("m_cpu" in n for n in txn.path)
|
||||
if is_bypass:
|
||||
if isinstance(txn.request, MemoryReadMsg):
|
||||
# D2H: send data back on reverse path to pcie_ep
|
||||
reverse_path = list(reversed(txn.path))
|
||||
if len(reverse_path) >= 2:
|
||||
resp_txn = Transaction(
|
||||
request=txn.request, path=reverse_path, step=0,
|
||||
nbytes=txn.request.nbytes, done=txn.done,
|
||||
)
|
||||
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||
return
|
||||
# MemoryWrite bypass or short path: done
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
# M_CPU DMA path: send ResponseMsg for aggregation
|
||||
reverse_path = list(reversed(txn.path))
|
||||
if len(reverse_path) >= 2 and self.ctx:
|
||||
from kernbench.runtime_api.kernel import ResponseMsg
|
||||
|
||||
@@ -52,6 +52,26 @@ class TwoDMeshNocComponent(ComponentBase):
|
||||
def _build_grid(self) -> None:
|
||||
if not self.ctx:
|
||||
return
|
||||
mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None
|
||||
if mesh:
|
||||
self._build_grid_from_mesh(mesh)
|
||||
else:
|
||||
self._build_grid_from_positions()
|
||||
|
||||
def _build_grid_from_mesh(self, mesh: dict) -> None:
|
||||
"""Build XY grid from cube_mesh.yaml router positions (authoritative)."""
|
||||
origin_x, origin_y = self._cube_origin()
|
||||
xs: set[float] = set()
|
||||
ys: set[float] = set()
|
||||
for key, router in mesh.get("routers", {}).items():
|
||||
if router is not None:
|
||||
xs.add(round(origin_x + router["pos_mm"][0], 2))
|
||||
ys.add(round(origin_y + router["pos_mm"][1], 2))
|
||||
self._x_grid = sorted(xs)
|
||||
self._y_grid = sorted(ys)
|
||||
|
||||
def _build_grid_from_positions(self) -> None:
|
||||
"""Fallback: infer grid from all node positions in the cube."""
|
||||
cube_prefix = self.node.id.rsplit(".", 1)[0]
|
||||
xs: set[float] = set()
|
||||
ys: set[float] = set()
|
||||
@@ -62,6 +82,23 @@ class TwoDMeshNocComponent(ComponentBase):
|
||||
self._x_grid = sorted(xs)
|
||||
self._y_grid = sorted(ys)
|
||||
|
||||
def _cube_origin(self) -> tuple[float, float]:
|
||||
"""Compute absolute origin (top-left) of this cube from cube_id."""
|
||||
parts = self.node.id.split(".")
|
||||
cube_str = [p for p in parts if p.startswith("cube")][0]
|
||||
cube_id = int(cube_str[4:])
|
||||
spec = self.ctx.spec
|
||||
sip_spec = spec.get("sip", {})
|
||||
cube_spec = spec.get("cube", {})
|
||||
mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
|
||||
cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
|
||||
cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0)
|
||||
seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
|
||||
"distance_mm_across_seam", 1.0)
|
||||
col = cube_id % mesh_w
|
||||
row = cube_id // mesh_w
|
||||
return (col * (cube_w + seam), row * (cube_h + seam))
|
||||
|
||||
def _get_link(self, key: tuple) -> simpy.Resource:
|
||||
if key not in self._links:
|
||||
assert self._env is not None
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
"""Position-aware XBAR component.
|
||||
|
||||
Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm,
|
||||
where internal_distance is the Manhattan distance between the entry port
|
||||
(PE router attachment) and exit port (HBM slice logical position) within
|
||||
the crossbar matrix.
|
||||
|
||||
PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]).
|
||||
HBM slice positions are uniformly distributed across the HBM physical width.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Generator
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import simpy
|
||||
|
||||
from kernbench.components.base import ComponentBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from kernbench.components.context import ComponentContext
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
class PositionAwareXbarComponent(ComponentBase):
|
||||
"""XBAR with position-dependent latency based on PE-to-slice distance.
|
||||
|
||||
Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm
|
||||
|
||||
Entry/exit port X positions are determined from the transaction path:
|
||||
- PE_DMA nodes: router X from cube_mesh.yaml
|
||||
- HBM slices: uniformly distributed across HBM physical width
|
||||
- Bridge nodes: physical X from topology positions
|
||||
- NOC: resolved by scanning path for PE_DMA node
|
||||
"""
|
||||
|
||||
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||
super().__init__(node, ctx)
|
||||
self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0))
|
||||
self._pe_router_xs: dict[str, float] = {}
|
||||
self._slice_xs: dict[str, float] = {}
|
||||
self._bridge_xs: dict[str, float] = {}
|
||||
self._ns_per_mm: float = 0.0
|
||||
|
||||
def start(self, env: simpy.Environment) -> None:
|
||||
self._build_position_map()
|
||||
super().start(env)
|
||||
|
||||
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||
yield env.timeout(self._base_overhead_ns)
|
||||
|
||||
# ── Position map construction ─────────────────────────────────
|
||||
|
||||
def _build_position_map(self) -> None:
|
||||
if not self.ctx or not self.ctx.spec:
|
||||
return
|
||||
mesh = self.ctx.spec.get("_mesh")
|
||||
if not mesh:
|
||||
return
|
||||
|
||||
self._ns_per_mm = self.ctx.ns_per_mm
|
||||
cube_prefix = self.node.id.rsplit(".", 1)[0]
|
||||
xbar_name = self.node.id.rsplit(".", 1)[1]
|
||||
is_top = xbar_name == "xbar_top"
|
||||
xbar_key = "top" if is_top else "bottom"
|
||||
|
||||
# PE router X positions from mesh attachments
|
||||
routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", [])
|
||||
for router_id in routers_list:
|
||||
router_data = mesh["routers"].get(router_id)
|
||||
if router_data is None:
|
||||
continue
|
||||
router_x = router_data["pos_mm"][0]
|
||||
for attach in router_data.get("attach", []):
|
||||
if attach.endswith(".dma"):
|
||||
pe_name = attach.split(".")[0]
|
||||
pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma"
|
||||
self._pe_router_xs[pe_dma_id] = router_x
|
||||
|
||||
# HBM slice X positions: uniformly distributed across HBM width
|
||||
cube_spec = self.ctx.spec.get("cube", {})
|
||||
cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
|
||||
hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0)
|
||||
n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8)
|
||||
half = n_slices // 2
|
||||
hbm_left = (cube_w - hbm_w) / 2
|
||||
|
||||
if is_top:
|
||||
slice_range = range(half)
|
||||
else:
|
||||
slice_range = range(half, n_slices)
|
||||
|
||||
n = len(list(slice_range))
|
||||
for i, sl in enumerate(slice_range):
|
||||
if n > 1:
|
||||
x = hbm_left + i * hbm_w / (n - 1)
|
||||
else:
|
||||
x = cube_w / 2
|
||||
self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x
|
||||
|
||||
# Bridge X positions from topology positions
|
||||
for node_id, pos in self.ctx.positions.items():
|
||||
if node_id.startswith(cube_prefix + ".bridge.") and pos is not None:
|
||||
origin_x = self._cube_origin_x()
|
||||
self._bridge_xs[node_id] = pos[0] - origin_x
|
||||
|
||||
def _cube_origin_x(self) -> float:
|
||||
"""Compute absolute X origin of this cube."""
|
||||
parts = self.node.id.split(".")
|
||||
cube_str = [p for p in parts if p.startswith("cube")][0]
|
||||
cube_id = int(cube_str[4:])
|
||||
spec = self.ctx.spec
|
||||
sip_spec = spec.get("sip", {})
|
||||
cube_spec = spec.get("cube", {})
|
||||
mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
|
||||
cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
|
||||
seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
|
||||
"distance_mm_across_seam", 1.0)
|
||||
col = cube_id % mesh_w
|
||||
return col * (cube_w + seam)
|
||||
|
||||
# ── Worker override ───────────────────────────────────────────
|
||||
|
||||
def _worker(self, env: simpy.Environment) -> Generator:
|
||||
while True:
|
||||
txn: Any = yield self._inbox.get()
|
||||
env.process(self._position_aware_forward(env, txn))
|
||||
|
||||
def _position_aware_forward(
|
||||
self, env: simpy.Environment, txn: Any,
|
||||
) -> Generator:
|
||||
prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
|
||||
next_hop = txn.next_hop
|
||||
|
||||
overhead = self._base_overhead_ns
|
||||
if prev_hop and next_hop and self._ns_per_mm > 0:
|
||||
entry_x = self._get_port_x(prev_hop, txn.path)
|
||||
exit_x = self._get_port_x(next_hop, txn.path)
|
||||
if entry_x is not None and exit_x is not None:
|
||||
overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm
|
||||
|
||||
yield env.timeout(overhead)
|
||||
|
||||
if next_hop:
|
||||
yield self.out_ports[next_hop].put(txn.advance())
|
||||
else:
|
||||
drain = getattr(txn, "drain_ns", 0.0)
|
||||
if drain > 0:
|
||||
yield env.timeout(drain)
|
||||
txn.done.succeed()
|
||||
|
||||
def _get_port_x(self, node_id: str, path: list[str]) -> float | None:
|
||||
"""Resolve the X position of an XBAR port from node context."""
|
||||
# Direct lookup: PE DMA
|
||||
if node_id in self._pe_router_xs:
|
||||
return self._pe_router_xs[node_id]
|
||||
# Direct lookup: HBM slice
|
||||
if node_id in self._slice_xs:
|
||||
return self._slice_xs[node_id]
|
||||
# Direct lookup: bridge
|
||||
if node_id in self._bridge_xs:
|
||||
return self._bridge_xs[node_id]
|
||||
# NOC: scan path for PE DMA node
|
||||
if "noc" in node_id:
|
||||
for p in path:
|
||||
if p in self._pe_router_xs:
|
||||
return self._pe_router_xs[p]
|
||||
return None
|
||||
Reference in New Issue
Block a user