Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
@@ -18,13 +18,14 @@ from kernbench.components.impls.pe_math import PeMathComponent
 from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
 from kernbench.components.impls.pe_tcm import PeTcmComponent
 from kernbench.components.impls.sram import SramComponent
+from kernbench.components.impls.xbar import PositionAwareXbarComponent

 ComponentRegistry.register("forwarding_v1", TransitComponent)
 ComponentRegistry.register("switch_v1", TransitComponent)
 ComponentRegistry.register("noc_v1", TransitComponent)
 ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
 ComponentRegistry.register("ucie_v1", TransitComponent)
-ComponentRegistry.register("xbar_v1", TransitComponent)
+ComponentRegistry.register("xbar_v1", PositionAwareXbarComponent)
 ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
 ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
 ComponentRegistry.register("m_cpu_v1", MCpuComponent)
@@ -50,5 +51,6 @@ __all__ = [
    "PeTcmComponent",
    "TransitComponent",
    "TwoDMeshNocComponent",
+    "PositionAwareXbarComponent",
    "SramComponent",
 ]
@@ -69,17 +69,37 @@ class HbmCtrlComponent(ComponentBase):
        yield from self._send_response(env, txn)

    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
-        """Create ResponseMsg and send on reverse path back to originator.
+        """Route completion based on path type.

-        PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
-        so we succeed txn.done directly instead of sending a response Transaction.
+        - PeDmaMsg: succeed done directly (probe).
+        - Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
+          data back on reverse path with original done event.
+        - M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
        """
-        from kernbench.runtime_api.kernel import PeDmaMsg
+        from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg

        if isinstance(txn.request, PeDmaMsg):
            txn.done.succeed()
            return

+        # Bypass path: no m_cpu in the transaction path
+        is_bypass = not any("m_cpu" in n for n in txn.path)
+        if is_bypass:
+            if isinstance(txn.request, MemoryReadMsg):
+                # D2H: send data back on reverse path to pcie_ep
+                reverse_path = list(reversed(txn.path))
+                if len(reverse_path) >= 2:
+                    resp_txn = Transaction(
+                        request=txn.request, path=reverse_path, step=0,
+                        nbytes=txn.request.nbytes, done=txn.done,
+                    )
+                    yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                    return
+            # MemoryWrite bypass or short path: done
+            txn.done.succeed()
+            return
+
+        # M_CPU DMA path: send ResponseMsg for aggregation
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2 and self.ctx:
            from kernbench.runtime_api.kernel import ResponseMsg
@@ -52,6 +52,26 @@ class TwoDMeshNocComponent(ComponentBase):
    def _build_grid(self) -> None:
        if not self.ctx:
            return
+        mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None
+        if mesh:
+            self._build_grid_from_mesh(mesh)
+        else:
+            self._build_grid_from_positions()
+
+    def _build_grid_from_mesh(self, mesh: dict) -> None:
+        """Build XY grid from cube_mesh.yaml router positions (authoritative)."""
+        origin_x, origin_y = self._cube_origin()
+        xs: set[float] = set()
+        ys: set[float] = set()
+        for key, router in mesh.get("routers", {}).items():
+            if router is not None:
+                xs.add(round(origin_x + router["pos_mm"][0], 2))
+                ys.add(round(origin_y + router["pos_mm"][1], 2))
+        self._x_grid = sorted(xs)
+        self._y_grid = sorted(ys)
+
+    def _build_grid_from_positions(self) -> None:
+        """Fallback: infer grid from all node positions in the cube."""
        cube_prefix = self.node.id.rsplit(".", 1)[0]
        xs: set[float] = set()
        ys: set[float] = set()
@@ -62,6 +82,23 @@ class TwoDMeshNocComponent(ComponentBase):
        self._x_grid = sorted(xs)
        self._y_grid = sorted(ys)

+    def _cube_origin(self) -> tuple[float, float]:
+        """Compute absolute origin (top-left) of this cube from cube_id."""
+        parts = self.node.id.split(".")
+        cube_str = [p for p in parts if p.startswith("cube")][0]
+        cube_id = int(cube_str[4:])
+        spec = self.ctx.spec
+        sip_spec = spec.get("sip", {})
+        cube_spec = spec.get("cube", {})
+        mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
+        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
+        cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0)
+        seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
+            "distance_mm_across_seam", 1.0)
+        col = cube_id % mesh_w
+        row = cube_id // mesh_w
+        return (col * (cube_w + seam), row * (cube_h + seam))
+
    def _get_link(self, key: tuple) -> simpy.Resource:
        if key not in self._links:
            assert self._env is not None
@@ -0,0 +1,168 @@
+"""Position-aware XBAR component.
+
+Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm,
+where internal_distance is the Manhattan distance between the entry port
+(PE router attachment) and exit port (HBM slice logical position) within
+the crossbar matrix.
+
+PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]).
+HBM slice positions are uniformly distributed across the HBM physical width.
+"""
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PositionAwareXbarComponent(ComponentBase):
+    """XBAR with position-dependent latency based on PE-to-slice distance.
+
+    Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm
+
+    Entry/exit port X positions are determined from the transaction path:
+      - PE_DMA nodes: router X from cube_mesh.yaml
+      - HBM slices: uniformly distributed across HBM physical width
+      - Bridge nodes: physical X from topology positions
+      - NOC: resolved by scanning path for PE_DMA node
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0))
+        self._pe_router_xs: dict[str, float] = {}
+        self._slice_xs: dict[str, float] = {}
+        self._bridge_xs: dict[str, float] = {}
+        self._ns_per_mm: float = 0.0
+
+    def start(self, env: simpy.Environment) -> None:
+        self._build_position_map()
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(self._base_overhead_ns)
+
+    # ── Position map construction ─────────────────────────────────
+
+    def _build_position_map(self) -> None:
+        if not self.ctx or not self.ctx.spec:
+            return
+        mesh = self.ctx.spec.get("_mesh")
+        if not mesh:
+            return
+
+        self._ns_per_mm = self.ctx.ns_per_mm
+        cube_prefix = self.node.id.rsplit(".", 1)[0]
+        xbar_name = self.node.id.rsplit(".", 1)[1]
+        is_top = xbar_name == "xbar_top"
+        xbar_key = "top" if is_top else "bottom"
+
+        # PE router X positions from mesh attachments
+        routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", [])
+        for router_id in routers_list:
+            router_data = mesh["routers"].get(router_id)
+            if router_data is None:
+                continue
+            router_x = router_data["pos_mm"][0]
+            for attach in router_data.get("attach", []):
+                if attach.endswith(".dma"):
+                    pe_name = attach.split(".")[0]
+                    pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma"
+                    self._pe_router_xs[pe_dma_id] = router_x
+
+        # HBM slice X positions: uniformly distributed across HBM width
+        cube_spec = self.ctx.spec.get("cube", {})
+        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
+        hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0)
+        n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8)
+        half = n_slices // 2
+        hbm_left = (cube_w - hbm_w) / 2
+
+        if is_top:
+            slice_range = range(half)
+        else:
+            slice_range = range(half, n_slices)
+
+        n = len(list(slice_range))
+        for i, sl in enumerate(slice_range):
+            if n > 1:
+                x = hbm_left + i * hbm_w / (n - 1)
+            else:
+                x = cube_w / 2
+            self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x
+
+        # Bridge X positions from topology positions
+        for node_id, pos in self.ctx.positions.items():
+            if node_id.startswith(cube_prefix + ".bridge.") and pos is not None:
+                origin_x = self._cube_origin_x()
+                self._bridge_xs[node_id] = pos[0] - origin_x
+
+    def _cube_origin_x(self) -> float:
+        """Compute absolute X origin of this cube."""
+        parts = self.node.id.split(".")
+        cube_str = [p for p in parts if p.startswith("cube")][0]
+        cube_id = int(cube_str[4:])
+        spec = self.ctx.spec
+        sip_spec = spec.get("sip", {})
+        cube_spec = spec.get("cube", {})
+        mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
+        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
+        seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
+            "distance_mm_across_seam", 1.0)
+        col = cube_id % mesh_w
+        return col * (cube_w + seam)
+
+    # ── Worker override ───────────────────────────────────────────
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._position_aware_forward(env, txn))
+
+    def _position_aware_forward(
+        self, env: simpy.Environment, txn: Any,
+    ) -> Generator:
+        prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
+        next_hop = txn.next_hop
+
+        overhead = self._base_overhead_ns
+        if prev_hop and next_hop and self._ns_per_mm > 0:
+            entry_x = self._get_port_x(prev_hop, txn.path)
+            exit_x = self._get_port_x(next_hop, txn.path)
+            if entry_x is not None and exit_x is not None:
+                overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm
+
+        yield env.timeout(overhead)
+
+        if next_hop:
+            yield self.out_ports[next_hop].put(txn.advance())
+        else:
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+            txn.done.succeed()
+
+    def _get_port_x(self, node_id: str, path: list[str]) -> float | None:
+        """Resolve the X position of an XBAR port from node context."""
+        # Direct lookup: PE DMA
+        if node_id in self._pe_router_xs:
+            return self._pe_router_xs[node_id]
+        # Direct lookup: HBM slice
+        if node_id in self._slice_xs:
+            return self._slice_xs[node_id]
+        # Direct lookup: bridge
+        if node_id in self._bridge_xs:
+            return self._bridge_xs[node_id]
+        # NOC: scan path for PE DMA node
+        if "noc" in node_id:
+            for p in path:
+                if p in self._pe_router_xs:
+                    return self._pe_router_xs[p]
+        return None