Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
@@ -18,11 +18,10 @@ from kernbench.topology.types import Edge, TopologyGraph
 class GraphEngine:
    """simpy-based discrete-event simulation engine.

-    Phase B: engine injects a Transaction into the PCIE_EP host queue for
-    each request. Components handle their own routing:
-      Path 1: PCIE_EP → IO_CPU   (engine-computed path, pre-loaded in Transaction)
-      Path 2: IO_CPU → M_CPU     (IO_CPU dispatches, fire-and-forget callback)
-      Path 3: M_CPU.DMA → HBM   (M_CPU dispatches, fire-and-forget callback)
+    Request routing:
+      MemoryWrite/Read: pcie_ep → io_noc → cube → xbar → hbm_ctrl (m_cpu bypass)
+      KernelLaunch:     pcie_ep → io_noc → io_cpu → io_noc → cube → m_cpu → PE
+      PeDmaMsg:         pe_dma → xbar → hbm_ctrl (direct probe)

    Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
    """
@@ -68,18 +67,20 @@ class GraphEngine:
            src_comp.out_ports[e.dst] = store
            dst_comp.in_ports[e.src] = store

-        # Wire processes: propagation delay per edge (ADR-0015 D2)
-        # Cut-through (wormhole) model: wires apply propagation only.
-        # Serialization (drain) is computed per-path and applied once at the terminal.
+        # Wire processes: propagation delay + BW occupancy per edge (ADR-0015 D2)
+        # Cut-through (wormhole) model: wires apply propagation delay per hop.
+        # BW occupancy (available_at) tracks when each directed link becomes free
+        # for the next transaction, modeling back-to-back serialization contention.
        for e in graph.edges:
            src_comp = self._components.get(e.src)
            dst_comp = self._components.get(e.dst)
            if src_comp is None or dst_comp is None:
                continue
            prop_ns = e.distance_mm * self._ns_per_mm
+            bw_gbs = e.bw_gbs or 0.0
            self._env.process(
                self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
-                           prop_ns)
+                           prop_ns, bw_gbs)
            )

        # Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
@@ -125,14 +126,33 @@ class GraphEngine:
        out_port: simpy.Store,
        in_port: simpy.Store,
        prop_ns: float,
+        bw_gbs: float = 0.0,
    ):
-        """SimPy process: relay messages with propagation delay only.
+        """SimPy process: relay messages with propagation delay and BW occupancy.

-        Cut-through (wormhole) model: serialization (drain) is computed per-path
-        and applied once at the terminal component, not at every wire hop.
+        Each directed edge maintains an ``available_at`` timestamp tracking when
+        the link becomes free for the next transaction.  When a transaction of
+        ``nbytes`` uses a link with ``bw_gbs``, the link is occupied for
+        ``nbytes / bw_gbs`` ns.  The *next* transaction on the same directed
+        link must wait until ``available_at`` passes (back-to-back serialization).
+
+        The *current* transaction is NOT delayed by its own occupancy — only by
+        a prior transaction's occupancy that has not yet cleared.  This avoids
+        double-drain: terminal drain_ns handles single-transaction serialization,
+        while available_at handles inter-transaction BW contention.
        """
+        available_at = 0.0
        while True:
            msg = yield out_port.get()
+            # BW occupancy: wait for link to become free, then mark busy
+            if bw_gbs > 0:
+                nbytes = getattr(msg, "nbytes", 0)
+                if nbytes > 0:
+                    wait = available_at - self._env.now
+                    if wait > 0:
+                        yield self._env.timeout(wait)
+                    available_at = self._env.now + (nbytes / bw_gbs)
+            # Propagation delay
            if prop_ns > 0:
                yield self._env.timeout(prop_ns)
            yield in_port.put(msg)
@@ -142,6 +162,10 @@ class GraphEngine:
            yield from self._process_pe_dma(key, request, done)
            return

+        if isinstance(request, (MemoryWriteMsg, MemoryReadMsg)):
+            yield from self._process_memory_direct(key, request, done)
+            return
+
        entries = self._entry_points(request)
        if not entries:
            self._results[key] = (
@@ -200,6 +224,44 @@ class GraphEngine:
        )
        done.succeed()

+    def _process_memory_direct(self, key: str, request: Any, done: simpy.Event):
+        """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
+
+        MemoryWrite: data flows forward (nbytes on wires), drain at hbm_ctrl terminal.
+        MemoryRead: command flows forward (nbytes=0), hbm_ctrl sends data back on
+                    reverse path with nbytes=request.nbytes.
+        """
+        if isinstance(request, MemoryWriteMsg):
+            sip, pa_val = request.dst_sip, request.dst_pa
+        else:
+            sip, pa_val = request.src_sip, request.src_pa
+
+        pcie_ep_id = self._resolver.find_pcie_ep(sip)
+        pa = PhysAddr.decode(pa_val)
+        hbm_node = self._resolver.resolve(pa)
+        path = self._router.find_memory_path(pcie_ep_id, hbm_node)
+        drain_ns = self._path_drain_ns(path, request.nbytes)
+
+        start_ns = self._env.now
+        txn_done = self._env.event()
+
+        is_write = isinstance(request, MemoryWriteMsg)
+        txn = Transaction(
+            request=request, path=path, step=0,
+            nbytes=request.nbytes if is_write else 0,
+            done=txn_done, drain_ns=drain_ns,
+        )
+
+        yield self._host_queues[pcie_ep_id].put(txn)
+        yield txn_done
+
+        total_ns = self._env.now - start_ns
+        self._results[key] = (
+            Completion(ok=True),
+            {"total_ns": total_ns, "nbytes": request.nbytes},
+        )
+        done.succeed()
+
    def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
        """Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
        pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
@@ -260,25 +322,8 @@ class GraphEngine:
    def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
        """Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.

-        For Memory{Write,Read}: single SIP entry.
-        For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
+        Only handles KernelLaunchMsg. MemoryWrite/Read use _process_memory_direct.
        """
-        if isinstance(request, MemoryWriteMsg):
-            sip = request.dst_sip
-            return [(
-                self._resolver.find_pcie_ep(sip),
-                self._resolver.find_io_cpu(sip),
-                request.nbytes,
-            )]
-
-        if isinstance(request, MemoryReadMsg):
-            sip = request.src_sip
-            return [(
-                self._resolver.find_pcie_ep(sip),
-                self._resolver.find_io_cpu(sip),
-                request.nbytes,
-            )]
-
        if isinstance(request, KernelLaunchMsg):
            seen: set[int] = set()
            entries: list[tuple[str, str, int]] = []