Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
@@ -110,7 +110,7 @@ class PathRouter:
    def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
        """M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).

-        Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
+        Same-cube: deterministic [m_cpu, noc, xbar_top/bot, hbm_ctrl.slice_i].
        Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
                    → routes through NOC → UCIe → target cube NOC → xbar → HBM.
        """
@@ -118,14 +118,23 @@ class PathRouter:
        d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
        if m_cube == d_cube:
            slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
+            xbar = "xbar_top" if slice_idx < 4 else "xbar_bot"
            return [
                m_cpu_id,
                f"{m_cube}.noc",
-                f"{m_cube}.xbar.pe{slice_idx}",
+                f"{m_cube}.{xbar}",
                dst_hbm_slice_id,
            ]
        return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)

+    def find_memory_path(self, src: str, dst: str) -> list[str]:
+        """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
+
+        Uses _adj_mcpu_dma which excludes pe_internal and pe_to_xbar edges,
+        preventing routing through PE pipeline nodes.
+        """
+        return self._run_dijkstra(self._adj_mcpu_dma, src, dst)
+
    def find_node_path(self, src: str, dst: str) -> list[str]:
        """General routing between arbitrary nodes, including command edges.