Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
@@ -21,11 +21,17 @@ sip:
    components:
      pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
      io_cpu:  { kind: io_cpu,  impl: io_cpu_v1,  attrs: { overhead_ns: 10.0 } }
+      io_noc:  { kind: io_noc,  impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
    links:
-      pcie_ep_to_io_cpu_bw_gbs: 256.0   # matches system.links.io_ep_to_switch.bw_gbs_per_ep
-      pcie_ep_to_io_cpu_mm: 1.0
-      io_cpu_to_ucie_bw_gbs: 512.0      # matches ucie.phy_bw_gbs per PHY
-      io_cpu_to_ucie_mm: 1.5
+      pcie_ep_to_noc_bw_gbs: 256.0
+      pcie_ep_to_noc_mm: 1.0
+      io_cpu_to_noc_bw_gbs: 256.0
+      io_cpu_to_noc_mm: 0.5
+    ucie:
+      overhead_ns: 8.0
+      n_connections: 4
+      per_connection_bw_gbs: 128.0   # 4 × 128 = 512 GB/s = PHY BW
+      noc_to_ucie_mm: 0.5
    instances:
      - id: io0
        place: { side: N, offset_norm: 0.5 }
@@ -82,41 +88,33 @@ cube:
    noc:     { kind: noc,     impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
    m_cpu:   { kind: m_cpu,   impl: m_cpu_v1,   attrs: { overhead_ns: 5.0 } }
    xbar:
-      pe:     { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
+      top:    { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
+      bottom: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
      bridges:
        - { id: left,  kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
        - { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
-    hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1 } }
+    hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 0.8 } }
    sram:     { kind: sram,     impl: sram_v1,     attrs: { size_mb: 32, overhead_ns: 2.0 } }

  ucie:
    decompose: true
    ports: [N, S, E, W]
-    overhead_ns: 1.0
+    overhead_ns: 8.0
+    n_connections: 4               # independent NOC↔UCIe connections per port
+    per_connection_bw_gbs: 128.0   # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW

  links:
-    pe_to_xbar_bw_gbs: 256.0         # per-PE effective (2048 / 8 PEs)
-    xbar_to_hbm_bw_gbs: 256.0        # per-PE effective (2048 / 8 PEs)
-    xbar_to_bridge_bw_gbs: 128.0     # bridge BW (same as xbar chain BW)
-    xbar_x_bw_gbs: 128.0             # X-direction BW for xbar chain traversal
-    xbar_chain_intra_corner_mm: 2.0  # xbar wire distance within same corner PE pair
-    xbar_chain_inter_corner_mm: 10.0 # xbar wire distance between corner pairs (NW↔NE, SW↔SE)
-    xbar_row_n_to_bridge_mm: 3.0
-    xbar_row_s_to_bridge_mm: 3.0
+    xbar_to_hbm_bw_gbs: 256.0        # per-slice effective (2048 / 8 slices)
+    xbar_to_bridge_bw_gbs: 128.0     # bridge BW (xbar_top/bot ↔ bridge)
+    xbar_to_bridge_mm: 3.0           # xbar ↔ bridge wire distance
    xbar_to_hbm_mm: 2.5
-    pe_to_xbar_row_n_mm: 6.0
-    pe_to_xbar_row_s_mm: 6.0
-    pe_dma_to_noc_mm: 0.0        # noc is distributed; distance modeled as 0
-    pe_dma_to_noc_bw_gbs: 512.0  # PE non-HBM data path BW
-    noc_to_xbar:
-      per_connection_bw_gbs: 128.0   # BW per NOC connection
+    pe_dma_to_noc_bw_gbs: 256.0  # PE → NOC BW (= HBM slice BW, no bottleneck)
+    noc_to_xbar_mm: 0.0          # noc is distributed; distance modeled as 0
+    noc_to_xbar_bw_gbs: 256.0    # NOC → xbar_top/bot BW (= HBM slice BW)
    noc_to_sram_mm: 0.0          # noc is distributed; distance modeled as 0
    noc_to_sram:
      per_connection_bw_gbs: 128.0   # BW per NOC connection
      n_connections: 4               # 4 × 128 = 512 GB/s aggregate
-    noc_to_ucie:
-      per_connection_bw_gbs: 128.0   # BW per NOC connection
-      n_connections: 4               # 4 × 128 = 512 GB/s = UCIe PHY BW
    m_cpu_to_noc_mm: 0.0         # noc is distributed; distance modeled as 0
    noc_to_pe_cpu_mm: 0.0        # noc is distributed; distance modeled as 0