Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop
  timestamps, split cross-cube into best/worst cases, D2H read section
- UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix
  cross-cube-best < cross-half latency inversion
- HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing
  effective BW from 256 to 204.8 GB/s
- Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases
- Probe default data size: 4KB -> 32KB for more realistic measurements
- IOChiplet NOC + D2H topology and tests
- NOC mesh, xbar, BW occupancy components and tests
- Cube mesh visualization diagram

278 tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
+22 -24
View File
@@ -21,11 +21,17 @@ sip:
components:
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
io_noc: { kind: io_noc, impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
links:
pcie_ep_to_io_cpu_bw_gbs: 256.0 # matches system.links.io_ep_to_switch.bw_gbs_per_ep
pcie_ep_to_io_cpu_mm: 1.0
io_cpu_to_ucie_bw_gbs: 512.0 # matches ucie.phy_bw_gbs per PHY
io_cpu_to_ucie_mm: 1.5
pcie_ep_to_noc_bw_gbs: 256.0
pcie_ep_to_noc_mm: 1.0
io_cpu_to_noc_bw_gbs: 256.0
io_cpu_to_noc_mm: 0.5
ucie:
overhead_ns: 8.0
n_connections: 4
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
noc_to_ucie_mm: 0.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
@@ -82,41 +88,33 @@ cube:
noc: { kind: noc, impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
xbar:
pe: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
top: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bottom: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bridges:
- { id: left, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
- { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 0.8 } }
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 1.0
overhead_ns: 8.0
n_connections: 4 # independent NOC↔UCIe connections per port
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
links:
pe_to_xbar_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs)
xbar_to_hbm_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs)
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (same as xbar chain BW)
xbar_x_bw_gbs: 128.0 # X-direction BW for xbar chain traversal
xbar_chain_intra_corner_mm: 2.0 # xbar wire distance within same corner PE pair
xbar_chain_inter_corner_mm: 10.0 # xbar wire distance between corner pairs (NW↔NE, SW↔SE)
xbar_row_n_to_bridge_mm: 3.0
xbar_row_s_to_bridge_mm: 3.0
xbar_to_hbm_bw_gbs: 256.0 # per-slice effective (2048 / 8 slices)
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (xbar_top/bot ↔ bridge)
xbar_to_bridge_mm: 3.0 # xbar ↔ bridge wire distance
xbar_to_hbm_mm: 2.5
pe_to_xbar_row_n_mm: 6.0
pe_to_xbar_row_s_mm: 6.0
pe_dma_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
pe_dma_to_noc_bw_gbs: 512.0 # PE non-HBM data path BW
noc_to_xbar:
per_connection_bw_gbs: 128.0 # BW per NOC connection
pe_dma_to_noc_bw_gbs: 256.0 # PE → NOC BW (= HBM slice BW, no bottleneck)
noc_to_xbar_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_xbar_bw_gbs: 256.0 # NOC → xbar_top/bot BW (= HBM slice BW)
noc_to_sram_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_sram:
per_connection_bw_gbs: 128.0 # BW per NOC connection
n_connections: 4 # 4 × 128 = 512 GB/s aggregate
noc_to_ucie:
per_connection_bw_gbs: 128.0 # BW per NOC connection
n_connections: 4 # 4 × 128 = 512 GB/s = UCIe PHY BW
m_cpu_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_pe_cpu_mm: 0.0 # noc is distributed; distance modeled as 0