Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+22
-24
@@ -21,11 +21,17 @@ sip:
|
||||
components:
|
||||
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
|
||||
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
|
||||
io_noc: { kind: io_noc, impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
|
||||
links:
|
||||
pcie_ep_to_io_cpu_bw_gbs: 256.0 # matches system.links.io_ep_to_switch.bw_gbs_per_ep
|
||||
pcie_ep_to_io_cpu_mm: 1.0
|
||||
io_cpu_to_ucie_bw_gbs: 512.0 # matches ucie.phy_bw_gbs per PHY
|
||||
io_cpu_to_ucie_mm: 1.5
|
||||
pcie_ep_to_noc_bw_gbs: 256.0
|
||||
pcie_ep_to_noc_mm: 1.0
|
||||
io_cpu_to_noc_bw_gbs: 256.0
|
||||
io_cpu_to_noc_mm: 0.5
|
||||
ucie:
|
||||
overhead_ns: 8.0
|
||||
n_connections: 4
|
||||
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
|
||||
noc_to_ucie_mm: 0.5
|
||||
instances:
|
||||
- id: io0
|
||||
place: { side: N, offset_norm: 0.5 }
|
||||
@@ -82,41 +88,33 @@ cube:
|
||||
noc: { kind: noc, impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
|
||||
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
|
||||
xbar:
|
||||
pe: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
|
||||
top: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
|
||||
bottom: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
|
||||
bridges:
|
||||
- { id: left, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
|
||||
- { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
|
||||
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1 } }
|
||||
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 0.8 } }
|
||||
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
|
||||
|
||||
ucie:
|
||||
decompose: true
|
||||
ports: [N, S, E, W]
|
||||
overhead_ns: 1.0
|
||||
overhead_ns: 8.0
|
||||
n_connections: 4 # independent NOC↔UCIe connections per port
|
||||
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
|
||||
|
||||
links:
|
||||
pe_to_xbar_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs)
|
||||
xbar_to_hbm_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs)
|
||||
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (same as xbar chain BW)
|
||||
xbar_x_bw_gbs: 128.0 # X-direction BW for xbar chain traversal
|
||||
xbar_chain_intra_corner_mm: 2.0 # xbar wire distance within same corner PE pair
|
||||
xbar_chain_inter_corner_mm: 10.0 # xbar wire distance between corner pairs (NW↔NE, SW↔SE)
|
||||
xbar_row_n_to_bridge_mm: 3.0
|
||||
xbar_row_s_to_bridge_mm: 3.0
|
||||
xbar_to_hbm_bw_gbs: 256.0 # per-slice effective (2048 / 8 slices)
|
||||
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (xbar_top/bot ↔ bridge)
|
||||
xbar_to_bridge_mm: 3.0 # xbar ↔ bridge wire distance
|
||||
xbar_to_hbm_mm: 2.5
|
||||
pe_to_xbar_row_n_mm: 6.0
|
||||
pe_to_xbar_row_s_mm: 6.0
|
||||
pe_dma_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
|
||||
pe_dma_to_noc_bw_gbs: 512.0 # PE non-HBM data path BW
|
||||
noc_to_xbar:
|
||||
per_connection_bw_gbs: 128.0 # BW per NOC connection
|
||||
pe_dma_to_noc_bw_gbs: 256.0 # PE → NOC BW (= HBM slice BW, no bottleneck)
|
||||
noc_to_xbar_mm: 0.0 # noc is distributed; distance modeled as 0
|
||||
noc_to_xbar_bw_gbs: 256.0 # NOC → xbar_top/bot BW (= HBM slice BW)
|
||||
noc_to_sram_mm: 0.0 # noc is distributed; distance modeled as 0
|
||||
noc_to_sram:
|
||||
per_connection_bw_gbs: 128.0 # BW per NOC connection
|
||||
n_connections: 4 # 4 × 128 = 512 GB/s aggregate
|
||||
noc_to_ucie:
|
||||
per_connection_bw_gbs: 128.0 # BW per NOC connection
|
||||
n_connections: 4 # 4 × 128 = 512 GB/s = UCIe PHY BW
|
||||
m_cpu_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
|
||||
noc_to_pe_cpu_mm: 0.0 # noc is distributed; distance modeled as 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user