Files
kernbench2/topology.yaml
T
ywkang d75da439c6 Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop
  timestamps, split cross-cube into best/worst cases, D2H read section
- UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix
  cross-cube-best < cross-half latency inversion
- HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing
  effective BW from 256 to 204.8 GB/s
- Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases
- Probe default data size: 4KB -> 32KB for more realistic measurements
- IOChiplet NOC + D2H topology and tests
- NOC mesh, xbar, BW occupancy components and tests
- Cube mesh visualization diagram

278 tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00

125 lines
4.7 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
system:
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
sips:
count: 2
components:
switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
links:
io_ep_to_switch:
kind: pcie
bw_gbs_per_ep: 768.0
distance_mm: 20.0
sip:
cube_mesh: { w: 4, h: 4 }
iochiplet:
components:
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
io_noc: { kind: io_noc, impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
links:
pcie_ep_to_noc_bw_gbs: 256.0
pcie_ep_to_noc_mm: 1.0
io_cpu_to_noc_bw_gbs: 256.0
io_cpu_to_noc_mm: 0.5
ucie:
overhead_ns: 8.0
n_connections: 4
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
noc_to_ucie_mm: 0.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
cube_ports:
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
links:
inter_cube_mesh:
bw_gbs_per_ucie_phy: 512.0
distance_mm_across_seam: 1.0
routing: { algo: xy }
cube:
geometry:
cube_mm: { w: 17.0, h: 14.0 }
hbm_mm: { w: 9.0, h: 5.0 }
ucie_mm: { size: 2.0 }
pe_layout:
corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
components:
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, attrs: { overhead_ns: 2.0 } }
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } }
pe_dma: { kind: pe_dma, impl: pe_dma_v1, attrs: { rd_engines: 1, wr_engines: 1 } }
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
pe_math: { kind: pe_math, impl: pe_math_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, attrs:
{ size_mb: 16 } }
links:
pe_cpu_to_scheduler_mm: 0.5
scheduler_to_dma_mm: 0.5
scheduler_to_gemm_mm: 0.5
scheduler_to_math_mm: 0.5
dma_to_tcm_bw_gbs: 512.0
dma_to_tcm_mm: 0.5
gemm_to_tcm_bw_gbs: 512.0 # GEMM reads inputs from TCM (ADR-0014 D5)
gemm_to_tcm_mm: 0.5
math_to_tcm_bw_gbs: 512.0
math_to_tcm_mm: 0.5
memory_map:
hbm_total_gb_per_cube: 48
hbm_slices_per_cube: 8
hbm_total_bw_gbs: 1024.0
components:
noc: { kind: noc, impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
xbar:
top: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bottom: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bridges:
- { id: left, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
- { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 0.8 } }
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 8.0
n_connections: 4 # independent NOC↔UCIe connections per port
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
links:
xbar_to_hbm_bw_gbs: 256.0 # per-slice effective (2048 / 8 slices)
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (xbar_top/bot ↔ bridge)
xbar_to_bridge_mm: 3.0 # xbar ↔ bridge wire distance
xbar_to_hbm_mm: 2.5
pe_dma_to_noc_bw_gbs: 256.0 # PE → NOC BW (= HBM slice BW, no bottleneck)
noc_to_xbar_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_xbar_bw_gbs: 256.0 # NOC → xbar_top/bot BW (= HBM slice BW)
noc_to_sram_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_sram:
per_connection_bw_gbs: 128.0 # BW per NOC connection
n_connections: 4 # 4 × 128 = 512 GB/s aggregate
m_cpu_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_pe_cpu_mm: 0.0 # noc is distributed; distance modeled as 0
visualization:
emit_views: [system, sip, cube]
sip_ids: [0]
cubes: [0, 9, 15]