Files
kernbench2/topology.yaml
T
2026-03-18 11:47:48 -07:00

127 lines
4.9 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
system:
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
sips:
count: 2
components:
switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
links:
io_ep_to_switch:
kind: pcie
bw_gbs_per_ep: 768.0
distance_mm: 20.0
sip:
cube_mesh: { w: 4, h: 4 }
iochiplet:
components:
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
links:
pcie_ep_to_io_cpu_bw_gbs: 256.0 # matches system.links.io_ep_to_switch.bw_gbs_per_ep
pcie_ep_to_io_cpu_mm: 1.0
io_cpu_to_ucie_bw_gbs: 512.0 # matches ucie.phy_bw_gbs per PHY
io_cpu_to_ucie_mm: 1.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
cube_ports:
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
links:
inter_cube_mesh:
bw_gbs_per_ucie_phy: 512.0
distance_mm_across_seam: 1.0
routing: { algo: xy }
cube:
geometry:
cube_mm: { w: 17.0, h: 14.0 }
hbm_mm: { w: 9.0, h: 5.0 }
ucie_mm: { size: 2.0 }
pe_layout:
corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
components:
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, attrs: { overhead_ns: 2.0 } }
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } }
pe_dma: { kind: pe_dma, impl: pe_dma_v1, attrs: { rd_engines: 1, wr_engines: 1 } }
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
pe_math: { kind: pe_math, impl: pe_math_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, attrs:
{ size_mb: 16 } }
links:
pe_cpu_to_scheduler_mm: 0.5
scheduler_to_dma_mm: 0.5
scheduler_to_gemm_mm: 0.5
scheduler_to_math_mm: 0.5
dma_to_tcm_bw_gbs: 512.0
dma_to_tcm_mm: 0.5
gemm_to_tcm_bw_gbs: 512.0 # GEMM reads inputs from TCM (ADR-0014 D5)
gemm_to_tcm_mm: 0.5
math_to_tcm_bw_gbs: 512.0
math_to_tcm_mm: 0.5
memory_map:
hbm_total_gb_per_cube: 48
hbm_slices_per_cube: 8
hbm_total_bw_gbs: 1024.0
components:
noc: { kind: noc, impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
xbar:
pe: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bridges:
- { id: left, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
- { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1 } }
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 1.0
links:
pe_to_xbar_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs)
xbar_to_hbm_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs)
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (same as xbar chain BW)
xbar_x_bw_gbs: 128.0 # X-direction BW for xbar chain traversal
xbar_chain_intra_corner_mm: 2.0 # xbar wire distance within same corner PE pair
xbar_chain_inter_corner_mm: 10.0 # xbar wire distance between corner pairs (NW↔NE, SW↔SE)
xbar_row_n_to_bridge_mm: 3.0
xbar_row_s_to_bridge_mm: 3.0
xbar_to_hbm_mm: 2.5
pe_to_xbar_row_n_mm: 6.0
pe_to_xbar_row_s_mm: 6.0
pe_dma_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
pe_dma_to_noc_bw_gbs: 512.0 # PE non-HBM data path BW
noc_to_xbar:
per_connection_bw_gbs: 128.0 # BW per NOC connection
noc_to_sram_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_sram:
per_connection_bw_gbs: 128.0 # BW per NOC connection
n_connections: 4 # 4 × 128 = 512 GB/s aggregate
noc_to_ucie:
per_connection_bw_gbs: 128.0 # BW per NOC connection
n_connections: 4 # 4 × 128 = 512 GB/s = UCIe PHY BW
m_cpu_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_pe_cpu_mm: 0.0 # noc is distributed; distance modeled as 0
visualization:
emit_views: [system, sip, cube]
sip_ids: [0]
cubes: [0, 9, 15]