Files
kernbench2/topology.yaml
T
ywkang 62fb01ae18 Add reverse path response latency for PE DMA and PE_CPU→M_CPU
Model fabric response hop latency for PE-internal operations:
- HBM_CTRL sends PeDmaMsg response on reverse path instead of direct done signal
- PE_CPU sends ResponseMsg via NOC→M_CPU on kernel completion
- Add NOC→PE_DMA and PE_CPU→NOC edges in topology builder
- Make HBM BW test assertions dynamic based on topology efficiency

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 15:40:56 -07:00

125 lines
4.7 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
system:
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
sips:
count: 2
components:
switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
links:
io_ep_to_switch:
kind: pcie
bw_gbs_per_ep: 768.0
distance_mm: 20.0
sip:
cube_mesh: { w: 4, h: 4 }
iochiplet:
components:
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
io_noc: { kind: io_noc, impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
links:
pcie_ep_to_noc_bw_gbs: 256.0
pcie_ep_to_noc_mm: 1.0
io_cpu_to_noc_bw_gbs: 256.0
io_cpu_to_noc_mm: 0.5
ucie:
overhead_ns: 8.0
n_connections: 4
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
noc_to_ucie_mm: 0.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
cube_ports:
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
links:
inter_cube_mesh:
bw_gbs_per_ucie_phy: 512.0
distance_mm_across_seam: 1.0
routing: { algo: xy }
cube:
geometry:
cube_mm: { w: 17.0, h: 14.0 }
hbm_mm: { w: 9.0, h: 5.0 }
ucie_mm: { size: 2.0 }
pe_layout:
corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
components:
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, attrs: { overhead_ns: 2.0 } }
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } }
pe_dma: { kind: pe_dma, impl: pe_dma_v1, attrs: { rd_engines: 1, wr_engines: 1 } }
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
pe_math: { kind: pe_math, impl: pe_math_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, attrs:
{ size_mb: 16 } }
links:
pe_cpu_to_scheduler_mm: 0.5
scheduler_to_dma_mm: 0.5
scheduler_to_gemm_mm: 0.5
scheduler_to_math_mm: 0.5
dma_to_tcm_bw_gbs: 512.0
dma_to_tcm_mm: 0.5
gemm_to_tcm_bw_gbs: 512.0 # GEMM reads inputs from TCM (ADR-0014 D5)
gemm_to_tcm_mm: 0.5
math_to_tcm_bw_gbs: 512.0
math_to_tcm_mm: 0.5
memory_map:
hbm_total_gb_per_cube: 48
hbm_slices_per_cube: 8
hbm_total_bw_gbs: 1024.0
components:
noc: { kind: noc, impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
xbar:
top: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bottom: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
bridges:
- { id: left, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
- { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 1.0 } }
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 8.0
n_connections: 4 # independent NOC↔UCIe connections per port
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
links:
xbar_to_hbm_bw_gbs: 256.0 # per-slice effective (2048 / 8 slices)
xbar_to_bridge_bw_gbs: 128.0 # bridge BW (xbar_top/bot ↔ bridge)
xbar_to_bridge_mm: 3.0 # xbar ↔ bridge wire distance
xbar_to_hbm_mm: 2.5
pe_dma_to_noc_bw_gbs: 256.0 # PE → NOC BW (= HBM slice BW, no bottleneck)
noc_to_xbar_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_xbar_bw_gbs: 256.0 # NOC → xbar_top/bot BW (= HBM slice BW)
noc_to_sram_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_sram:
per_connection_bw_gbs: 128.0 # BW per NOC connection
n_connections: 4 # 4 × 128 = 512 GB/s aggregate
m_cpu_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0
noc_to_pe_cpu_mm: 0.0 # noc is distributed; distance modeled as 0
visualization:
emit_views: [system, sip, cube]
sip_ids: [0]
cubes: [0, 9, 15]