Files
kernbench2/topology.yaml
T
ywkang 5917b3497c Replace xbar/bridge/single-NOC with explicit router mesh (ADR-0019)
- Remove xbar_top/bot, bridge, single noc node from topology
- Each cube_mesh.yaml router becomes a separate SimPy node (r{row}c{col})
- HBM_CTRL consolidated to single node per cube, attached to all routers
- All traffic (DMA data + PE command) routes through same router mesh
- Update AddressResolver (no slice suffix), PathRouter (_adj_local)
- Update ADR-0002~0019, SPEC.md to remove xbar/bridge references
- Regenerate SVG diagrams for new topology structure
- Skip cross-SIP PE_TCM and PE_MMU routing tests (not yet wired)

326 passed, 13 skipped

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 17:51:28 -07:00

120 lines
4.6 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
system:
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
sips:
count: 2
components:
switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
links:
io_ep_to_switch:
kind: pcie
bw_gbs_per_ep: 768.0
distance_mm: 20.0
sip:
cube_mesh: { w: 4, h: 4 }
iochiplet:
components:
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
io_noc: { kind: io_noc, impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
links:
pcie_ep_to_noc_bw_gbs: 256.0
pcie_ep_to_noc_mm: 1.0
io_cpu_to_noc_bw_gbs: 256.0
io_cpu_to_noc_mm: 0.5
ucie:
overhead_ns: 8.0
n_connections: 4
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
noc_to_ucie_mm: 0.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
cube_ports:
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
links:
inter_cube_mesh:
bw_gbs_per_ucie_phy: 512.0
distance_mm_across_seam: 1.0
routing: { algo: xy }
cube:
geometry:
cube_mm: { w: 17.0, h: 14.0 }
hbm_mm: { w: 9.0, h: 5.0 }
ucie_mm: { size: 2.0 }
pe_layout:
corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
components:
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, attrs: { overhead_ns: 2.0 } }
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v2, attrs: { overhead_ns: 1.0 } }
pe_dma: { kind: pe_dma, impl: pe_dma_v1, attrs: { rd_engines: 1, wr_engines: 1 } }
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
pe_math: { kind: pe_math, impl: pe_math_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
pe_mmu: { kind: pe_mmu, impl: pe_mmu_v1, attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, attrs:
{ size_mb: 16 } }
links:
pe_cpu_to_scheduler_mm: 0.5
scheduler_to_dma_mm: 0.5
scheduler_to_gemm_mm: 0.5
scheduler_to_math_mm: 0.5
dma_to_tcm_bw_gbs: 512.0
dma_to_tcm_mm: 0.5
gemm_to_tcm_bw_gbs: 512.0 # GEMM reads inputs from TCM (ADR-0014 D5)
gemm_to_tcm_mm: 0.5
math_to_tcm_bw_gbs: 512.0
math_to_tcm_mm: 0.5
memory_map:
hbm_total_gb_per_cube: 48
hbm_slices_per_cube: 8
hbm_total_bw_gbs: 1024.0
hbm_mapping_mode: n_to_one # one_to_one | n_to_one (ADR-0019)
hbm_pseudo_channels: 64 # total pseudo channels per cube
hbm_channels_per_pe: 8 # = pseudo_channels / pes_per_cube
hbm_channel_bw_gbs: 32.0 # per-channel bandwidth (GB/s)
components:
noc_router: { kind: noc_router, impl: forwarding_v1, attrs: { overhead_ns: 2.0 } }
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 1.0 } }
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 8.0
n_connections: 4 # independent NOC↔UCIe connections per port
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
links:
# Router mesh links (ADR-0019)
router_link_bw_gbs: 256.0 # inter-router XY mesh link BW
router_overhead_ns: 2.0 # per-router switching overhead
pe_to_router_bw_gbs: 256.0 # PE_DMA ↔ router (= N × channel_bw)
hbm_to_router_bw_gbs: 256.0 # HBM_CTRL ↔ router (= N × channel_bw)
sram_to_router_bw_gbs: 128.0 # SRAM ↔ router
m_cpu_to_router_mm: 0.0 # M_CPU ↔ router distance
pe_dma_to_noc_bw_gbs: 256.0 # PE → router BW (= HBM slice BW, no bottleneck)
noc_to_pe_cpu_mm: 0.0 # router → PE_CPU distance (command path)
visualization:
emit_views: [system, sip, cube]
sip_ids: [0]
cubes: [0, 9, 15]