Files
kernbench2/topology.yaml
T
ywkang 1d95df4bee Restructure legacy backups, remove pe_accel, fix DMA self-routing
- Move builtin_legacy/ → legacy/builtin/ (cleaner structure)
- Move pe_accel_legacy/ → legacy/pe_accel/
- Remove custom/pe_accel/ (replaced by new builtin)
- Remove pe_scheduler_v2 from components.yaml
- Switch topology.yaml to pe_scheduler_v1 (new builtin)
- Fix PE_DMA self-routing: handle consecutive DMA_READ stages
  (same component consecutive stages processed in-place, not via port)

382 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 00:02:26 -07:00

134 lines
5.5 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
system:
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
sips:
count: 2
components:
switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
links:
io_ep_to_switch:
kind: pcie
bw_gbs_per_ep: 768.0
distance_mm: 20.0
sip:
cube_mesh: { w: 4, h: 4 }
iochiplet:
components:
pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } }
io_noc: { kind: io_noc, impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
links:
pcie_ep_to_noc_bw_gbs: 256.0
pcie_ep_to_noc_mm: 1.0
io_cpu_to_noc_bw_gbs: 256.0
io_cpu_to_noc_mm: 0.5
ucie:
overhead_ns: 8.0
n_connections: 4
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
noc_to_ucie_mm: 0.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
cube_ports:
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
links:
inter_cube_mesh:
bw_gbs_per_ucie_phy: 512.0
distance_mm_across_seam: 1.0
routing: { algo: xy }
cube:
geometry:
cube_mm: { w: 17.0, h: 14.0 }
hbm_mm: { w: 9.0, h: 5.0 }
ucie_mm: { size: 2.0 }
pe_layout:
corners: [NW, NE, SW, SE] # N corners → top PE rows; S corners → bottom PE rows
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
components:
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, attrs: { overhead_ns: 2.0 } }
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } }
pe_dma: { kind: pe_dma, impl: pe_dma_v1, attrs: { rd_engines: 1, wr_engines: 1 } }
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
pe_math: { kind: pe_math, impl: pe_math_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, attrs: { overhead_ns: 0.0 } }
pe_mmu: { kind: pe_mmu, impl: pe_mmu_v1, attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0 } }
links:
pe_cpu_to_scheduler_mm: 0.5
scheduler_to_dma_mm: 0.5
scheduler_to_gemm_mm: 0.5
scheduler_to_math_mm: 0.5
scheduler_to_fetch_store_mm: 0.5
dma_to_tcm_bw_gbs: 512.0
dma_to_tcm_mm: 0.5
dma_to_fetch_store_mm: 0.0 # DMA → fetch_store chaining (ADR-0021)
fetch_store_to_tcm_bw_gbs: 512.0
fetch_store_to_tcm_mm: 0.0
fetch_store_to_gemm_mm: 0.0 # fetch → GEMM chaining (ADR-0021)
fetch_store_to_math_mm: 0.0 # fetch → MATH chaining (ADR-0021)
gemm_to_fetch_store_mm: 0.0 # GEMM → store chaining (ADR-0021)
math_to_fetch_store_mm: 0.0 # MATH → store chaining (ADR-0021)
fetch_store_to_dma_mm: 0.0 # store → DMA writeback chaining (ADR-0021)
gemm_to_tcm_bw_gbs: 512.0
gemm_to_tcm_mm: 0.5
math_to_tcm_bw_gbs: 512.0
math_to_tcm_mm: 0.5
memory_map:
hbm_total_gb_per_cube: 48
hbm_slices_per_cube: 8
hbm_total_bw_gbs: 1024.0
hbm_mapping_mode: n_to_one # one_to_one | n_to_one (ADR-0019)
hbm_pseudo_channels: 64 # total pseudo channels per cube
hbm_channels_per_pe: 8 # = pseudo_channels / pes_per_cube
hbm_channel_bw_gbs: 32.0 # per-channel bandwidth (GB/s)
components:
noc_router: { kind: noc_router, impl: forwarding_v1, attrs: { overhead_ns: 2.0 } }
m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 1.0 } }
sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } }
# Physical placement of non-PE components (mm coordinates)
placement:
m_cpu: { pos_mm: [7.5, 3.0] } # top center, below UCIe-N
sram: { pos_mm: [1.5, 9.0] } # left side, below HBM zone
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 8.0
n_connections: 4 # independent NOC↔UCIe connections per port
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
links:
# Router mesh links (ADR-0019)
router_link_bw_gbs: 256.0 # inter-router XY mesh link BW
router_overhead_ns: 2.0 # per-router switching overhead
pe_to_router_bw_gbs: 256.0 # PE_DMA ↔ router (= N × channel_bw)
hbm_to_router_bw_gbs: 256.0 # HBM_CTRL ↔ router (= N × channel_bw)
sram_to_router_bw_gbs: 128.0 # SRAM ↔ router
m_cpu_to_router_mm: 0.0 # M_CPU ↔ router distance
pe_dma_to_noc_bw_gbs: 256.0 # PE → router BW (= HBM slice BW, no bottleneck)
noc_to_pe_cpu_mm: 0.0 # router → PE_CPU distance (command path)
visualization:
emit_views: [system, sip, cube]
sip_ids: [0]
cubes: [0, 9, 15]