system: ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon) sips: count: 2 topology: ring_1d components: switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } } links: io_ep_to_switch: kind: pcie bw_gbs_per_ep: 768.0 distance_mm: 20.0 sip: cube_mesh: { w: 4, h: 4 } iochiplet: components: pcie_ep: { kind: pcie_ep, impl: builtin.pcie_ep, attrs: { overhead_ns: 5.0 } } io_cpu: { kind: io_cpu, impl: builtin.io_cpu, attrs: { overhead_ns: 10.0 } } io_noc: { kind: io_noc, impl: builtin.forwarding, attrs: { overhead_ns: 0.0 } } links: pcie_ep_to_noc_bw_gbs: 256.0 pcie_ep_to_noc_mm: 1.0 io_cpu_to_noc_bw_gbs: 256.0 io_cpu_to_noc_mm: 0.5 ucie: overhead_ns: 8.0 n_connections: 4 per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW noc_to_ucie_mm: 0.5 instances: - id: io0 place: { side: N, offset_norm: 0.5 } ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] } cube_ports: - { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 } - { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 } - { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 } - { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 } links: inter_cube_mesh: bw_gbs_per_ucie_phy: 512.0 distance_mm_across_seam: 1.0 routing: { algo: xy } cube: geometry: cube_mm: { w: 17.0, h: 14.0 } hbm_mm: { w: 9.0, h: 5.0 } ucie_mm: { size: 2.0 } pe_layout: corners: [NW, NE, SW, SE] # N corners → top PE rows; S corners → bottom PE rows pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8 pe_template: components: pe_cpu: { kind: pe_cpu, impl: builtin.pe_cpu, attrs: { overhead_ns: 2.0 } } pe_scheduler: { kind: pe_scheduler, impl: builtin.pe_scheduler, attrs: { overhead_ns: 1.0 } } pe_dma: { kind: pe_dma, impl: builtin.pe_dma, attrs: { rd_engines: 1, wr_engines: 1 } } pe_gemm: { kind: pe_gemm, impl: builtin.pe_gemm, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } } pe_math: { kind: pe_math, impl: builtin.pe_math, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } } pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } } pe_mmu: { kind: pe_mmu, impl: builtin.pe_mmu, attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } } pe_tcm: { kind: pe_tcm, impl: builtin.pe_tcm, attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0, kernel_scratch_mb: 1 } } pe_ipcq: { kind: pe_ipcq, impl: builtin.pe_ipcq, attrs: { overhead_ns: 0.0 } } links: pe_cpu_to_scheduler_mm: 0.5 scheduler_to_dma_mm: 0.5 scheduler_to_gemm_mm: 0.5 scheduler_to_math_mm: 0.5 scheduler_to_fetch_store_mm: 0.5 dma_to_tcm_bw_gbs: 512.0 dma_to_tcm_mm: 0.5 dma_to_fetch_store_mm: 0.0 # DMA → fetch_store chaining (ADR-0014 D6) fetch_store_to_tcm_bw_gbs: 512.0 fetch_store_to_tcm_mm: 0.0 fetch_store_to_gemm_mm: 0.0 # fetch → GEMM chaining (ADR-0014 D6) fetch_store_to_math_mm: 0.0 # fetch → MATH chaining (ADR-0014 D6) gemm_to_fetch_store_mm: 0.0 # GEMM → store chaining (ADR-0014 D6) gemm_to_math_mm: 0.0 # GEMM → MATH epilogue chaining (ADR-0014 D6) math_to_fetch_store_mm: 0.0 # MATH → store chaining (ADR-0014 D6) fetch_store_to_dma_mm: 0.0 # store → DMA writeback chaining (ADR-0014 D6) gemm_to_tcm_bw_gbs: 512.0 gemm_to_tcm_mm: 0.5 math_to_tcm_bw_gbs: 512.0 math_to_tcm_mm: 0.5 cpu_to_ipcq_mm: 0.5 # PE_CPU → PE_IPCQ (ADR-0023) ipcq_to_dma_mm: 0.0 # PE_IPCQ → PE_DMA token forwarding (ADR-0023) dma_to_ipcq_mm: 0.0 # PE_DMA → PE_IPCQ metadata arrival (ADR-0023) memory_map: hbm_total_gb_per_cube: 48 hbm_slices_per_cube: 8 hbm_total_bw_gbs: 1024.0 hbm_mapping_mode: n_to_one # one_to_one | n_to_one (ADR-0017 D8) hbm_pseudo_channels: 64 # total pseudo channels per cube hbm_channels_per_pe: 8 # = pseudo_channels / pes_per_cube hbm_channel_bw_gbs: 32.0 # per-channel bandwidth (GB/s) components: noc_router: { kind: noc_router, impl: builtin.forwarding, attrs: { overhead_ns: 2.0 } } m_cpu: { kind: m_cpu, impl: builtin.m_cpu, attrs: { overhead_ns: 5.0 } } hbm_ctrl: { kind: hbm_ctrl, impl: builtin.hbm_ctrl, attrs: { capacity: 1, efficiency: 1.0, num_pcs: 8, burst_bytes: 256, switch_penalty_ns: 0.0 } } sram: { kind: sram, impl: builtin.sram, attrs: { size_mb: 32, overhead_ns: 2.0 } } # Physical placement of non-PE components (mm coordinates) placement: m_cpu: { pos_mm: [7.5, 3.0] } # top center, below UCIe-N sram: { pos_mm: [1.5, 9.0] } # left side, below HBM zone ucie: decompose: true ports: [N, S, E, W] overhead_ns: 8.0 n_connections: 4 # independent NOC↔UCIe connections per port per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW links: # Router mesh links (ADR-0017 D5) router_link_bw_gbs: 256.0 # inter-router XY mesh link BW router_overhead_ns: 2.0 # per-router switching overhead pe_to_router_bw_gbs: 256.0 # PE_DMA ↔ router (= N × channel_bw) hbm_to_router_bw_gbs: 256.0 # HBM_CTRL ↔ router (= N × channel_bw) sram_to_router_bw_gbs: 128.0 # SRAM ↔ router m_cpu_to_router_mm: 0.0 # M_CPU ↔ router distance pe_dma_to_noc_bw_gbs: 256.0 # PE → router BW (= HBM slice BW, no bottleneck) noc_to_pe_cpu_mm: 0.0 # router → PE_CPU distance (command path) visualization: emit_views: [system, sip, cube] sip_ids: [0] cubes: [0, 9, 15]