kernbench2/topology.yaml


system:
  ns_per_mm: 0.01   # wire propagation delay: 10 ps/mm (on-chip silicon)

  sips:
    count: 2

  components:
    switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } }

  links:
    io_ep_to_switch:
      kind: pcie
      bw_gbs_per_ep: 768.0
      distance_mm: 20.0

sip:
  cube_mesh: { w: 4, h: 4 }

  iochiplet:
    components:
      pcie_ep: { kind: pcie_ep, impl: builtin.pcie_ep, attrs: { overhead_ns: 5.0 } }
      io_cpu:  { kind: io_cpu,  impl: builtin.io_cpu,  attrs: { overhead_ns: 10.0 } }
      io_noc:  { kind: io_noc,  impl: builtin.forwarding, attrs: { overhead_ns: 0.0 } }
    links:
      pcie_ep_to_noc_bw_gbs: 256.0
      pcie_ep_to_noc_mm: 1.0
      io_cpu_to_noc_bw_gbs: 256.0
      io_cpu_to_noc_mm: 0.5
    ucie:
      overhead_ns: 8.0
      n_connections: 4
      per_connection_bw_gbs: 128.0   # 4 × 128 = 512 GB/s = PHY BW
      noc_to_ucie_mm: 0.5
    instances:
      - id: io0
        place: { side: N, offset_norm: 0.5 }
        ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
        cube_ports:
          - { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
          - { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
          - { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
          - { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }

  links:
    inter_cube_mesh:
      bw_gbs_per_ucie_phy: 512.0
      distance_mm_across_seam: 1.0
      routing: { algo: xy }

cube:
  geometry:
    cube_mm: { w: 17.0, h: 14.0 }
    hbm_mm: { w: 9.0, h: 5.0 }
    ucie_mm: { size: 2.0 }

  pe_layout:
    corners: [NW, NE, SW, SE]   # N corners → top PE rows; S corners → bottom PE rows
    pe_per_corner: 2             # total PEs per cube: 4 * 2 = 8

  pe_template:
    components:
      pe_cpu:       { kind: pe_cpu,       impl: builtin.pe_cpu,       attrs: { overhead_ns: 2.0 } }
      pe_scheduler: { kind: pe_scheduler, impl: builtin.pe_scheduler, attrs: { overhead_ns: 1.0 } }
      pe_dma:       { kind: pe_dma,       impl: builtin.pe_dma,       attrs: { rd_engines: 1, wr_engines: 1 } }
      pe_gemm:        { kind: pe_gemm,        impl: builtin.pe_gemm,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
      pe_math:        { kind: pe_math,        impl: builtin.pe_math,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
      pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } }
      pe_mmu:         { kind: pe_mmu,         impl: builtin.pe_mmu,         attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
      pe_tcm:         { kind: pe_tcm,         impl: builtin.pe_tcm,         attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0 } }
    links:
      pe_cpu_to_scheduler_mm:  0.5
      scheduler_to_dma_mm:     0.5
      scheduler_to_gemm_mm:    0.5
      scheduler_to_math_mm:    0.5
      scheduler_to_fetch_store_mm: 0.5
      dma_to_tcm_bw_gbs:       512.0
      dma_to_tcm_mm:           0.5
      dma_to_fetch_store_mm:   0.0     # DMA → fetch_store chaining (ADR-0021)
      fetch_store_to_tcm_bw_gbs: 512.0
      fetch_store_to_tcm_mm:   0.0
      fetch_store_to_gemm_mm:  0.0     # fetch → GEMM chaining (ADR-0021)
      fetch_store_to_math_mm:  0.0     # fetch → MATH chaining (ADR-0021)
      gemm_to_fetch_store_mm:  0.0     # GEMM → store chaining (ADR-0021)
      math_to_fetch_store_mm:  0.0     # MATH → store chaining (ADR-0021)
      fetch_store_to_dma_mm:   0.0     # store → DMA writeback chaining (ADR-0021)
      gemm_to_tcm_bw_gbs:      512.0
      gemm_to_tcm_mm:          0.5
      math_to_tcm_bw_gbs:      512.0
      math_to_tcm_mm:          0.5

  memory_map:
    hbm_total_gb_per_cube: 48
    hbm_slices_per_cube: 8
    hbm_total_bw_gbs: 1024.0
    hbm_mapping_mode: n_to_one        # one_to_one | n_to_one (ADR-0019)
    hbm_pseudo_channels: 64           # total pseudo channels per cube
    hbm_channels_per_pe: 8            # = pseudo_channels / pes_per_cube
    hbm_channel_bw_gbs: 32.0          # per-channel bandwidth (GB/s)

  components:
    noc_router: { kind: noc_router, impl: builtin.forwarding, attrs: { overhead_ns: 2.0 } }
    m_cpu:      { kind: m_cpu,   impl: builtin.m_cpu,   attrs: { overhead_ns: 5.0 } }
    hbm_ctrl:   { kind: hbm_ctrl, impl: builtin.hbm_ctrl, attrs: { capacity: 1, efficiency: 1.0 } }
    sram:       { kind: sram,     impl: builtin.sram,     attrs: { size_mb: 32, overhead_ns: 2.0 } }

  # Physical placement of non-PE components (mm coordinates)
  placement:
    m_cpu:  { pos_mm: [7.5, 3.0] }   # top center, below UCIe-N
    sram:   { pos_mm: [1.5, 9.0] }   # left side, below HBM zone

  ucie:
    decompose: true
    ports: [N, S, E, W]
    overhead_ns: 8.0
    n_connections: 4               # independent NOC↔UCIe connections per port
    per_connection_bw_gbs: 128.0   # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW

  links:
    # Router mesh links (ADR-0019)
    router_link_bw_gbs: 256.0        # inter-router XY mesh link BW
    router_overhead_ns: 2.0          # per-router switching overhead
    pe_to_router_bw_gbs: 256.0      # PE_DMA ↔ router (= N × channel_bw)
    hbm_to_router_bw_gbs: 256.0     # HBM_CTRL ↔ router (= N × channel_bw)
    sram_to_router_bw_gbs: 128.0    # SRAM ↔ router
    m_cpu_to_router_mm: 0.0         # M_CPU ↔ router distance
    pe_dma_to_noc_bw_gbs: 256.0     # PE → router BW (= HBM slice BW, no bottleneck)
    noc_to_pe_cpu_mm: 0.0           # router → PE_CPU distance (command path)

visualization:
  emit_views: [system, sip, cube]
  sip_ids: [0]
  cubes: [0, 9, 15]