kernbench2/topology.yaml


system:
  ns_per_mm: 0.01   # wire propagation delay: 10 ps/mm (on-chip silicon)

  sips:
    count: 2

  components:
    switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }

  links:
    io_ep_to_switch:
      kind: pcie
      bw_gbs_per_ep: 768.0
      distance_mm: 20.0

sip:
  cube_mesh: { w: 4, h: 4 }

  iochiplet:
    components:
      pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
      io_cpu:  { kind: io_cpu,  impl: io_cpu_v1,  attrs: { overhead_ns: 10.0 } }
      io_noc:  { kind: io_noc,  impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
    links:
      pcie_ep_to_noc_bw_gbs: 256.0
      pcie_ep_to_noc_mm: 1.0
      io_cpu_to_noc_bw_gbs: 256.0
      io_cpu_to_noc_mm: 0.5
    ucie:
      overhead_ns: 8.0
      n_connections: 4
      per_connection_bw_gbs: 128.0   # 4 × 128 = 512 GB/s = PHY BW
      noc_to_ucie_mm: 0.5
    instances:
      - id: io0
        place: { side: N, offset_norm: 0.5 }
        ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
        cube_ports:
          - { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
          - { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
          - { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
          - { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }

  links:
    inter_cube_mesh:
      bw_gbs_per_ucie_phy: 512.0
      distance_mm_across_seam: 1.0
      routing: { algo: xy }

cube:
  geometry:
    cube_mm: { w: 17.0, h: 14.0 }
    hbm_mm: { w: 9.0, h: 5.0 }
    ucie_mm: { size: 2.0 }

  pe_layout:
    corners: [NW, NE, SW, SE]   # N corners → xbar top row; S corners → xbar bottom row
    pe_per_corner: 2             # total PEs per cube: 4 * 2 = 8

  pe_template:
    components:
      pe_cpu:       { kind: pe_cpu,       impl: pe_cpu_v1,       attrs: { overhead_ns: 2.0 } }
      pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v2, attrs: { overhead_ns: 1.0 } }
      pe_dma:       { kind: pe_dma,       impl: pe_dma_v1,       attrs: { rd_engines: 1, wr_engines: 1 } }
      pe_gemm:      { kind: pe_gemm,      impl: pe_gemm_v1,      attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
      pe_math:      { kind: pe_math,      impl: pe_math_v1,      attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
      pe_mmu:       { kind: pe_mmu,       impl: pe_mmu_v1,       attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
      pe_tcm:       { kind: pe_tcm,       impl: pe_tcm_v1,       attrs:
      { size_mb: 16 } }
    links:
      pe_cpu_to_scheduler_mm:  0.5
      scheduler_to_dma_mm:     0.5
      scheduler_to_gemm_mm:    0.5
      scheduler_to_math_mm:    0.5
      dma_to_tcm_bw_gbs:       512.0
      dma_to_tcm_mm:           0.5
      gemm_to_tcm_bw_gbs:      512.0    # GEMM reads inputs from TCM (ADR-0014 D5)
      gemm_to_tcm_mm:          0.5
      math_to_tcm_bw_gbs:      512.0
      math_to_tcm_mm:          0.5

  memory_map:
    hbm_total_gb_per_cube: 48
    hbm_slices_per_cube: 8
    hbm_total_bw_gbs: 1024.0

  components:
    noc:     { kind: noc,     impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
    m_cpu:   { kind: m_cpu,   impl: m_cpu_v1,   attrs: { overhead_ns: 5.0 } }
    xbar:
      top:    { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
      bottom: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
      bridges:
        - { id: left,  kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
        - { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
    hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 1.0 } }
    sram:     { kind: sram,     impl: sram_v1,     attrs: { size_mb: 32, overhead_ns: 2.0 } }

  ucie:
    decompose: true
    ports: [N, S, E, W]
    overhead_ns: 8.0
    n_connections: 4               # independent NOC↔UCIe connections per port
    per_connection_bw_gbs: 128.0   # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW

  links:
    xbar_to_hbm_bw_gbs: 256.0        # per-slice effective (2048 / 8 slices)
    xbar_to_bridge_bw_gbs: 128.0     # bridge BW (xbar_top/bot ↔ bridge)
    xbar_to_bridge_mm: 3.0           # xbar ↔ bridge wire distance
    xbar_to_hbm_mm: 2.5
    pe_dma_to_noc_bw_gbs: 256.0  # PE → NOC BW (= HBM slice BW, no bottleneck)
    noc_to_xbar_mm: 0.0          # noc is distributed; distance modeled as 0
    noc_to_xbar_bw_gbs: 256.0    # NOC → xbar_top/bot BW (= HBM slice BW)
    noc_to_sram_mm: 0.0          # noc is distributed; distance modeled as 0
    noc_to_sram:
      per_connection_bw_gbs: 128.0   # BW per NOC connection
      n_connections: 4               # 4 × 128 = 512 GB/s aggregate
    m_cpu_to_noc_mm: 0.0         # noc is distributed; distance modeled as 0
    noc_to_pe_cpu_mm: 0.0        # noc is distributed; distance modeled as 0

visualization:
  emit_views: [system, sip, cube]
  sip_ids: [0]
  cubes: [0, 9, 15]