Files
kernbench2/topology.yaml
T
mukesh a7fe785e5f tl.composite: fused epilogue ops with per-op scope
Extend tl.composite() with an ordered epilogue list. Each op carries
a scope flag - output_tile (default, runs once per (m,n) before
STORE), k_tile (every K-tile right after GEMM), or kernel. Plan
generator slots MATH stages by scope; pe_math reuses pe_dma's
local-loop pattern so chained epilogues (bias->relu) skip the port
hop. op_log captures per-stage params for telemetry. Topology
gains a gemm->math edge (snapshot test updated).

API stays backward-compatible - `epilogue=` is opt-in.

Example:
    h = tl.composite(
        op="gemm", a=a, b=b, out_ptr=int(out),
        epilogue=[
            {"op": "dequant", "scale": s_per_k, "scope": "k_tile"},
            {"op": "bias",    "bias":  bias_vec},
            {"op": "relu"},
            {"op": "scale",   "factor": 0.5},
        ],
    )
    tl.wait(h)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 10:16:47 -07:00

140 lines
6.1 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
system:
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
sips:
count: 2
topology: ring_1d
components:
switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } }
links:
io_ep_to_switch:
kind: pcie
bw_gbs_per_ep: 768.0
distance_mm: 20.0
sip:
cube_mesh: { w: 4, h: 4 }
iochiplet:
components:
pcie_ep: { kind: pcie_ep, impl: builtin.pcie_ep, attrs: { overhead_ns: 5.0 } }
io_cpu: { kind: io_cpu, impl: builtin.io_cpu, attrs: { overhead_ns: 10.0 } }
io_noc: { kind: io_noc, impl: builtin.forwarding, attrs: { overhead_ns: 0.0 } }
links:
pcie_ep_to_noc_bw_gbs: 256.0
pcie_ep_to_noc_mm: 1.0
io_cpu_to_noc_bw_gbs: 256.0
io_cpu_to_noc_mm: 0.5
ucie:
overhead_ns: 8.0
n_connections: 4
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
noc_to_ucie_mm: 0.5
instances:
- id: io0
place: { side: N, offset_norm: 0.5 }
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
cube_ports:
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
links:
inter_cube_mesh:
bw_gbs_per_ucie_phy: 512.0
distance_mm_across_seam: 1.0
routing: { algo: xy }
cube:
geometry:
cube_mm: { w: 17.0, h: 14.0 }
hbm_mm: { w: 9.0, h: 5.0 }
ucie_mm: { size: 2.0 }
pe_layout:
corners: [NW, NE, SW, SE] # N corners → top PE rows; S corners → bottom PE rows
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
components:
pe_cpu: { kind: pe_cpu, impl: builtin.pe_cpu, attrs: { overhead_ns: 2.0 } }
pe_scheduler: { kind: pe_scheduler, impl: builtin.pe_scheduler, attrs: { overhead_ns: 1.0 } }
pe_dma: { kind: pe_dma, impl: builtin.pe_dma, attrs: { rd_engines: 1, wr_engines: 1 } }
pe_gemm: { kind: pe_gemm, impl: builtin.pe_gemm, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
pe_math: { kind: pe_math, impl: builtin.pe_math, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } }
pe_mmu: { kind: pe_mmu, impl: builtin.pe_mmu, attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
pe_tcm: { kind: pe_tcm, impl: builtin.pe_tcm, attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0, kernel_scratch_mb: 1 } }
pe_ipcq: { kind: pe_ipcq, impl: builtin.pe_ipcq, attrs: { overhead_ns: 0.0 } }
links:
pe_cpu_to_scheduler_mm: 0.5
scheduler_to_dma_mm: 0.5
scheduler_to_gemm_mm: 0.5
scheduler_to_math_mm: 0.5
scheduler_to_fetch_store_mm: 0.5
dma_to_tcm_bw_gbs: 512.0
dma_to_tcm_mm: 0.5
dma_to_fetch_store_mm: 0.0 # DMA → fetch_store chaining (ADR-0021)
fetch_store_to_tcm_bw_gbs: 512.0
fetch_store_to_tcm_mm: 0.0
fetch_store_to_gemm_mm: 0.0 # fetch → GEMM chaining (ADR-0021)
fetch_store_to_math_mm: 0.0 # fetch → MATH chaining (ADR-0021)
gemm_to_fetch_store_mm: 0.0 # GEMM → store chaining (ADR-0021)
gemm_to_math_mm: 0.0 # GEMM → MATH epilogue chaining (ADR-0021)
math_to_fetch_store_mm: 0.0 # MATH → store chaining (ADR-0021)
fetch_store_to_dma_mm: 0.0 # store → DMA writeback chaining (ADR-0021)
gemm_to_tcm_bw_gbs: 512.0
gemm_to_tcm_mm: 0.5
math_to_tcm_bw_gbs: 512.0
math_to_tcm_mm: 0.5
cpu_to_ipcq_mm: 0.5 # PE_CPU → PE_IPCQ (ADR-0023)
ipcq_to_dma_mm: 0.0 # PE_IPCQ → PE_DMA token forwarding (ADR-0023)
dma_to_ipcq_mm: 0.0 # PE_DMA → PE_IPCQ metadata arrival (ADR-0023)
memory_map:
hbm_total_gb_per_cube: 48
hbm_slices_per_cube: 8
hbm_total_bw_gbs: 1024.0
hbm_mapping_mode: n_to_one # one_to_one | n_to_one (ADR-0019)
hbm_pseudo_channels: 64 # total pseudo channels per cube
hbm_channels_per_pe: 8 # = pseudo_channels / pes_per_cube
hbm_channel_bw_gbs: 32.0 # per-channel bandwidth (GB/s)
components:
noc_router: { kind: noc_router, impl: builtin.forwarding, attrs: { overhead_ns: 2.0 } }
m_cpu: { kind: m_cpu, impl: builtin.m_cpu, attrs: { overhead_ns: 5.0 } }
hbm_ctrl: { kind: hbm_ctrl, impl: builtin.hbm_ctrl, attrs: { capacity: 1, efficiency: 1.0, num_pcs: 8, burst_bytes: 256, switch_penalty_ns: 0.0 } }
sram: { kind: sram, impl: builtin.sram, attrs: { size_mb: 32, overhead_ns: 2.0 } }
# Physical placement of non-PE components (mm coordinates)
placement:
m_cpu: { pos_mm: [7.5, 3.0] } # top center, below UCIe-N
sram: { pos_mm: [1.5, 9.0] } # left side, below HBM zone
ucie:
decompose: true
ports: [N, S, E, W]
overhead_ns: 8.0
n_connections: 4 # independent NOC↔UCIe connections per port
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
links:
# Router mesh links (ADR-0019)
router_link_bw_gbs: 256.0 # inter-router XY mesh link BW
router_overhead_ns: 2.0 # per-router switching overhead
pe_to_router_bw_gbs: 256.0 # PE_DMA ↔ router (= N × channel_bw)
hbm_to_router_bw_gbs: 256.0 # HBM_CTRL ↔ router (= N × channel_bw)
sram_to_router_bw_gbs: 128.0 # SRAM ↔ router
m_cpu_to_router_mm: 0.0 # M_CPU ↔ router distance
pe_dma_to_noc_bw_gbs: 256.0 # PE → router BW (= HBM slice BW, no bottleneck)
noc_to_pe_cpu_mm: 0.0 # router → PE_CPU distance (command path)
visualization:
emit_views: [system, sip, cube]
sip_ids: [0]
cubes: [0, 9, 15]