# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11) # # Loaded by AhbmCCLBackend at init_process_group time. # defaults.algorithm chooses which kernel + topology is installed # into PE_IPCQ neighbor tables. Host code is unaware of these settings. defaults: # Algorithm to run for this benchmark execution. algorithm: intercube_allreduce # IPCQ ring buffer location. # tcm — PE-local TCM (fast, small, conflicts with compute TCM access) # hbm — PE-local HBM (large, slower DMA latency) # sram — Cube-shared SRAM (medium, cube-internal contention) buffer_kind: tcm # Backpressure mode. # poll — spin-loop polling of cached peer pointers # sleep — yield SimPy event, wake on credit return backpressure: sleep # Ring depth: number of slots per (direction, tx|rx) buffer. n_slots: 4 # Slot size in bytes (must hold one tile worth of data). slot_size: 4096 # PE_DMA virtual channel chunk size (D8). vc_chunk_size: 256 # Credit return fast path message size (D9). ipcq_credit_size_bytes: 16 algorithms: # ── intercube all-reduce (pe0-only, cube mesh + inter-SIP) ── # Reduces across the 4×4 cube mesh within each SIP, then inter-SIP # exchange on root cube, then broadcast back. SIP topology is read # from topology.yaml → system.sips.topology. Kernel auto-selects # ring / torus / mesh inter-SIP exchange pattern. intercube_allreduce: module: kernbench.ccl.algorithms.intercube_allreduce topology: none buffer_kind: tcm n_elem: 8 root_cube: 15