# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11) # # Loaded by AhbmCCLBackend at init_process_group time. # defaults.algorithm chooses which kernel + topology is installed # into PE_IPCQ neighbor tables. Host code is unaware of these settings. defaults: # Algorithm to run for this benchmark execution. algorithm: ring_allreduce_tcm # NOTE: world_size is not set here by default. AhbmCCLBackend derives it # from the chosen algorithm's entry (if it sets ``world_size``) or from # topology.yaml (``sips × cubes_per_sip × pes_per_cube``). This mirrors # real PyTorch DDP where ranks/world_size come from env vars, not code. # IPCQ ring buffer location. # tcm — PE-local TCM (fast, small, conflicts with compute TCM access) # hbm — PE-local HBM (large, slower DMA latency) # sram — Cube-shared SRAM (medium, cube-internal contention) buffer_kind: tcm # Backpressure mode. # poll — spin-loop polling of cached peer pointers # sleep — yield SimPy event, wake on credit return backpressure: sleep # Ring depth: number of slots per (direction, tx|rx) buffer. n_slots: 4 # Slot size in bytes (must hold one tile worth of data). slot_size: 4096 # PE_DMA virtual channel chunk size (D8). First implementation does not # use chunk-level interleave; this is reserved for future precision. vc_chunk_size: 256 # Credit return fast path message size (D9). Used by bottleneck-BW # latency calculation. 16-64 bytes typical. ipcq_credit_size_bytes: 16 algorithms: # ── ring all-reduce, buffer in PE_TCM ── # Defaults to topology-derived world_size (full system, 256 ranks). # Use a smaller tile size at high rank counts so f16 sums stay within # the verification tolerance and op_log replay scales. ring_allreduce_tcm: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: tcm n_elem: 8 # ── ring all-reduce, buffer in PE-local HBM ── ring_allreduce_hbm: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: hbm n_elem: 8 # ── ring all-reduce, buffer in cube SRAM ── ring_allreduce_sram: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: sram n_elem: 8 # ── 2D mesh all-reduce: perfect square only (2×2 = 4 PEs) ── mesh_allreduce_4: module: kernbench.ccl.algorithms.mesh_allreduce topology: mesh_2d buffer_kind: tcm world_size: 4 n_elem: 16 # ── tree all-reduce (binary, 7 PEs) ── tree_allreduce_7: module: kernbench.ccl.algorithms.tree_allreduce topology: tree_binary buffer_kind: tcm world_size: 7 n_elem: 16