kernbench2/ccl.yaml

# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11)
#
# Loaded by AhbmCCLBackend at init_process_group time.
# defaults.algorithm chooses which kernel + topology is installed
# into PE_IPCQ neighbor tables. Host code is unaware of these settings.

defaults:
  # Algorithm to run for this benchmark execution.
  algorithm: ring_allreduce_tcm

  # NOTE: world_size is not set here by default. AhbmCCLBackend derives it
  # from the chosen algorithm's entry (if it sets ``world_size``) or from
  # topology.yaml (``sips × cubes_per_sip × pes_per_cube``). This mirrors
  # real PyTorch DDP where ranks/world_size come from env vars, not code.

  # IPCQ ring buffer location.
  #   tcm  — PE-local TCM (fast, small, conflicts with compute TCM access)
  #   hbm  — PE-local HBM (large, slower DMA latency)
  #   sram — Cube-shared SRAM (medium, cube-internal contention)
  buffer_kind: tcm

  # Backpressure mode.
  #   poll  — spin-loop polling of cached peer pointers
  #   sleep — yield SimPy event, wake on credit return
  backpressure: sleep

  # Ring depth: number of slots per (direction, tx|rx) buffer.
  n_slots: 4

  # Slot size in bytes (must hold one tile worth of data).
  slot_size: 4096

  # PE_DMA virtual channel chunk size (D8). First implementation does not
  # use chunk-level interleave; this is reserved for future precision.
  vc_chunk_size: 256

  # Credit return fast path message size (D9). Used by bottleneck-BW
  # latency calculation. 16-64 bytes typical.
  ipcq_credit_size_bytes: 16

algorithms:
  # ── ring all-reduce, buffer in PE_TCM ──
  # Defaults to topology-derived world_size (full system, 256 ranks).
  # Use a smaller tile size at high rank counts so f16 sums stay within
  # the verification tolerance and op_log replay scales.
  ring_allreduce_tcm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: tcm
    n_elem: 8

  # ── ring all-reduce, buffer in PE-local HBM ──
  ring_allreduce_hbm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: hbm
    n_elem: 8

  # ── ring all-reduce, buffer in cube SRAM ──
  ring_allreduce_sram:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: sram
    n_elem: 8

  # ── 2D mesh all-reduce: perfect square only (2×2 = 4 PEs) ──
  mesh_allreduce_4:
    module: kernbench.ccl.algorithms.mesh_allreduce
    topology: mesh_2d
    buffer_kind: tcm
    world_size: 4
    n_elem: 16

  # ── tree all-reduce (binary, 7 PEs) ──
  tree_allreduce_7:
    module: kernbench.ccl.algorithms.tree_allreduce
    topology: tree_binary
    buffer_kind: tcm
    world_size: 7
    n_elem: 16

  # ── hierarchical all-reduce (3-level: intra-cube → inter-cube → inter-SIP) ──
  # Uses bidirectional ring reduce + chain broadcast. ~25 rounds vs 255 flat.
  hierarchical_allreduce:
    module: kernbench.ccl.algorithms.hierarchical_allreduce
    topology: none
    buffer_kind: tcm
    n_elem: 16