kernbench2/ccl.yaml

# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11)
#
# Loaded by AhbmCCLBackend at init_process_group time.
# defaults.algorithm chooses which kernel + topology is installed
# into PE_IPCQ neighbor tables. Host code is unaware of these settings.

defaults:
  # Algorithm to run for this benchmark execution.
  algorithm: lrab_hierarchical_allreduce

  # IPCQ ring buffer location.
  #   tcm  — PE-local TCM (fast, small, conflicts with compute TCM access)
  #   hbm  — PE-local HBM (large, slower DMA latency)
  #   sram — Cube-shared SRAM (medium, cube-internal contention)
  buffer_kind: tcm

  # Backpressure mode.
  #   poll  — spin-loop polling of cached peer pointers
  #   sleep — yield SimPy event, wake on credit return
  backpressure: sleep

  # Ring depth: number of slots per (direction, tx|rx) buffer.
  n_slots: 4

  # Slot size in bytes (must hold one tile worth of data).
  slot_size: 4096

  # PE_DMA virtual channel chunk size (D8).
  vc_chunk_size: 256

  # Credit return fast path message size (D9).
  ipcq_credit_size_bytes: 16

algorithms:
  # ── intercube all-reduce (pe0-only, cube mesh + inter-SIP) ──
  # Reduces across the 4×4 cube mesh within each SIP, then inter-SIP
  # exchange on root cube, then broadcast back. SIP topology is read
  # from topology.yaml → system.sips.topology. Kernel auto-selects
  # ring / torus / mesh inter-SIP exchange pattern.
  lrab_hierarchical_allreduce:
    module: kernbench.ccl.algorithms.lrab_hierarchical_allreduce
    topology: none
    buffer_kind: tcm
    n_elem: 8
    # root_cube: the kernel currently elects the root dynamically as the
    # geometric center of the cube mesh (root = (h//2)*w + (w//2)) to
    # minimize the intra-SIP critical path, so this value is NOT read today.
    # Kept as a placeholder for a future explicit-root override / runtime
    # election hook (see ADR-0032 D1 + Non-goals).
    root_cube: 15