kernbench2/tests/test_ccl_performance.py

"""CCL performance validation tests (ADR-0023 D13 T5).

Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
under different ``ccl.yaml`` algorithm choices:

  - All buffer kinds finish in non-zero simulated time.
  - Latency is bounded well under 1 ms for small tiles.

These are sanity checks on the model itself, not on absolute numbers.
"""
from __future__ import annotations

import importlib
import os
from contextlib import contextmanager

import pytest

from kernbench.runtime_api.bench_runner import run_bench
from kernbench.runtime_api.types import resolve_device
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology


def _engine_factory(topology, device):
    return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)


@contextmanager
def _ccl_yaml_override(algorithm: str, world_size: int | None = None):
    """Write a tmp ccl.yaml that forces a specific algorithm + world_size."""
    import tempfile
    entry_extra = f"\n    world_size: {world_size}" if world_size is not None else ""
    body = f"""
defaults:
  algorithm: {algorithm}
  buffer_kind: tcm
  backpressure: sleep
  n_slots: 4
  slot_size: 4096
  vc_chunk_size: 256
  ipcq_credit_size_bytes: 16

algorithms:
  ring_allreduce_tcm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: tcm
  ring_allreduce_hbm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: hbm
  ring_allreduce_sram:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""}
  {algorithm}:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: tcm{entry_extra}
""" if world_size is not None else f"""
defaults:
  algorithm: {algorithm}
  buffer_kind: tcm
  backpressure: sleep
  n_slots: 4
  slot_size: 4096
  vc_chunk_size: 256
  ipcq_credit_size_bytes: 16

algorithms:
  ring_allreduce_tcm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: tcm
  ring_allreduce_hbm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: hbm
  ring_allreduce_sram:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: sram
"""
    with tempfile.TemporaryDirectory() as tmp:
        path = os.path.join(tmp, "ccl.yaml")
        with open(path, "w") as f:
            f.write(body)
        old_cwd = os.getcwd()
        os.chdir(tmp)
        try:
            yield path
        finally:
            os.chdir(old_cwd)


def _run_unified(algorithm: str, world_size: int | None = None) -> float:
    """Run the unified ccl_allreduce bench under a ccl.yaml override,
    return simulated kernel total_ns."""
    with _ccl_yaml_override(algorithm, world_size):
        topo = resolve_topology(
            os.path.join(os.path.dirname(__file__), "..", "topology.yaml")
        )
        bench_mod = importlib.import_module("benches.ccl_allreduce")
        result = run_bench(
            topology=topo, bench_fn=bench_mod.run,
            device=resolve_device("all"),
            engine_factory=_engine_factory,
        )
    assert result.completion.ok, f"{algorithm} did not complete"
    last_kernel = None
    for tr in (result.traces or []):
        if tr.get("phase") == "kernel":
            last_kernel = tr
    assert last_kernel is not None, f"{algorithm} produced no kernel trace"
    return float(last_kernel.get("total_ns", 0.0))


@pytest.mark.parametrize("algorithm", [
    "ring_allreduce_tcm",
    "ring_allreduce_hbm",
    "ring_allreduce_sram",
])
def test_ccl_latency_positive(algorithm):
    """Every buffer kind must produce a positive simulated latency."""
    ns = _run_unified(algorithm)
    assert ns > 0


def test_ccl_latency_under_reasonable_bound():
    """Sanity bound: ring all-reduce (tile=32 f16) should finish in well
    under 1 ms simulated. Way overhead-dominated for small tiles."""
    ns = _run_unified("ring_allreduce_tcm")
    assert ns < 100_000_000  # < 100 ms simulated — very loose bound