kernbench2/tests/test_ccl_performance.py

"""CCL performance validation tests (ADR-0023 D13 T5).

Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.

Uses 8-rank (single cube) for all buffer variants — the latency model
is topology-aware, so buffer_kind differences are visible even at small
scale. Full-system (256-rank) cross-SIP latency is covered by the
``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
"""
from __future__ import annotations

import importlib
import os

import pytest

from kernbench.runtime_api.bench_runner import run_bench
from kernbench.runtime_api.types import resolve_device
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology


def _engine_factory(topology, device):
    return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)


def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
    """Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
    Returns simulated kernel total_ns."""
    import tempfile

    body = f"""\
defaults:
  algorithm: {algorithm}
  buffer_kind: {buffer_kind}
  backpressure: sleep
  n_slots: 4
  slot_size: 4096
  vc_chunk_size: 256
  ipcq_credit_size_bytes: 16

algorithms:
  {algorithm}:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
    buffer_kind: {buffer_kind}
    world_size: 8
    n_elem: 32
"""
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    with tempfile.TemporaryDirectory() as tmp:
        with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
            f.write(body)
        old_cwd = os.getcwd()
        os.chdir(tmp)
        try:
            topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
            bench_mod = importlib.import_module("benches.ccl_allreduce")
            result = run_bench(
                topology=topo, bench_fn=bench_mod.run,
                device=resolve_device("all"),
                engine_factory=_engine_factory,
            )
        finally:
            os.chdir(old_cwd)

    assert result.completion.ok, f"{algorithm} did not complete"
    last_kernel = None
    for tr in (result.traces or []):
        if tr.get("phase") == "kernel":
            last_kernel = tr
    assert last_kernel is not None, f"{algorithm} produced no kernel trace"
    return float(last_kernel.get("total_ns", 0.0))


@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
def test_ccl_latency_positive(buffer_kind):
    """Every buffer kind must produce a positive simulated latency."""
    algo = f"ring_allreduce_{buffer_kind}"
    ns = _run_8rank(algo, buffer_kind)
    assert ns > 0


def test_ccl_latency_under_reasonable_bound():
    """8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
    ns = _run_8rank("ring_allreduce_tcm", "tcm")
    assert ns < 1_000_000  # < 1 ms simulated