"""CCL performance validation tests (ADR-0023 D13 T5). Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench under different ``ccl.yaml`` algorithm choices: - All buffer kinds finish in non-zero simulated time. - Latency is bounded well under 1 ms for small tiles. These are sanity checks on the model itself, not on absolute numbers. """ from __future__ import annotations import importlib import os from contextlib import contextmanager import pytest from kernbench.runtime_api.bench_runner import run_bench from kernbench.runtime_api.types import resolve_device from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import resolve_topology def _engine_factory(topology, device): return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True) @contextmanager def _ccl_yaml_override(algorithm: str, world_size: int | None = None): """Write a tmp ccl.yaml that forces a specific algorithm + world_size.""" import tempfile entry_extra = f"\n world_size: {world_size}" if world_size is not None else "" body = f""" defaults: algorithm: {algorithm} buffer_kind: tcm backpressure: sleep n_slots: 4 slot_size: 4096 vc_chunk_size: 256 ipcq_credit_size_bytes: 16 algorithms: ring_allreduce_tcm: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: tcm ring_allreduce_hbm: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: hbm ring_allreduce_sram: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""} {algorithm}: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: tcm{entry_extra} """ if world_size is not None else f""" defaults: algorithm: {algorithm} buffer_kind: tcm backpressure: sleep n_slots: 4 slot_size: 4096 vc_chunk_size: 256 ipcq_credit_size_bytes: 16 algorithms: ring_allreduce_tcm: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: tcm ring_allreduce_hbm: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: hbm ring_allreduce_sram: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: sram """ with tempfile.TemporaryDirectory() as tmp: path = os.path.join(tmp, "ccl.yaml") with open(path, "w") as f: f.write(body) old_cwd = os.getcwd() os.chdir(tmp) try: yield path finally: os.chdir(old_cwd) def _run_unified(algorithm: str, world_size: int | None = None) -> float: """Run the unified ccl_allreduce bench under a ccl.yaml override, return simulated kernel total_ns.""" with _ccl_yaml_override(algorithm, world_size): topo = resolve_topology( os.path.join(os.path.dirname(__file__), "..", "topology.yaml") ) bench_mod = importlib.import_module("benches.ccl_allreduce") result = run_bench( topology=topo, bench_fn=bench_mod.run, device=resolve_device("all"), engine_factory=_engine_factory, ) assert result.completion.ok, f"{algorithm} did not complete" last_kernel = None for tr in (result.traces or []): if tr.get("phase") == "kernel": last_kernel = tr assert last_kernel is not None, f"{algorithm} produced no kernel trace" return float(last_kernel.get("total_ns", 0.0)) @pytest.mark.parametrize("algorithm", [ "ring_allreduce_tcm", "ring_allreduce_hbm", "ring_allreduce_sram", ]) def test_ccl_latency_positive(algorithm): """Every buffer kind must produce a positive simulated latency.""" ns = _run_unified(algorithm) assert ns > 0 def test_ccl_latency_under_reasonable_bound(): """Sanity bound: ring all-reduce (tile=32 f16) should finish in well under 1 ms simulated. Way overhead-dominated for small tiles.""" ns = _run_unified("ring_allreduce_tcm") assert ns < 100_000_000 # < 100 ms simulated — very loose bound