"""CCL performance validation tests (ADR-0023 D13 T5). Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench under the rank = SIP TP launcher model (ADR-0024 / ADR-0027). Uses the topology-derived world_size (= 2 in the shipped topology); the latency model is topology-aware, so buffer_kind differences remain visible even at this scale. """ from __future__ import annotations import importlib import os import pytest from kernbench.runtime_api.bench_runner import run_bench from kernbench.runtime_api.types import resolve_device from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import resolve_topology def _engine_factory(topology, device): return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True) def _run_ring(algorithm: str, buffer_kind: str = "tcm") -> float: """Run a rank = SIP ring all-reduce via the unified bench with a tmp ccl.yaml overlay. Returns simulated kernel total_ns.""" import tempfile body = f"""\ defaults: algorithm: {algorithm} buffer_kind: {buffer_kind} backpressure: sleep n_slots: 4 slot_size: 4096 vc_chunk_size: 256 ipcq_credit_size_bytes: 16 algorithms: {algorithm}: module: kernbench.ccl.algorithms.ring_allreduce topology: ring_1d buffer_kind: {buffer_kind} n_elem: 32 """ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) with tempfile.TemporaryDirectory() as tmp: with open(os.path.join(tmp, "ccl.yaml"), "w") as f: f.write(body) old_cwd = os.getcwd() os.chdir(tmp) try: topo = resolve_topology(os.path.join(project_root, "topology.yaml")) bench_mod = importlib.import_module("benches.ccl_allreduce") result = run_bench( topology=topo, bench_fn=bench_mod.run, device=resolve_device("all"), engine_factory=_engine_factory, ) finally: os.chdir(old_cwd) assert result.completion.ok, f"{algorithm} did not complete" last_kernel = None for tr in (result.traces or []): if tr.get("phase") == "kernel": last_kernel = tr assert last_kernel is not None, f"{algorithm} produced no kernel trace" return float(last_kernel.get("total_ns", 0.0)) @pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"]) def test_ccl_latency_positive(buffer_kind): """Every buffer kind must produce a positive simulated latency.""" algo = f"ring_allreduce_{buffer_kind}" ns = _run_ring(algo, buffer_kind) assert ns > 0 def test_ccl_latency_under_reasonable_bound(): """rank = SIP ring all-reduce (tile=32 f16) should finish well under 1ms.""" ns = _run_ring("ring_allreduce_tcm", "tcm") assert ns < 1_000_000 # < 1 ms simulated