"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench. Only covers the rank = SIP TP launcher path (ADR-0024 + ADR-0027). Each case writes a tmp ``ccl.yaml`` that selects a specific (algorithm, buffer_kind) pair; ``world_size`` is always derived from topology SIP count (2 in the shipped topology). The legacy rank = PE single-driver path was removed; intra-SIP PE-level collectives are expressed inside the kernel via ``tl.program_id`` and do not require a host-side ``ProcessGroup``. """ from __future__ import annotations import os import textwrap import pytest import kernbench.cli.main as cli_main CCL_YAML_TEMPLATE = textwrap.dedent("""\ defaults: algorithm: {algorithm} buffer_kind: {buffer_kind} backpressure: sleep n_slots: 4 slot_size: 4096 vc_chunk_size: 256 ipcq_credit_size_bytes: 16 algorithms: {algorithm}: module: {module} topology: {topology} buffer_kind: {buffer_kind} """) def _write_ccl_yaml( tmp_path, *, algorithm: str, module: str, topology: str, buffer_kind: str, ) -> str: body = CCL_YAML_TEMPLATE.format( algorithm=algorithm, module=module, topology=topology, buffer_kind=buffer_kind, ) (tmp_path / "ccl.yaml").write_text(body) return str(tmp_path) CASES = [ # Ring all-reduce across SIPs (ws == topology SIP count = 2), # one case per IPCQ buffer location. pytest.param( "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce", "ring_1d", "tcm", id="ring_tcm", ), pytest.param( "ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce", "ring_1d", "hbm", id="ring_hbm", ), pytest.param( "ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce", "ring_1d", "sram", id="ring_sram", ), ] @pytest.mark.parametrize("algorithm,module,topology,buffer_kind", CASES) def test_ccl_allreduce_matrix( tmp_path, capsys, monkeypatch, algorithm, module, topology, buffer_kind, ): """Each (algorithm × buffer_kind) combo passes through the unified rank = SIP bench and yields ``ws OK`` where ``ws == topology SIP count``.""" project_root = os.path.abspath( os.path.join(os.path.dirname(__file__), "..") ) yaml_dir = _write_ccl_yaml( tmp_path, algorithm=algorithm, module=module, topology=topology, buffer_kind=buffer_kind, ) monkeypatch.chdir(yaml_dir) rc = cli_main.main([ "run", "--topology", os.path.join(project_root, "topology.yaml"), "--bench", "ccl_allreduce", "--verify-data", ]) assert rc == 0 out = capsys.readouterr().out assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}" assert f"{algorithm}" in out and "OK" in out, ( f"expected pass line for '{algorithm}' in output:\n{out}" )