kernbench2/tests/test_ccl_allreduce_matrix.py

"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench.

Only covers the rank = SIP TP launcher path (ADR-0024 + ADR-0027). Each
case writes a tmp ``ccl.yaml`` that selects a specific (algorithm,
buffer_kind) pair; ``world_size`` is always derived from topology SIP
count (2 in the shipped topology).

The legacy rank = PE single-driver path was removed; intra-SIP PE-level
collectives are expressed inside the kernel via ``tl.program_id`` and do
not require a host-side ``ProcessGroup``.
"""
from __future__ import annotations

import os
import textwrap

import pytest

import kernbench.cli.main as cli_main


CCL_YAML_TEMPLATE = textwrap.dedent("""\
    defaults:
      algorithm: {algorithm}
      buffer_kind: {buffer_kind}
      backpressure: sleep
      n_slots: 4
      slot_size: 4096
      vc_chunk_size: 256
      ipcq_credit_size_bytes: 16

    algorithms:
      {algorithm}:
        module: {module}
        topology: {topology}
        buffer_kind: {buffer_kind}
""")


def _write_ccl_yaml(
    tmp_path,
    *,
    algorithm: str,
    module: str,
    topology: str,
    buffer_kind: str,
) -> str:
    body = CCL_YAML_TEMPLATE.format(
        algorithm=algorithm,
        module=module,
        topology=topology,
        buffer_kind=buffer_kind,
    )
    (tmp_path / "ccl.yaml").write_text(body)
    return str(tmp_path)


CASES = [
    # Ring all-reduce across SIPs (ws == topology SIP count = 2),
    # one case per IPCQ buffer location.
    pytest.param(
        "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
        "ring_1d", "tcm",
        id="ring_tcm",
    ),
    pytest.param(
        "ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
        "ring_1d", "hbm",
        id="ring_hbm",
    ),
    pytest.param(
        "ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
        "ring_1d", "sram",
        id="ring_sram",
    ),
]


@pytest.mark.parametrize("algorithm,module,topology,buffer_kind", CASES)
def test_ccl_allreduce_matrix(
    tmp_path, capsys, monkeypatch,
    algorithm, module, topology, buffer_kind,
):
    """Each (algorithm × buffer_kind) combo passes through the unified
    rank = SIP bench and yields ``ws OK`` where ``ws == topology SIP count``."""
    project_root = os.path.abspath(
        os.path.join(os.path.dirname(__file__), "..")
    )
    yaml_dir = _write_ccl_yaml(
        tmp_path,
        algorithm=algorithm,
        module=module,
        topology=topology,
        buffer_kind=buffer_kind,
    )
    monkeypatch.chdir(yaml_dir)
    rc = cli_main.main([
        "run",
        "--topology", os.path.join(project_root, "topology.yaml"),
        "--bench", "ccl_allreduce",
        "--verify-data",
    ])
    assert rc == 0
    out = capsys.readouterr().out
    assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}"
    assert f"{algorithm}" in out and "OK" in out, (
        f"expected pass line for '{algorithm}' in output:\n{out}"
    )