kernbench2/tests/test_ccl_hello_world_guide.py

"""Validate the hello-world example from docs/ccl-author-guide.md.

This is the simplest possible CCL kernel — each PE sends its tile E
and receives a tile from W. After running, each rank's slice should
contain the data of the previous rank.
"""
from __future__ import annotations

import numpy as np

from kernbench.ccl.algorithms import hello_send
from kernbench.ccl.testing import run_kernel_in_mock


def test_hello_send_4_ranks_mock():
    n_elem = 8
    inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]

    outputs = run_kernel_in_mock(
        kernel_fn=hello_send.kernel,
        world_size=4,
        topology="ring_1d",
        inputs=inputs,
        kernel_args=(n_elem,),
    )

    # rank r should have rank (r-1) % 4's data
    for r in range(4):
        prev = inputs[(r - 1) % 4]
        assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}"


def test_hello_send_via_simpy_runner():
    """Same but through real SimPy + IPCQ."""
    from kernbench.policy.placement.dp import DPPolicy
    from kernbench.runtime_api.bench_runner import run_bench
    from kernbench.runtime_api.types import resolve_device
    from kernbench.sim_engine.engine import GraphEngine
    from kernbench.topology.builder import resolve_topology

    topo = resolve_topology("topology.yaml")
    n_elem = 8
    world_size = 8

    def run(torch):
        # World size for this hello test is 8 (one cube). ccl.yaml no
        # longer carries a default world_size — pass it explicitly.
        plan = torch.install_ipcq(
            algorithm="ring_allreduce_tcm", world_size_override=world_size,
        )
        a = torch.zeros(
            (1, world_size * n_elem), dtype="f16",
            dp=DPPolicy(
                sip="replicate", cube="replicate", pe="column_wise",
                num_sips=1, num_cubes=1,
            ),
            name="hello_in",
        )
        store = torch.engine.memory_store
        base = a._handle.va_base or a._handle.shards[0].pa
        nbytes = n_elem * 2
        for r in range(world_size):
            store.write("hbm", base + r * nbytes,
                        np.full((n_elem,), float(r + 1), dtype=np.float16))

        torch.launch("hello_send", hello_send.kernel, a, n_elem)

        # Each rank should hold the previous rank's data after the round
        for r in range(world_size):
            arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16")
            prev_value = float(((r - 1) % world_size) + 1)
            assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}"

    result = run_bench(
        topology=topo, bench_fn=run,
        device=resolve_device("all"),
        engine_factory=lambda t, d: GraphEngine(
            getattr(t, "topology_obj", t), enable_data=True
        ),
    )
    assert result.completion.ok