"""Validate the hello-world example from docs/ccl-author-guide.md. This is the simplest possible CCL kernel — each PE sends its tile E and receives a tile from W. After running, each rank's slice should contain the data of the previous rank. """ from __future__ import annotations import numpy as np from kernbench.ccl.algorithms import hello_send from kernbench.ccl.testing import run_kernel_in_mock def test_hello_send_4_ranks_mock(): n_elem = 8 inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)] outputs = run_kernel_in_mock( kernel_fn=hello_send.kernel, world_size=4, topology="ring_1d", inputs=inputs, kernel_args=(n_elem,), ) # rank r should have rank (r-1) % 4's data for r in range(4): prev = inputs[(r - 1) % 4] assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}" def test_hello_send_via_simpy_runner(): """Same but through real SimPy + IPCQ.""" from kernbench.policy.placement.dp import DPPolicy from kernbench.runtime_api.bench_runner import run_bench from kernbench.runtime_api.types import resolve_device from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import resolve_topology topo = resolve_topology("topology.yaml") n_elem = 8 world_size = 8 def run(torch): # World size for this hello test is 8 (one cube). ccl.yaml no # longer carries a default world_size — pass it explicitly. plan = torch.install_ipcq( algorithm="ring_allreduce_tcm", world_size_override=world_size, ) a = torch.zeros( (1, world_size * n_elem), dtype="f16", dp=DPPolicy( cube="replicate", pe="column_wise", num_cubes=1, ), name="hello_in", ) store = torch.engine.memory_store base = a._handle.va_base or a._handle.shards[0].pa nbytes = n_elem * 2 for r in range(world_size): store.write("hbm", base + r * nbytes, np.full((n_elem,), float(r + 1), dtype=np.float16)) torch.launch("hello_send", hello_send.kernel, a, n_elem) # Each rank should hold the previous rank's data after the round for r in range(world_size): arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16") prev_value = float(((r - 1) % world_size) + 1) assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}" result = run_bench( topology=topo, bench_fn=run, device=resolve_device("all"), engine_factory=lambda t, d: GraphEngine( getattr(t, "topology_obj", t), enable_data=True ), ) assert result.completion.ok