cfc2d74ec4
The unified ccl_allreduce bench previously carried two execution models
in one worker with ``if world_size == n_sips:`` branching:
- TP mode (rank = SIP, ADR-0024/0027): proper ProcessGroup semantics.
- Legacy rank = PE mode: single-driver worker allocating one big tensor
distributed across all PEs via _derive_dp, with kernel-level SPMD via
program_id.
The second model is unnecessary — intra-SIP PE-level collectives are
expressed inside the kernel (tl.send/tl.recv with program_id, IPCQ) and
do not need a host-side ProcessGroup. Removing it lets the bench be a
clean reference implementation of the TP launcher.
benches/ccl_allreduce.py:
- Config resolved once in run() via _resolve_cfg -> _BenchCfg dataclass.
- rank != n_sips now raises RuntimeError explicitly.
- _worker / _allocate_rank_tile / _init_with_rank_value / _report each
have one concern; duplicated init + verification paths collapsed.
- _derive_dp and the second verify+print block deleted.
- 166 lines -> 91 lines.
ccl.yaml:
- mesh_allreduce_4 (world_size: 4) and tree_allreduce_7 (world_size: 7)
algorithm entries removed (rank = PE only).
- Algorithm kernel files (kernbench.ccl.algorithms.mesh_allreduce,
tree_allreduce) kept as-is for direct-dispatch future use.
tests/test_ccl_allreduce_matrix.py:
- Matrix shrinks from 7 cases to 3: ring × {tcm, hbm, sram} at ws =
topology SIP count (= 2). mesh_2x2, tree_binary_7, ring_multi_cube,
and the three ring_*_8 cases removed.
tests/test_ccl_performance.py:
- _run_8rank renamed to _run_ring; world_size: 8 override dropped; now
exercises rank = SIP ring all-reduce.
tests/test_mp_spawn.py, tests/test_ccl_ddp_launcher.py:
- Monkeypatch target updated from bench.worker to bench._worker
(signature now takes BenchCfg instead of (rank, world_size)).
555 passed, 1 intentional skip. Tests that directly call
install_ipcq(world_size_override=N) for kernel-level sanity
(test_ccl_hello_world_guide, test_recv_copy_to_dst, test_tl_recv_async,
test_ccl_deadlock_detection) are unchanged — they never went through
the bench and still exercise the kernel-only path.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
86 lines
2.8 KiB
Python
86 lines
2.8 KiB
Python
"""CCL performance validation tests (ADR-0023 D13 T5).
|
|
|
|
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
|
|
under the rank = SIP TP launcher model (ADR-0024 / ADR-0027). Uses the
|
|
topology-derived world_size (= 2 in the shipped topology); the latency
|
|
model is topology-aware, so buffer_kind differences remain visible even
|
|
at this scale.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import os
|
|
|
|
import pytest
|
|
|
|
from kernbench.runtime_api.bench_runner import run_bench
|
|
from kernbench.runtime_api.types import resolve_device
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import resolve_topology
|
|
|
|
|
|
def _engine_factory(topology, device):
|
|
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
|
|
|
|
|
def _run_ring(algorithm: str, buffer_kind: str = "tcm") -> float:
|
|
"""Run a rank = SIP ring all-reduce via the unified bench with a tmp
|
|
ccl.yaml overlay. Returns simulated kernel total_ns."""
|
|
import tempfile
|
|
|
|
body = f"""\
|
|
defaults:
|
|
algorithm: {algorithm}
|
|
buffer_kind: {buffer_kind}
|
|
backpressure: sleep
|
|
n_slots: 4
|
|
slot_size: 4096
|
|
vc_chunk_size: 256
|
|
ipcq_credit_size_bytes: 16
|
|
|
|
algorithms:
|
|
{algorithm}:
|
|
module: kernbench.ccl.algorithms.ring_allreduce
|
|
topology: ring_1d
|
|
buffer_kind: {buffer_kind}
|
|
n_elem: 32
|
|
"""
|
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
|
|
f.write(body)
|
|
old_cwd = os.getcwd()
|
|
os.chdir(tmp)
|
|
try:
|
|
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
|
|
bench_mod = importlib.import_module("benches.ccl_allreduce")
|
|
result = run_bench(
|
|
topology=topo, bench_fn=bench_mod.run,
|
|
device=resolve_device("all"),
|
|
engine_factory=_engine_factory,
|
|
)
|
|
finally:
|
|
os.chdir(old_cwd)
|
|
|
|
assert result.completion.ok, f"{algorithm} did not complete"
|
|
last_kernel = None
|
|
for tr in (result.traces or []):
|
|
if tr.get("phase") == "kernel":
|
|
last_kernel = tr
|
|
assert last_kernel is not None, f"{algorithm} produced no kernel trace"
|
|
return float(last_kernel.get("total_ns", 0.0))
|
|
|
|
|
|
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
|
|
def test_ccl_latency_positive(buffer_kind):
|
|
"""Every buffer kind must produce a positive simulated latency."""
|
|
algo = f"ring_allreduce_{buffer_kind}"
|
|
ns = _run_ring(algo, buffer_kind)
|
|
assert ns > 0
|
|
|
|
|
|
def test_ccl_latency_under_reasonable_bound():
|
|
"""rank = SIP ring all-reduce (tile=32 f16) should finish well under 1ms."""
|
|
ns = _run_ring("ring_allreduce_tcm", "tcm")
|
|
assert ns < 1_000_000 # < 1 ms simulated
|