1d8b9401e5
New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
120 lines
3.5 KiB
Python
120 lines
3.5 KiB
Python
"""End-to-end distributed test for intercube allreduce.
|
|
|
|
Exercises the full process-group path:
|
|
dist.init_process_group(backend="ahbm")
|
|
→ mp.spawn(nprocs=n_sips)
|
|
→ each worker: set_device → allocate → fill → dist.all_reduce → verify
|
|
|
|
This is the same flow a real DDP training script would use.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import textwrap
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
|
|
|
N_CUBES = 16
|
|
N_ELEM = 8
|
|
|
|
|
|
def _write_ccl_yaml(tmp_path) -> str:
|
|
body = textwrap.dedent("""\
|
|
defaults:
|
|
algorithm: intercube_allreduce
|
|
buffer_kind: tcm
|
|
backpressure: sleep
|
|
n_slots: 4
|
|
slot_size: 4096
|
|
vc_chunk_size: 256
|
|
ipcq_credit_size_bytes: 16
|
|
|
|
algorithms:
|
|
intercube_allreduce:
|
|
module: kernbench.ccl.algorithms.intercube_allreduce
|
|
topology: none
|
|
buffer_kind: tcm
|
|
n_elem: 8
|
|
root_cube: 15
|
|
""")
|
|
(tmp_path / "ccl.yaml").write_text(body)
|
|
return str(tmp_path)
|
|
|
|
|
|
def _worker(rank: int, n_sips: int, torch) -> None:
|
|
"""Per-SIP worker: allocate, fill, all_reduce, verify."""
|
|
from kernbench.policy.placement.dp import DPPolicy
|
|
|
|
torch.ahbm.set_device(rank)
|
|
|
|
dp = DPPolicy(
|
|
cube="row_wise", pe="replicate",
|
|
num_pes=1, num_cubes=N_CUBES,
|
|
)
|
|
tensor = torch.zeros(
|
|
(N_CUBES, N_ELEM), dtype="f16", dp=dp,
|
|
name=f"sip{rank}",
|
|
)
|
|
|
|
init_arr = np.full((N_CUBES, N_ELEM), float(rank + 1), dtype=np.float16)
|
|
tensor.copy_(torch.from_numpy(init_arr))
|
|
|
|
print(f"[SIP {rank}] input cube0[:4] = {tensor.numpy()[0][:4].tolist()}")
|
|
|
|
torch.distributed.all_reduce(tensor, op="sum")
|
|
|
|
arr = tensor.numpy()
|
|
expected = float(N_CUBES * sum(range(1, n_sips + 1)))
|
|
|
|
print(f"[SIP {rank}] output cube0[:4] = {arr[0][:4].tolist()}")
|
|
print(f"[SIP {rank}] output cube15[:4] = {arr[15][:4].tolist()}")
|
|
|
|
for cube_id in range(N_CUBES):
|
|
assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), (
|
|
f"SIP{rank} cube {cube_id}: "
|
|
f"got {arr[cube_id][:4]}, expected {expected}"
|
|
)
|
|
|
|
if rank == 0:
|
|
print(f"\n intercube_allreduce (ws={n_sips}): "
|
|
f"{n_sips * N_CUBES} OK")
|
|
|
|
|
|
def test_distributed_intercube_allreduce(tmp_path, monkeypatch):
|
|
"""Full distributed path: init_process_group → mp.spawn → all_reduce."""
|
|
from kernbench.runtime_api.context import RuntimeContext
|
|
from kernbench.runtime_api.types import DeviceSelector
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import resolve_topology
|
|
|
|
monkeypatch.chdir(_write_ccl_yaml(tmp_path))
|
|
|
|
topo = resolve_topology(str(TOPOLOGY_PATH))
|
|
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
|
spec = topo.topology_obj.spec
|
|
n_sips = int(spec["system"]["sips"]["count"])
|
|
|
|
with RuntimeContext(
|
|
engine=engine,
|
|
target_device=DeviceSelector("all"),
|
|
correlation_id="dist_intercube_ar",
|
|
spec=spec,
|
|
) as ctx:
|
|
ctx.distributed.init_process_group(backend="ahbm")
|
|
|
|
assert ctx.distributed.get_world_size() == n_sips
|
|
|
|
t_start = engine._env.now
|
|
|
|
ctx.multiprocessing.spawn(
|
|
_worker, args=(n_sips, ctx), nprocs=n_sips,
|
|
)
|
|
|
|
t_end = engine._env.now
|
|
print(f"\n[distributed] sim latency = "
|
|
f"{t_end - t_start:.1f} ns ({(t_end - t_start) / 1000:.3f} us)")
|