Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 17:33:42 -07:00
parent cfc2d74ec4
commit 1d8b9401e5
30 changed files with 876 additions and 2892 deletions
@@ -0,0 +1,119 @@
+"""End-to-end distributed test for intercube allreduce.
+
+Exercises the full process-group path:
+    dist.init_process_group(backend="ahbm")
+    → mp.spawn(nprocs=n_sips)
+    → each worker: set_device → allocate → fill → dist.all_reduce → verify
+
+This is the same flow a real DDP training script would use.
+"""
+from __future__ import annotations
+
+import os
+import textwrap
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+N_CUBES = 16
+N_ELEM = 8
+
+
+def _write_ccl_yaml(tmp_path) -> str:
+    body = textwrap.dedent("""\
+        defaults:
+          algorithm: intercube_allreduce
+          buffer_kind: tcm
+          backpressure: sleep
+          n_slots: 4
+          slot_size: 4096
+          vc_chunk_size: 256
+          ipcq_credit_size_bytes: 16
+
+        algorithms:
+          intercube_allreduce:
+            module: kernbench.ccl.algorithms.intercube_allreduce
+            topology: none
+            buffer_kind: tcm
+            n_elem: 8
+            root_cube: 15
+    """)
+    (tmp_path / "ccl.yaml").write_text(body)
+    return str(tmp_path)
+
+
+def _worker(rank: int, n_sips: int, torch) -> None:
+    """Per-SIP worker: allocate, fill, all_reduce, verify."""
+    from kernbench.policy.placement.dp import DPPolicy
+
+    torch.ahbm.set_device(rank)
+
+    dp = DPPolicy(
+        cube="row_wise", pe="replicate",
+        num_pes=1, num_cubes=N_CUBES,
+    )
+    tensor = torch.zeros(
+        (N_CUBES, N_ELEM), dtype="f16", dp=dp,
+        name=f"sip{rank}",
+    )
+
+    init_arr = np.full((N_CUBES, N_ELEM), float(rank + 1), dtype=np.float16)
+    tensor.copy_(torch.from_numpy(init_arr))
+
+    print(f"[SIP {rank}] input  cube0[:4] = {tensor.numpy()[0][:4].tolist()}")
+
+    torch.distributed.all_reduce(tensor, op="sum")
+
+    arr = tensor.numpy()
+    expected = float(N_CUBES * sum(range(1, n_sips + 1)))
+
+    print(f"[SIP {rank}] output cube0[:4]  = {arr[0][:4].tolist()}")
+    print(f"[SIP {rank}] output cube15[:4] = {arr[15][:4].tolist()}")
+
+    for cube_id in range(N_CUBES):
+        assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), (
+            f"SIP{rank} cube {cube_id}: "
+            f"got {arr[cube_id][:4]}, expected {expected}"
+        )
+
+    if rank == 0:
+        print(f"\n  intercube_allreduce (ws={n_sips}): "
+              f"{n_sips * N_CUBES} OK")
+
+
+def test_distributed_intercube_allreduce(tmp_path, monkeypatch):
+    """Full distributed path: init_process_group → mp.spawn → all_reduce."""
+    from kernbench.runtime_api.context import RuntimeContext
+    from kernbench.runtime_api.types import DeviceSelector
+    from kernbench.sim_engine.engine import GraphEngine
+    from kernbench.topology.builder import resolve_topology
+
+    monkeypatch.chdir(_write_ccl_yaml(tmp_path))
+
+    topo = resolve_topology(str(TOPOLOGY_PATH))
+    engine = GraphEngine(topo.topology_obj, enable_data=True)
+    spec = topo.topology_obj.spec
+    n_sips = int(spec["system"]["sips"]["count"])
+
+    with RuntimeContext(
+        engine=engine,
+        target_device=DeviceSelector("all"),
+        correlation_id="dist_intercube_ar",
+        spec=spec,
+    ) as ctx:
+        ctx.distributed.init_process_group(backend="ahbm")
+
+        assert ctx.distributed.get_world_size() == n_sips
+
+        t_start = engine._env.now
+
+        ctx.multiprocessing.spawn(
+            _worker, args=(n_sips, ctx), nprocs=n_sips,
+        )
+
+        t_end = engine._env.now
+        print(f"\n[distributed] sim latency = "
+              f"{t_end - t_start:.1f} ns ({(t_end - t_start) / 1000:.3f} us)")