Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms.
Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane),
then inter-SIP exchange on root cube, then broadcast back. Supports
ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by
topology.yaml. Integrated with dist.init_process_group / dist.all_reduce.

New files:
- src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel)
- src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip)
- tests/test_allreduce_multidevice.py (config-driven, 3 topologies)
- tests/test_distributed_intercube_allreduce.py (full distributed path)
- tests/test_intercube_sfr_config.py (SFR wiring verification)

Modified:
- distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip
- topologies.py: added torus_2d, mesh_2d_no_wrap
- install.py: global_E/W/N/S in _OPPOSITE_DIR
- topology.yaml: added system.sips.topology
- ccl.yaml: single intercube_allreduce algorithm
- benches/ccl_allreduce.py: row_wise cube-mesh tensor layout

Removed old flat-ring algorithms and their tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 17:33:42 -07:00
parent cfc2d74ec4
commit 1d8b9401e5
30 changed files with 876 additions and 2892 deletions
+46 -29
View File
@@ -23,6 +23,7 @@ Host bench code uses only real-PyTorch names:
from __future__ import annotations
import importlib
import math
from typing import Any
@@ -40,31 +41,35 @@ class AhbmCCLBackend:
self._merged = resolve_algorithm_config(self._cfg_all)
self._algo_module = importlib.import_module(self._merged["module"])
self._world_size = self._resolve_world_size()
# ADR-0024 D7: handles pending drain by the main scheduler.
# Worker greenlets extend this list after submitting their collective
# kernel, then yield. The bench `run()` loop drains the list after
# all workers yielded (so all sibling kernels are live in SimPy
# before any rank waits, avoiding cross-rank deadlock).
self._pending_collective_handles: list = []
self._dist_ctx: Any = None
# Eager IPCQ install — ``init_process_group`` time. Mirrors NCCL
# communicator creation: done once, reused across every subsequent
# collective call on the same process group.
# ADR-0024 D2: rank → SIP representative PE mapping when world_size
# fits in the topology's SIP count. Legacy "rank = flat PE index" is
# preserved when ccl.yaml explicitly overrides world_size > SIP count
# (backward compat path).
spec = self.ctx.spec or {}
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
if self._world_size <= n_sips:
rank_to_pe = [(r, 0, 0) for r in range(self._world_size)]
else:
rank_to_pe = None
self.ctx.install_ipcq(
algorithm=self._merged["algorithm"],
world_size_override=self._world_size,
rank_to_pe=rank_to_pe,
self._n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
self._sip_topo = str(
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
)
cm = spec.get("sip", {}).get("cube_mesh", {})
self._cube_w = int(cm.get("w", 4))
self._cube_h = int(cm.get("h", 4))
# Resolve SIP topology dims for the kernel
topo_map = getattr(self._algo_module, "TOPO_NAME_TO_KIND", None)
if topo_map is not None:
self._sip_topo_kind = topo_map.get(self._sip_topo, 0)
else:
self._sip_topo_kind = 0
if self._sip_topo == "ring_1d":
self._sip_topo_w, self._sip_topo_h = 0, 0
else:
side = int(round(math.sqrt(self._n_sips)))
self._sip_topo_w, self._sip_topo_h = side, side
# IPCQ install: wire all pe0s across all cubes and SIPs
engine = getattr(self.ctx, "engine", None)
if engine is not None:
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
configure_sfr_intercube_multisip(engine, spec, self._merged)
def _resolve_world_size(self) -> int:
"""Derive world_size (priority: algorithm override > defaults > topology).
@@ -109,15 +114,26 @@ class AhbmCCLBackend:
n_elem = shards[0].nbytes // tensor.itemsize
kernel_fn = self._algo_module.kernel
kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
# ADR-0024 D7: submit + yield. When running under the multi-greenlet
# bench launcher, the scheduler (not the worker) drains the pending
# handles. This is required because env.run must be invoked from the
# MAIN greenlet — otherwise kernel_runner's spawned kernel-greenlet
# captures the worker-greenlet as its `_parent`, and kernel
# switch_to_simpy() returns control to the main scheduler loop
# mid-wait, causing nested re-entry and the scheduler to spin.
# Resolve sip_rank from the current greenlet's bound rank
from greenlet import getcurrent as _gc
g = _gc()
dist_ctx = getattr(self, "_dist_ctx", None)
if dist_ctx is not None:
sip_rank = int(dist_ctx._rank_by_greenlet.get(g, 0))
else:
sip_rank = 0
extra_args = (
sip_rank,
self._sip_topo_kind,
self._sip_topo_w,
self._sip_topo_h,
)
pending = self.ctx.launch(
self._merged["algorithm"], kernel_fn, tensor, *kernel_args,
self._merged["algorithm"], kernel_fn, tensor,
*kernel_args, *extra_args,
_defer_wait=True,
)
from greenlet import getcurrent
@@ -181,6 +197,7 @@ class DistributedContext:
"DistributedContext not bound to a RuntimeContext"
)
self._backend = AhbmCCLBackend(torch_ctx=ctx)
self._backend._dist_ctx = self
def is_initialized(self) -> bool:
return self._backend is not None