Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh
New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,6 +23,7 @@ Host bench code uses only real-PyTorch names:
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
|
||||
@@ -40,31 +41,35 @@ class AhbmCCLBackend:
|
||||
self._merged = resolve_algorithm_config(self._cfg_all)
|
||||
self._algo_module = importlib.import_module(self._merged["module"])
|
||||
self._world_size = self._resolve_world_size()
|
||||
# ADR-0024 D7: handles pending drain by the main scheduler.
|
||||
# Worker greenlets extend this list after submitting their collective
|
||||
# kernel, then yield. The bench `run()` loop drains the list after
|
||||
# all workers yielded (so all sibling kernels are live in SimPy
|
||||
# before any rank waits, avoiding cross-rank deadlock).
|
||||
self._pending_collective_handles: list = []
|
||||
self._dist_ctx: Any = None
|
||||
|
||||
# Eager IPCQ install — ``init_process_group`` time. Mirrors NCCL
|
||||
# communicator creation: done once, reused across every subsequent
|
||||
# collective call on the same process group.
|
||||
# ADR-0024 D2: rank → SIP representative PE mapping when world_size
|
||||
# fits in the topology's SIP count. Legacy "rank = flat PE index" is
|
||||
# preserved when ccl.yaml explicitly overrides world_size > SIP count
|
||||
# (backward compat path).
|
||||
spec = self.ctx.spec or {}
|
||||
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||
if self._world_size <= n_sips:
|
||||
rank_to_pe = [(r, 0, 0) for r in range(self._world_size)]
|
||||
else:
|
||||
rank_to_pe = None
|
||||
self.ctx.install_ipcq(
|
||||
algorithm=self._merged["algorithm"],
|
||||
world_size_override=self._world_size,
|
||||
rank_to_pe=rank_to_pe,
|
||||
self._n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||
self._sip_topo = str(
|
||||
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||
)
|
||||
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||
self._cube_w = int(cm.get("w", 4))
|
||||
self._cube_h = int(cm.get("h", 4))
|
||||
|
||||
# Resolve SIP topology dims for the kernel
|
||||
topo_map = getattr(self._algo_module, "TOPO_NAME_TO_KIND", None)
|
||||
if topo_map is not None:
|
||||
self._sip_topo_kind = topo_map.get(self._sip_topo, 0)
|
||||
else:
|
||||
self._sip_topo_kind = 0
|
||||
if self._sip_topo == "ring_1d":
|
||||
self._sip_topo_w, self._sip_topo_h = 0, 0
|
||||
else:
|
||||
side = int(round(math.sqrt(self._n_sips)))
|
||||
self._sip_topo_w, self._sip_topo_h = side, side
|
||||
|
||||
# IPCQ install: wire all pe0s across all cubes and SIPs
|
||||
engine = getattr(self.ctx, "engine", None)
|
||||
if engine is not None:
|
||||
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||
configure_sfr_intercube_multisip(engine, spec, self._merged)
|
||||
|
||||
def _resolve_world_size(self) -> int:
|
||||
"""Derive world_size (priority: algorithm override > defaults > topology).
|
||||
@@ -109,15 +114,26 @@ class AhbmCCLBackend:
|
||||
n_elem = shards[0].nbytes // tensor.itemsize
|
||||
kernel_fn = self._algo_module.kernel
|
||||
kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
|
||||
# ADR-0024 D7: submit + yield. When running under the multi-greenlet
|
||||
# bench launcher, the scheduler (not the worker) drains the pending
|
||||
# handles. This is required because env.run must be invoked from the
|
||||
# MAIN greenlet — otherwise kernel_runner's spawned kernel-greenlet
|
||||
# captures the worker-greenlet as its `_parent`, and kernel
|
||||
# switch_to_simpy() returns control to the main scheduler loop
|
||||
# mid-wait, causing nested re-entry and the scheduler to spin.
|
||||
|
||||
# Resolve sip_rank from the current greenlet's bound rank
|
||||
from greenlet import getcurrent as _gc
|
||||
g = _gc()
|
||||
dist_ctx = getattr(self, "_dist_ctx", None)
|
||||
if dist_ctx is not None:
|
||||
sip_rank = int(dist_ctx._rank_by_greenlet.get(g, 0))
|
||||
else:
|
||||
sip_rank = 0
|
||||
|
||||
extra_args = (
|
||||
sip_rank,
|
||||
self._sip_topo_kind,
|
||||
self._sip_topo_w,
|
||||
self._sip_topo_h,
|
||||
)
|
||||
|
||||
pending = self.ctx.launch(
|
||||
self._merged["algorithm"], kernel_fn, tensor, *kernel_args,
|
||||
self._merged["algorithm"], kernel_fn, tensor,
|
||||
*kernel_args, *extra_args,
|
||||
_defer_wait=True,
|
||||
)
|
||||
from greenlet import getcurrent
|
||||
@@ -181,6 +197,7 @@ class DistributedContext:
|
||||
"DistributedContext not bound to a RuntimeContext"
|
||||
)
|
||||
self._backend = AhbmCCLBackend(torch_ctx=ctx)
|
||||
self._backend._dist_ctx = self
|
||||
|
||||
def is_initialized(self) -> bool:
|
||||
return self._backend is not None
|
||||
|
||||
Reference in New Issue
Block a user