Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 17:33:42 -07:00
parent cfc2d74ec4
commit 1d8b9401e5
30 changed files with 876 additions and 2892 deletions
@@ -23,6 +23,7 @@ Host bench code uses only real-PyTorch names:
 from __future__ import annotations

 import importlib
+import math
 from typing import Any


@@ -40,31 +41,35 @@ class AhbmCCLBackend:
        self._merged = resolve_algorithm_config(self._cfg_all)
        self._algo_module = importlib.import_module(self._merged["module"])
        self._world_size = self._resolve_world_size()
-        # ADR-0024 D7: handles pending drain by the main scheduler.
-        # Worker greenlets extend this list after submitting their collective
-        # kernel, then yield. The bench `run()` loop drains the list after
-        # all workers yielded (so all sibling kernels are live in SimPy
-        # before any rank waits, avoiding cross-rank deadlock).
        self._pending_collective_handles: list = []
+        self._dist_ctx: Any = None

-        # Eager IPCQ install — ``init_process_group`` time. Mirrors NCCL
-        # communicator creation: done once, reused across every subsequent
-        # collective call on the same process group.
-        # ADR-0024 D2: rank → SIP representative PE mapping when world_size
-        # fits in the topology's SIP count. Legacy "rank = flat PE index" is
-        # preserved when ccl.yaml explicitly overrides world_size > SIP count
-        # (backward compat path).
        spec = self.ctx.spec or {}
-        n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
-        if self._world_size <= n_sips:
-            rank_to_pe = [(r, 0, 0) for r in range(self._world_size)]
-        else:
-            rank_to_pe = None
-        self.ctx.install_ipcq(
-            algorithm=self._merged["algorithm"],
-            world_size_override=self._world_size,
-            rank_to_pe=rank_to_pe,
+        self._n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+        self._sip_topo = str(
+            spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
        )
+        cm = spec.get("sip", {}).get("cube_mesh", {})
+        self._cube_w = int(cm.get("w", 4))
+        self._cube_h = int(cm.get("h", 4))
+
+        # Resolve SIP topology dims for the kernel
+        topo_map = getattr(self._algo_module, "TOPO_NAME_TO_KIND", None)
+        if topo_map is not None:
+            self._sip_topo_kind = topo_map.get(self._sip_topo, 0)
+        else:
+            self._sip_topo_kind = 0
+        if self._sip_topo == "ring_1d":
+            self._sip_topo_w, self._sip_topo_h = 0, 0
+        else:
+            side = int(round(math.sqrt(self._n_sips)))
+            self._sip_topo_w, self._sip_topo_h = side, side
+
+        # IPCQ install: wire all pe0s across all cubes and SIPs
+        engine = getattr(self.ctx, "engine", None)
+        if engine is not None:
+            from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
+            configure_sfr_intercube_multisip(engine, spec, self._merged)

    def _resolve_world_size(self) -> int:
        """Derive world_size (priority: algorithm override > defaults > topology).
@@ -109,15 +114,26 @@ class AhbmCCLBackend:
        n_elem = shards[0].nbytes // tensor.itemsize
        kernel_fn = self._algo_module.kernel
        kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
-        # ADR-0024 D7: submit + yield. When running under the multi-greenlet
-        # bench launcher, the scheduler (not the worker) drains the pending
-        # handles. This is required because env.run must be invoked from the
-        # MAIN greenlet — otherwise kernel_runner's spawned kernel-greenlet
-        # captures the worker-greenlet as its `_parent`, and kernel
-        # switch_to_simpy() returns control to the main scheduler loop
-        # mid-wait, causing nested re-entry and the scheduler to spin.
+
+        # Resolve sip_rank from the current greenlet's bound rank
+        from greenlet import getcurrent as _gc
+        g = _gc()
+        dist_ctx = getattr(self, "_dist_ctx", None)
+        if dist_ctx is not None:
+            sip_rank = int(dist_ctx._rank_by_greenlet.get(g, 0))
+        else:
+            sip_rank = 0
+
+        extra_args = (
+            sip_rank,
+            self._sip_topo_kind,
+            self._sip_topo_w,
+            self._sip_topo_h,
+        )
+
        pending = self.ctx.launch(
-            self._merged["algorithm"], kernel_fn, tensor, *kernel_args,
+            self._merged["algorithm"], kernel_fn, tensor,
+            *kernel_args, *extra_args,
            _defer_wait=True,
        )
        from greenlet import getcurrent
@@ -181,6 +197,7 @@ class DistributedContext:
                "DistributedContext not bound to a RuntimeContext"
            )
        self._backend = AhbmCCLBackend(torch_ctx=ctx)
+        self._backend._dist_ctx = self

    def is_initialized(self) -> bool:
        return self._backend is not None