Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms.
Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane),
then inter-SIP exchange on root cube, then broadcast back. Supports
ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by
topology.yaml. Integrated with dist.init_process_group / dist.all_reduce.

New files:
- src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel)
- src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip)
- tests/test_allreduce_multidevice.py (config-driven, 3 topologies)
- tests/test_distributed_intercube_allreduce.py (full distributed path)
- tests/test_intercube_sfr_config.py (SFR wiring verification)

Modified:
- distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip
- topologies.py: added torus_2d, mesh_2d_no_wrap
- install.py: global_E/W/N/S in _OPPOSITE_DIR
- topology.yaml: added system.sips.topology
- ccl.yaml: single intercube_allreduce algorithm
- benches/ccl_allreduce.py: row_wise cube-mesh tensor layout

Removed old flat-ring algorithms and their tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 17:33:42 -07:00
parent cfc2d74ec4
commit 1d8b9401e5
30 changed files with 876 additions and 2892 deletions
+113
View File
@@ -0,0 +1,113 @@
"""Tests for configure_sfr_intercube_multisip neighbor table wiring.
Verifies that IPCQ neighbor tables are correctly installed for
intercube (pe0, 4×4 mesh N/S/E/W) + inter-SIP (pe0, all cubes,
global_E/global_W) communication.
"""
from __future__ import annotations
from pathlib import Path
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
N_CUBES = 16
def _engine_and_spec():
topo = resolve_topology(str(TOPOLOGY_PATH))
engine = GraphEngine(topo.topology_obj, enable_data=True)
return engine, topo.topology_obj.spec
def _merged_cfg():
cfg = load_ccl_config()
return resolve_algorithm_config(cfg, name="intercube_allreduce")
class TestConfigureSfrNeighborTables:
def test_world_size_and_rank_to_pe(self):
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
plan = configure_sfr_intercube_multisip(engine, spec, cfg)
n_sips = int(spec["system"]["sips"]["count"])
assert plan["world_size"] == n_sips * N_CUBES
assert len(plan["rank_to_pe"]) == n_sips * N_CUBES
for pe_idx, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
assert pe == 0, f"pe_idx {pe_idx}: pe must be 0, got {pe}"
def test_corner_cube0_has_E_and_S_only(self):
"""Cube 0 (row=0, col=0) is NW corner: only E and S neighbors."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
qp = ipcq.queue_pairs
assert "E" in qp, "cube 0 must have E neighbor"
assert "S" in qp, "cube 0 must have S neighbor"
assert "W" not in qp, "cube 0 (col=0) must NOT have W neighbor"
assert "N" not in qp, "cube 0 (row=0) must NOT have N neighbor"
assert qp["E"]["peer"].cube == 1
assert qp["S"]["peer"].cube == 4
def test_interior_cube5_has_all_four(self):
"""Cube 5 (row=1, col=1) is interior: N/S/E/W all present."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
ipcq = engine._components["sip0.cube5.pe0.pe_ipcq"]
qp = ipcq.queue_pairs
assert qp["N"]["peer"].cube == 1
assert qp["S"]["peer"].cube == 9
assert qp["E"]["peer"].cube == 6
assert qp["W"]["peer"].cube == 4
def test_root_cube15_has_inter_sip(self):
"""Cube 15 (root, SE corner) has N, W + global_E/global_W."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
ipcq0 = engine._components["sip0.cube15.pe0.pe_ipcq"]
qp0 = ipcq0.queue_pairs
assert "N" in qp0
assert "W" in qp0
assert "E" not in qp0, "cube 15 (col=3) must NOT have E"
assert "S" not in qp0, "cube 15 (row=3) must NOT have S"
assert "global_E" in qp0, "root cube must have global_E"
assert "global_W" in qp0, "root cube must have global_W"
assert qp0["global_E"]["peer"].sip == 1
assert qp0["global_E"]["peer"].cube == 15
ipcq1 = engine._components["sip1.cube15.pe0.pe_ipcq"]
qp1 = ipcq1.queue_pairs
assert qp1["global_E"]["peer"].sip == 0
assert qp1["global_E"]["peer"].cube == 15
def test_all_cubes_have_inter_sip(self):
"""ALL cubes (not just root) are wired for inter-SIP."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
root_cube = int(cfg.get("root_cube", N_CUBES - 1))
for cube_id in range(N_CUBES):
ipcq = engine._components[f"sip0.cube{cube_id}.pe0.pe_ipcq"]
qp = ipcq.queue_pairs
assert "global_E" in qp, (
f"sip0.cube{cube_id}.pe0 missing global_E"
)
assert "global_W" in qp, (
f"sip0.cube{cube_id}.pe0 missing global_W"
)
if cube_id == root_cube:
assert qp["global_E"]["peer"].sip != 0, (
f"root cube {root_cube} global_E must point to another SIP"
)