Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms.
Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane),
then inter-SIP exchange on root cube, then broadcast back. Supports
ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by
topology.yaml. Integrated with dist.init_process_group / dist.all_reduce.

New files:
- src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel)
- src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip)
- tests/test_allreduce_multidevice.py (config-driven, 3 topologies)
- tests/test_distributed_intercube_allreduce.py (full distributed path)
- tests/test_intercube_sfr_config.py (SFR wiring verification)

Modified:
- distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip
- topologies.py: added torus_2d, mesh_2d_no_wrap
- install.py: global_E/W/N/S in _OPPOSITE_DIR
- topology.yaml: added system.sips.topology
- ccl.yaml: single intercube_allreduce algorithm
- benches/ccl_allreduce.py: row_wise cube-mesh tensor layout

Removed old flat-ring algorithms and their tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 17:33:42 -07:00
parent cfc2d74ec4
commit 1d8b9401e5
30 changed files with 876 additions and 2892 deletions
-117
View File
@@ -2,7 +2,6 @@
from __future__ import annotations
from kernbench.ccl.install import (
install_ipcq,
linear_rank_to_pe,
load_ccl_config,
resolve_algorithm_config,
@@ -26,28 +25,14 @@ def test_resolve_algorithm_config_default():
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg)
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
# ccl.yaml no longer carries defaults.world_size — backend derives
# it from topology.yaml at install time. Just check the field is
# absent here (verified per-test where install_ipcq is called).
assert "world_size" not in merged or merged["world_size"] >= 1
def test_resolve_algorithm_config_override():
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
assert merged["algorithm"] == "ring_allreduce_hbm"
assert merged["buffer_kind"] == "hbm" # algo override
# defaults still apply
assert merged["n_slots"] == cfg["defaults"]["n_slots"]
def test_linear_rank_to_pe():
engine, topo = _engine()
spec = topo.spec
# Cube 0 of SIP 0
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
# Should not exceed total PE count
pes_per_sip = (
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
* spec["cube"]["pe_layout"]["pe_per_corner"]
@@ -56,105 +41,3 @@ def test_linear_rank_to_pe():
sips = spec["system"]["sips"]["count"]
total = sips * pes_per_sip
assert total >= 8
def test_install_ipcq_neighbors_correct():
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
# Force a single-cube 8-rank install for the assertions below.
merged["world_size"] = 8
plan = install_ipcq(engine, topo.spec, merged)
assert plan["world_size"] == 8
assert plan["buffer_kind"] == "tcm"
# Each rank should have E and W entries
for r, nbrs in plan["neighbor_table"].items():
assert "E" in nbrs
assert "W" in nbrs
# Inspect installed PE_IPCQ for rank 0
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
qp_e = ipcq.queue_pairs["E"]
qp_w = ipcq.queue_pairs["W"]
assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1
assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7
# rx_base addresses should be unique
assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
def test_install_ipcq_credit_stores_wired():
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
merged["world_size"] = 8
install_ipcq(engine, topo.spec, merged)
# rank 0 (pe0) sending E goes to rank 1 (pe1)
# rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
qp_e = pe0.queue_pairs["E"]
assert qp_e["peer_credit_store"] is pe1.credit_inbox
# ── ADR-0025 D1: reverse_direction opposite-preference ───────────────
def test_reverse_direction_opposite_preference_2rank_ring():
"""ADR-0025 D1: In a 2-rank bidirectional ring both E and W point to the
same peer; reverse_direction must pick the OPPOSITE direction (W for E,
E for W) so rx_base targets the semantically-correct slot.
Concretely: rank 0 sending via E to rank 1 must target rank 1's W-rx
buffer (not rank 1's E-rx), because rank 1's kernel recv(W) reads from
its W-rx.
"""
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
merged["world_size"] = 2
install_ipcq(engine, topo.spec, merged)
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
rank1_e_rx = ipcq1.queue_pairs["E"]["my_rx_base_pa"]
rank1_w_rx = ipcq1.queue_pairs["W"]["my_rx_base_pa"]
qp0_e = ipcq0.queue_pairs["E"]
qp0_w = ipcq0.queue_pairs["W"]
# rank 0's E entry should target rank 1's W-rx (opposite), NOT rank 1's E-rx.
assert qp0_e["peer"].rx_base_pa == rank1_w_rx, (
f"expected rank 0's E peer.rx_base_pa == rank 1's W-rx ({rank1_w_rx:#x}), "
f"got {qp0_e['peer'].rx_base_pa:#x} (matches E-rx: {rank1_e_rx:#x}) — "
f"reverse_direction picked same-label instead of opposite"
)
# rank 0's W entry should target rank 1's E-rx (opposite).
assert qp0_w["peer"].rx_base_pa == rank1_e_rx
def test_reverse_direction_opposite_preference_4rank_ring_sanity():
"""ADR-0025 D1 sanity: ws>=3 ring. E and W have distinct peers, so
opposite-preference produces same result as old dict-order first-match.
This test should PASS both under current and post-fix code.
"""
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
merged["world_size"] = 4
install_ipcq(engine, topo.spec, merged)
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
ipcq3 = engine._components["sip0.cube0.pe3.pe_ipcq"]
# rank 0 E → rank 1 → rank 1's W-rx
qp0_e = ipcq0.queue_pairs["E"]
assert qp0_e["peer"].rx_base_pa == ipcq1.queue_pairs["W"]["my_rx_base_pa"]
# rank 0 W → rank 3 (last in ring) → rank 3's E-rx
qp0_w = ipcq0.queue_pairs["W"]
assert qp0_w["peer"].rx_base_pa == ipcq3.queue_pairs["E"]["my_rx_base_pa"]