Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh
New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from kernbench.ccl.install import (
|
||||
install_ipcq,
|
||||
linear_rank_to_pe,
|
||||
load_ccl_config,
|
||||
resolve_algorithm_config,
|
||||
@@ -26,28 +25,14 @@ def test_resolve_algorithm_config_default():
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg)
|
||||
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
|
||||
# ccl.yaml no longer carries defaults.world_size — backend derives
|
||||
# it from topology.yaml at install time. Just check the field is
|
||||
# absent here (verified per-test where install_ipcq is called).
|
||||
assert "world_size" not in merged or merged["world_size"] >= 1
|
||||
|
||||
|
||||
def test_resolve_algorithm_config_override():
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
|
||||
assert merged["algorithm"] == "ring_allreduce_hbm"
|
||||
assert merged["buffer_kind"] == "hbm" # algo override
|
||||
# defaults still apply
|
||||
assert merged["n_slots"] == cfg["defaults"]["n_slots"]
|
||||
|
||||
|
||||
def test_linear_rank_to_pe():
|
||||
engine, topo = _engine()
|
||||
spec = topo.spec
|
||||
# Cube 0 of SIP 0
|
||||
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
|
||||
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
|
||||
# Should not exceed total PE count
|
||||
pes_per_sip = (
|
||||
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
||||
* spec["cube"]["pe_layout"]["pe_per_corner"]
|
||||
@@ -56,105 +41,3 @@ def test_linear_rank_to_pe():
|
||||
sips = spec["system"]["sips"]["count"]
|
||||
total = sips * pes_per_sip
|
||||
assert total >= 8
|
||||
|
||||
|
||||
def test_install_ipcq_neighbors_correct():
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
# Force a single-cube 8-rank install for the assertions below.
|
||||
merged["world_size"] = 8
|
||||
plan = install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
assert plan["world_size"] == 8
|
||||
assert plan["buffer_kind"] == "tcm"
|
||||
|
||||
# Each rank should have E and W entries
|
||||
for r, nbrs in plan["neighbor_table"].items():
|
||||
assert "E" in nbrs
|
||||
assert "W" in nbrs
|
||||
|
||||
# Inspect installed PE_IPCQ for rank 0
|
||||
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
qp_e = ipcq.queue_pairs["E"]
|
||||
qp_w = ipcq.queue_pairs["W"]
|
||||
assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1
|
||||
assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7
|
||||
# rx_base addresses should be unique
|
||||
assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
|
||||
|
||||
|
||||
def test_install_ipcq_credit_stores_wired():
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 8
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
# rank 0 (pe0) sending E goes to rank 1 (pe1)
|
||||
# rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
|
||||
pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
|
||||
qp_e = pe0.queue_pairs["E"]
|
||||
assert qp_e["peer_credit_store"] is pe1.credit_inbox
|
||||
|
||||
|
||||
# ── ADR-0025 D1: reverse_direction opposite-preference ───────────────
|
||||
|
||||
|
||||
def test_reverse_direction_opposite_preference_2rank_ring():
|
||||
"""ADR-0025 D1: In a 2-rank bidirectional ring both E and W point to the
|
||||
same peer; reverse_direction must pick the OPPOSITE direction (W for E,
|
||||
E for W) so rx_base targets the semantically-correct slot.
|
||||
|
||||
Concretely: rank 0 sending via E to rank 1 must target rank 1's W-rx
|
||||
buffer (not rank 1's E-rx), because rank 1's kernel recv(W) reads from
|
||||
its W-rx.
|
||||
"""
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 2
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
|
||||
rank1_e_rx = ipcq1.queue_pairs["E"]["my_rx_base_pa"]
|
||||
rank1_w_rx = ipcq1.queue_pairs["W"]["my_rx_base_pa"]
|
||||
|
||||
qp0_e = ipcq0.queue_pairs["E"]
|
||||
qp0_w = ipcq0.queue_pairs["W"]
|
||||
|
||||
# rank 0's E entry should target rank 1's W-rx (opposite), NOT rank 1's E-rx.
|
||||
assert qp0_e["peer"].rx_base_pa == rank1_w_rx, (
|
||||
f"expected rank 0's E peer.rx_base_pa == rank 1's W-rx ({rank1_w_rx:#x}), "
|
||||
f"got {qp0_e['peer'].rx_base_pa:#x} (matches E-rx: {rank1_e_rx:#x}) — "
|
||||
f"reverse_direction picked same-label instead of opposite"
|
||||
)
|
||||
# rank 0's W entry should target rank 1's E-rx (opposite).
|
||||
assert qp0_w["peer"].rx_base_pa == rank1_e_rx
|
||||
|
||||
|
||||
def test_reverse_direction_opposite_preference_4rank_ring_sanity():
|
||||
"""ADR-0025 D1 sanity: ws>=3 ring. E and W have distinct peers, so
|
||||
opposite-preference produces same result as old dict-order first-match.
|
||||
This test should PASS both under current and post-fix code.
|
||||
"""
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 4
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
ipcq3 = engine._components["sip0.cube0.pe3.pe_ipcq"]
|
||||
|
||||
# rank 0 E → rank 1 → rank 1's W-rx
|
||||
qp0_e = ipcq0.queue_pairs["E"]
|
||||
assert qp0_e["peer"].rx_base_pa == ipcq1.queue_pairs["W"]["my_rx_base_pa"]
|
||||
# rank 0 W → rank 3 (last in ring) → rank 3's E-rx
|
||||
qp0_w = ipcq0.queue_pairs["W"]
|
||||
assert qp0_w["peer"].rx_base_pa == ipcq3.queue_pairs["E"]["my_rx_base_pa"]
|
||||
|
||||
Reference in New Issue
Block a user