ADR-0032 + intra_* opposite directions in IPCQ install

Add intra_N/S/E/W to install.py _OPPOSITE_DIR table so the intra-cube
PE-to-PE namespace is symmetrical with intercube N/S/E/W. ADR-0032
documents the intercube allreduce algorithm (supersedes ADR-0029).
Refresh ADR-0024/0025/0029 cross-refs and update
test_intercube_sfr_config.py to cover the new intra_* mappings. Drop
the obsolete test_ccl_round_robin_recv.py (replaced by intercube tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 16:43:01 -07:00
parent 81cc32c46b
commit 1c33afec55
7 changed files with 363 additions and 111 deletions
+86 -60
View File
@@ -1,8 +1,9 @@
"""Tests for configure_sfr_intercube_multisip neighbor table wiring.
Verifies that IPCQ neighbor tables are correctly installed for
intercube (pe0, 4×4 mesh N/S/E/W) + inter-SIP (pe0, all cubes,
global_E/global_W) communication.
Verifies full IPCQ hardware wiring (independent of DPPolicy):
- intra-cube (2×4 PE grid) → intra_N/S/E/W
- intercube same-lane → N/S/E/W
- inter-SIP same-(cube, pe) → global_N/S/E/W
"""
from __future__ import annotations
@@ -16,6 +17,7 @@ from kernbench.topology.builder import resolve_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
N_CUBES = 16
PES_PER_CUBE = 8
def _engine_and_spec():
@@ -36,78 +38,102 @@ class TestConfigureSfrNeighborTables:
plan = configure_sfr_intercube_multisip(engine, spec, cfg)
n_sips = int(spec["system"]["sips"]["count"])
assert plan["world_size"] == n_sips * N_CUBES
assert len(plan["rank_to_pe"]) == n_sips * N_CUBES
for pe_idx, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
assert pe == 0, f"pe_idx {pe_idx}: pe must be 0, got {pe}"
expected = n_sips * N_CUBES * PES_PER_CUBE
assert plan["world_size"] == expected
assert len(plan["rank_to_pe"]) == expected
def test_corner_cube0_has_E_and_S_only(self):
"""Cube 0 (row=0, col=0) is NW corner: only E and S neighbors."""
# ── Intra-cube (intra_N/S/E/W) ────────────────────────────────
def test_pe0_intra_cube_has_intra_E_and_intra_S(self):
"""pe0 is NW of the 2×4 PE grid: intra_E=pe1, intra_S=pe4."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
qp = ipcq.queue_pairs
assert "E" in qp, "cube 0 must have E neighbor"
assert "S" in qp, "cube 0 must have S neighbor"
assert "W" not in qp, "cube 0 (col=0) must NOT have W neighbor"
assert "N" not in qp, "cube 0 (row=0) must NOT have N neighbor"
qp = engine._components["sip0.cube0.pe0.pe_ipcq"].queue_pairs
assert "intra_E" in qp
assert qp["intra_E"]["peer"].pe == 1
assert "intra_S" in qp
assert qp["intra_S"]["peer"].pe == 4
assert "intra_W" not in qp
assert "intra_N" not in qp
def test_pe5_intra_cube_has_all_four(self):
"""pe5 (row=1, col=1 in 2×4 grid) has all 4 intra directions.
Intra neighbors: intra_N=pe1, intra_E=pe6, intra_W=pe4,
intra_S not present (row=1 is bottom row).
"""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
qp = engine._components["sip0.cube0.pe5.pe_ipcq"].queue_pairs
assert qp["intra_N"]["peer"].pe == 1
assert qp["intra_E"]["peer"].pe == 6
assert qp["intra_W"]["peer"].pe == 4
assert "intra_S" not in qp # bottom row
# ── Intercube same-lane (N/S/E/W) ─────────────────────────────
def test_corner_cube0_pe0_has_intercube_E_and_S(self):
"""Cube 0 (NW mesh corner): intercube E→cube1, S→cube4."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
qp = engine._components["sip0.cube0.pe0.pe_ipcq"].queue_pairs
assert qp["E"]["peer"].cube == 1
assert qp["E"]["peer"].pe == 0 # same-lane
assert qp["S"]["peer"].cube == 4
assert qp["S"]["peer"].pe == 0
assert "W" not in qp, "cube 0 has no west neighbor"
assert "N" not in qp, "cube 0 has no north neighbor"
def test_interior_cube5_has_all_four(self):
"""Cube 5 (row=1, col=1) is interior: N/S/E/W all present."""
def test_interior_cube5_pe3_has_all_four_intercube_same_lane(self):
"""Cube 5 interior, pe3: intercube N/S/E/W all present, same-lane."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
ipcq = engine._components["sip0.cube5.pe0.pe_ipcq"]
qp = ipcq.queue_pairs
assert qp["N"]["peer"].cube == 1
assert qp["S"]["peer"].cube == 9
assert qp["E"]["peer"].cube == 6
assert qp["W"]["peer"].cube == 4
qp = engine._components["sip0.cube5.pe3.pe_ipcq"].queue_pairs
for d, expected_cube in [("N", 1), ("S", 9), ("E", 6), ("W", 4)]:
assert qp[d]["peer"].cube == expected_cube
assert qp[d]["peer"].pe == 3 # same-lane
def test_root_cube15_has_inter_sip(self):
"""Cube 15 (root, SE corner) has N, W + global_E/global_W."""
def test_all_pes_have_intercube_wiring(self):
"""Every PE on every interior cube has intercube same-lane wiring."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
ipcq0 = engine._components["sip0.cube15.pe0.pe_ipcq"]
qp0 = ipcq0.queue_pairs
assert "N" in qp0
assert "W" in qp0
assert "E" not in qp0, "cube 15 (col=3) must NOT have E"
assert "S" not in qp0, "cube 15 (row=3) must NOT have S"
assert "global_E" in qp0, "root cube must have global_E"
assert "global_W" in qp0, "root cube must have global_W"
assert qp0["global_E"]["peer"].sip == 1
assert qp0["global_E"]["peer"].cube == 15
ipcq1 = engine._components["sip1.cube15.pe0.pe_ipcq"]
qp1 = ipcq1.queue_pairs
assert qp1["global_E"]["peer"].sip == 0
assert qp1["global_E"]["peer"].cube == 15
def test_all_cubes_have_inter_sip(self):
"""ALL cubes (not just root) are wired for inter-SIP."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
root_cube = int(cfg.get("root_cube", N_CUBES - 1))
for cube_id in range(N_CUBES):
ipcq = engine._components[f"sip0.cube{cube_id}.pe0.pe_ipcq"]
qp = ipcq.queue_pairs
assert "global_E" in qp, (
f"sip0.cube{cube_id}.pe0 missing global_E"
)
assert "global_W" in qp, (
f"sip0.cube{cube_id}.pe0 missing global_W"
)
if cube_id == root_cube:
assert qp["global_E"]["peer"].sip != 0, (
f"root cube {root_cube} global_E must point to another SIP"
# Interior cube 5: every PE should have N/S/E/W same-lane.
for pe in range(PES_PER_CUBE):
qp = engine._components[f"sip0.cube5.pe{pe}.pe_ipcq"].queue_pairs
for d in ("N", "S", "E", "W"):
assert d in qp, f"sip0.cube5.pe{pe} missing intercube {d}"
assert qp[d]["peer"].pe == pe, (
f"sip0.cube5.pe{pe} {d} not same-lane"
)
# ── Inter-SIP (global_*) ──────────────────────────────────────
def test_every_pe_on_every_cube_has_inter_sip(self):
"""All PEs on all cubes wired for inter-SIP via global_*."""
engine, spec = _engine_and_spec()
cfg = _merged_cfg()
configure_sfr_intercube_multisip(engine, spec, cfg)
for cube_id in range(N_CUBES):
for pe in range(PES_PER_CUBE):
qp = engine._components[
f"sip0.cube{cube_id}.pe{pe}.pe_ipcq"
].queue_pairs
assert "global_E" in qp, (
f"sip0.cube{cube_id}.pe{pe} missing global_E"
)
assert "global_W" in qp
# Peer must be same (cube, pe) on another SIP.
assert qp["global_E"]["peer"].sip == 1
assert qp["global_E"]["peer"].cube == cube_id
assert qp["global_E"]["peer"].pe == pe