ff7d727ddd
Rename the intercube all-reduce identity to lrab_hierarchical_allreduce (module, config key, distributed test) so the name reflects both levels it implements: LRAB intra-SIP (local reduce to center root + broadcast) and the hierarchical inter-SIP topology exchange (ring/torus/mesh). ADR-0032 slug kept as the stable decision id; pure rename, no logic change. Also in this batch: - ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder. - Rename allreduce + pe2pe latency plots to descriptive, title-matching filenames and retitle the in-plot headings; drop overview/overview_log. - Point the PPTX image refs at the new plot names. Doc + derived-artifact + rename only; no simulation behavior changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
140 lines
5.6 KiB
Python
140 lines
5.6 KiB
Python
"""Tests for configure_sfr_intercube_multisip neighbor table wiring.
|
||
|
||
Verifies full IPCQ hardware wiring (independent of DPPolicy):
|
||
- intra-cube (2×4 PE grid) → intra_N/S/E/W
|
||
- intercube same-lane → N/S/E/W
|
||
- inter-SIP same-(cube, pe) → global_N/S/E/W
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
|
||
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||
from kernbench.sim_engine.engine import GraphEngine
|
||
from kernbench.topology.builder import resolve_topology
|
||
|
||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||
|
||
N_CUBES = 16
|
||
PES_PER_CUBE = 8
|
||
|
||
|
||
def _engine_and_spec():
|
||
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||
return engine, topo.topology_obj.spec
|
||
|
||
|
||
def _merged_cfg():
|
||
cfg = load_ccl_config()
|
||
return resolve_algorithm_config(cfg, name="lrab_hierarchical_allreduce")
|
||
|
||
|
||
class TestConfigureSfrNeighborTables:
|
||
def test_world_size_and_rank_to_pe(self):
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
plan = configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
n_sips = int(spec["system"]["sips"]["count"])
|
||
expected = n_sips * N_CUBES * PES_PER_CUBE
|
||
assert plan["world_size"] == expected
|
||
assert len(plan["rank_to_pe"]) == expected
|
||
|
||
# ── Intra-cube (intra_N/S/E/W) ────────────────────────────────
|
||
|
||
def test_pe0_intra_cube_has_intra_E_and_intra_S(self):
|
||
"""pe0 is NW of the 2×4 PE grid: intra_E=pe1, intra_S=pe4."""
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
qp = engine._components["sip0.cube0.pe0.pe_ipcq"].queue_pairs
|
||
assert "intra_E" in qp
|
||
assert qp["intra_E"]["peer"].pe == 1
|
||
assert "intra_S" in qp
|
||
assert qp["intra_S"]["peer"].pe == 4
|
||
assert "intra_W" not in qp
|
||
assert "intra_N" not in qp
|
||
|
||
def test_pe5_intra_cube_has_all_four(self):
|
||
"""pe5 (row=1, col=1 in 2×4 grid) has all 4 intra directions.
|
||
|
||
Intra neighbors: intra_N=pe1, intra_E=pe6, intra_W=pe4,
|
||
intra_S not present (row=1 is bottom row).
|
||
"""
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
qp = engine._components["sip0.cube0.pe5.pe_ipcq"].queue_pairs
|
||
assert qp["intra_N"]["peer"].pe == 1
|
||
assert qp["intra_E"]["peer"].pe == 6
|
||
assert qp["intra_W"]["peer"].pe == 4
|
||
assert "intra_S" not in qp # bottom row
|
||
|
||
# ── Intercube same-lane (N/S/E/W) ─────────────────────────────
|
||
|
||
def test_corner_cube0_pe0_has_intercube_E_and_S(self):
|
||
"""Cube 0 (NW mesh corner): intercube E→cube1, S→cube4."""
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
qp = engine._components["sip0.cube0.pe0.pe_ipcq"].queue_pairs
|
||
assert qp["E"]["peer"].cube == 1
|
||
assert qp["E"]["peer"].pe == 0 # same-lane
|
||
assert qp["S"]["peer"].cube == 4
|
||
assert qp["S"]["peer"].pe == 0
|
||
assert "W" not in qp, "cube 0 has no west neighbor"
|
||
assert "N" not in qp, "cube 0 has no north neighbor"
|
||
|
||
def test_interior_cube5_pe3_has_all_four_intercube_same_lane(self):
|
||
"""Cube 5 interior, pe3: intercube N/S/E/W all present, same-lane."""
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
qp = engine._components["sip0.cube5.pe3.pe_ipcq"].queue_pairs
|
||
for d, expected_cube in [("N", 1), ("S", 9), ("E", 6), ("W", 4)]:
|
||
assert qp[d]["peer"].cube == expected_cube
|
||
assert qp[d]["peer"].pe == 3 # same-lane
|
||
|
||
def test_all_pes_have_intercube_wiring(self):
|
||
"""Every PE on every interior cube has intercube same-lane wiring."""
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
# Interior cube 5: every PE should have N/S/E/W same-lane.
|
||
for pe in range(PES_PER_CUBE):
|
||
qp = engine._components[f"sip0.cube5.pe{pe}.pe_ipcq"].queue_pairs
|
||
for d in ("N", "S", "E", "W"):
|
||
assert d in qp, f"sip0.cube5.pe{pe} missing intercube {d}"
|
||
assert qp[d]["peer"].pe == pe, (
|
||
f"sip0.cube5.pe{pe} {d} not same-lane"
|
||
)
|
||
|
||
# ── Inter-SIP (global_*) ──────────────────────────────────────
|
||
|
||
def test_every_pe_on_every_cube_has_inter_sip(self):
|
||
"""All PEs on all cubes wired for inter-SIP via global_*."""
|
||
engine, spec = _engine_and_spec()
|
||
cfg = _merged_cfg()
|
||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||
|
||
for cube_id in range(N_CUBES):
|
||
for pe in range(PES_PER_CUBE):
|
||
qp = engine._components[
|
||
f"sip0.cube{cube_id}.pe{pe}.pe_ipcq"
|
||
].queue_pairs
|
||
assert "global_E" in qp, (
|
||
f"sip0.cube{cube_id}.pe{pe} missing global_E"
|
||
)
|
||
assert "global_W" in qp
|
||
# Peer must be same (cube, pe) on another SIP.
|
||
assert qp["global_E"]["peer"].sip == 1
|
||
assert qp["global_E"]["peer"].cube == cube_id
|
||
assert qp["global_E"]["peer"].pe == pe
|