sccl: drive allreduce tests via torch.distributed; reorganize into tests/sccl/

Convert the multidevice allreduce correctness + latency/buffer-kind sweeps
to run through the real PyTorch-distributed path
(init_process_group(backend="ahbm") -> mp.spawn -> dist.all_reduce) instead
of direct ctx.launch, and reorganize the CCL/allreduce tests into a
tests/sccl/ package split one test per file.

Production change (required for the distributed path on non-square SIP grids):
- AhbmCCLBackend now reads explicit system.sips.w/h from the spec, with a
  square-only sqrt fallback that raises on ambiguity, instead of silently
  guessing round(sqrt(count)). This fixes the 2x3 / 3x2 torus + mesh cases,
  which previously resolved to a wrong 2x2 grid. Mirrors the test helper's
  _sip_topo_dims precedence (explicit w/h > square fallback > raise).

Test reorganization (tests/sccl/):
- _allreduce_helpers.py: shared plumbing (distributed driver, config writers,
  direct-launch run_allreduce parity reference, sweep/buffer-kind constants,
  plot aggregators, topology-diagram + FSIM-comparison emitters).
- test_allreduce_ring_torus_mesh.py: correctness across ring/torus/mesh.
- test_distributed_default_topology.py: full distributed path on topology.yaml.
- test_plot_latency_sweep.py / test_plot_buffer_kind_sweep.py: sweep rows.
- test_plot_topology_diagram.py / test_plot_comparison_fsim.py: plot emitters.
- test_intercube_root_center.py: moved in (ADR-0032 center-root latency guard).

Also:
- Move the FSIM comparison plot generator out of scripts/ into the sccl suite.
- Delete superseded test files (test_allreduce_multidevice,
  test_distributed_lrab_hierarchical_allreduce, test_allreduce_buffer_kind_sweep)
  and repoint conftest aggregators + the ipcq buffer-kind importers.
- Regenerate the allreduce_latency_plots derived artifacts from the full sweep.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 22:24:43 -07:00
parent ff7d727ddd
commit b610cb0d9a
22 changed files with 745 additions and 759 deletions
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,35 @@
"""Correctness of intercube allreduce across SIP topologies (distributed path).
Routes through init_process_group → mp.spawn → dist.all_reduce for ring_1d,
torus_2d (2×3), and mesh_2d_no_wrap (2×3). Per-rank correctness is asserted
inside the worker; spawn raises on failure.
"""
from __future__ import annotations
import pytest
from tests.sccl._allreduce_helpers import (
CONFIGS,
DEFAULT_N_ELEM,
_crit_ns,
_run_distributed,
_write_temp_configs,
)
@pytest.mark.parametrize(
"algorithm,sip_topology,n_sips,sip_w,sip_h", CONFIGS,
)
def test_allreduce(
tmp_path, monkeypatch, algorithm, sip_topology, n_sips, sip_w, sip_h,
):
topo_path, _ = _write_temp_configs(
tmp_path, sip_topology, n_sips, algorithm,
sip_w=sip_w, sip_h=sip_h,
)
engine, _n_cubes = _run_distributed(
tmp_path, monkeypatch, topo_path,
f"test_{algorithm}_{sip_topology}", DEFAULT_N_ELEM,
)
# A positive critical path confirms the kernel actually ran.
assert _crit_ns(engine) > 0.0
@@ -0,0 +1,47 @@
"""Full distributed path against topology.yaml as-is (no overrides).
The same flow a real DDP training script would use:
init_process_group(backend="ahbm") → mp.spawn → dist.all_reduce.
"""
from __future__ import annotations
from kernbench.runtime_api.context import RuntimeContext
from kernbench.runtime_api.types import DeviceSelector
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
from tests.sccl._allreduce_helpers import (
DEFAULT_N_ELEM,
TOPOLOGY_PATH,
_worker,
_write_ccl_yaml,
)
def test_distributed_lrab_hierarchical_allreduce(tmp_path, monkeypatch):
monkeypatch.chdir(_write_ccl_yaml(tmp_path))
topo = resolve_topology(str(TOPOLOGY_PATH))
engine = GraphEngine(topo.topology_obj, enable_data=True)
spec = topo.topology_obj.spec
n_sips = int(spec["system"]["sips"]["count"])
cm = spec["sip"]["cube_mesh"]
n_cubes = int(cm["w"]) * int(cm["h"])
with RuntimeContext(
engine=engine,
target_device=DeviceSelector("all"),
correlation_id="dist_intercube_ar",
spec=spec,
) as ctx:
ctx.distributed.init_process_group(backend="ahbm")
assert ctx.distributed.get_world_size() == n_sips
t_start = engine._env.now
ctx.multiprocessing.spawn(
_worker, args=(n_cubes, DEFAULT_N_ELEM, n_sips, ctx),
nprocs=n_sips,
)
t_end = engine._env.now
print(f"\n[distributed] sim latency = "
f"{t_end - t_start:.1f} ns ({(t_end - t_start) / 1000:.3f} us)")
+144
View File
@@ -0,0 +1,144 @@
"""Phase 1 test for moving the lrab_hierarchical_allreduce root cube from the
bottom-right corner (3,3) to the geometric center (2,2).
Today's algorithm (lrab_hierarchical_allreduce.py) hardcodes
``root_cube = (cube_h-1) * cube_w + (cube_w-1)`` (= cube 15 in 4×4).
The intra-SIP critical path for one allreduce is therefore::
Phase 1 (row reduce W→E to col 3) : 3 hops
Phase 2 (col reduce N→S to row 3 on col 3): 3 hops
Phase 3 (inter-SIP at root) : (separate)
Phase 4 (col broadcast S→N) : 3 hops
Phase 5 (row broadcast E→W) : 3 hops
Total intra-SIP critical path : 12 hops
Moving the root to (2,2) and using BIDIRECTIONAL convergence (cols 0..2
go W→E, col 3 goes E→W in parallel; rows 0..2 go N→S, row 3 goes S→N
in parallel) cuts each phase's critical path from 3 hops to 2::
Phase 1 critical path : max(2, 1) = 2 hops
Phase 2 critical path : max(2, 1) = 2 hops
Phase 4 critical path : 2 hops
Phase 5 critical path : 2 hops
Total intra-SIP critical path : 8 hops
Per-hop cost at 96 KB on TCM ≈ 600 ns (slot IO write+read 384 ns +
fabric drain ~217 ns). 4 fewer hops ⇒ ~2.4 µs reduction.
EXPECTED Phase 1 outcome:
- Today (root = corner) : ~22.0 µs ← test FAILS (> 20500 ns)
- After Phase 2 (root = center) : ~19.6 µs ← test PASSES (< 20500 ns)
"""
from __future__ import annotations
from pathlib import Path
import pytest
from kernbench.runtime_api.context import RuntimeContext
from kernbench.runtime_api.types import DeviceSelector
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
from tests.sccl._allreduce_helpers import (
_write_temp_configs,
run_allreduce,
)
def _run_torus_96kb(tmp_path: Path) -> float:
"""Run torus_2d 6-SIP allreduce at 96 KB / slot, return critical-path
pe_exec_ns. Fixed at TCM (the project default)."""
sub = tmp_path / "torus_root_center"
sub.mkdir()
topo_path, ccl_path = _write_temp_configs(
sub,
sip_topology="torus_2d",
n_sips=6,
algorithm="lrab_hierarchical_allreduce",
sip_w=3, sip_h=2,
n_elem_override=49152, # 49152 × 2 = 96 KB / slot
)
topo = resolve_topology(topo_path)
engine = GraphEngine(topo.topology_obj, enable_data=True)
spec = topo.topology_obj.spec
with RuntimeContext(
engine=engine,
target_device=DeviceSelector("all"),
correlation_id="root_center_phase1",
spec=spec,
) as ctx:
result = run_allreduce(
ctx, engine, spec,
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
)
assert result["ok_cubes"] > 0
pe_exec_vals = [
float(tr.get("pe_exec_ns", 0.0) or 0.0)
for _, (_, tr) in engine._results.items()
if isinstance(tr, dict)
]
return max(pe_exec_vals) if pe_exec_vals else 0.0
def test_intra_sip_critical_path_at_96k_below_threshold(tmp_path):
"""Post-Phase-2 (root=center, bidirectional reduce) the torus_2d
96 KB allreduce on TCM should be meaningfully lower than corner
root with serial reduce.
The absolute number depends on the latency model's fidelity.
Under ADR-0033 Phase 2c (per-flit wire timing, wormhole) the
bottleneck-link transit time is charged once per flit on each
serialized hop, so allreduce numbers are higher than pre-2c
estimates. Threshold widened to 30 µs to accommodate the more
accurate model; the algorithmic property (8-hop center root <
12-hop corner root) is the invariant being asserted.
"""
lat_ns = _run_torus_96kb(tmp_path)
THRESHOLD_NS = 30_000.0
assert lat_ns < THRESHOLD_NS, (
f"torus_2d 6-SIP 96 KB allreduce should land below "
f"{THRESHOLD_NS:.0f} ns post-Phase-2 (root=center, "
f"bidirectional reduce). got {lat_ns:.1f} ns "
f"({lat_ns / 1000:.2f} µs)"
)
def test_correctness_preserved(tmp_path):
"""Smoke check: at small n_elem the new algorithm must still produce
the correct sum across all 96 cubes. ``run_allreduce`` validates
every cube against the expected reduce result (``ok_cubes`` must be
96 = 6 SIPs × 16 cubes).
This guards against the obvious Phase 2 risk: bidirectional reduce
sums each contribution exactly once. If implemented wrong (double-
counting or skipping the right edge column / bottom row), the
asserts inside run_allreduce fail.
"""
sub = tmp_path / "correctness"
sub.mkdir()
topo_path, ccl_path = _write_temp_configs(
sub,
sip_topology="torus_2d",
n_sips=6,
algorithm="lrab_hierarchical_allreduce",
sip_w=3, sip_h=2,
n_elem_override=128, # tiny payload to keep this fast
)
topo = resolve_topology(topo_path)
engine = GraphEngine(topo.topology_obj, enable_data=True)
spec = topo.topology_obj.spec
with RuntimeContext(
engine=engine,
target_device=DeviceSelector("all"),
correlation_id="root_center_correctness",
spec=spec,
) as ctx:
result = run_allreduce(
ctx, engine, spec,
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
)
n_cubes = 6 * 16 # 6 SIPs × 16 cubes/SIP
assert result["ok_cubes"] == n_cubes, (
f"all 96 cubes must validate; got {result['ok_cubes']} OK"
)
+66
View File
@@ -0,0 +1,66 @@
"""Buffer-kind sweep (TCM / SRAM / HBM) on torus_2d 6 SIPs (3×2), distributed.
Each parametrized case writes one JSON row; the conftest sessionfinish hook
calls ``aggregate_buffer_kind_plot`` to emit the comparison PNG + csv. Pre
slot-latency modeling the three lines overlap exactly (slot access is
latency-free today).
"""
from __future__ import annotations
import json
import pytest
import yaml
from tests.sccl._allreduce_helpers import (
_BK_ROWS_DIR,
_ELEM_BYTES_F16,
_bk_params,
_crit_ns,
_run_distributed,
_write_temp_configs,
)
@pytest.mark.parametrize("buffer_kind,n_elem", _bk_params())
def test_buffer_kind_allreduce_one(tmp_path, monkeypatch, buffer_kind, n_elem):
sub = tmp_path / f"{buffer_kind}_{n_elem}"
sub.mkdir()
topo_path, ccl_path = _write_temp_configs(
sub,
sip_topology="torus_2d",
n_sips=6,
algorithm="lrab_hierarchical_allreduce",
sip_w=3, sip_h=2,
n_elem_override=n_elem,
)
# Override buffer_kind in the temp ccl.yaml (read by the ahbm backend
# at init_process_group time via load_ccl_config()).
with open(ccl_path) as f:
ccl_cfg = yaml.safe_load(f)
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
ccl_cfg.setdefault("algorithms", {}).setdefault(
"lrab_hierarchical_allreduce", {},
)["buffer_kind"] = buffer_kind
with open(ccl_path, "w") as f:
yaml.dump(ccl_cfg, f, default_flow_style=False)
engine, _ = _run_distributed(
sub, monkeypatch, topo_path,
f"bk_sweep_{buffer_kind}_{n_elem}", n_elem,
)
crit_ns = _crit_ns(engine)
bytes_per_pe = n_elem * _ELEM_BYTES_F16
record = {
"buffer_kind": buffer_kind,
"sip_topology": "torus_2d",
"n_sips": 6,
"n_elem": n_elem,
"bytes_per_pe": bytes_per_pe,
"latency_ns": crit_ns,
}
_BK_ROWS_DIR.mkdir(parents=True, exist_ok=True)
row_path = _BK_ROWS_DIR / f"{buffer_kind}_{n_elem}.json"
with open(row_path, "w", encoding="utf-8") as f:
json.dump(record, f)
+23
View File
@@ -0,0 +1,23 @@
"""Emit the broken-y-axis allreduce-vs-FSIM comparison plot.
Post-processes summary.csv (written by the latency sweep) into
comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png. Pure
plotting; reads the on-disk summary.csv (skips if the sweep has never run).
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.sccl._allreduce_helpers import (
_SWEEP_OUT_DIR,
emit_comparison_fsim_plot,
)
def test_emit_comparison_fsim_plot():
if not (_SWEEP_OUT_DIR / "summary.csv").exists():
pytest.skip("summary.csv absent; run the latency sweep first")
out = emit_comparison_fsim_plot()
assert out is not None and Path(out).exists()
+58
View File
@@ -0,0 +1,58 @@
"""Allreduce latency sweep (distributed path), xdist-friendly.
Each parametrized case writes one JSON row to the shared staging dir; the
conftest sessionfinish hook calls ``_aggregate_sweep_plots`` to emit the
per-topology PNGs + summary.csv after all cases finish.
"""
from __future__ import annotations
import json
import pytest
from tests.sccl._allreduce_helpers import (
_ELEM_BYTES_F16,
_SWEEP_ROWS_DIR,
_crit_ns,
_run_distributed,
_sweep_params,
_write_temp_configs,
)
@pytest.mark.parametrize(
"algorithm,sip_topology,n_sips,sip_w,sip_h,n_elem", _sweep_params(),
)
def test_allreduce_latency_one(
tmp_path, monkeypatch, algorithm, sip_topology, n_sips, sip_w, sip_h,
n_elem,
):
topo_path, _ = _write_temp_configs(
tmp_path, sip_topology, n_sips, algorithm,
sip_w=sip_w, sip_h=sip_h,
n_elem_override=n_elem,
)
engine, n_cubes = _run_distributed(
tmp_path, monkeypatch, topo_path,
f"sweep_{algorithm}_{sip_topology}_{n_elem}", n_elem,
)
crit_ns = _crit_ns(engine)
bytes_per_sip = n_cubes * n_elem * _ELEM_BYTES_F16
bytes_per_pe = n_elem * _ELEM_BYTES_F16
record = {
"algorithm": algorithm,
"sip_topology": sip_topology,
"n_sips": n_sips,
"n_elem": n_elem,
"bytes_per_pe": bytes_per_pe,
"bytes_per_sip": bytes_per_sip,
"latency_ns": crit_ns,
}
_SWEEP_ROWS_DIR.mkdir(parents=True, exist_ok=True)
row_path = _SWEEP_ROWS_DIR / f"{sip_topology}_{n_elem}.json"
with open(row_path, "w", encoding="utf-8") as f:
json.dump(record, f)
+11
View File
@@ -0,0 +1,11 @@
"""Emit topology.png (device-level + cube-level reduction). Pure plotting; no sim."""
from __future__ import annotations
from pathlib import Path
from tests.sccl._allreduce_helpers import emit_topology_diagram
def test_emit_topology_diagram():
out = emit_topology_diagram()
assert Path(out).exists()