1d8b9401e5
New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
"""Tests for CCL backend install (ADR-0023 D10/D11)."""
|
|
from __future__ import annotations
|
|
|
|
from kernbench.ccl.install import (
|
|
linear_rank_to_pe,
|
|
load_ccl_config,
|
|
resolve_algorithm_config,
|
|
)
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import resolve_topology
|
|
|
|
|
|
def _engine():
|
|
topo = resolve_topology("topology.yaml").topology_obj
|
|
return GraphEngine(topo, enable_data=True), topo
|
|
|
|
|
|
def test_load_ccl_config():
|
|
cfg = load_ccl_config()
|
|
assert "defaults" in cfg
|
|
assert "algorithms" in cfg
|
|
|
|
|
|
def test_resolve_algorithm_config_default():
|
|
cfg = load_ccl_config()
|
|
merged = resolve_algorithm_config(cfg)
|
|
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
|
|
assert "world_size" not in merged or merged["world_size"] >= 1
|
|
|
|
|
|
def test_linear_rank_to_pe():
|
|
engine, topo = _engine()
|
|
spec = topo.spec
|
|
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
|
|
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
|
|
pes_per_sip = (
|
|
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
|
* spec["cube"]["pe_layout"]["pe_per_corner"]
|
|
* len(spec["cube"]["pe_layout"]["corners"])
|
|
)
|
|
sips = spec["system"]["sips"]["count"]
|
|
total = sips * pes_per_sip
|
|
assert total >= 8
|