CCL allreduce: rename to lrab_hierarchical_allreduce + descriptive plots
Rename the intercube all-reduce identity to lrab_hierarchical_allreduce (module, config key, distributed test) so the name reflects both levels it implements: LRAB intra-SIP (local reduce to center root + broadcast) and the hierarchical inter-SIP topology exchange (ring/torus/mesh). ADR-0032 slug kept as the stable decision id; pure rename, no logic change. Also in this batch: - ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder. - Rename allreduce + pe2pe latency plots to descriptive, title-matching filenames and retitle the in-plot headings; drop overview/overview_log. - Point the PPTX image refs at the new plot names. Doc + derived-artifact + rename only; no simulation behavior changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,119 @@
|
||||
"""End-to-end distributed test for intercube allreduce.
|
||||
|
||||
Exercises the full process-group path:
|
||||
dist.init_process_group(backend="ahbm")
|
||||
→ mp.spawn(nprocs=n_sips)
|
||||
→ each worker: set_device → allocate → fill → dist.all_reduce → verify
|
||||
|
||||
This is the same flow a real DDP training script would use.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
N_CUBES = 16
|
||||
N_ELEM = 8
|
||||
|
||||
|
||||
def _write_ccl_yaml(tmp_path) -> str:
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: lrab_hierarchical_allreduce
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
lrab_hierarchical_allreduce:
|
||||
module: kernbench.ccl.algorithms.lrab_hierarchical_allreduce
|
||||
topology: none
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
root_cube: 15
|
||||
""")
|
||||
(tmp_path / "ccl.yaml").write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
def _worker(rank: int, n_sips: int, torch) -> None:
|
||||
"""Per-SIP worker: allocate, fill, all_reduce, verify."""
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
torch.ahbm.set_device(rank)
|
||||
|
||||
dp = DPPolicy(
|
||||
cube="row_wise", pe="replicate",
|
||||
num_pes=1, num_cubes=N_CUBES,
|
||||
)
|
||||
tensor = torch.zeros(
|
||||
(N_CUBES, N_ELEM), dtype="f16", dp=dp,
|
||||
name=f"sip{rank}",
|
||||
)
|
||||
|
||||
init_arr = np.full((N_CUBES, N_ELEM), float(rank + 1), dtype=np.float16)
|
||||
tensor.copy_(torch.from_numpy(init_arr))
|
||||
|
||||
print(f"[SIP {rank}] input cube0[:4] = {tensor.numpy()[0][:4].tolist()}")
|
||||
|
||||
torch.distributed.all_reduce(tensor, op="sum")
|
||||
|
||||
arr = tensor.numpy()
|
||||
expected = float(N_CUBES * sum(range(1, n_sips + 1)))
|
||||
|
||||
print(f"[SIP {rank}] output cube0[:4] = {arr[0][:4].tolist()}")
|
||||
print(f"[SIP {rank}] output cube15[:4] = {arr[15][:4].tolist()}")
|
||||
|
||||
for cube_id in range(N_CUBES):
|
||||
assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), (
|
||||
f"SIP{rank} cube {cube_id}: "
|
||||
f"got {arr[cube_id][:4]}, expected {expected}"
|
||||
)
|
||||
|
||||
if rank == 0:
|
||||
print(f"\n lrab_hierarchical_allreduce (ws={n_sips}): "
|
||||
f"{n_sips * N_CUBES} OK")
|
||||
|
||||
|
||||
def test_distributed_lrab_hierarchical_allreduce(tmp_path, monkeypatch):
|
||||
"""Full distributed path: init_process_group → mp.spawn → all_reduce."""
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
monkeypatch.chdir(_write_ccl_yaml(tmp_path))
|
||||
|
||||
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
spec = topo.topology_obj.spec
|
||||
n_sips = int(spec["system"]["sips"]["count"])
|
||||
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="dist_intercube_ar",
|
||||
spec=spec,
|
||||
) as ctx:
|
||||
ctx.distributed.init_process_group(backend="ahbm")
|
||||
|
||||
assert ctx.distributed.get_world_size() == n_sips
|
||||
|
||||
t_start = engine._env.now
|
||||
|
||||
ctx.multiprocessing.spawn(
|
||||
_worker, args=(n_sips, ctx), nprocs=n_sips,
|
||||
)
|
||||
|
||||
t_end = engine._env.now
|
||||
print(f"\n[distributed] sim latency = "
|
||||
f"{t_end - t_start:.1f} ns ({(t_end - t_start) / 1000:.3f} us)")
|
||||
Reference in New Issue
Block a user