ff7d727ddd
Rename the intercube all-reduce identity to lrab_hierarchical_allreduce (module, config key, distributed test) so the name reflects both levels it implements: LRAB intra-SIP (local reduce to center root + broadcast) and the hierarchical inter-SIP topology exchange (ring/torus/mesh). ADR-0032 slug kept as the stable decision id; pure rename, no logic change. Also in this batch: - ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder. - Rename allreduce + pe2pe latency plots to descriptive, title-matching filenames and retitle the in-plot headings; drop overview/overview_log. - Point the PPTX image refs at the new plot names. Doc + derived-artifact + rename only; no simulation behavior changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
200 lines
6.1 KiB
Python
200 lines
6.1 KiB
Python
"""Phase 1 buffer-kind allreduce sweep — torus_2d 6 SIPs.
|
|
|
|
Parametrized over (buffer_kind, n_elem). Each case runs the standard
|
|
config-driven allreduce app and writes a JSON row to a shared staging
|
|
dir; the conftest sessionfinish hook (added in Phase 1) aggregates
|
|
rows into ``docs/diagrams/allreduce_latency_plots/
|
|
AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png``.
|
|
|
|
Pre-Phase-2: the three buffer-kind lines overlap exactly because slot
|
|
access is latency-free today. Post-Phase-2 they spread out (tcm
|
|
fastest, hbm slowest).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from kernbench.runtime_api.context import RuntimeContext
|
|
from kernbench.runtime_api.types import DeviceSelector
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import resolve_topology
|
|
|
|
# Reuse the allreduce app helpers.
|
|
from tests.test_allreduce_multidevice import (
|
|
_write_temp_configs,
|
|
run_allreduce,
|
|
)
|
|
|
|
|
|
_BUFFER_KINDS = ["tcm", "sram", "hbm"]
|
|
_N_ELEM_GRID = [128, 1024, 8192, 32768] # 256 B → 64 KB per slot
|
|
_ELEM_BYTES_F16 = 2
|
|
|
|
_OUT_DIR = (Path(__file__).parent.parent / "docs" / "diagrams"
|
|
/ "allreduce_latency_plots")
|
|
_ROWS_DIR = _OUT_DIR / "_buffer_kind_rows"
|
|
# Descriptive output stem (shared by the .png and .csv).
|
|
_OUT_STEM = "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM"
|
|
|
|
|
|
def _bk_params():
|
|
out = []
|
|
for bk in _BUFFER_KINDS:
|
|
for n_elem in _N_ELEM_GRID:
|
|
out.append(pytest.param(bk, n_elem, id=f"{bk}-n_elem{n_elem}"))
|
|
return out
|
|
|
|
|
|
@pytest.mark.parametrize("buffer_kind,n_elem", _bk_params())
|
|
def test_buffer_kind_allreduce_one(tmp_path, buffer_kind, n_elem):
|
|
"""One config of the buffer-kind sweep. xdist parallelizes."""
|
|
sub = tmp_path / f"{buffer_kind}_{n_elem}"
|
|
sub.mkdir()
|
|
topo_path, ccl_path = _write_temp_configs(
|
|
sub,
|
|
sip_topology="torus_2d",
|
|
n_sips=6,
|
|
algorithm="lrab_hierarchical_allreduce",
|
|
sip_w=3, sip_h=2,
|
|
n_elem_override=n_elem,
|
|
)
|
|
# Override buffer_kind in the temp ccl.yaml.
|
|
with open(ccl_path) as f:
|
|
ccl_cfg = yaml.safe_load(f)
|
|
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
|
|
ccl_cfg.setdefault("algorithms", {}).setdefault(
|
|
"lrab_hierarchical_allreduce", {},
|
|
)["buffer_kind"] = buffer_kind
|
|
with open(ccl_path, "w") as f:
|
|
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
|
|
|
topo = resolve_topology(topo_path)
|
|
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
|
spec = topo.topology_obj.spec
|
|
|
|
with RuntimeContext(
|
|
engine=engine,
|
|
target_device=DeviceSelector("all"),
|
|
correlation_id=f"bk_sweep_{buffer_kind}_{n_elem}",
|
|
spec=spec,
|
|
) as ctx:
|
|
result = run_allreduce(
|
|
ctx, engine, spec,
|
|
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
|
|
)
|
|
assert result["ok_cubes"] > 0
|
|
|
|
pe_exec_vals = [
|
|
float(tr.get("pe_exec_ns", 0.0) or 0.0)
|
|
for _, (_, tr) in engine._results.items()
|
|
if isinstance(tr, dict)
|
|
]
|
|
crit_ns = max(pe_exec_vals) if pe_exec_vals else 0.0
|
|
|
|
bytes_per_pe = n_elem * _ELEM_BYTES_F16
|
|
record = {
|
|
"buffer_kind": buffer_kind,
|
|
"sip_topology": "torus_2d",
|
|
"n_sips": 6,
|
|
"n_elem": n_elem,
|
|
"bytes_per_pe": bytes_per_pe,
|
|
"latency_ns": crit_ns,
|
|
}
|
|
_ROWS_DIR.mkdir(parents=True, exist_ok=True)
|
|
row_path = _ROWS_DIR / f"{buffer_kind}_{n_elem}.json"
|
|
with open(row_path, "w", encoding="utf-8") as f:
|
|
json.dump(record, f)
|
|
|
|
|
|
def aggregate_buffer_kind_plot() -> bool:
|
|
"""Read per-config rows and emit the descriptive .png + .csv (_OUT_STEM).
|
|
|
|
Called from conftest.pytest_sessionfinish (controller-only).
|
|
Returns True if rows were aggregated.
|
|
"""
|
|
import csv
|
|
|
|
if not _ROWS_DIR.exists():
|
|
return False
|
|
row_files = sorted(_ROWS_DIR.glob("*.json"))
|
|
if not row_files:
|
|
return False
|
|
|
|
records = []
|
|
for p in row_files:
|
|
with open(p, encoding="utf-8") as f:
|
|
records.append(json.load(f))
|
|
|
|
import matplotlib.pyplot as plt
|
|
from matplotlib.ticker import FuncFormatter
|
|
|
|
def _fmt_bytes(x, _pos):
|
|
if x <= 0:
|
|
return "0"
|
|
if x >= 1024 * 1024:
|
|
return f"{x / (1024 * 1024):.0f} MB"
|
|
if x >= 1024:
|
|
return f"{x / 1024:.0f} KB"
|
|
return f"{x:.0f} B"
|
|
|
|
_bytes_fmt = FuncFormatter(_fmt_bytes)
|
|
|
|
_OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
with open(_OUT_DIR / f"{_OUT_STEM}.csv", "w",
|
|
newline="", encoding="utf-8") as f:
|
|
w = csv.DictWriter(f, fieldnames=[
|
|
"buffer_kind", "sip_topology", "n_sips", "n_elem",
|
|
"bytes_per_pe", "latency_ns",
|
|
])
|
|
w.writeheader()
|
|
for r in sorted(records, key=lambda r: (
|
|
r["buffer_kind"], r["bytes_per_pe"],
|
|
)):
|
|
w.writerow(r)
|
|
|
|
colors = {"tcm": "tab:blue", "sram": "tab:orange", "hbm": "tab:red"}
|
|
fig, ax = plt.subplots(figsize=(10, 6))
|
|
for bk in ["tcm", "sram", "hbm"]:
|
|
rs = sorted(
|
|
[r for r in records if r["buffer_kind"] == bk],
|
|
key=lambda r: r["bytes_per_pe"],
|
|
)
|
|
if not rs:
|
|
continue
|
|
ax.plot(
|
|
[r["bytes_per_pe"] for r in rs],
|
|
[r["latency_ns"] for r in rs],
|
|
marker="o", lw=2.0,
|
|
color=colors[bk], label=f"buffer_kind = {bk}",
|
|
)
|
|
ax.set_xscale("log", base=2)
|
|
ax.set_xlabel("Bytes per PE (log scale)")
|
|
ax.set_ylabel("Time (ns)")
|
|
ax.set_title(
|
|
"AllReduce_LRAB_2Dtorus_6SiP(2x3) — IPCQ memory (SRAM, TCM, HBM)"
|
|
)
|
|
ax.grid(True, alpha=0.3)
|
|
ax.legend()
|
|
ax.xaxis.set_major_formatter(_bytes_fmt)
|
|
fig.tight_layout()
|
|
fig.savefig(_OUT_DIR / f"{_OUT_STEM}.png", dpi=130)
|
|
plt.close(fig)
|
|
|
|
for p in row_files:
|
|
try:
|
|
p.unlink()
|
|
except OSError:
|
|
pass
|
|
try:
|
|
_ROWS_DIR.rmdir()
|
|
except OSError:
|
|
pass
|
|
|
|
print(f"\nWrote {_OUT_DIR / f'{_OUT_STEM}.png'} "
|
|
f"from {len(records)} rows")
|
|
return True
|