CCL allreduce: rename to lrab_hierarchical_allreduce + descriptive plots
Rename the intercube all-reduce identity to lrab_hierarchical_allreduce (module, config key, distributed test) so the name reflects both levels it implements: LRAB intra-SIP (local reduce to center root + broadcast) and the hierarchical inter-SIP topology exchange (ring/torus/mesh). ADR-0032 slug kept as the stable decision id; pure rename, no logic change. Also in this batch: - ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder. - Rename allreduce + pe2pe latency plots to descriptive, title-matching filenames and retitle the in-plot headings; drop overview/overview_log. - Point the PPTX image refs at the new plot names. Doc + derived-artifact + rename only; no simulation behavior changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,8 @@
|
||||
Parametrized over (buffer_kind, n_elem). Each case runs the standard
|
||||
config-driven allreduce app and writes a JSON row to a shared staging
|
||||
dir; the conftest sessionfinish hook (added in Phase 1) aggregates
|
||||
rows into ``docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png``.
|
||||
rows into ``docs/diagrams/allreduce_latency_plots/
|
||||
AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png``.
|
||||
|
||||
Pre-Phase-2: the three buffer-kind lines overlap exactly because slot
|
||||
access is latency-free today. Post-Phase-2 they spread out (tcm
|
||||
@@ -36,6 +37,8 @@ _ELEM_BYTES_F16 = 2
|
||||
_OUT_DIR = (Path(__file__).parent.parent / "docs" / "diagrams"
|
||||
/ "allreduce_latency_plots")
|
||||
_ROWS_DIR = _OUT_DIR / "_buffer_kind_rows"
|
||||
# Descriptive output stem (shared by the .png and .csv).
|
||||
_OUT_STEM = "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM"
|
||||
|
||||
|
||||
def _bk_params():
|
||||
@@ -55,7 +58,7 @@ def test_buffer_kind_allreduce_one(tmp_path, buffer_kind, n_elem):
|
||||
sub,
|
||||
sip_topology="torus_2d",
|
||||
n_sips=6,
|
||||
algorithm="intercube_allreduce",
|
||||
algorithm="lrab_hierarchical_allreduce",
|
||||
sip_w=3, sip_h=2,
|
||||
n_elem_override=n_elem,
|
||||
)
|
||||
@@ -64,7 +67,7 @@ def test_buffer_kind_allreduce_one(tmp_path, buffer_kind, n_elem):
|
||||
ccl_cfg = yaml.safe_load(f)
|
||||
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
|
||||
ccl_cfg.setdefault("algorithms", {}).setdefault(
|
||||
"intercube_allreduce", {},
|
||||
"lrab_hierarchical_allreduce", {},
|
||||
)["buffer_kind"] = buffer_kind
|
||||
with open(ccl_path, "w") as f:
|
||||
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
||||
@@ -81,7 +84,7 @@ def test_buffer_kind_allreduce_one(tmp_path, buffer_kind, n_elem):
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
|
||||
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
|
||||
)
|
||||
assert result["ok_cubes"] > 0
|
||||
|
||||
@@ -108,7 +111,7 @@ def test_buffer_kind_allreduce_one(tmp_path, buffer_kind, n_elem):
|
||||
|
||||
|
||||
def aggregate_buffer_kind_plot() -> bool:
|
||||
"""Read per-config rows and emit buffer_kind_sweep.png + CSV.
|
||||
"""Read per-config rows and emit the descriptive .png + .csv (_OUT_STEM).
|
||||
|
||||
Called from conftest.pytest_sessionfinish (controller-only).
|
||||
Returns True if rows were aggregated.
|
||||
@@ -141,7 +144,7 @@ def aggregate_buffer_kind_plot() -> bool:
|
||||
_bytes_fmt = FuncFormatter(_fmt_bytes)
|
||||
|
||||
_OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(_OUT_DIR / "buffer_kind_sweep.csv", "w",
|
||||
with open(_OUT_DIR / f"{_OUT_STEM}.csv", "w",
|
||||
newline="", encoding="utf-8") as f:
|
||||
w = csv.DictWriter(f, fieldnames=[
|
||||
"buffer_kind", "sip_topology", "n_sips", "n_elem",
|
||||
@@ -172,13 +175,13 @@ def aggregate_buffer_kind_plot() -> bool:
|
||||
ax.set_xlabel("Bytes per PE (log scale)")
|
||||
ax.set_ylabel("Time (ns)")
|
||||
ax.set_title(
|
||||
"Allreduce torus_2d (6 SIPs, 3×2) — IPCQ slot memory tier"
|
||||
"AllReduce_LRAB_2Dtorus_6SiP(2x3) — IPCQ memory (SRAM, TCM, HBM)"
|
||||
)
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.legend()
|
||||
ax.xaxis.set_major_formatter(_bytes_fmt)
|
||||
fig.tight_layout()
|
||||
fig.savefig(_OUT_DIR / "buffer_kind_sweep.png", dpi=130)
|
||||
fig.savefig(_OUT_DIR / f"{_OUT_STEM}.png", dpi=130)
|
||||
plt.close(fig)
|
||||
|
||||
for p in row_files:
|
||||
@@ -191,6 +194,6 @@ def aggregate_buffer_kind_plot() -> bool:
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
print(f"\nWrote {_OUT_DIR / 'buffer_kind_sweep.png'} "
|
||||
print(f"\nWrote {_OUT_DIR / f'{_OUT_STEM}.png'} "
|
||||
f"from {len(records)} rows")
|
||||
return True
|
||||
|
||||
@@ -189,15 +189,15 @@ TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
CONFIGS = [
|
||||
pytest.param(
|
||||
"intercube_allreduce", "ring_1d", 6, None, None,
|
||||
"lrab_hierarchical_allreduce", "ring_1d", 6, None, None,
|
||||
id="ring_6sip",
|
||||
),
|
||||
pytest.param(
|
||||
"intercube_allreduce", "torus_2d", 6, 2, 3,
|
||||
"lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3,
|
||||
id="torus_6sip_2x3",
|
||||
),
|
||||
pytest.param(
|
||||
"intercube_allreduce", "mesh_2d_no_wrap", 6, 2, 3,
|
||||
"lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3,
|
||||
id="mesh_6sip_2x3",
|
||||
),
|
||||
]
|
||||
@@ -280,9 +280,9 @@ _SWEEP_N_ELEM = [
|
||||
_ELEM_BYTES_F16 = 2
|
||||
|
||||
_SWEEP_TOPOLOGIES = [
|
||||
("intercube_allreduce", "ring_1d", 6, None, None),
|
||||
("intercube_allreduce", "torus_2d", 6, 2, 3),
|
||||
("intercube_allreduce", "mesh_2d_no_wrap", 6, 2, 3),
|
||||
("lrab_hierarchical_allreduce", "ring_1d", 6, None, None),
|
||||
("lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3),
|
||||
("lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3),
|
||||
]
|
||||
|
||||
# Shared on-disk staging dir for parametrized sweep rows. Each
|
||||
@@ -440,10 +440,22 @@ def _aggregate_sweep_plots() -> bool:
|
||||
continue
|
||||
xs = [r["bytes_per_pe"] for r in rs]
|
||||
ys = [r["latency_ns"] for r in rs]
|
||||
title = (
|
||||
f"Allreduce latency — {topo_name} "
|
||||
f"(n_sips={rs[0]['n_sips']})"
|
||||
_per_topo_titles = {
|
||||
"ring_1d": "AllReduce_LRAB_Ring1D_6SiP(1x6)",
|
||||
"torus_2d": "AllReduce_LRAB_2Dtorus_6SiP(2x3)",
|
||||
"mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP(2x3)",
|
||||
}
|
||||
# Descriptive output filenames (parens → underscores for
|
||||
# markdown/URL safety; topo key stays the summary.csv value).
|
||||
_per_topo_files = {
|
||||
"ring_1d": "AllReduce_LRAB_Ring1D_6SiP_1x6",
|
||||
"torus_2d": "AllReduce_LRAB_2Dtorus_6SiP_2x3",
|
||||
"mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP_2x3",
|
||||
}
|
||||
title = _per_topo_titles.get(
|
||||
topo_name, f"Allreduce latency — {topo_name}"
|
||||
)
|
||||
out_stem = _per_topo_files.get(topo_name, topo_name)
|
||||
fig, ax = plt.subplots(figsize=(8, 5))
|
||||
ax.plot(xs, ys, marker="o", color="tab:blue")
|
||||
ax.set_xscale("log", base=2)
|
||||
@@ -453,75 +465,14 @@ def _aggregate_sweep_plots() -> bool:
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.xaxis.set_major_formatter(_bytes_fmt)
|
||||
fig.tight_layout()
|
||||
fig.savefig(_SWEEP_OUT_DIR / f"{topo_name}.png", dpi=120)
|
||||
fig.savefig(_SWEEP_OUT_DIR / f"{out_stem}.png", dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
colors = {"ring_1d": "tab:blue", "torus_2d": "tab:orange",
|
||||
"mesh_2d_no_wrap": "tab:green"}
|
||||
|
||||
# ── Hand-derived theoretical model for torus_2d (6 SIPs) ──
|
||||
# Critical-path analysis (per packet, packet = 128 B at NoC):
|
||||
# local intra-SIP reduce + broadcast = 8 hops × 57 ns = 456 ns
|
||||
# global X-direction reduce = 5 UCIe + 1 UAL = 445 ns
|
||||
# global Y-direction reduce = 5 UCIe + 1 UAL = 445 ns
|
||||
# per-packet startup latency = 456 + 445 + 445 = 1346 ns
|
||||
# Packet count is PER CUBE (8 PEs/cube cooperate on the cube tile).
|
||||
# At 6144 packets/cube the pipelined total is 8741 ns, so the
|
||||
# bottleneck-stage interval τ = (8741 − 1346) / (6144 − 1) ≈ 1.204 ns.
|
||||
# T_theoretical(N) = 1346 + (N − 1) × τ
|
||||
# where N = ceil((bytes_per_pe × 8) / 128) = ceil(bytes_per_pe / 16)
|
||||
NOC_PACKET_BYTES = 128
|
||||
PES_PER_CUBE = 8
|
||||
T_STARTUP_NS = 1346.0
|
||||
TAU_NS = (8741.0 - 1346.0) / (6144 - 1) # ≈ 1.2038 ns/packet
|
||||
|
||||
def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float:
|
||||
bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE
|
||||
n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES)) # ceil
|
||||
return T_STARTUP_NS + (n_packets - 1) * TAU_NS
|
||||
|
||||
fig, ax = plt.subplots(figsize=(9, 6))
|
||||
for topo_name in topologies:
|
||||
rs = sorted(
|
||||
[r for r in records if r["sip_topology"] == topo_name],
|
||||
key=lambda r: r["bytes_per_pe"],
|
||||
)
|
||||
if not rs:
|
||||
continue
|
||||
ax.plot(
|
||||
[r["bytes_per_pe"] for r in rs],
|
||||
[r["latency_ns"] for r in rs],
|
||||
marker="o",
|
||||
label=f"{topo_name} (n_sips={rs[0]['n_sips']})",
|
||||
color=colors.get(topo_name),
|
||||
)
|
||||
|
||||
# Theoretical torus_2d curve across all payload sizes.
|
||||
torus_rs = sorted(
|
||||
[r for r in records if r["sip_topology"] == "torus_2d"],
|
||||
key=lambda r: r["bytes_per_pe"],
|
||||
)
|
||||
if torus_rs:
|
||||
xs_th = [r["bytes_per_pe"] for r in torus_rs]
|
||||
ys_th = [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs]
|
||||
ax.plot(
|
||||
xs_th, ys_th,
|
||||
color="tab:red", linestyle="--", linewidth=1.6, marker="x",
|
||||
label="theoretical torus_2d (6 SIPs)",
|
||||
)
|
||||
|
||||
ax.set_xscale("log", base=2)
|
||||
ax.set_xlabel("Bytes per PE (log scale)")
|
||||
ax.set_ylabel("Time (ns)")
|
||||
ax.set_title("Multi-device allreduce latency by topology")
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.set_xlim(left=min(r["bytes_per_pe"] for r in records) / 2,
|
||||
right=max(r["bytes_per_pe"] for r in records) * 1.5)
|
||||
ax.legend()
|
||||
ax.xaxis.set_major_formatter(_bytes_fmt)
|
||||
fig.tight_layout()
|
||||
fig.savefig(_SWEEP_OUT_DIR / "overview.png", dpi=120)
|
||||
plt.close(fig)
|
||||
# Combined overview.png is no longer emitted — the broken-y-axis
|
||||
# comparison (scripts/emit_overview_with_external_ref.py →
|
||||
# comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png)
|
||||
# supersedes it. Per-topology plots above and summary.csv are still
|
||||
# produced.
|
||||
|
||||
# Cleanup row staging dir so a partial future run doesn't pick up
|
||||
# stale rows.
|
||||
@@ -535,7 +486,7 @@ def _aggregate_sweep_plots() -> bool:
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
print(f"\nWrote {_SWEEP_OUT_DIR / 'overview.png'} "
|
||||
print(f"\nWrote per-topology plots + summary.csv to {_SWEEP_OUT_DIR} "
|
||||
f"from {len(records)} rows")
|
||||
return True
|
||||
|
||||
|
||||
+5
-5
@@ -25,7 +25,7 @@ N_ELEM = 8
|
||||
def _write_ccl_yaml(tmp_path) -> str:
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: intercube_allreduce
|
||||
algorithm: lrab_hierarchical_allreduce
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
@@ -34,8 +34,8 @@ def _write_ccl_yaml(tmp_path) -> str:
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
intercube_allreduce:
|
||||
module: kernbench.ccl.algorithms.intercube_allreduce
|
||||
lrab_hierarchical_allreduce:
|
||||
module: kernbench.ccl.algorithms.lrab_hierarchical_allreduce
|
||||
topology: none
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
@@ -80,11 +80,11 @@ def _worker(rank: int, n_sips: int, torch) -> None:
|
||||
)
|
||||
|
||||
if rank == 0:
|
||||
print(f"\n intercube_allreduce (ws={n_sips}): "
|
||||
print(f"\n lrab_hierarchical_allreduce (ws={n_sips}): "
|
||||
f"{n_sips * N_CUBES} OK")
|
||||
|
||||
|
||||
def test_distributed_intercube_allreduce(tmp_path, monkeypatch):
|
||||
def test_distributed_lrab_hierarchical_allreduce(tmp_path, monkeypatch):
|
||||
"""Full distributed path: init_process_group → mp.spawn → all_reduce."""
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Phase 1 test for moving the intercube_allreduce root cube from the
|
||||
"""Phase 1 test for moving the lrab_hierarchical_allreduce root cube from the
|
||||
bottom-right corner (3,3) to the geometric center (2,2).
|
||||
|
||||
Today's algorithm (intercube_allreduce.py) hardcodes
|
||||
Today's algorithm (lrab_hierarchical_allreduce.py) hardcodes
|
||||
``root_cube = (cube_h-1) * cube_w + (cube_w-1)`` (= cube 15 in 4×4).
|
||||
The intra-SIP critical path for one allreduce is therefore::
|
||||
|
||||
@@ -55,7 +55,7 @@ def _run_torus_96kb(tmp_path: Path) -> float:
|
||||
sub,
|
||||
sip_topology="torus_2d",
|
||||
n_sips=6,
|
||||
algorithm="intercube_allreduce",
|
||||
algorithm="lrab_hierarchical_allreduce",
|
||||
sip_w=3, sip_h=2,
|
||||
n_elem_override=49152, # 49152 × 2 = 96 KB / slot
|
||||
)
|
||||
@@ -70,7 +70,7 @@ def _run_torus_96kb(tmp_path: Path) -> float:
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
|
||||
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
|
||||
)
|
||||
assert result["ok_cubes"] > 0
|
||||
pe_exec_vals = [
|
||||
@@ -121,7 +121,7 @@ def test_correctness_preserved(tmp_path):
|
||||
sub,
|
||||
sip_topology="torus_2d",
|
||||
n_sips=6,
|
||||
algorithm="intercube_allreduce",
|
||||
algorithm="lrab_hierarchical_allreduce",
|
||||
sip_w=3, sip_h=2,
|
||||
n_elem_override=128, # tiny payload to keep this fast
|
||||
)
|
||||
@@ -136,7 +136,7 @@ def test_correctness_preserved(tmp_path):
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
|
||||
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
|
||||
)
|
||||
n_cubes = 6 * 16 # 6 SIPs × 16 cubes/SIP
|
||||
assert result["ok_cubes"] == n_cubes, (
|
||||
|
||||
@@ -28,7 +28,7 @@ def _engine_and_spec():
|
||||
|
||||
def _merged_cfg():
|
||||
cfg = load_ccl_config()
|
||||
return resolve_algorithm_config(cfg, name="intercube_allreduce")
|
||||
return resolve_algorithm_config(cfg, name="lrab_hierarchical_allreduce")
|
||||
|
||||
|
||||
class TestConfigureSfrNeighborTables:
|
||||
|
||||
@@ -81,7 +81,7 @@ def _run_torus_allreduce(
|
||||
sub,
|
||||
sip_topology="torus_2d",
|
||||
n_sips=6,
|
||||
algorithm="intercube_allreduce",
|
||||
algorithm="lrab_hierarchical_allreduce",
|
||||
sip_w=3, sip_h=2,
|
||||
n_elem_override=n_elem,
|
||||
)
|
||||
@@ -92,7 +92,7 @@ def _run_torus_allreduce(
|
||||
ccl_cfg = yaml.safe_load(f)
|
||||
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
|
||||
ccl_cfg.setdefault("algorithms", {}).setdefault(
|
||||
"intercube_allreduce", {},
|
||||
"lrab_hierarchical_allreduce", {},
|
||||
)["buffer_kind"] = buffer_kind
|
||||
with open(ccl_path, "w") as f:
|
||||
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
||||
@@ -109,7 +109,7 @@ def _run_torus_allreduce(
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
|
||||
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
|
||||
)
|
||||
assert result["ok_cubes"] > 0, "allreduce did not validate"
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ def _run_allreduce_with_buffer_kind(
|
||||
sub,
|
||||
sip_topology="torus_2d",
|
||||
n_sips=6,
|
||||
algorithm="intercube_allreduce",
|
||||
algorithm="lrab_hierarchical_allreduce",
|
||||
sip_w=3, sip_h=2,
|
||||
n_elem_override=n_elem,
|
||||
)
|
||||
@@ -77,7 +77,7 @@ def _run_allreduce_with_buffer_kind(
|
||||
ccl_cfg = yaml.safe_load(f)
|
||||
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
|
||||
ccl_cfg.setdefault("algorithms", {}).setdefault(
|
||||
"intercube_allreduce", {},
|
||||
"lrab_hierarchical_allreduce", {},
|
||||
)["buffer_kind"] = buffer_kind
|
||||
with open(ccl_path, "w") as f:
|
||||
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
||||
@@ -94,7 +94,7 @@ def _run_allreduce_with_buffer_kind(
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
|
||||
algorithm="lrab_hierarchical_allreduce", ccl_yaml=ccl_path,
|
||||
)
|
||||
assert result["ok_cubes"] > 0, "allreduce did not validate"
|
||||
|
||||
|
||||
@@ -472,7 +472,7 @@ def _run_ipcq():
|
||||
dst_sip, dst_cube, dst_pe = DST
|
||||
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="intercube_allreduce")
|
||||
merged = resolve_algorithm_config(cfg, name="lrab_hierarchical_allreduce")
|
||||
merged["slot_size"] = max(int(merged.get("slot_size", 4096)), NBYTES)
|
||||
|
||||
with RuntimeContext(
|
||||
|
||||
@@ -56,13 +56,17 @@ class Hop:
|
||||
|
||||
|
||||
HOPS = [
|
||||
Hop("h1_intra_horizontal", "Intra-cube horizontal (pe0 to pe1)",
|
||||
Hop("latency_intracube_PE0_to_PE1_horizontal",
|
||||
"Intra-cube PE-to-PE latency: PE0 → PE1 (horizontal)",
|
||||
(0, 0, 0), (0, 0, 1), "intra_E", "intra_W", True),
|
||||
Hop("h2_intra_vertical", "Intra-cube vertical (pe0 to pe4)",
|
||||
Hop("latency_intracube_PE0_to_PE4_vertical",
|
||||
"Intra-cube PE-to-PE latency: PE0 → PE4 (vertical)",
|
||||
(0, 0, 0), (0, 0, 4), "intra_S", "intra_N", True),
|
||||
Hop("h3_inter_cube_horizontal", "Inter-cube horizontal (cube0 to cube1)",
|
||||
Hop("latency_intercube_C0PE0_to_C1PE0_horizontal",
|
||||
"Inter-cube PE-to-PE latency: Cube0.PE0 → Cube1.PE0 (horizontal)",
|
||||
(0, 0, 0), (0, 1, 0), "E", "W", True),
|
||||
Hop("h4_inter_cube_vertical", "Inter-cube vertical (cube0 to cube4)",
|
||||
Hop("latency_intercube_C0PE0_to_C4PE0_vertical",
|
||||
"Inter-cube PE-to-PE latency: Cube0.PE0 → Cube4.PE0 (vertical)",
|
||||
(0, 0, 0), (0, 4, 0), "S", "N", True),
|
||||
]
|
||||
|
||||
@@ -80,7 +84,7 @@ def _measure_ipcq(hop: Hop, nbytes: int) -> float:
|
||||
engine, spec = _make_engine()
|
||||
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="intercube_allreduce")
|
||||
merged = resolve_algorithm_config(cfg, name="lrab_hierarchical_allreduce")
|
||||
merged["slot_size"] = max(int(merged.get("slot_size", 4096)), nbytes)
|
||||
|
||||
n_elem = nbytes // ELEM_BYTES
|
||||
|
||||
Reference in New Issue
Block a user