bcf941dcee
Test matrix restructure: - 256-rank full-system ring runs only ONCE (marked pytest.mark.slow) instead of 7× across matrix + perf tests. Cross-SIP routing is verified by the single run; buffer variants (tcm/hbm/sram) are tested at 8-rank where they finish in <0.5s. - Performance tests use 8-rank instead of 256-rank. - `pytest -m "not slow"` completes in ~2.5min (local dev). - Full suite including slow: ~6min (CI). DataExecutor optimization: - Remove ThreadPoolExecutor from DataExecutor.run(). Same-t_start groups are almost always size 1, so the thread pool creation and dispatch overhead dominated. Simple sequential loop is faster. - Skip dma_read ops at the loop level (they are always no-ops in Phase 2 but were dispatched through _execute_op → _execute_memory). - Remove redundant CLI Phase 2 re-execution: engine._flush_data_phase already replays during engine.wait(); the CLI now only prints the diagnostic summary without re-running DataExecutor. 502 tests pass. Wall time: 25m30s → 5m43s (full), 2m28s (no slow). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
"""CCL performance validation tests (ADR-0023 D13 T5).
|
|
|
|
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.
|
|
|
|
Uses 8-rank (single cube) for all buffer variants — the latency model
|
|
is topology-aware, so buffer_kind differences are visible even at small
|
|
scale. Full-system (256-rank) cross-SIP latency is covered by the
|
|
``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import os
|
|
|
|
import pytest
|
|
|
|
from kernbench.runtime_api.bench_runner import run_bench
|
|
from kernbench.runtime_api.types import resolve_device
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import resolve_topology
|
|
|
|
|
|
def _engine_factory(topology, device):
|
|
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
|
|
|
|
|
def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
|
|
"""Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
|
|
Returns simulated kernel total_ns."""
|
|
import tempfile
|
|
|
|
body = f"""\
|
|
defaults:
|
|
algorithm: {algorithm}
|
|
buffer_kind: {buffer_kind}
|
|
backpressure: sleep
|
|
n_slots: 4
|
|
slot_size: 4096
|
|
vc_chunk_size: 256
|
|
ipcq_credit_size_bytes: 16
|
|
|
|
algorithms:
|
|
{algorithm}:
|
|
module: kernbench.ccl.algorithms.ring_allreduce
|
|
topology: ring_1d
|
|
buffer_kind: {buffer_kind}
|
|
world_size: 8
|
|
n_elem: 32
|
|
"""
|
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
|
|
f.write(body)
|
|
old_cwd = os.getcwd()
|
|
os.chdir(tmp)
|
|
try:
|
|
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
|
|
bench_mod = importlib.import_module("benches.ccl_allreduce")
|
|
result = run_bench(
|
|
topology=topo, bench_fn=bench_mod.run,
|
|
device=resolve_device("all"),
|
|
engine_factory=_engine_factory,
|
|
)
|
|
finally:
|
|
os.chdir(old_cwd)
|
|
|
|
assert result.completion.ok, f"{algorithm} did not complete"
|
|
last_kernel = None
|
|
for tr in (result.traces or []):
|
|
if tr.get("phase") == "kernel":
|
|
last_kernel = tr
|
|
assert last_kernel is not None, f"{algorithm} produced no kernel trace"
|
|
return float(last_kernel.get("total_ns", 0.0))
|
|
|
|
|
|
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
|
|
def test_ccl_latency_positive(buffer_kind):
|
|
"""Every buffer kind must produce a positive simulated latency."""
|
|
algo = f"ring_allreduce_{buffer_kind}"
|
|
ns = _run_8rank(algo, buffer_kind)
|
|
assert ns > 0
|
|
|
|
|
|
def test_ccl_latency_under_reasonable_bound():
|
|
"""8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
|
|
ns = _run_8rank("ring_allreduce_tcm", "tcm")
|
|
assert ns < 1_000_000 # < 1 ms simulated
|