Files
kernbench2/tests/test_ccl_performance.py
T
ywkang bcf941dcee Speed up regression: 25min → 6min (test matrix + DataExecutor cleanup)
Test matrix restructure:
- 256-rank full-system ring runs only ONCE (marked pytest.mark.slow)
  instead of 7× across matrix + perf tests. Cross-SIP routing is
  verified by the single run; buffer variants (tcm/hbm/sram) are
  tested at 8-rank where they finish in <0.5s.
- Performance tests use 8-rank instead of 256-rank.
- `pytest -m "not slow"` completes in ~2.5min (local dev).
- Full suite including slow: ~6min (CI).

DataExecutor optimization:
- Remove ThreadPoolExecutor from DataExecutor.run(). Same-t_start
  groups are almost always size 1, so the thread pool creation and
  dispatch overhead dominated. Simple sequential loop is faster.
- Skip dma_read ops at the loop level (they are always no-ops in
  Phase 2 but were dispatched through _execute_op → _execute_memory).
- Remove redundant CLI Phase 2 re-execution: engine._flush_data_phase
  already replays during engine.wait(); the CLI now only prints the
  diagnostic summary without re-running DataExecutor.

502 tests pass. Wall time: 25m30s → 5m43s (full), 2m28s (no slow).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 20:52:07 -07:00

88 lines
2.8 KiB
Python

"""CCL performance validation tests (ADR-0023 D13 T5).
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.
Uses 8-rank (single cube) for all buffer variants — the latency model
is topology-aware, so buffer_kind differences are visible even at small
scale. Full-system (256-rank) cross-SIP latency is covered by the
``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
"""
from __future__ import annotations
import importlib
import os
import pytest
from kernbench.runtime_api.bench_runner import run_bench
from kernbench.runtime_api.types import resolve_device
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
def _engine_factory(topology, device):
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
"""Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
Returns simulated kernel total_ns."""
import tempfile
body = f"""\
defaults:
algorithm: {algorithm}
buffer_kind: {buffer_kind}
backpressure: sleep
n_slots: 4
slot_size: 4096
vc_chunk_size: 256
ipcq_credit_size_bytes: 16
algorithms:
{algorithm}:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: {buffer_kind}
world_size: 8
n_elem: 32
"""
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
with tempfile.TemporaryDirectory() as tmp:
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
f.write(body)
old_cwd = os.getcwd()
os.chdir(tmp)
try:
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
bench_mod = importlib.import_module("benches.ccl_allreduce")
result = run_bench(
topology=topo, bench_fn=bench_mod.run,
device=resolve_device("all"),
engine_factory=_engine_factory,
)
finally:
os.chdir(old_cwd)
assert result.completion.ok, f"{algorithm} did not complete"
last_kernel = None
for tr in (result.traces or []):
if tr.get("phase") == "kernel":
last_kernel = tr
assert last_kernel is not None, f"{algorithm} produced no kernel trace"
return float(last_kernel.get("total_ns", 0.0))
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
def test_ccl_latency_positive(buffer_kind):
"""Every buffer kind must produce a positive simulated latency."""
algo = f"ring_allreduce_{buffer_kind}"
ns = _run_8rank(algo, buffer_kind)
assert ns > 0
def test_ccl_latency_under_reasonable_bound():
"""8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
ns = _run_8rank("ring_allreduce_tcm", "tcm")
assert ns < 1_000_000 # < 1 ms simulated