Speed up regression: 25min → 6min (test matrix + DataExecutor cleanup)
Test matrix restructure: - 256-rank full-system ring runs only ONCE (marked pytest.mark.slow) instead of 7× across matrix + perf tests. Cross-SIP routing is verified by the single run; buffer variants (tcm/hbm/sram) are tested at 8-rank where they finish in <0.5s. - Performance tests use 8-rank instead of 256-rank. - `pytest -m "not slow"` completes in ~2.5min (local dev). - Full suite including slow: ~6min (CI). DataExecutor optimization: - Remove ThreadPoolExecutor from DataExecutor.run(). Same-t_start groups are almost always size 1, so the thread pool creation and dispatch overhead dominated. Simple sequential loop is faster. - Skip dma_read ops at the loop level (they are always no-ops in Phase 2 but were dispatched through _execute_op → _execute_memory). - Remove redundant CLI Phase 2 re-execution: engine._flush_data_phase already replays during engine.wait(); the CLI now only prints the diagnostic summary without re-running DataExecutor. 502 tests pass. Wall time: 25m30s → 5m43s (full), 2m28s (no slow). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -72,15 +72,12 @@ def cmd_run(args) -> int:
|
||||
print(format_report(result.traces, title=args.bench, spec=spec))
|
||||
print(result.summary_text())
|
||||
|
||||
# Phase 2: data execution (ADR-0020)
|
||||
# Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
|
||||
# already runs inside engine.wait() → _flush_data_phase(). We only
|
||||
# print the summary here; no redundant re-execution.
|
||||
if verify_data and result.engine is not None:
|
||||
from kernbench.sim_engine.data_executor import DataExecutor
|
||||
|
||||
op_log = result.engine.op_log
|
||||
store = result.engine.memory_store
|
||||
if op_log and store is not None:
|
||||
executor = DataExecutor(op_log, store)
|
||||
executor.run()
|
||||
if op_log:
|
||||
n_gemm = sum(1 for r in op_log if r.op_kind == "gemm")
|
||||
n_math = sum(1 for r in op_log if r.op_kind == "math")
|
||||
print(f"[data] Phase 2 complete: {len(op_log)} ops ({n_gemm} gemm, {n_math} math)")
|
||||
|
||||
@@ -6,8 +6,6 @@ Same-timestamp independent ops can be batched for efficiency.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from itertools import groupby
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -29,18 +27,16 @@ class DataExecutor:
|
||||
self.store = store
|
||||
|
||||
def run(self) -> None:
|
||||
"""Execute all ops in op_log order, grouped by t_start.
|
||||
"""Execute all ops in op_log order.
|
||||
|
||||
Same-timestamp ops are independent and executed in parallel
|
||||
via ThreadPoolExecutor (numpy releases the GIL for BLAS ops).
|
||||
Ops are processed sequentially in t_start order. The previous
|
||||
ThreadPoolExecutor-based parallel execution was removed because
|
||||
same-t_start groups are almost always size 1 (each PE processes
|
||||
one command at a time), so the thread-pool overhead dominated.
|
||||
"""
|
||||
with ThreadPoolExecutor() as pool:
|
||||
for _t, ops_iter in groupby(self._op_log, key=lambda r: r.t_start):
|
||||
ops = list(ops_iter)
|
||||
if len(ops) == 1:
|
||||
self._execute_op(ops[0])
|
||||
else:
|
||||
list(pool.map(self._execute_op, ops))
|
||||
for op in self._op_log:
|
||||
if op.op_kind != "memory" or op.op_name != "dma_read":
|
||||
self._execute_op(op)
|
||||
|
||||
def _execute_op(self, op: OpRecord) -> None:
|
||||
if op.op_kind == "memory":
|
||||
|
||||
@@ -66,31 +66,39 @@ def _write_ccl_yaml(
|
||||
|
||||
CASES = [
|
||||
# algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
|
||||
#
|
||||
# Full-system (256-rank, cross-SIP) — run only ONCE (tcm). Buffer
|
||||
# variant differences are purely IPCQ slot placement; the compute path
|
||||
# is identical. Cross-SIP routing is the real thing being verified here.
|
||||
pytest.param(
|
||||
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", None, 8, 256,
|
||||
id="ring_full_system_tcm",
|
||||
id="ring_full_system",
|
||||
marks=pytest.mark.slow,
|
||||
),
|
||||
# Buffer variants at 8-rank (fast — same kernel, different slot space).
|
||||
pytest.param(
|
||||
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", 8, 32, 8,
|
||||
id="ring_tcm_8",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "hbm", None, 8, 256,
|
||||
id="ring_full_system_hbm",
|
||||
"ring_1d", "hbm", 8, 32, 8,
|
||||
id="ring_hbm_8",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "sram", None, 8, 256,
|
||||
id="ring_full_system_sram",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_8", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", 8, 32, 8,
|
||||
id="ring_single_cube",
|
||||
"ring_1d", "sram", 8, 32, 8,
|
||||
id="ring_sram_8",
|
||||
),
|
||||
# Multi-cube (16-rank, cross-cube within 1 SIP).
|
||||
pytest.param(
|
||||
"ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", 16, 16, 16,
|
||||
id="ring_multi_cube",
|
||||
),
|
||||
# Mesh + tree algorithms.
|
||||
pytest.param(
|
||||
"mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
|
||||
"mesh_2d", "tcm", 4, 16, 4,
|
||||
|
||||
@@ -1,18 +1,16 @@
|
||||
"""CCL performance validation tests (ADR-0023 D13 T5).
|
||||
|
||||
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
|
||||
under different ``ccl.yaml`` algorithm choices:
|
||||
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.
|
||||
|
||||
- All buffer kinds finish in non-zero simulated time.
|
||||
- Latency is bounded well under 1 ms for small tiles.
|
||||
|
||||
These are sanity checks on the model itself, not on absolute numbers.
|
||||
Uses 8-rank (single cube) for all buffer variants — the latency model
|
||||
is topology-aware, so buffer_kind differences are visible even at small
|
||||
scale. Full-system (256-rank) cross-SIP latency is covered by the
|
||||
``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -26,15 +24,15 @@ def _engine_factory(topology, device):
|
||||
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _ccl_yaml_override(algorithm: str, world_size: int | None = None):
|
||||
"""Write a tmp ccl.yaml that forces a specific algorithm + world_size."""
|
||||
def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
|
||||
"""Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
|
||||
Returns simulated kernel total_ns."""
|
||||
import tempfile
|
||||
entry_extra = f"\n world_size: {world_size}" if world_size is not None else ""
|
||||
body = f"""
|
||||
|
||||
body = f"""\
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: tcm
|
||||
buffer_kind: {buffer_kind}
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
@@ -42,71 +40,30 @@ defaults:
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
ring_allreduce_hbm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: hbm
|
||||
ring_allreduce_sram:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""}
|
||||
{algorithm}:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm{entry_extra}
|
||||
""" if world_size is not None else f"""
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
ring_allreduce_hbm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: hbm
|
||||
ring_allreduce_sram:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: sram
|
||||
buffer_kind: {buffer_kind}
|
||||
world_size: 8
|
||||
n_elem: 32
|
||||
"""
|
||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = os.path.join(tmp, "ccl.yaml")
|
||||
with open(path, "w") as f:
|
||||
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
|
||||
f.write(body)
|
||||
old_cwd = os.getcwd()
|
||||
os.chdir(tmp)
|
||||
try:
|
||||
yield path
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
|
||||
|
||||
def _run_unified(algorithm: str, world_size: int | None = None) -> float:
|
||||
"""Run the unified ccl_allreduce bench under a ccl.yaml override,
|
||||
return simulated kernel total_ns."""
|
||||
with _ccl_yaml_override(algorithm, world_size):
|
||||
topo = resolve_topology(
|
||||
os.path.join(os.path.dirname(__file__), "..", "topology.yaml")
|
||||
)
|
||||
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
|
||||
bench_mod = importlib.import_module("benches.ccl_allreduce")
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=bench_mod.run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=_engine_factory,
|
||||
)
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
|
||||
assert result.completion.ok, f"{algorithm} did not complete"
|
||||
last_kernel = None
|
||||
for tr in (result.traces or []):
|
||||
@@ -116,19 +73,15 @@ def _run_unified(algorithm: str, world_size: int | None = None) -> float:
|
||||
return float(last_kernel.get("total_ns", 0.0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", [
|
||||
"ring_allreduce_tcm",
|
||||
"ring_allreduce_hbm",
|
||||
"ring_allreduce_sram",
|
||||
])
|
||||
def test_ccl_latency_positive(algorithm):
|
||||
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
|
||||
def test_ccl_latency_positive(buffer_kind):
|
||||
"""Every buffer kind must produce a positive simulated latency."""
|
||||
ns = _run_unified(algorithm)
|
||||
algo = f"ring_allreduce_{buffer_kind}"
|
||||
ns = _run_8rank(algo, buffer_kind)
|
||||
assert ns > 0
|
||||
|
||||
|
||||
def test_ccl_latency_under_reasonable_bound():
|
||||
"""Sanity bound: ring all-reduce (tile=32 f16) should finish in well
|
||||
under 1 ms simulated. Way overhead-dominated for small tiles."""
|
||||
ns = _run_unified("ring_allreduce_tcm")
|
||||
assert ns < 100_000_000 # < 100 ms simulated — very loose bound
|
||||
"""8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
|
||||
ns = _run_8rank("ring_allreduce_tcm", "tcm")
|
||||
assert ns < 1_000_000 # < 1 ms simulated
|
||||
|
||||
Reference in New Issue
Block a user