Speed up regression: 25min → 6min (test matrix + DataExecutor cleanup)

Test matrix restructure:
- 256-rank full-system ring runs only ONCE (marked pytest.mark.slow)
  instead of 7× across matrix + perf tests. Cross-SIP routing is
  verified by the single run; buffer variants (tcm/hbm/sram) are
  tested at 8-rank where they finish in <0.5s.
- Performance tests use 8-rank instead of 256-rank.
- `pytest -m "not slow"` completes in ~2.5min (local dev).
- Full suite including slow: ~6min (CI).

DataExecutor optimization:
- Remove ThreadPoolExecutor from DataExecutor.run(). Same-t_start
  groups are almost always size 1, so the thread pool creation and
  dispatch overhead dominated. Simple sequential loop is faster.
- Skip dma_read ops at the loop level (they are always no-ops in
  Phase 2 but were dispatched through _execute_op → _execute_memory).
- Remove redundant CLI Phase 2 re-execution: engine._flush_data_phase
  already replays during engine.wait(); the CLI now only prints the
  diagnostic summary without re-running DataExecutor.

502 tests pass. Wall time: 25m30s → 5m43s (full), 2m28s (no slow).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-12 20:52:07 -07:00
parent 998cc85762
commit bcf941dcee
4 changed files with 60 additions and 106 deletions
+4 -7
View File
@@ -72,15 +72,12 @@ def cmd_run(args) -> int:
print(format_report(result.traces, title=args.bench, spec=spec))
print(result.summary_text())
# Phase 2: data execution (ADR-0020)
# Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
# already runs inside engine.wait() → _flush_data_phase(). We only
# print the summary here; no redundant re-execution.
if verify_data and result.engine is not None:
from kernbench.sim_engine.data_executor import DataExecutor
op_log = result.engine.op_log
store = result.engine.memory_store
if op_log and store is not None:
executor = DataExecutor(op_log, store)
executor.run()
if op_log:
n_gemm = sum(1 for r in op_log if r.op_kind == "gemm")
n_math = sum(1 for r in op_log if r.op_kind == "math")
print(f"[data] Phase 2 complete: {len(op_log)} ops ({n_gemm} gemm, {n_math} math)")
+8 -12
View File
@@ -6,8 +6,6 @@ Same-timestamp independent ops can be batched for efficiency.
"""
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor
from itertools import groupby
from typing import Any
import numpy as np
@@ -29,18 +27,16 @@ class DataExecutor:
self.store = store
def run(self) -> None:
"""Execute all ops in op_log order, grouped by t_start.
"""Execute all ops in op_log order.
Same-timestamp ops are independent and executed in parallel
via ThreadPoolExecutor (numpy releases the GIL for BLAS ops).
Ops are processed sequentially in t_start order. The previous
ThreadPoolExecutor-based parallel execution was removed because
same-t_start groups are almost always size 1 (each PE processes
one command at a time), so the thread-pool overhead dominated.
"""
with ThreadPoolExecutor() as pool:
for _t, ops_iter in groupby(self._op_log, key=lambda r: r.t_start):
ops = list(ops_iter)
if len(ops) == 1:
self._execute_op(ops[0])
else:
list(pool.map(self._execute_op, ops))
for op in self._op_log:
if op.op_kind != "memory" or op.op_name != "dma_read":
self._execute_op(op)
def _execute_op(self, op: OpRecord) -> None:
if op.op_kind == "memory":
+18 -10
View File
@@ -66,31 +66,39 @@ def _write_ccl_yaml(
CASES = [
# algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
#
# Full-system (256-rank, cross-SIP) — run only ONCE (tcm). Buffer
# variant differences are purely IPCQ slot placement; the compute path
# is identical. Cross-SIP routing is the real thing being verified here.
pytest.param(
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", None, 8, 256,
id="ring_full_system_tcm",
id="ring_full_system",
marks=pytest.mark.slow,
),
# Buffer variants at 8-rank (fast — same kernel, different slot space).
pytest.param(
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", 8, 32, 8,
id="ring_tcm_8",
),
pytest.param(
"ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "hbm", None, 8, 256,
id="ring_full_system_hbm",
"ring_1d", "hbm", 8, 32, 8,
id="ring_hbm_8",
),
pytest.param(
"ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "sram", None, 8, 256,
id="ring_full_system_sram",
),
pytest.param(
"ring_allreduce_8", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", 8, 32, 8,
id="ring_single_cube",
"ring_1d", "sram", 8, 32, 8,
id="ring_sram_8",
),
# Multi-cube (16-rank, cross-cube within 1 SIP).
pytest.param(
"ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", 16, 16, 16,
id="ring_multi_cube",
),
# Mesh + tree algorithms.
pytest.param(
"mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
"mesh_2d", "tcm", 4, 16, 4,
+30 -77
View File
@@ -1,18 +1,16 @@
"""CCL performance validation tests (ADR-0023 D13 T5).
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
under different ``ccl.yaml`` algorithm choices:
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.
- All buffer kinds finish in non-zero simulated time.
- Latency is bounded well under 1 ms for small tiles.
These are sanity checks on the model itself, not on absolute numbers.
Uses 8-rank (single cube) for all buffer variants — the latency model
is topology-aware, so buffer_kind differences are visible even at small
scale. Full-system (256-rank) cross-SIP latency is covered by the
``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
"""
from __future__ import annotations
import importlib
import os
from contextlib import contextmanager
import pytest
@@ -26,15 +24,15 @@ def _engine_factory(topology, device):
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
@contextmanager
def _ccl_yaml_override(algorithm: str, world_size: int | None = None):
"""Write a tmp ccl.yaml that forces a specific algorithm + world_size."""
def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
"""Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
Returns simulated kernel total_ns."""
import tempfile
entry_extra = f"\n world_size: {world_size}" if world_size is not None else ""
body = f"""
body = f"""\
defaults:
algorithm: {algorithm}
buffer_kind: tcm
buffer_kind: {buffer_kind}
backpressure: sleep
n_slots: 4
slot_size: 4096
@@ -42,71 +40,30 @@ defaults:
ipcq_credit_size_bytes: 16
algorithms:
ring_allreduce_tcm:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: tcm
ring_allreduce_hbm:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: hbm
ring_allreduce_sram:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""}
{algorithm}:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: tcm{entry_extra}
""" if world_size is not None else f"""
defaults:
algorithm: {algorithm}
buffer_kind: tcm
backpressure: sleep
n_slots: 4
slot_size: 4096
vc_chunk_size: 256
ipcq_credit_size_bytes: 16
algorithms:
ring_allreduce_tcm:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: tcm
ring_allreduce_hbm:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: hbm
ring_allreduce_sram:
module: kernbench.ccl.algorithms.ring_allreduce
topology: ring_1d
buffer_kind: sram
buffer_kind: {buffer_kind}
world_size: 8
n_elem: 32
"""
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
with tempfile.TemporaryDirectory() as tmp:
path = os.path.join(tmp, "ccl.yaml")
with open(path, "w") as f:
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
f.write(body)
old_cwd = os.getcwd()
os.chdir(tmp)
try:
yield path
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
bench_mod = importlib.import_module("benches.ccl_allreduce")
result = run_bench(
topology=topo, bench_fn=bench_mod.run,
device=resolve_device("all"),
engine_factory=_engine_factory,
)
finally:
os.chdir(old_cwd)
def _run_unified(algorithm: str, world_size: int | None = None) -> float:
"""Run the unified ccl_allreduce bench under a ccl.yaml override,
return simulated kernel total_ns."""
with _ccl_yaml_override(algorithm, world_size):
topo = resolve_topology(
os.path.join(os.path.dirname(__file__), "..", "topology.yaml")
)
bench_mod = importlib.import_module("benches.ccl_allreduce")
result = run_bench(
topology=topo, bench_fn=bench_mod.run,
device=resolve_device("all"),
engine_factory=_engine_factory,
)
assert result.completion.ok, f"{algorithm} did not complete"
last_kernel = None
for tr in (result.traces or []):
@@ -116,19 +73,15 @@ def _run_unified(algorithm: str, world_size: int | None = None) -> float:
return float(last_kernel.get("total_ns", 0.0))
@pytest.mark.parametrize("algorithm", [
"ring_allreduce_tcm",
"ring_allreduce_hbm",
"ring_allreduce_sram",
])
def test_ccl_latency_positive(algorithm):
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
def test_ccl_latency_positive(buffer_kind):
"""Every buffer kind must produce a positive simulated latency."""
ns = _run_unified(algorithm)
algo = f"ring_allreduce_{buffer_kind}"
ns = _run_8rank(algo, buffer_kind)
assert ns > 0
def test_ccl_latency_under_reasonable_bound():
"""Sanity bound: ring all-reduce (tile=32 f16) should finish in well
under 1 ms simulated. Way overhead-dominated for small tiles."""
ns = _run_unified("ring_allreduce_tcm")
assert ns < 100_000_000 # < 100 ms simulated — very loose bound
"""8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
ns = _run_8rank("ring_allreduce_tcm", "tcm")
assert ns < 1_000_000 # < 1 ms simulated