Speed up regression: 25min → 6min (test matrix + DataExecutor cleanup)

Test matrix restructure: - 256-rank full-system ring runs only ONCE (marked pytest.mark.slow) instead of 7× across matrix + perf tests. Cross-SIP routing is verified by the single run; buffer variants (tcm/hbm/sram) are tested at 8-rank where they finish in <0.5s. - Performance tests use 8-rank instead of 256-rank. - `pytest -m "not slow"` completes in ~2.5min (local dev). - Full suite including slow: ~6min (CI). DataExecutor optimization: - Remove ThreadPoolExecutor from DataExecutor.run(). Same-t_start groups are almost always size 1, so the thread pool creation and dispatch overhead dominated. Simple sequential loop is faster. - Skip dma_read ops at the loop level (they are always no-ops in Phase 2 but were dispatched through _execute_op → _execute_memory). - Remove redundant CLI Phase 2 re-execution: engine._flush_data_phase already replays during engine.wait(); the CLI now only prints the diagnostic summary without re-running DataExecutor. 502 tests pass. Wall time: 25m30s → 5m43s (full), 2m28s (no slow). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 20:52:07 -07:00
parent 998cc85762
commit bcf941dcee
4 changed files with 60 additions and 106 deletions
@@ -72,15 +72,12 @@ def cmd_run(args) -> int:
        print(format_report(result.traces, title=args.bench, spec=spec))
    print(result.summary_text())

-    # Phase 2: data execution (ADR-0020)
+    # Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
+    # already runs inside engine.wait() → _flush_data_phase(). We only
+    # print the summary here; no redundant re-execution.
    if verify_data and result.engine is not None:
-        from kernbench.sim_engine.data_executor import DataExecutor
-
        op_log = result.engine.op_log
-        store = result.engine.memory_store
-        if op_log and store is not None:
-            executor = DataExecutor(op_log, store)
-            executor.run()
+        if op_log:
            n_gemm = sum(1 for r in op_log if r.op_kind == "gemm")
            n_math = sum(1 for r in op_log if r.op_kind == "math")
            print(f"[data] Phase 2 complete: {len(op_log)} ops ({n_gemm} gemm, {n_math} math)")
@@ -6,8 +6,6 @@ Same-timestamp independent ops can be batched for efficiency.
 """
 from __future__ import annotations

-from concurrent.futures import ThreadPoolExecutor
-from itertools import groupby
 from typing import Any

 import numpy as np
@@ -29,18 +27,16 @@ class DataExecutor:
        self.store = store

    def run(self) -> None:
-        """Execute all ops in op_log order, grouped by t_start.
+        """Execute all ops in op_log order.

-        Same-timestamp ops are independent and executed in parallel
-        via ThreadPoolExecutor (numpy releases the GIL for BLAS ops).
+        Ops are processed sequentially in t_start order. The previous
+        ThreadPoolExecutor-based parallel execution was removed because
+        same-t_start groups are almost always size 1 (each PE processes
+        one command at a time), so the thread-pool overhead dominated.
        """
-        with ThreadPoolExecutor() as pool:
-            for _t, ops_iter in groupby(self._op_log, key=lambda r: r.t_start):
-                ops = list(ops_iter)
-                if len(ops) == 1:
-                    self._execute_op(ops[0])
-                else:
-                    list(pool.map(self._execute_op, ops))
+        for op in self._op_log:
+            if op.op_kind != "memory" or op.op_name != "dma_read":
+                self._execute_op(op)

    def _execute_op(self, op: OpRecord) -> None:
        if op.op_kind == "memory":
@@ -66,31 +66,39 @@ def _write_ccl_yaml(

 CASES = [
    # algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
+    #
+    # Full-system (256-rank, cross-SIP) — run only ONCE (tcm). Buffer
+    # variant differences are purely IPCQ slot placement; the compute path
+    # is identical. Cross-SIP routing is the real thing being verified here.
    pytest.param(
        "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
        "ring_1d", "tcm", None, 8, 256,
-        id="ring_full_system_tcm",
+        id="ring_full_system",
+        marks=pytest.mark.slow,
+    ),
+    # Buffer variants at 8-rank (fast — same kernel, different slot space).
+    pytest.param(
+        "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
+        "ring_1d", "tcm", 8, 32, 8,
+        id="ring_tcm_8",
    ),
    pytest.param(
        "ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
-        "ring_1d", "hbm", None, 8, 256,
-        id="ring_full_system_hbm",
+        "ring_1d", "hbm", 8, 32, 8,
+        id="ring_hbm_8",
    ),
    pytest.param(
        "ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
-        "ring_1d", "sram", None, 8, 256,
-        id="ring_full_system_sram",
-    ),
-    pytest.param(
-        "ring_allreduce_8", "kernbench.ccl.algorithms.ring_allreduce",
-        "ring_1d", "tcm", 8, 32, 8,
-        id="ring_single_cube",
+        "ring_1d", "sram", 8, 32, 8,
+        id="ring_sram_8",
    ),
+    # Multi-cube (16-rank, cross-cube within 1 SIP).
    pytest.param(
        "ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
        "ring_1d", "tcm", 16, 16, 16,
        id="ring_multi_cube",
    ),
+    # Mesh + tree algorithms.
    pytest.param(
        "mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
        "mesh_2d", "tcm", 4, 16, 4,
@@ -1,18 +1,16 @@
 """CCL performance validation tests (ADR-0023 D13 T5).

-Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
-under different ``ccl.yaml`` algorithm choices:
+Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.

-  - All buffer kinds finish in non-zero simulated time.
-  - Latency is bounded well under 1 ms for small tiles.
-
-These are sanity checks on the model itself, not on absolute numbers.
+Uses 8-rank (single cube) for all buffer variants — the latency model
+is topology-aware, so buffer_kind differences are visible even at small
+scale. Full-system (256-rank) cross-SIP latency is covered by the
+``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
 """
 from __future__ import annotations

 import importlib
 import os
-from contextlib import contextmanager

 import pytest

@@ -26,15 +24,15 @@ def _engine_factory(topology, device):
    return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)


-@contextmanager
-def _ccl_yaml_override(algorithm: str, world_size: int | None = None):
-    """Write a tmp ccl.yaml that forces a specific algorithm + world_size."""
+def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
+    """Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
+    Returns simulated kernel total_ns."""
    import tempfile
-    entry_extra = f"\n    world_size: {world_size}" if world_size is not None else ""
-    body = f"""
+
+    body = f"""\
 defaults:
  algorithm: {algorithm}
-  buffer_kind: tcm
+  buffer_kind: {buffer_kind}
  backpressure: sleep
  n_slots: 4
  slot_size: 4096
@@ -42,71 +40,30 @@ defaults:
  ipcq_credit_size_bytes: 16

 algorithms:
-  ring_allreduce_tcm:
-    module: kernbench.ccl.algorithms.ring_allreduce
-    topology: ring_1d
-    buffer_kind: tcm
-  ring_allreduce_hbm:
-    module: kernbench.ccl.algorithms.ring_allreduce
-    topology: ring_1d
-    buffer_kind: hbm
-  ring_allreduce_sram:
-    module: kernbench.ccl.algorithms.ring_allreduce
-    topology: ring_1d
-    buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""}
  {algorithm}:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d
-    buffer_kind: tcm{entry_extra}
-""" if world_size is not None else f"""
-defaults:
-  algorithm: {algorithm}
-  buffer_kind: tcm
-  backpressure: sleep
-  n_slots: 4
-  slot_size: 4096
-  vc_chunk_size: 256
-  ipcq_credit_size_bytes: 16
-
-algorithms:
-  ring_allreduce_tcm:
-    module: kernbench.ccl.algorithms.ring_allreduce
-    topology: ring_1d
-    buffer_kind: tcm
-  ring_allreduce_hbm:
-    module: kernbench.ccl.algorithms.ring_allreduce
-    topology: ring_1d
-    buffer_kind: hbm
-  ring_allreduce_sram:
-    module: kernbench.ccl.algorithms.ring_allreduce
-    topology: ring_1d
-    buffer_kind: sram
+    buffer_kind: {buffer_kind}
+    world_size: 8
+    n_elem: 32
 """
+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    with tempfile.TemporaryDirectory() as tmp:
-        path = os.path.join(tmp, "ccl.yaml")
-        with open(path, "w") as f:
+        with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
            f.write(body)
        old_cwd = os.getcwd()
        os.chdir(tmp)
        try:
-            yield path
-        finally:
-            os.chdir(old_cwd)
-
-
-def _run_unified(algorithm: str, world_size: int | None = None) -> float:
-    """Run the unified ccl_allreduce bench under a ccl.yaml override,
-    return simulated kernel total_ns."""
-    with _ccl_yaml_override(algorithm, world_size):
-        topo = resolve_topology(
-            os.path.join(os.path.dirname(__file__), "..", "topology.yaml")
-        )
+            topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
            bench_mod = importlib.import_module("benches.ccl_allreduce")
            result = run_bench(
                topology=topo, bench_fn=bench_mod.run,
                device=resolve_device("all"),
                engine_factory=_engine_factory,
            )
+        finally:
+            os.chdir(old_cwd)
+
    assert result.completion.ok, f"{algorithm} did not complete"
    last_kernel = None
    for tr in (result.traces or []):
@@ -116,19 +73,15 @@ def _run_unified(algorithm: str, world_size: int | None = None) -> float:
    return float(last_kernel.get("total_ns", 0.0))


-@pytest.mark.parametrize("algorithm", [
-    "ring_allreduce_tcm",
-    "ring_allreduce_hbm",
-    "ring_allreduce_sram",
-])
-def test_ccl_latency_positive(algorithm):
+@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
+def test_ccl_latency_positive(buffer_kind):
    """Every buffer kind must produce a positive simulated latency."""
-    ns = _run_unified(algorithm)
+    algo = f"ring_allreduce_{buffer_kind}"
+    ns = _run_8rank(algo, buffer_kind)
    assert ns > 0


 def test_ccl_latency_under_reasonable_bound():
-    """Sanity bound: ring all-reduce (tile=32 f16) should finish in well
-    under 1 ms simulated. Way overhead-dominated for small tiles."""
-    ns = _run_unified("ring_allreduce_tcm")
-    assert ns < 100_000_000  # < 100 ms simulated — very loose bound
+    """8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
+    ns = _run_8rank("ring_allreduce_tcm", "tcm")
+    assert ns < 1_000_000  # < 1 ms simulated