From 372c987995f428f2ceeb48ddc5ba9908df3087c7 Mon Sep 17 00:00:00 2001 From: Yangwook Kang Date: Sun, 12 Apr 2026 21:06:41 -0700 Subject: [PATCH] Reduce test time to 12s: shrink GEMM dims + enable pytest-xdist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GEMM dimension reduction: - qkv_gemm.py: M,K,N = 128,256,128 → 32,64,32 (64 tiles → 1 tile). - qkv_gemm_multi_pe.py: same reduction. - Tests verify pipeline correctness, not large-matrix throughput. - Per-test time: 18s → 1.7s. 6 tests total: 108s → 10s. pytest-xdist parallel execution: - Add pytest-xdist to dev dependencies. - pyproject.toml addopts: -n auto (use all CPU cores), -m "not slow". - Default `pytest` runs 501 tests in ~12s (previously 148s). - Full suite including slow: `pytest -m ""` → 3m24s (previously 5m43s). pytest.mark.slow: - Registered in pyproject.toml markers section. - 256-rank full-system test is the only slow-marked test. - Run with: pytest -m "" (CI) or pytest (local dev, skips slow). 502 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- benches/qkv_gemm.py | 4 +++- benches/qkv_gemm_multi_pe.py | 4 +++- pyproject.toml | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/benches/qkv_gemm.py b/benches/qkv_gemm.py index 492c632..a8ad8f2 100644 --- a/benches/qkv_gemm.py +++ b/benches/qkv_gemm.py @@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) → (M, N) -M, K, N = 128, 256, 128 +# Small dims (1 tile) for fast regression. The test verifies the full +# host→PE pipeline, not large-matrix throughput. +M, K, N = 32, 64, 32 DTYPE = "f16" diff --git a/benches/qkv_gemm_multi_pe.py b/benches/qkv_gemm_multi_pe.py index e8e8649..2aced00 100644 --- a/benches/qkv_gemm_multi_pe.py +++ b/benches/qkv_gemm_multi_pe.py @@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) -> (M, N) -M, K, N = 128, 256, 128 +# Small dims (1 tile) for fast regression. The test verifies the multi-PE +# fan-out pipeline, not large-matrix throughput. +M, K, N = 32, 64, 32 DTYPE = "f16" diff --git a/pyproject.toml b/pyproject.toml index 9f762ee..ef6ba8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"] [project.optional-dependencies] dev = [ "pytest>=7", + "pytest-xdist>=3.0", "ruff>=0.4.0", ] @@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"] ignore = ["F401"] [tool.pytest.ini_options] -addopts = ["--disable-warnings"] +addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"] +markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]