From 372c987995f428f2ceeb48ddc5ba9908df3087c7 Mon Sep 17 00:00:00 2001
From: Yangwook Kang <ywkang80@gmail.com>
Date: Sun, 12 Apr 2026 21:06:41 -0700
Subject: [PATCH] Reduce test time to 12s: shrink GEMM dims + enable
 pytest-xdist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GEMM dimension reduction:
- qkv_gemm.py: M,K,N = 128,256,128 → 32,64,32 (64 tiles → 1 tile).
- qkv_gemm_multi_pe.py: same reduction.
- Tests verify pipeline correctness, not large-matrix throughput.
- Per-test time: 18s → 1.7s. 6 tests total: 108s → 10s.

pytest-xdist parallel execution:
- Add pytest-xdist to dev dependencies.
- pyproject.toml addopts: -n auto (use all CPU cores), -m "not slow".
- Default `pytest` runs 501 tests in ~12s (previously 148s).
- Full suite including slow: `pytest -m ""` → 3m24s (previously 5m43s).

pytest.mark.slow:
- Registered in pyproject.toml markers section.
- 256-rank full-system test is the only slow-marked test.
- Run with: pytest -m "" (CI) or pytest (local dev, skips slow).

502 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benches/qkv_gemm.py          | 4 +++-
 benches/qkv_gemm_multi_pe.py | 4 +++-
 pyproject.toml               | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/benches/qkv_gemm.py b/benches/qkv_gemm.py
index 492c632..a8ad8f2 100644
--- a/benches/qkv_gemm.py
+++ b/benches/qkv_gemm.py
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy
 
 # GEMM dimensions: (M, K) x (K, N) → (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the full
+# host→PE pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
 DTYPE = "f16"
 
 
diff --git a/benches/qkv_gemm_multi_pe.py b/benches/qkv_gemm_multi_pe.py
index e8e8649..2aced00 100644
--- a/benches/qkv_gemm_multi_pe.py
+++ b/benches/qkv_gemm_multi_pe.py
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy
 
 # GEMM dimensions: (M, K) x (K, N) -> (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the multi-PE
+# fan-out pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
 DTYPE = "f16"
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 9f762ee..ef6ba8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"]
 [project.optional-dependencies]
 dev = [
   "pytest>=7",
+  "pytest-xdist>=3.0",
   "ruff>=0.4.0",
 ]
 
@@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"]
 ignore = ["F401"]
 
 [tool.pytest.ini_options]
-addopts = ["--disable-warnings"]
+addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"]
+markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]