diff --git a/benches/qkv_gemm.py b/benches/qkv_gemm.py
index 492c632..a8ad8f2 100644
--- a/benches/qkv_gemm.py
+++ b/benches/qkv_gemm.py
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy
 
 # GEMM dimensions: (M, K) x (K, N) → (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the full
+# host→PE pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
 DTYPE = "f16"
 
 
diff --git a/benches/qkv_gemm_multi_pe.py b/benches/qkv_gemm_multi_pe.py
index e8e8649..2aced00 100644
--- a/benches/qkv_gemm_multi_pe.py
+++ b/benches/qkv_gemm_multi_pe.py
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy
 
 # GEMM dimensions: (M, K) x (K, N) -> (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the multi-PE
+# fan-out pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
 DTYPE = "f16"
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 9f762ee..ef6ba8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"]
 [project.optional-dependencies]
 dev = [
   "pytest>=7",
+  "pytest-xdist>=3.0",
   "ruff>=0.4.0",
 ]
 
@@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"]
 ignore = ["F401"]
 
 [tool.pytest.ini_options]
-addopts = ["--disable-warnings"]
+addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"]
+markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]