diff --git a/benches/qkv_gemm.py b/benches/qkv_gemm.py index 492c632..a8ad8f2 100644 --- a/benches/qkv_gemm.py +++ b/benches/qkv_gemm.py @@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) → (M, N) -M, K, N = 128, 256, 128 +# Small dims (1 tile) for fast regression. The test verifies the full +# host→PE pipeline, not large-matrix throughput. +M, K, N = 32, 64, 32 DTYPE = "f16" diff --git a/benches/qkv_gemm_multi_pe.py b/benches/qkv_gemm_multi_pe.py index e8e8649..2aced00 100644 --- a/benches/qkv_gemm_multi_pe.py +++ b/benches/qkv_gemm_multi_pe.py @@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) -> (M, N) -M, K, N = 128, 256, 128 +# Small dims (1 tile) for fast regression. The test verifies the multi-PE +# fan-out pipeline, not large-matrix throughput. +M, K, N = 32, 64, 32 DTYPE = "f16" diff --git a/pyproject.toml b/pyproject.toml index 9f762ee..ef6ba8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"] [project.optional-dependencies] dev = [ "pytest>=7", + "pytest-xdist>=3.0", "ruff>=0.4.0", ] @@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"] ignore = ["F401"] [tool.pytest.ini_options] -addopts = ["--disable-warnings"] +addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"] +markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]