fca24feac5
- intercube_allreduce: add single-cube fast path that skips intra-SIP mesh reduce and goes directly to inter-SIP exchange. Fixes IPCQ deadlock when TP launches kernel on one cube per SIP. - distributed.py: derive effective cube dims from tensor shard placement instead of hardcoding topology mesh size. - pyproject.toml: add matplotlib>=3.7 to dependencies. - pe_dma.py (prior commit): add MMU translation in pipeline DMA path. 577 passed, 0 failed (was 529 passed, 10 failed). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
37 lines
890 B
TOML
37 lines
890 B
TOML
[build-system]
|
|
requires = ["setuptools>=68", "wheel"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "kernbench"
|
|
version = "0.1.0"
|
|
requires-python = ">=3.10"
|
|
dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12", "numpy>=1.24", "greenlet>=3.0", "matplotlib>=3.7"]
|
|
|
|
[project.scripts]
|
|
kernbench = "kernbench.cli.main:main"
|
|
|
|
[tool.setuptools.packages.find]
|
|
where = ["src", "."]
|
|
include = ["kernbench*", "benches*"]
|
|
|
|
[project.optional-dependencies]
|
|
dev = [
|
|
"pytest>=7",
|
|
"pytest-xdist>=3.0",
|
|
"ruff>=0.4.0",
|
|
]
|
|
|
|
[tool.ruff]
|
|
line-length = 100
|
|
target-version = "py310"
|
|
fix = false
|
|
|
|
[tool.ruff.lint]
|
|
select = ["E", "F", "I", "B", "UP"]
|
|
ignore = ["F401"]
|
|
|
|
[tool.pytest.ini_options]
|
|
addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"]
|
|
markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]
|