ADR-0023 D9.7: IPCQ slot-memory latency model (TCM/SRAM/HBM)
Charge per-tier bandwidth + setup overhead at IPCQ slot WRITE
(receiver inbound DMA, in pe_dma._handle_ipcq_inbound) and slot
READ (recv consume, in pe_ipcq._handle_recv). Tier table
(common/ipcq_types.py):
tcm : 512 GB/s, 0 ns
sram : 128 GB/s, 2 ns
hbm : 32 GB/s, 6 ns
Before this change, slot read/write was free regardless of
buffer_kind, making memory-tier choice invisible in simulated
latency. After the change, swapping buffer_kind in ccl.yaml
produces measurable per-tier separation in allreduce latency.
Tests:
test_ipcq_buffer_kind_latency.py — three micro-tests asserting
tcm < sram < hbm ordering, payload-scaling, and that
buffer_kind sensitivity grows with payload (credit-only path
stays fabric-bound).
test_allreduce_buffer_kind_sweep.py — 12-config parametrized
sweep emitting buffer_kind_sweep.png (3 lines, torus_2d).
conftest sessionfinish hook generalised to dispatch multiple
sweep aggregators (allreduce + buffer-kind).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+21
-17
@@ -27,23 +27,27 @@ def pytest_sessionfinish(session, exitstatus):
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
mod_path = Path(__file__).parent / "test_allreduce_multidevice.py"
|
||||
if not mod_path.exists():
|
||||
return
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"_test_allreduce_multidevice_for_aggregate", mod_path,
|
||||
)
|
||||
if spec is None or spec.loader is None:
|
||||
return
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
sys.modules[spec.name] = mod
|
||||
try:
|
||||
spec.loader.exec_module(mod)
|
||||
agg = getattr(mod, "_aggregate_sweep_plots", None)
|
||||
if agg is not None:
|
||||
agg()
|
||||
except Exception as e:
|
||||
print(f"[conftest] sweep aggregation failed: {e}")
|
||||
def _exec(name: str, attr: str) -> None:
|
||||
mod_path = Path(__file__).parent / name
|
||||
if not mod_path.exists():
|
||||
return
|
||||
s = importlib.util.spec_from_file_location(
|
||||
f"_{name.removesuffix('.py')}_for_aggregate", mod_path,
|
||||
)
|
||||
if s is None or s.loader is None:
|
||||
return
|
||||
mod = importlib.util.module_from_spec(s)
|
||||
sys.modules[s.name] = mod
|
||||
try:
|
||||
s.loader.exec_module(mod)
|
||||
fn = getattr(mod, attr, None)
|
||||
if fn is not None:
|
||||
fn()
|
||||
except Exception as e:
|
||||
print(f"[conftest] aggregator {attr}() in {name} failed: {e}")
|
||||
|
||||
_exec("test_allreduce_multidevice.py", "_aggregate_sweep_plots")
|
||||
_exec("test_allreduce_buffer_kind_sweep.py", "aggregate_buffer_kind_plot")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
||||
Reference in New Issue
Block a user