ADR-0026: DPPolicy intra-device only + ShardSpec structural coords
DPPolicy no longer carries a cross-SIP axis. SIP-level placement is solely controlled by torch.ahbm.set_device(rank) (ADR-0024); DPPolicy itself describes only the cube × PE layout within one SIP. ShardSpec switches to structural (sip, cube, pe) coordinates; the flat pe_index field/property is fully removed — silent drift between global-flat and SIP-local interpretations was a foot-gun flagged by ADR-0024 D11. Breaking API (explicit TypeError / AttributeError): - DPPolicy(sip=...) / DPPolicy(num_sips=...) -> TypeError - ShardSpec.pe_index -> AttributeError - ShardSpec(pe_index=...) -> TypeError - resolve_dp_policy now takes target_sip= (required), no num_sips. Downstream migration: - PE allocator dict keyed by (sip, cube, pe) tuples, in both _ensure_allocators and _free_tensor. deploy_tensor uses tuple lookup. - _create_tensor passes target_sip=current_sip; post-hoc pe_index shifting removed entirely. - launch._compute_local_shape drops the dp.sip branch. - Internal resolvers (column_wise / row_wise / replicate / tiled_*) return _LocalPeShard (cube-local identifier) instead of ShardSpec — resolve_dp_policy lifts them to full structural coords. Tests: - New tests/test_adr0026_dppolicy_intra_device.py (12 tests) pins the contract end-to-end. - test_sip_parallel.py rewritten: SIP composition now modeled as two resolve_dp_policy(target_sip=...) calls (ADR-0024 launcher style). - Call-site migration: test_tensor, test_va_integration, test_va_offset, test_runtime_api_tensor, test_tl_recv_async, test_ccl_* and benches gemm_single_pe, gpt3_qkv, va_offset_verify, ccl_allreduce (legacy branch) all use intra-device DPPolicy and structural ShardSpec. Result: 523 passed, 1 strict xfail (ring_default_ws — unchanged ADR-0024 Phase B blocker; architectural fix deferred to ADR-0027). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+23
-17
@@ -2,11 +2,13 @@ import pytest
|
||||
|
||||
from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator
|
||||
from kernbench.policy.placement.dp import (
|
||||
DPPolicy,
|
||||
ShardSpec,
|
||||
column_wise,
|
||||
tiled_column_major,
|
||||
replicate,
|
||||
resolve_dp_policy,
|
||||
row_wise,
|
||||
tiled_column_major,
|
||||
tiled_row_major,
|
||||
)
|
||||
from kernbench.runtime_api.kernel import (
|
||||
@@ -40,9 +42,9 @@ _CFG = AddressConfig(
|
||||
)
|
||||
|
||||
|
||||
def _make_allocators(num_pe: int = 8) -> dict[int, PEMemAllocator]:
|
||||
def _make_allocators(num_pe: int = 8) -> dict[tuple[int, int, int], PEMemAllocator]:
|
||||
return {
|
||||
i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
|
||||
(0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
|
||||
for i in range(num_pe)
|
||||
}
|
||||
|
||||
@@ -133,7 +135,7 @@ def test_column_wise_placement():
|
||||
assert len(shards) == 8
|
||||
expected_nbytes = 1024 * 64 * 2 # 128 KB
|
||||
for i, s in enumerate(shards):
|
||||
assert s.pe_index == i
|
||||
assert s.local_pe == i
|
||||
assert s.nbytes == expected_nbytes
|
||||
# offsets are contiguous
|
||||
assert shards[0].offset_bytes == 0
|
||||
@@ -151,7 +153,7 @@ def test_row_wise_placement():
|
||||
assert len(shards) == 8
|
||||
expected_nbytes = 128 * 512 * 2 # 128 KB
|
||||
for i, s in enumerate(shards):
|
||||
assert s.pe_index == i
|
||||
assert s.local_pe == i
|
||||
assert s.nbytes == expected_nbytes
|
||||
assert shards[0].offset_bytes == 0
|
||||
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
||||
@@ -166,7 +168,7 @@ def test_replicate_placement():
|
||||
assert len(shards) == 8
|
||||
full_nbytes = 1024 * 512 * 2 # 1 MB
|
||||
for i, s in enumerate(shards):
|
||||
assert s.pe_index == i
|
||||
assert s.local_pe == i
|
||||
assert s.nbytes == full_nbytes
|
||||
assert s.offset_bytes == 0 # each is a full copy
|
||||
|
||||
@@ -188,10 +190,10 @@ def test_tiled_column_major():
|
||||
# tile (m=0,k=0) → PE0, tile (m=0,k=1) → PE1, ..., (m=0,k=3) → PE3
|
||||
# tile (m=1,k=0) → PE4, tile (m=1,k=1) → PE5, ..., (m=1,k=3) → PE7
|
||||
# tile (m=2,k=0) → PE0, ...
|
||||
assert shards[0].pe_index == 0
|
||||
assert shards[1].pe_index == 1
|
||||
assert shards[7].pe_index == 7
|
||||
assert shards[8].pe_index == 0 # wraps around
|
||||
assert shards[0].local_pe == 0
|
||||
assert shards[1].local_pe == 1
|
||||
assert shards[7].local_pe == 7
|
||||
assert shards[8].local_pe == 0 # wraps around
|
||||
# total coverage
|
||||
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
||||
|
||||
@@ -212,10 +214,10 @@ def test_tiled_row_major():
|
||||
# tile (m=0,k=0) → PE0, tile (m=1,k=0) → PE1, ..., (m=3,k=0) → PE3
|
||||
# tile (m=0,k=1) → PE4, tile (m=1,k=1) → PE5, ..., (m=3,k=1) → PE7
|
||||
# tile (m=0,k=2) → PE0, ...
|
||||
assert shards[0].pe_index == 0
|
||||
assert shards[1].pe_index == 1
|
||||
assert shards[7].pe_index == 7
|
||||
assert shards[8].pe_index == 0 # wraps around
|
||||
assert shards[0].local_pe == 0
|
||||
assert shards[1].local_pe == 1
|
||||
assert shards[7].local_pe == 7
|
||||
assert shards[8].local_pe == 0 # wraps around
|
||||
# total coverage
|
||||
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
||||
|
||||
@@ -226,7 +228,11 @@ def test_tiled_row_major():
|
||||
def test_deploy_tensor_hbm():
|
||||
"""Deploy with column_wise placement → TensorHandle with valid PA shards."""
|
||||
allocs = _make_allocators()
|
||||
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
|
||||
placement = resolve_dp_policy(
|
||||
DPPolicy(cube="replicate", pe="column_wise"),
|
||||
shape=(1024, 512), itemsize=2,
|
||||
num_pe=8, num_cubes=1, target_sip=0,
|
||||
)
|
||||
th = deploy_tensor(
|
||||
name="W",
|
||||
shape=(1024, 512),
|
||||
@@ -253,7 +259,7 @@ def test_deploy_tensor_hbm():
|
||||
def test_deploy_tensor_tcm():
|
||||
"""Deploy with TCM → uses pe_tcm_addr allocation."""
|
||||
allocs = _make_allocators()
|
||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=256)]
|
||||
placement = [ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=256)]
|
||||
th = deploy_tensor(
|
||||
name="small",
|
||||
shape=(128,),
|
||||
@@ -271,7 +277,7 @@ def test_deploy_tensor_overflow():
|
||||
"""Allocation exceeding PE HBM capacity raises AllocationError."""
|
||||
allocs = _make_allocators()
|
||||
# 6 GB per PE slice, try to allocate 7 GB
|
||||
big_shard = ShardSpec(pe_index=0, offset_bytes=0, nbytes=7 * _GB)
|
||||
big_shard = ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=7 * _GB)
|
||||
with pytest.raises(AllocationError):
|
||||
deploy_tensor(
|
||||
name="toobig",
|
||||
|
||||
Reference in New Issue
Block a user