"""Tests for SIP-level tensor parallelism — ADR-0026 structural model. DPPolicy no longer carries a ``sip`` axis (ADR-0026 D1). SIP placement is now expressed structurally: each call to ``resolve_dp_policy(target_sip=N)`` emits shards pinned to SIP N. Multi-SIP parallelism is composed by calling the resolver once per SIP (typically driven by the ADR-0024 launcher, one worker greenlet per rank, each worker using ``torch.ahbm.set_device(rank)``). Covered here: SP1. ``target_sip`` stamps every shard. SP2. Two-SIP placement: union of two resolver calls covers the whole tensor K-axis when the combined bench treats them as column-split. SP3. Same for row-wise. SP4. Cube + PE sharding within a SIP remains correct across SIPs. SP5. PE_CPU num_programs contract (unchanged by ADR-0026). """ from __future__ import annotations from kernbench.policy.placement.dp import DPPolicy, resolve_dp_policy # ── SP1. target_sip stamps shards ──────────────────────────────────── def test_target_sip_stamps_all_shards(): dp = DPPolicy(cube="replicate", pe="column_wise") shards = resolve_dp_policy( dp, shape=(128, 256), itemsize=2, num_pe=8, num_cubes=1, target_sip=3, ) assert all(s.sip == 3 for s in shards) assert all(0 <= s.pe < 8 for s in shards) assert all(s.cube == 0 for s in shards) # ── SP2. column-wise placement composed across two SIPs ───────────── def test_compose_two_sips_column_wise_covers_tensor(): """Bench splits K-axis across 2 SIPs by calling resolve twice and giving each SIP half of the tensor (half-shape + offset). Shards from both SIPs together cover the whole K axis.""" full_shape = (128, 256) itemsize = 2 # Per-SIP half-shape (K split across SIPs). half_shape = (128, 128) dp = DPPolicy(cube="replicate", pe="column_wise") shards_sip0 = resolve_dp_policy( dp, shape=half_shape, itemsize=itemsize, num_pe=8, num_cubes=1, target_sip=0, ) shards_sip1 = resolve_dp_policy( dp, shape=half_shape, itemsize=itemsize, num_pe=8, num_cubes=1, target_sip=1, ) total_bytes = full_shape[0] * full_shape[1] * itemsize sip0_bytes = sum(s.nbytes for s in shards_sip0) sip1_bytes = sum(s.nbytes for s in shards_sip1) assert sip0_bytes + sip1_bytes == total_bytes assert all(s.sip == 0 for s in shards_sip0) assert all(s.sip == 1 for s in shards_sip1) # ── SP3. row-wise placement composed across two SIPs ──────────────── def test_compose_two_sips_row_wise_covers_tensor(): full_shape = (128, 256) itemsize = 2 half_shape = (64, 256) # per-SIP half of M dp = DPPolicy(cube="replicate", pe="column_wise") shards_sip0 = resolve_dp_policy( dp, shape=half_shape, itemsize=itemsize, num_pe=8, num_cubes=1, target_sip=0, ) shards_sip1 = resolve_dp_policy( dp, shape=half_shape, itemsize=itemsize, num_pe=8, num_cubes=1, target_sip=1, ) total_bytes = full_shape[0] * full_shape[1] * itemsize assert sum(s.nbytes for s in shards_sip0) + sum(s.nbytes for s in shards_sip1) == total_bytes # ── SP4. cube × PE sharding is independent per SIP ────────────────── def test_cube_pe_sharding_independent_per_sip(): """Intra-SIP cube + PE layout matches across SIPs; only sip field differs.""" dp = DPPolicy(cube="column_wise", pe="column_wise") s0 = resolve_dp_policy( dp, shape=(128, 256), itemsize=2, num_pe=4, num_cubes=2, target_sip=0, ) s1 = resolve_dp_policy( dp, shape=(128, 256), itemsize=2, num_pe=4, num_cubes=2, target_sip=1, ) assert len(s0) == len(s1) == 2 * 4 for a, b in zip(s0, s1): assert a.sip == 0 and b.sip == 1 assert (a.cube, a.pe, a.offset_bytes, a.nbytes) == ( b.cube, b.pe, b.offset_bytes, b.nbytes ) # ── SP5. PE_CPU num_programs (contract unchanged) ─────────────────── def test_pe_cpu_sets_num_programs(): """TLContext reports num_programs from its initializer — used by PE_CPU when it launches a kernel on behalf of its shards.""" from kernbench.triton_emu.tl_context import TLContext tl = TLContext(pe_id=3, num_programs=8) assert tl.program_id(0) == 3 assert tl.num_programs(0) == 8