Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise)
- PE_CPU: auto num_programs from cube shard count
- context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape
- deploy_tensor: removed mmus param, MMU mapping is context-only responsibility
- ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename
- VA offset bench + tests: 2D/1D, standard Triton kernel pattern

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
+5 -19
View File
@@ -88,7 +88,6 @@ def test_deploy_tensor_assigns_va_base():
"""deploy_tensor with VA allocator assigns va_base to TensorHandle."""
allocs = _make_allocators()
va_alloc = _make_va_allocator()
mmus = _make_mmus()
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
th = deploy_tensor(
@@ -98,7 +97,6 @@ def test_deploy_tensor_assigns_va_base():
placement=placement,
allocators=allocs,
va_allocator=va_alloc,
mmus=mmus,
)
assert th.va_base is not None
@@ -109,7 +107,6 @@ def test_deploy_tensor_va_covers_all_shards():
"""VA allocation covers the entire tensor; each shard is at va_base + offset."""
allocs = _make_allocators()
va_alloc = _make_va_allocator()
mmus = _make_mmus()
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
th = deploy_tensor(
@@ -119,41 +116,32 @@ def test_deploy_tensor_va_covers_all_shards():
placement=placement,
allocators=allocs,
va_allocator=va_alloc,
mmus=mmus,
)
# Each shard's VA is derivable: va_base + offset_bytes
for s in th.shards:
shard_va = th.va_base + s.offset_bytes
assert shard_va > 0
def test_deploy_tensor_registers_mmu_mappings():
"""deploy_tensor registers VA→PA mappings in all PE MMUs."""
def test_deploy_tensor_does_not_install_mmu_mappings():
"""deploy_tensor does NOT install MMU mappings — that's context's job."""
allocs = _make_allocators()
va_alloc = _make_va_allocator()
mmus = _make_mmus()
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
th = deploy_tensor(
deploy_tensor(
name="W",
shape=(1024, 512),
dtype="fp16",
placement=placement,
allocators=allocs,
va_allocator=va_alloc,
mmus=mmus,
)
# Every MMU should have entries (broadcast)
# No MMU should have any entries (mappings come from fabric MmuMapMsg)
for mmu in mmus.values():
assert mmu.num_entries > 0
# Each shard's derived VA should translate to its PA in every MMU
for mmu in mmus.values():
for s in th.shards:
shard_va = th.va_base + s.offset_bytes
assert mmu.translate(shard_va) == s.pa
assert mmu.num_entries == 0
# ── T12. Tensor.va property ──────────────────────────────────────────
@@ -165,7 +153,6 @@ def test_tensor_va_property():
allocs = _make_allocators(1)
va_alloc = _make_va_allocator()
mmus = _make_mmus(1)
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=4096)]
t = Tensor(shape=(2048,), dtype="f16", name="test")
@@ -176,7 +163,6 @@ def test_tensor_va_property():
placement=placement,
allocators=allocs,
va_allocator=va_alloc,
mmus=mmus,
)
assert t.va > 0
assert t.va == t._handle.va_base