Add SIP-level tensor parallelism, component registry YAML, VA offset verification

- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 01:13:17 -07:00
parent 08812eda58
commit 63669f82cb
35 changed files with 813 additions and 219 deletions
@@ -88,7 +88,6 @@ def test_deploy_tensor_assigns_va_base():
    """deploy_tensor with VA allocator assigns va_base to TensorHandle."""
    allocs = _make_allocators()
    va_alloc = _make_va_allocator()
-    mmus = _make_mmus()
    placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)

    th = deploy_tensor(
@@ -98,7 +97,6 @@ def test_deploy_tensor_assigns_va_base():
        placement=placement,
        allocators=allocs,
        va_allocator=va_alloc,
-        mmus=mmus,
    )

    assert th.va_base is not None
@@ -109,7 +107,6 @@ def test_deploy_tensor_va_covers_all_shards():
    """VA allocation covers the entire tensor; each shard is at va_base + offset."""
    allocs = _make_allocators()
    va_alloc = _make_va_allocator()
-    mmus = _make_mmus()
    placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)

    th = deploy_tensor(
@@ -119,41 +116,32 @@ def test_deploy_tensor_va_covers_all_shards():
        placement=placement,
        allocators=allocs,
        va_allocator=va_alloc,
-        mmus=mmus,
    )

-    # Each shard's VA is derivable: va_base + offset_bytes
    for s in th.shards:
        shard_va = th.va_base + s.offset_bytes
        assert shard_va > 0


-def test_deploy_tensor_registers_mmu_mappings():
-    """deploy_tensor registers VA→PA mappings in all PE MMUs."""
+def test_deploy_tensor_does_not_install_mmu_mappings():
+    """deploy_tensor does NOT install MMU mappings — that's context's job."""
    allocs = _make_allocators()
    va_alloc = _make_va_allocator()
    mmus = _make_mmus()
    placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)

-    th = deploy_tensor(
+    deploy_tensor(
        name="W",
        shape=(1024, 512),
        dtype="fp16",
        placement=placement,
        allocators=allocs,
        va_allocator=va_alloc,
-        mmus=mmus,
    )

-    # Every MMU should have entries (broadcast)
+    # No MMU should have any entries (mappings come from fabric MmuMapMsg)
    for mmu in mmus.values():
-        assert mmu.num_entries > 0
-
-    # Each shard's derived VA should translate to its PA in every MMU
-    for mmu in mmus.values():
-        for s in th.shards:
-            shard_va = th.va_base + s.offset_bytes
-            assert mmu.translate(shard_va) == s.pa
+        assert mmu.num_entries == 0


 # ── T12. Tensor.va property ──────────────────────────────────────────
@@ -165,7 +153,6 @@ def test_tensor_va_property():

    allocs = _make_allocators(1)
    va_alloc = _make_va_allocator()
-    mmus = _make_mmus(1)
    placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=4096)]

    t = Tensor(shape=(2048,), dtype="f16", name="test")
@@ -176,7 +163,6 @@ def test_tensor_va_property():
        placement=placement,
        allocators=allocs,
        va_allocator=va_alloc,
-        mmus=mmus,
    )
    assert t.va > 0
    assert t.va == t._handle.va_base