Add virtual memory support: PE_MMU, VA allocator, fabric MmuMapMsg

Implement VA/MMU layer (ADR-0011 Phase 1) enabling Triton kernels to use contiguous virtual addresses on sharded tensors. Key changes: - PE_MMU component: hybrid inbox (MmuMapMsg) + sync translate() for PE_DMA - VirtualAllocator + PEMemAllocator: free-list with coalescing - MmuMapMsg/MmuUnmapMsg fabric path with SIP-level routing - DPPolicy-based mapping: replicate=local, sharded=broadcast - Tensor lifecycle: del + weakref cleanup, context manager - Rename: TensorHandle.pa→addr, DmaReadCmd.src_pa→src_addr, ctx→torch Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:01:47 -07:00
parent 62fb01ae18
commit 08812eda58
34 changed files with 2131 additions and 139 deletions
@@ -17,31 +17,32 @@ def test_full_graph_node_count():
    g = _graph()
    # 1 switch
    # + 2 SIPs × (1 IO × (3 comps + 4 io_ucie + 16 io_conn)
-    #            + 16 cubes × (cube_comps + 8 PEs × 6 pe_comps))
+    #            + 16 cubes × (cube_comps + 8 PEs × 7 pe_comps))
    #   IO: pcie_ep + io_cpu + io_noc + 4 io_ucie + 4*4 io_conn = 23
    #   cube_comps: 9 (noc, m_cpu, sram, 2 bridge, 4 ucie)
    #             + 16 ucie_conn (4 ports × 4 connections)
    #             + 2 xbar_top/bot
    #             + 8 hbm_slices = 35
-    # = 1 + 2*(23 + 16*(35+48)) = 1 + 2*(23+1328) = 1 + 2702 = 2703
-    assert len(g.nodes) == 2703
+    #   pe_comps: 7 (pe_cpu, pe_scheduler, pe_dma, pe_gemm, pe_math, pe_mmu, pe_tcm)
+    # = 1 + 2*(23 + 16*(35+56)) = 1 + 2*(23+1456) = 1 + 2958 = 2959
+    assert len(g.nodes) == 2959


 def test_full_graph_edge_count():
    g = _graph()
-    # Per cube: 184
+    # Per cube: 192
    #   PE-internal: 56
-    #   PE_DMA→noc: 8, noc→pe_dma: 8, noc→pe_cpu: 8, pe_cpu→noc: 8
+    #   PE_DMA→noc: 8, noc→pe_dma: 8, noc→pe_cpu: 8, pe_cpu→noc: 8, noc→pe_mmu: 8
    #   xbar_top→hbm{0..3}: 4+4=8, xbar_bot→hbm{4..7}: 4+4=8
    #   noc↔xbar_top: 2, noc↔xbar_bot: 2
    #   xbar_top↔bridge.left: 2, bridge.left↔xbar_bot: 2
    #   xbar_top↔bridge.right: 2, bridge.right↔xbar_bot: 2
    #   ucie: 64, m_cpu↔noc: 2, noc↔sram: 2
-    #   Total: 56+8+8+8+8+8+8+2+2+2+2+2+2+64+2+2 = 184
+    #   Total: 56+8+8+8+8+8+8+8+2+2+2+2+2+2+64+2+2 = 192
    # IO edges per SIP: 77
-    # Per SIP: 16*184 + 48 inter-cube + 77 IO = 3069
-    # Total: 2 * 3069 = 6138
-    assert len(g.edges) == 6138
+    # Per SIP: 16*192 + 48 inter-cube + 77 IO = 3197
+    # Total: 2 * 3197 = 6394
+    assert len(g.edges) == 6394


 # ── Full graph: specific nodes exist ─────────────────────────────────
@@ -267,7 +268,7 @@ def test_cube_view_pe_to_noc():
 def test_pe_view_has_all_components():
    v = _graph().pe_view
    assert set(v.nodes.keys()) == {
-        "pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm"
+        "pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_mmu", "pe_tcm"
    }