Add virtual memory support: PE_MMU, VA allocator, fabric MmuMapMsg

Implement VA/MMU layer (ADR-0011 Phase 1) enabling Triton kernels to use contiguous virtual addresses on sharded tensors. Key changes: - PE_MMU component: hybrid inbox (MmuMapMsg) + sync translate() for PE_DMA - VirtualAllocator + PEMemAllocator: free-list with coalescing - MmuMapMsg/MmuUnmapMsg fabric path with SIP-level routing - DPPolicy-based mapping: replicate=local, sharded=broadcast - Tensor lifecycle: del + weakref cleanup, context manager - Rename: TensorHandle.pa→addr, DmaReadCmd.src_pa→src_addr, ctx→torch Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:01:47 -07:00
parent 62fb01ae18
commit 08812eda58
34 changed files with 2131 additions and 139 deletions
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import bisect
 from dataclasses import dataclass

 from kernbench.policy.address.phyaddr import PhysAddr
@@ -29,6 +30,63 @@ class AddressConfig:
        return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes


+class _FreeList:
+    """Offset-based free-list allocator with coalescing."""
+
+    def __init__(self, capacity: int) -> None:
+        self._capacity = capacity
+        self._used = 0
+        self._free: list[tuple[int, int]] = [(0, capacity)]  # (offset, size)
+
+    @property
+    def used(self) -> int:
+        return self._used
+
+    @property
+    def total(self) -> int:
+        return self._capacity
+
+    def alloc(self, nbytes: int) -> int:
+        """Allocate nbytes, return offset. Raises AllocationError if full."""
+        for i, (start, size) in enumerate(self._free):
+            if size >= nbytes:
+                if size == nbytes:
+                    self._free.pop(i)
+                else:
+                    self._free[i] = (start + nbytes, size - nbytes)
+                self._used += nbytes
+                return start
+        raise AllocationError(
+            f"overflow: need {nbytes}, "
+            f"largest free block {max((s for _, s in self._free), default=0)}"
+        )
+
+    def free(self, offset: int, nbytes: int) -> None:
+        """Return a range to the free-list with coalescing."""
+        self._used -= nbytes
+        new_start = offset
+        new_end = offset + nbytes
+
+        idx = bisect.bisect_left(self._free, (offset,))
+
+        # Coalesce with previous block
+        if idx > 0:
+            prev_start, prev_size = self._free[idx - 1]
+            if prev_start + prev_size == new_start:
+                new_start = prev_start
+                idx -= 1
+                self._free.pop(idx)
+
+        # Coalesce with next block
+        if idx < len(self._free):
+            next_start, next_size = self._free[idx]
+            if new_end == next_start:
+                new_end = next_start + next_size
+                self._free.pop(idx)
+
+        self._free.insert(idx, (new_start, new_end - new_start))
+
+
 class PEMemAllocator:
    def __init__(
        self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
@@ -38,39 +96,48 @@ class PEMemAllocator:
        self._cube_id = cube_id
        self._pe_id = pe_id
        self._cfg = cfg
-        self._hbm_cursor = 0
-        self._tcm_cursor = 0
+        self._hbm = _FreeList(cfg.hbm_slice_bytes)
+        self._tcm = _FreeList(cfg.tcm_allocatable_bytes)

    def alloc_hbm(self, nbytes: int) -> PhysAddr:
-        if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
+        try:
+            offset = self._hbm.alloc(nbytes)
+        except AllocationError:
            raise AllocationError(
                f"HBM overflow: need {nbytes}, "
-                f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
+                f"available {self._cfg.hbm_slice_bytes - self._hbm.used}"
            )
-        pa = PhysAddr.pe_hbm_addr(
+        return PhysAddr.pe_hbm_addr(
            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
-            pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
+            pe_id=self._pe_id, pe_local_hbm_offset=offset,
            slice_size_bytes=self._cfg.hbm_slice_bytes,
        )
-        self._hbm_cursor += nbytes
-        return pa
+
+    def free_hbm(self, pa: PhysAddr, nbytes: int) -> None:
+        # Extract PE-local offset from the PA's hbm_offset
+        pe_slice_start = self._pe_id * self._cfg.hbm_slice_bytes
+        offset = pa.hbm_offset - pe_slice_start
+        self._hbm.free(offset, nbytes)

    def alloc_tcm(self, nbytes: int) -> PhysAddr:
-        if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
+        try:
+            offset = self._tcm.alloc(nbytes)
+        except AllocationError:
            raise AllocationError(
                f"TCM overflow: need {nbytes}, "
-                f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
+                f"available {self._cfg.tcm_allocatable_bytes - self._tcm.used}"
            )
-        pa = PhysAddr.pe_tcm_addr(
+        return PhysAddr.pe_tcm_addr(
            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
-            pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
+            pe_id=self._pe_id, tcm_offset=offset,
        )
-        self._tcm_cursor += nbytes
-        return pa
+
+    def free_tcm(self, pa: PhysAddr, nbytes: int) -> None:
+        self._tcm.free(pa.sub_offset, nbytes)

    @property
    def hbm_used(self) -> int:
-        return self._hbm_cursor
+        return self._hbm.used

    @property
    def hbm_total(self) -> int:
@@ -78,7 +145,7 @@ class PEMemAllocator:

    @property
    def tcm_used(self) -> int:
-        return self._tcm_cursor
+        return self._tcm.used

    @property
    def tcm_total(self) -> int: