08812eda58
Implement VA/MMU layer (ADR-0011 Phase 1) enabling Triton kernels to use contiguous virtual addresses on sharded tensors. Key changes: - PE_MMU component: hybrid inbox (MmuMapMsg) + sync translate() for PE_DMA - VirtualAllocator + PEMemAllocator: free-list with coalescing - MmuMapMsg/MmuUnmapMsg fabric path with SIP-level routing - DPPolicy-based mapping: replicate=local, sharded=broadcast - Tensor lifecycle: del + weakref cleanup, context manager - Rename: TensorHandle.pa→addr, DmaReadCmd.src_pa→src_addr, ctx→torch Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
153 lines
4.6 KiB
Python
153 lines
4.6 KiB
Python
from __future__ import annotations
|
|
|
|
import bisect
|
|
from dataclasses import dataclass
|
|
|
|
from kernbench.policy.address.phyaddr import PhysAddr
|
|
|
|
|
|
class AllocationError(Exception):
|
|
pass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class AddressConfig:
|
|
sip_count: int
|
|
cubes_per_sip: int
|
|
pes_per_cube: int
|
|
hbm_bytes_per_cube: int
|
|
hbm_slices_per_cube: int
|
|
tcm_bytes_per_pe: int
|
|
tcm_scheduler_reserved_bytes: int
|
|
sram_bytes_per_cube: int
|
|
|
|
@property
|
|
def hbm_slice_bytes(self) -> int:
|
|
return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
|
|
|
|
@property
|
|
def tcm_allocatable_bytes(self) -> int:
|
|
return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
|
|
|
|
|
|
class _FreeList:
|
|
"""Offset-based free-list allocator with coalescing."""
|
|
|
|
def __init__(self, capacity: int) -> None:
|
|
self._capacity = capacity
|
|
self._used = 0
|
|
self._free: list[tuple[int, int]] = [(0, capacity)] # (offset, size)
|
|
|
|
@property
|
|
def used(self) -> int:
|
|
return self._used
|
|
|
|
@property
|
|
def total(self) -> int:
|
|
return self._capacity
|
|
|
|
def alloc(self, nbytes: int) -> int:
|
|
"""Allocate nbytes, return offset. Raises AllocationError if full."""
|
|
for i, (start, size) in enumerate(self._free):
|
|
if size >= nbytes:
|
|
if size == nbytes:
|
|
self._free.pop(i)
|
|
else:
|
|
self._free[i] = (start + nbytes, size - nbytes)
|
|
self._used += nbytes
|
|
return start
|
|
raise AllocationError(
|
|
f"overflow: need {nbytes}, "
|
|
f"largest free block {max((s for _, s in self._free), default=0)}"
|
|
)
|
|
|
|
def free(self, offset: int, nbytes: int) -> None:
|
|
"""Return a range to the free-list with coalescing."""
|
|
self._used -= nbytes
|
|
new_start = offset
|
|
new_end = offset + nbytes
|
|
|
|
idx = bisect.bisect_left(self._free, (offset,))
|
|
|
|
# Coalesce with previous block
|
|
if idx > 0:
|
|
prev_start, prev_size = self._free[idx - 1]
|
|
if prev_start + prev_size == new_start:
|
|
new_start = prev_start
|
|
idx -= 1
|
|
self._free.pop(idx)
|
|
|
|
# Coalesce with next block
|
|
if idx < len(self._free):
|
|
next_start, next_size = self._free[idx]
|
|
if new_end == next_start:
|
|
new_end = next_start + next_size
|
|
self._free.pop(idx)
|
|
|
|
self._free.insert(idx, (new_start, new_end - new_start))
|
|
|
|
|
|
class PEMemAllocator:
|
|
def __init__(
|
|
self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
|
|
) -> None:
|
|
self._rack_id = rack_id
|
|
self._sip_id = sip_id
|
|
self._cube_id = cube_id
|
|
self._pe_id = pe_id
|
|
self._cfg = cfg
|
|
self._hbm = _FreeList(cfg.hbm_slice_bytes)
|
|
self._tcm = _FreeList(cfg.tcm_allocatable_bytes)
|
|
|
|
def alloc_hbm(self, nbytes: int) -> PhysAddr:
|
|
try:
|
|
offset = self._hbm.alloc(nbytes)
|
|
except AllocationError:
|
|
raise AllocationError(
|
|
f"HBM overflow: need {nbytes}, "
|
|
f"available {self._cfg.hbm_slice_bytes - self._hbm.used}"
|
|
)
|
|
return PhysAddr.pe_hbm_addr(
|
|
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
|
|
pe_id=self._pe_id, pe_local_hbm_offset=offset,
|
|
slice_size_bytes=self._cfg.hbm_slice_bytes,
|
|
)
|
|
|
|
def free_hbm(self, pa: PhysAddr, nbytes: int) -> None:
|
|
# Extract PE-local offset from the PA's hbm_offset
|
|
pe_slice_start = self._pe_id * self._cfg.hbm_slice_bytes
|
|
offset = pa.hbm_offset - pe_slice_start
|
|
self._hbm.free(offset, nbytes)
|
|
|
|
def alloc_tcm(self, nbytes: int) -> PhysAddr:
|
|
try:
|
|
offset = self._tcm.alloc(nbytes)
|
|
except AllocationError:
|
|
raise AllocationError(
|
|
f"TCM overflow: need {nbytes}, "
|
|
f"available {self._cfg.tcm_allocatable_bytes - self._tcm.used}"
|
|
)
|
|
return PhysAddr.pe_tcm_addr(
|
|
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
|
|
pe_id=self._pe_id, tcm_offset=offset,
|
|
)
|
|
|
|
def free_tcm(self, pa: PhysAddr, nbytes: int) -> None:
|
|
self._tcm.free(pa.sub_offset, nbytes)
|
|
|
|
@property
|
|
def hbm_used(self) -> int:
|
|
return self._hbm.used
|
|
|
|
@property
|
|
def hbm_total(self) -> int:
|
|
return self._cfg.hbm_slice_bytes
|
|
|
|
@property
|
|
def tcm_used(self) -> int:
|
|
return self._tcm.used
|
|
|
|
@property
|
|
def tcm_total(self) -> int:
|
|
return self._cfg.tcm_allocatable_bytes
|