commit - release 1

2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from kernbench.policy.address.phyaddr import PhysAddr
+
+
+class AllocationError(Exception):
+    pass
+
+
+@dataclass(frozen=True)
+class AddressConfig:
+    sip_count: int
+    cubes_per_sip: int
+    pes_per_cube: int
+    hbm_bytes_per_cube: int
+    hbm_slices_per_cube: int
+    tcm_bytes_per_pe: int
+    tcm_scheduler_reserved_bytes: int
+    sram_bytes_per_cube: int
+
+    @property
+    def hbm_slice_bytes(self) -> int:
+        return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
+
+    @property
+    def tcm_allocatable_bytes(self) -> int:
+        return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
+
+
+class PEMemAllocator:
+    def __init__(
+        self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
+    ) -> None:
+        self._rack_id = rack_id
+        self._sip_id = sip_id
+        self._cube_id = cube_id
+        self._pe_id = pe_id
+        self._cfg = cfg
+        self._hbm_cursor = 0
+        self._tcm_cursor = 0
+
+    def alloc_hbm(self, nbytes: int) -> PhysAddr:
+        if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
+            raise AllocationError(
+                f"HBM overflow: need {nbytes}, "
+                f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
+            )
+        pa = PhysAddr.pe_hbm_addr(
+            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
+            pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
+            slice_size_bytes=self._cfg.hbm_slice_bytes,
+        )
+        self._hbm_cursor += nbytes
+        return pa
+
+    def alloc_tcm(self, nbytes: int) -> PhysAddr:
+        if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
+            raise AllocationError(
+                f"TCM overflow: need {nbytes}, "
+                f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
+            )
+        pa = PhysAddr.pe_tcm_addr(
+            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
+            pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
+        )
+        self._tcm_cursor += nbytes
+        return pa
+
+    @property
+    def hbm_used(self) -> int:
+        return self._hbm_cursor
+
+    @property
+    def hbm_total(self) -> int:
+        return self._cfg.hbm_slice_bytes
+
+    @property
+    def tcm_used(self) -> int:
+        return self._tcm_cursor
+
+    @property
+    def tcm_total(self) -> int:
+        return self._cfg.tcm_allocatable_bytes
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Literal
+
+MAX_51 = (1 << 51) - 1
+
+
+class PhysAddrError(Exception):
+    pass
+
+
+def _chk_range(name: str, v: int, bits: int) -> None:
+    if not (0 <= v < (1 << bits)):
+        raise PhysAddrError(f"{name} out of range for {bits} bits: {v}")
+
+
+def _chk_max(name: str, v: int, maxv: int) -> None:
+    if not (0 <= v <= maxv):
+        raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}")
+
+
+class UnitType(IntEnum):
+    PE = 0
+    MCPU = 1
+    SRAM = 2
+
+
+@dataclass(frozen=True)
+class PhysAddr:
+    """
+    51-bit physical address value object.
+
+    Layout:
+      [50:47] rack_id  (4)
+      [46:43] sip_id   (4)
+      [42:38] sip_seg  (5)   # cube_id
+      [37:0]  local_offset (38) => each segment is 256GB
+
+    local_offset:
+      [37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window
+    """
+
+    rack_id: int
+    sip_id: int
+    sip_seg: int
+    local_offset: int
+
+    kind: Literal["hbm", "pe_resource", "raw"] = "raw"
+    cube_id: int = 0
+    unit_type: UnitType = UnitType.PE
+    pe_id: int = 0
+    ext: int = 0
+    sub_offset: int = 0
+    hbm_offset: int = 0
+
+    HBM_WINDOW_BYTES = 1 << 37  # 128GB
+
+    def encode(self) -> int:
+        _chk_range("rack_id", self.rack_id, 4)
+        _chk_range("sip_id", self.sip_id, 4)
+        _chk_range("sip_seg", self.sip_seg, 5)
+        _chk_range("local_offset", self.local_offset, 38)
+        addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset
+        if not (0 <= addr <= MAX_51):
+            raise PhysAddrError("address exceeds 51-bit space")
+        return addr
+
+    @staticmethod
+    def decode(addr: int) -> PhysAddr:
+        if not (0 <= addr <= MAX_51):
+            raise PhysAddrError("addr must be a 51-bit value")
+        rack = (addr >> 47) & 0xF
+        sip_id = (addr >> 43) & 0xF
+        sip_seg = (addr >> 38) & 0x1F
+        off = addr & ((1 << 38) - 1)
+        cube_id = sip_seg
+        sel = (off >> 37) & 0x1
+        if sel == 1:
+            hbm_offset = int(off & ((1 << 37) - 1))
+            return PhysAddr(
+                rack_id=rack,
+                sip_id=sip_id,
+                sip_seg=sip_seg,
+                local_offset=off,
+                kind="hbm",
+                cube_id=cube_id,
+                hbm_offset=hbm_offset,
+            )
+        # PE resource decode
+        raw_ut = int((off >> 34) & 0x7)
+        try:
+            unit_type = UnitType(raw_ut)
+        except ValueError:
+            raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None
+        pe_id = int((off >> 30) & 0xF)
+        ext = int((off >> 29) & 0x1)
+        sub_offset = int(off & ((1 << 29) - 1))
+        return PhysAddr(
+            rack_id=rack,
+            sip_id=sip_id,
+            sip_seg=sip_seg,
+            local_offset=off,
+            kind="pe_resource",
+            cube_id=cube_id,
+            unit_type=unit_type,
+            pe_id=pe_id,
+            ext=ext,
+            sub_offset=sub_offset,
+            hbm_offset=0,
+        )
+
+    @staticmethod
+    def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("hbm_offset", hbm_offset, 37)
+        sip_seg = cube_id
+        local_offset = (1 << 37) | int(hbm_offset)
+        return PhysAddr(
+            rack_id=rack_id,
+            sip_id=sip_id,
+            sip_seg=sip_seg,
+            local_offset=local_offset,
+            kind="hbm",
+            cube_id=cube_id,
+            hbm_offset=int(hbm_offset),
+        )
+
+    @staticmethod
+    def pe_hbm_addr(
+        *,
+        rack_id: int,
+        sip_id: int,
+        cube_id: int,
+        pe_id: int,
+        pe_local_hbm_offset: int,
+        slice_size_bytes: int,
+    ) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("pe_id", pe_id, 4)
+        if not (0 <= pe_local_hbm_offset < slice_size_bytes):
+            raise PhysAddrError("pe_local_hbm_offset out of PE local slice range")
+        hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset)
+        if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES):
+            raise PhysAddrError("HBM offset exceeds reserved 128GB window")
+        return PhysAddr.hbm_addr(
+            rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset
+        )
+
+    @staticmethod
+    def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int:
+        return hbm_offset // slice_size_bytes
+
+    @staticmethod
+    def cube_sram_addr(
+        *, rack_id: int, sip_id: int, cube_id: int, sram_offset: int,
+    ) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("sram_offset", sram_offset, 29)
+        sip_seg = cube_id
+        local_offset = (UnitType.SRAM << 34) | sram_offset
+        return PhysAddr(
+            rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
+            local_offset=local_offset,
+            kind="pe_resource", cube_id=cube_id,
+            unit_type=UnitType.SRAM, sub_offset=sram_offset,
+        )
+
+    @staticmethod
+    def pe_tcm_addr(
+        *, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int,
+    ) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("pe_id", pe_id, 4)
+        _chk_range("tcm_offset", tcm_offset, 29)
+        sip_seg = cube_id
+        local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset
+        return PhysAddr(
+            rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
+            local_offset=local_offset,
+            kind="pe_resource", cube_id=cube_id,
+            unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset,
+        )
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Literal
+
+
+@dataclass(frozen=True)
+class DPPolicy:
+    """Two-level data-parallel policy: cube-level + pe-level."""
+
+    cube: Literal["replicate", "shard_m", "shard_k"] = "replicate"
+    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+
+
+def resolve_dp_policy(
+    policy: DPPolicy,
+    *,
+    shape: tuple[int, int],
+    itemsize: int,
+    num_pe: int,
+    num_cubes: int = 1,
+) -> list[ShardSpec]:
+    """Resolve a DPPolicy into a list[ShardSpec] with two-level resolution.
+
+    Cube-level policy distributes across cubes, pe-level distributes within
+    each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id.
+    """
+    _PE_RESOLVERS = {
+        "replicate": replicate,
+        "column_wise": column_wise,
+        "row_wise": row_wise,
+    }
+    resolver = _PE_RESOLVERS.get(policy.pe)
+    if resolver is None:
+        raise ValueError(f"Unknown pe-level policy: {policy.pe}")
+
+    if num_cubes <= 1:
+        return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe)
+
+    # Two-level resolution: cube-level → pe-level
+    M, K = shape
+    all_shards: list[ShardSpec] = []
+
+    for cube_id in range(num_cubes):
+        # Determine per-cube shape based on cube-level policy
+        if policy.cube == "replicate":
+            cube_shape = (M, K)
+            cube_offset = 0
+        elif policy.cube == "shard_m":
+            chunk_m = M // num_cubes
+            cube_shape = (chunk_m, K)
+            cube_offset = cube_id * chunk_m * K * itemsize
+        elif policy.cube == "shard_k":
+            chunk_k = K // num_cubes
+            cube_shape = (M, chunk_k)
+            cube_offset = cube_id * M * chunk_k * itemsize
+        else:
+            raise ValueError(f"Unknown cube-level policy: {policy.cube}")
+
+        # Resolve pe-level within this cube's shape
+        pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
+
+        # Remap pe_index to flat index and adjust offset
+        for ps in pe_shards:
+            flat_idx = cube_id * num_pe + ps.pe_index
+            all_shards.append(ShardSpec(
+                pe_index=flat_idx,
+                offset_bytes=cube_offset + ps.offset_bytes,
+                nbytes=ps.nbytes,
+            ))
+
+    return all_shards
+
+
+@dataclass(frozen=True)
+class ShardSpec:
+    pe_index: int
+    offset_bytes: int
+    nbytes: int
+
+
+def column_wise(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+) -> list[ShardSpec]:
+    """Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
+    M, K = shape
+    chunk_k = K // num_pe
+    chunk_bytes = M * chunk_k * itemsize
+    shards = []
+    for i in range(num_pe):
+        shards.append(ShardSpec(
+            pe_index=i,
+            offset_bytes=i * chunk_bytes,
+            nbytes=chunk_bytes,
+        ))
+    return shards
+
+
+def row_wise(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+) -> list[ShardSpec]:
+    """Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
+    M, K = shape
+    chunk_m = M // num_pe
+    chunk_bytes = chunk_m * K * itemsize
+    shards = []
+    for i in range(num_pe):
+        shards.append(ShardSpec(
+            pe_index=i,
+            offset_bytes=i * chunk_bytes,
+            nbytes=chunk_bytes,
+        ))
+    return shards
+
+
+def replicate(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+) -> list[ShardSpec]:
+    """Full copy per PE. Each PE gets (M, K)."""
+    M, K = shape
+    full_bytes = M * K * itemsize
+    return [
+        ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
+        for i in range(num_pe)
+    ]
+
+
+def tiled_column_major(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+    tile_m: int, tile_k: int,
+) -> list[ShardSpec]:
+    """2D tiling, column-major order (K axis first), round-robin across PEs."""
+    M, K = shape
+    tiles_m = ceil(M / tile_m)
+    tiles_k = ceil(K / tile_k)
+    tile_bytes = tile_m * tile_k * itemsize
+    row_bytes = K * itemsize
+    shards = []
+    idx = 0
+    for mi in range(tiles_m):
+        for ki in range(tiles_k):
+            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
+            shards.append(ShardSpec(
+                pe_index=idx % num_pe,
+                offset_bytes=offset,
+                nbytes=tile_bytes,
+            ))
+            idx += 1
+    return shards
+
+
+def tiled_row_major(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+    tile_m: int, tile_k: int,
+) -> list[ShardSpec]:
+    """2D tiling, row-major order (M axis first), round-robin across PEs."""
+    M, K = shape
+    tiles_m = ceil(M / tile_m)
+    tiles_k = ceil(K / tile_k)
+    tile_bytes = tile_m * tile_k * itemsize
+    row_bytes = K * itemsize
+    shards = []
+    idx = 0
+    for ki in range(tiles_k):
+        for mi in range(tiles_m):
+            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
+            shards.append(ShardSpec(
+                pe_index=idx % num_pe,
+                offset_bytes=offset,
+                nbytes=tile_bytes,
+            ))
+            idx += 1
+    return shards
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+import heapq
+from collections import defaultdict
+
+from kernbench.policy.address.phyaddr import PhysAddr, UnitType
+from kernbench.topology.types import TopologyGraph
+
+
+class RoutingError(Exception):
+    pass
+
+
+class AddressResolver:
+    """Resolve a PhysAddr to the destination node_id in the compiled graph.
+
+    Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that
+    component implementations never construct node_id strings directly.
+    Centralising the naming convention here means a single change propagates
+    everywhere (ADR-0015 D4).
+    """
+
+    def __init__(self, graph: TopologyGraph) -> None:
+        self._node_ids = set(graph.nodes)
+        mm = graph.spec["cube"]["memory_map"]
+        self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
+
+    # ── Physical-address resolution ──────────────────────────────────
+
+    def resolve(self, addr: PhysAddr) -> str:
+        s = addr.sip_id
+        c = addr.cube_id
+        if addr.kind == "hbm":
+            pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
+            node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
+        elif addr.kind == "pe_resource":
+            if addr.unit_type == UnitType.PE:
+                node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm"
+            elif addr.unit_type == UnitType.SRAM:
+                node_id = f"sip{s}.cube{c}.sram"
+            elif addr.unit_type == UnitType.MCPU:
+                node_id = f"sip{s}.cube{c}.m_cpu"
+            else:
+                raise RoutingError(f"unsupported unit_type: {addr.unit_type}")
+        else:
+            raise RoutingError(f"unsupported address kind: {addr.kind}")
+        if node_id not in self._node_ids:
+            raise RoutingError(f"node {node_id} not found in topology")
+        return node_id
+
+    # ── Named node lookups ───────────────────────────────────────────
+
+    def find_m_cpu(self, sip: int, cube: int) -> str:
+        node_id = f"sip{sip}.cube{cube}.m_cpu"
+        if node_id not in self._node_ids:
+            raise RoutingError(f"M_CPU not found: {node_id}")
+        return node_id
+
+    def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str:
+        node_id = f"sip{sip}.{io_id}.pcie_ep"
+        if node_id not in self._node_ids:
+            raise RoutingError(f"PCIE_EP not found: {node_id}")
+        return node_id
+
+    def find_io_cpu(self, sip: int, io_id: str = "io0") -> str:
+        node_id = f"sip{sip}.{io_id}.io_cpu"
+        if node_id not in self._node_ids:
+            raise RoutingError(f"IO_CPU not found: {node_id}")
+        return node_id
+
+    def find_all_pcie_eps(self) -> list[str]:
+        """Return all PCIE_EP node ids across all SIPs, sorted."""
+        return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep"))
+
+
+class PathRouter:
+    """Find data-path from a source PE (or arbitrary node) to a destination node.
+
+    Two adjacency graphs are maintained:
+      _adj      — excludes command edges (used by PE DMA routing, find_path)
+      _adj_all  — includes all edges (used by component-to-component routing,
+                  find_node_path; required because M_CPU↔NOC links are "command")
+    """
+
+    # Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
+    # PE-internal pipeline nodes when computing DMA paths.
+    _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
+
+    def __init__(self, graph: TopologyGraph) -> None:
+        self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list)
+        self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list)
+        self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list)
+        for e in graph.edges:
+            w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm
+            self._adj_all[e.src].append((e.dst, w))
+            if e.kind != "command":
+                self._adj[e.src].append((e.dst, w))
+            if e.kind not in self._MCPU_DMA_EXCLUDE:
+                self._adj_mcpu_dma[e.src].append((e.dst, w))
+
+    def find_path(self, src_pe: str, dst_node: str) -> list[str]:
+        """PE DMA routing: prepends .pe_dma, excludes command edges."""
+        start = f"{src_pe}.pe_dma"
+        return self._run_dijkstra(self._adj, start, dst_node)
+
+    def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
+        start = f"{src_pe}.pe_dma"
+        return self._run_dijkstra_with_dist(self._adj, start, dst_node)
+
+    def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
+        """M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).
+
+        Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
+        Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
+                    → routes through NOC → UCIe → target cube NOC → xbar → HBM.
+        """
+        m_cube = ".".join(m_cpu_id.split(".")[:2])
+        d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
+        if m_cube == d_cube:
+            slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
+            return [
+                m_cpu_id,
+                f"{m_cube}.noc",
+                f"{m_cube}.xbar.pe{slice_idx}",
+                dst_hbm_slice_id,
+            ]
+        return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)
+
+    def find_node_path(self, src: str, dst: str) -> list[str]:
+        """General routing between arbitrary nodes, including command edges.
+
+        Used by components (IoCpuComponent, MCpuComponent) that route through
+        M_CPU↔NOC command-kind links.
+        """
+        return self._run_dijkstra(self._adj_all, src, dst)
+
+    def _run_dijkstra(
+        self,
+        adj: dict[str, list[tuple[str, float]]],
+        start: str,
+        goal: str,
+    ) -> list[str]:
+        path, _ = self._run_dijkstra_with_dist(adj, start, goal)
+        return path
+
+    def _run_dijkstra_with_dist(
+        self,
+        adj: dict[str, list[tuple[str, float]]],
+        start: str,
+        goal: str,
+    ) -> tuple[list[str], float]:
+        if start == goal:
+            return [start], 0.0
+        best: dict[str, float] = {start: 0.0}
+        prev: dict[str, str] = {}
+        heap: list[tuple[float, str]] = [(0.0, start)]
+        while heap:
+            d, node = heapq.heappop(heap)
+            if node == goal:
+                path: list[str] = []
+                cur = goal
+                while cur != start:
+                    path.append(cur)
+                    cur = prev[cur]
+                path.append(start)
+                path.reverse()
+                return path, d
+            if d > best.get(node, float("inf")):
+                continue
+            for neighbor, edge_dist in adj[node]:
+                new_d = d + edge_dist
+                if new_d < best.get(neighbor, float("inf")):
+                    best[neighbor] = new_d
+                    prev[neighbor] = node
+                    heapq.heappush(heap, (new_d, neighbor))
+        raise RoutingError(f"no path from {start} to {goal}")
+
+    # ── backward-compat shims (used by existing tests) ───────────────
+
+    def _dijkstra(self, start: str, goal: str) -> list[str]:
+        return self._run_dijkstra(self._adj, start, goal)
+
+    def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]:
+        return self._run_dijkstra_with_dist(self._adj, start, goal)