commit - release 1

This commit is contained in:
2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
+85
View File
@@ -0,0 +1,85 @@
from __future__ import annotations
from dataclasses import dataclass
from kernbench.policy.address.phyaddr import PhysAddr
class AllocationError(Exception):
pass
@dataclass(frozen=True)
class AddressConfig:
sip_count: int
cubes_per_sip: int
pes_per_cube: int
hbm_bytes_per_cube: int
hbm_slices_per_cube: int
tcm_bytes_per_pe: int
tcm_scheduler_reserved_bytes: int
sram_bytes_per_cube: int
@property
def hbm_slice_bytes(self) -> int:
return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
@property
def tcm_allocatable_bytes(self) -> int:
return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
class PEMemAllocator:
def __init__(
self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
) -> None:
self._rack_id = rack_id
self._sip_id = sip_id
self._cube_id = cube_id
self._pe_id = pe_id
self._cfg = cfg
self._hbm_cursor = 0
self._tcm_cursor = 0
def alloc_hbm(self, nbytes: int) -> PhysAddr:
if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
raise AllocationError(
f"HBM overflow: need {nbytes}, "
f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
)
pa = PhysAddr.pe_hbm_addr(
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
slice_size_bytes=self._cfg.hbm_slice_bytes,
)
self._hbm_cursor += nbytes
return pa
def alloc_tcm(self, nbytes: int) -> PhysAddr:
if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
raise AllocationError(
f"TCM overflow: need {nbytes}, "
f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
)
pa = PhysAddr.pe_tcm_addr(
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
)
self._tcm_cursor += nbytes
return pa
@property
def hbm_used(self) -> int:
return self._hbm_cursor
@property
def hbm_total(self) -> int:
return self._cfg.hbm_slice_bytes
@property
def tcm_used(self) -> int:
return self._tcm_cursor
@property
def tcm_total(self) -> int:
return self._cfg.tcm_allocatable_bytes
+184
View File
@@ -0,0 +1,184 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import IntEnum
from typing import Literal
MAX_51 = (1 << 51) - 1
class PhysAddrError(Exception):
pass
def _chk_range(name: str, v: int, bits: int) -> None:
if not (0 <= v < (1 << bits)):
raise PhysAddrError(f"{name} out of range for {bits} bits: {v}")
def _chk_max(name: str, v: int, maxv: int) -> None:
if not (0 <= v <= maxv):
raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}")
class UnitType(IntEnum):
PE = 0
MCPU = 1
SRAM = 2
@dataclass(frozen=True)
class PhysAddr:
"""
51-bit physical address value object.
Layout:
[50:47] rack_id (4)
[46:43] sip_id (4)
[42:38] sip_seg (5) # cube_id
[37:0] local_offset (38) => each segment is 256GB
local_offset:
[37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window
"""
rack_id: int
sip_id: int
sip_seg: int
local_offset: int
kind: Literal["hbm", "pe_resource", "raw"] = "raw"
cube_id: int = 0
unit_type: UnitType = UnitType.PE
pe_id: int = 0
ext: int = 0
sub_offset: int = 0
hbm_offset: int = 0
HBM_WINDOW_BYTES = 1 << 37 # 128GB
def encode(self) -> int:
_chk_range("rack_id", self.rack_id, 4)
_chk_range("sip_id", self.sip_id, 4)
_chk_range("sip_seg", self.sip_seg, 5)
_chk_range("local_offset", self.local_offset, 38)
addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset
if not (0 <= addr <= MAX_51):
raise PhysAddrError("address exceeds 51-bit space")
return addr
@staticmethod
def decode(addr: int) -> PhysAddr:
if not (0 <= addr <= MAX_51):
raise PhysAddrError("addr must be a 51-bit value")
rack = (addr >> 47) & 0xF
sip_id = (addr >> 43) & 0xF
sip_seg = (addr >> 38) & 0x1F
off = addr & ((1 << 38) - 1)
cube_id = sip_seg
sel = (off >> 37) & 0x1
if sel == 1:
hbm_offset = int(off & ((1 << 37) - 1))
return PhysAddr(
rack_id=rack,
sip_id=sip_id,
sip_seg=sip_seg,
local_offset=off,
kind="hbm",
cube_id=cube_id,
hbm_offset=hbm_offset,
)
# PE resource decode
raw_ut = int((off >> 34) & 0x7)
try:
unit_type = UnitType(raw_ut)
except ValueError:
raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None
pe_id = int((off >> 30) & 0xF)
ext = int((off >> 29) & 0x1)
sub_offset = int(off & ((1 << 29) - 1))
return PhysAddr(
rack_id=rack,
sip_id=sip_id,
sip_seg=sip_seg,
local_offset=off,
kind="pe_resource",
cube_id=cube_id,
unit_type=unit_type,
pe_id=pe_id,
ext=ext,
sub_offset=sub_offset,
hbm_offset=0,
)
@staticmethod
def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("hbm_offset", hbm_offset, 37)
sip_seg = cube_id
local_offset = (1 << 37) | int(hbm_offset)
return PhysAddr(
rack_id=rack_id,
sip_id=sip_id,
sip_seg=sip_seg,
local_offset=local_offset,
kind="hbm",
cube_id=cube_id,
hbm_offset=int(hbm_offset),
)
@staticmethod
def pe_hbm_addr(
*,
rack_id: int,
sip_id: int,
cube_id: int,
pe_id: int,
pe_local_hbm_offset: int,
slice_size_bytes: int,
) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("pe_id", pe_id, 4)
if not (0 <= pe_local_hbm_offset < slice_size_bytes):
raise PhysAddrError("pe_local_hbm_offset out of PE local slice range")
hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset)
if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES):
raise PhysAddrError("HBM offset exceeds reserved 128GB window")
return PhysAddr.hbm_addr(
rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset
)
@staticmethod
def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int:
return hbm_offset // slice_size_bytes
@staticmethod
def cube_sram_addr(
*, rack_id: int, sip_id: int, cube_id: int, sram_offset: int,
) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("sram_offset", sram_offset, 29)
sip_seg = cube_id
local_offset = (UnitType.SRAM << 34) | sram_offset
return PhysAddr(
rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
local_offset=local_offset,
kind="pe_resource", cube_id=cube_id,
unit_type=UnitType.SRAM, sub_offset=sram_offset,
)
@staticmethod
def pe_tcm_addr(
*, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int,
) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("pe_id", pe_id, 4)
_chk_range("tcm_offset", tcm_offset, 29)
sip_seg = cube_id
local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset
return PhysAddr(
rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
local_offset=local_offset,
kind="pe_resource", cube_id=cube_id,
unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset,
)
+174
View File
@@ -0,0 +1,174 @@
from __future__ import annotations
from dataclasses import dataclass
from math import ceil
from typing import Literal
@dataclass(frozen=True)
class DPPolicy:
"""Two-level data-parallel policy: cube-level + pe-level."""
cube: Literal["replicate", "shard_m", "shard_k"] = "replicate"
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
def resolve_dp_policy(
policy: DPPolicy,
*,
shape: tuple[int, int],
itemsize: int,
num_pe: int,
num_cubes: int = 1,
) -> list[ShardSpec]:
"""Resolve a DPPolicy into a list[ShardSpec] with two-level resolution.
Cube-level policy distributes across cubes, pe-level distributes within
each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id.
"""
_PE_RESOLVERS = {
"replicate": replicate,
"column_wise": column_wise,
"row_wise": row_wise,
}
resolver = _PE_RESOLVERS.get(policy.pe)
if resolver is None:
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
if num_cubes <= 1:
return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe)
# Two-level resolution: cube-level → pe-level
M, K = shape
all_shards: list[ShardSpec] = []
for cube_id in range(num_cubes):
# Determine per-cube shape based on cube-level policy
if policy.cube == "replicate":
cube_shape = (M, K)
cube_offset = 0
elif policy.cube == "shard_m":
chunk_m = M // num_cubes
cube_shape = (chunk_m, K)
cube_offset = cube_id * chunk_m * K * itemsize
elif policy.cube == "shard_k":
chunk_k = K // num_cubes
cube_shape = (M, chunk_k)
cube_offset = cube_id * M * chunk_k * itemsize
else:
raise ValueError(f"Unknown cube-level policy: {policy.cube}")
# Resolve pe-level within this cube's shape
pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
# Remap pe_index to flat index and adjust offset
for ps in pe_shards:
flat_idx = cube_id * num_pe + ps.pe_index
all_shards.append(ShardSpec(
pe_index=flat_idx,
offset_bytes=cube_offset + ps.offset_bytes,
nbytes=ps.nbytes,
))
return all_shards
@dataclass(frozen=True)
class ShardSpec:
pe_index: int
offset_bytes: int
nbytes: int
def column_wise(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
) -> list[ShardSpec]:
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
M, K = shape
chunk_k = K // num_pe
chunk_bytes = M * chunk_k * itemsize
shards = []
for i in range(num_pe):
shards.append(ShardSpec(
pe_index=i,
offset_bytes=i * chunk_bytes,
nbytes=chunk_bytes,
))
return shards
def row_wise(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
) -> list[ShardSpec]:
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
M, K = shape
chunk_m = M // num_pe
chunk_bytes = chunk_m * K * itemsize
shards = []
for i in range(num_pe):
shards.append(ShardSpec(
pe_index=i,
offset_bytes=i * chunk_bytes,
nbytes=chunk_bytes,
))
return shards
def replicate(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
) -> list[ShardSpec]:
"""Full copy per PE. Each PE gets (M, K)."""
M, K = shape
full_bytes = M * K * itemsize
return [
ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
for i in range(num_pe)
]
def tiled_column_major(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
tile_m: int, tile_k: int,
) -> list[ShardSpec]:
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
M, K = shape
tiles_m = ceil(M / tile_m)
tiles_k = ceil(K / tile_k)
tile_bytes = tile_m * tile_k * itemsize
row_bytes = K * itemsize
shards = []
idx = 0
for mi in range(tiles_m):
for ki in range(tiles_k):
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
shards.append(ShardSpec(
pe_index=idx % num_pe,
offset_bytes=offset,
nbytes=tile_bytes,
))
idx += 1
return shards
def tiled_row_major(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
tile_m: int, tile_k: int,
) -> list[ShardSpec]:
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
M, K = shape
tiles_m = ceil(M / tile_m)
tiles_k = ceil(K / tile_k)
tile_bytes = tile_m * tile_k * itemsize
row_bytes = K * itemsize
shards = []
idx = 0
for ki in range(tiles_k):
for mi in range(tiles_m):
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
shards.append(ShardSpec(
pe_index=idx % num_pe,
offset_bytes=offset,
nbytes=tile_bytes,
))
idx += 1
return shards
+184
View File
@@ -0,0 +1,184 @@
from __future__ import annotations
import heapq
from collections import defaultdict
from kernbench.policy.address.phyaddr import PhysAddr, UnitType
from kernbench.topology.types import TopologyGraph
class RoutingError(Exception):
pass
class AddressResolver:
"""Resolve a PhysAddr to the destination node_id in the compiled graph.
Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that
component implementations never construct node_id strings directly.
Centralising the naming convention here means a single change propagates
everywhere (ADR-0015 D4).
"""
def __init__(self, graph: TopologyGraph) -> None:
self._node_ids = set(graph.nodes)
mm = graph.spec["cube"]["memory_map"]
self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
# ── Physical-address resolution ──────────────────────────────────
def resolve(self, addr: PhysAddr) -> str:
s = addr.sip_id
c = addr.cube_id
if addr.kind == "hbm":
pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
elif addr.kind == "pe_resource":
if addr.unit_type == UnitType.PE:
node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm"
elif addr.unit_type == UnitType.SRAM:
node_id = f"sip{s}.cube{c}.sram"
elif addr.unit_type == UnitType.MCPU:
node_id = f"sip{s}.cube{c}.m_cpu"
else:
raise RoutingError(f"unsupported unit_type: {addr.unit_type}")
else:
raise RoutingError(f"unsupported address kind: {addr.kind}")
if node_id not in self._node_ids:
raise RoutingError(f"node {node_id} not found in topology")
return node_id
# ── Named node lookups ───────────────────────────────────────────
def find_m_cpu(self, sip: int, cube: int) -> str:
node_id = f"sip{sip}.cube{cube}.m_cpu"
if node_id not in self._node_ids:
raise RoutingError(f"M_CPU not found: {node_id}")
return node_id
def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str:
node_id = f"sip{sip}.{io_id}.pcie_ep"
if node_id not in self._node_ids:
raise RoutingError(f"PCIE_EP not found: {node_id}")
return node_id
def find_io_cpu(self, sip: int, io_id: str = "io0") -> str:
node_id = f"sip{sip}.{io_id}.io_cpu"
if node_id not in self._node_ids:
raise RoutingError(f"IO_CPU not found: {node_id}")
return node_id
def find_all_pcie_eps(self) -> list[str]:
"""Return all PCIE_EP node ids across all SIPs, sorted."""
return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep"))
class PathRouter:
"""Find data-path from a source PE (or arbitrary node) to a destination node.
Two adjacency graphs are maintained:
_adj — excludes command edges (used by PE DMA routing, find_path)
_adj_all — includes all edges (used by component-to-component routing,
find_node_path; required because M_CPU↔NOC links are "command")
"""
# Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
# PE-internal pipeline nodes when computing DMA paths.
_MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
def __init__(self, graph: TopologyGraph) -> None:
self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list)
self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list)
self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list)
for e in graph.edges:
w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm
self._adj_all[e.src].append((e.dst, w))
if e.kind != "command":
self._adj[e.src].append((e.dst, w))
if e.kind not in self._MCPU_DMA_EXCLUDE:
self._adj_mcpu_dma[e.src].append((e.dst, w))
def find_path(self, src_pe: str, dst_node: str) -> list[str]:
"""PE DMA routing: prepends .pe_dma, excludes command edges."""
start = f"{src_pe}.pe_dma"
return self._run_dijkstra(self._adj, start, dst_node)
def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
start = f"{src_pe}.pe_dma"
return self._run_dijkstra_with_dist(self._adj, start, dst_node)
def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
"""M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).
Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
→ routes through NOC → UCIe → target cube NOC → xbar → HBM.
"""
m_cube = ".".join(m_cpu_id.split(".")[:2])
d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
if m_cube == d_cube:
slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
return [
m_cpu_id,
f"{m_cube}.noc",
f"{m_cube}.xbar.pe{slice_idx}",
dst_hbm_slice_id,
]
return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)
def find_node_path(self, src: str, dst: str) -> list[str]:
"""General routing between arbitrary nodes, including command edges.
Used by components (IoCpuComponent, MCpuComponent) that route through
M_CPU↔NOC command-kind links.
"""
return self._run_dijkstra(self._adj_all, src, dst)
def _run_dijkstra(
self,
adj: dict[str, list[tuple[str, float]]],
start: str,
goal: str,
) -> list[str]:
path, _ = self._run_dijkstra_with_dist(adj, start, goal)
return path
def _run_dijkstra_with_dist(
self,
adj: dict[str, list[tuple[str, float]]],
start: str,
goal: str,
) -> tuple[list[str], float]:
if start == goal:
return [start], 0.0
best: dict[str, float] = {start: 0.0}
prev: dict[str, str] = {}
heap: list[tuple[float, str]] = [(0.0, start)]
while heap:
d, node = heapq.heappop(heap)
if node == goal:
path: list[str] = []
cur = goal
while cur != start:
path.append(cur)
cur = prev[cur]
path.append(start)
path.reverse()
return path, d
if d > best.get(node, float("inf")):
continue
for neighbor, edge_dist in adj[node]:
new_d = d + edge_dist
if new_d < best.get(neighbor, float("inf")):
best[neighbor] = new_d
prev[neighbor] = node
heapq.heappush(heap, (new_d, neighbor))
raise RoutingError(f"no path from {start} to {goal}")
# ── backward-compat shims (used by existing tests) ───────────────
def _dijkstra(self, start: str, goal: str) -> list[str]:
return self._run_dijkstra(self._adj, start, goal)
def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]:
return self._run_dijkstra_with_dist(self._adj, start, goal)