ADR-0019 D1/D4: per-PE HBM CTRL partitioning

Restores per-PE HBM controller partitioning that was lost in
commit 5917b34 ("Replace xbar/bridge/single-NOC with explicit
router mesh"), which had over-consolidated the per-slice HBM CTRL
into a single cube-wide ``hbm_ctrl`` connected to every router —
the opposite of what ADR-0019 D1/D4 specifies.

Builder splits ``hbm_ctrl`` into 8 ``hbm_ctrl.pe{X}`` instances per
cube, each reachable ONLY through PE_X's attaching router via the
existing ``peX.hbm`` attach metadata from cube_mesh.yaml. Cube
aggregate BW now matches the spec (8 PEs × 8 PCs × 32 GB/s =
2048 GB/s) instead of collapsing to 256 GB/s.

AddressResolver decodes the target PE from the HBM PA's hbm_offset
(``offset // slice_size``) and returns ``hbm_ctrl.pe{X}``. PathRouter
uses the existing ``_adj_local`` adjacency for same-cube PE_DMA so
the cube's own UCIe port can no longer appear as a zero-distance
shortcut between routers — local PE_DMA now traverses the mesh,
restoring the ADR-0019 D4 worked example
``PE0.pe_dma → r0c0 → … → r1c4 → hbm_ctrl``.

Tests:
- New tests/test_per_pe_hbm_partition.py: 14 tests covering
  topology shape, per-PE router exclusivity, PA resolution,
  single-hop local path, cross-PE mesh traversal, and end-to-end
  latency monotonicity. Probe CLI now reports
  pe-local < pe-same-half < pe-cross-half (was uniform 141ns).
- Existing tests updated for new node ids and replaced two
  assertions that locked in the wrong consolidation:
  test_noc_mesh.test_hbm_connects_to_all_routers and
  test_topology_compile.test_hbm_ctrl_connects_all_routers are
  now per-PE exclusivity assertions; test_routing
  .test_all_pe_hbm_equidistant becomes
  test_cross_pe_hbm_distance_increases_with_mesh_hops.
- test_ipcq_buffer_kind_locations.test_hbm_pe_hop_charged_at_large_payload
  threshold recalibrated 4000→1500 ns: the prior figure reflected
  serialization on the over-consolidated single hbm_ctrl; per-PE
  partitioning removes that artificial contention so the gap
  shrinks to the genuine PE↔HBM-hop cost.

Full suite: 645 passed, 1 skipped.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-15 01:04:30 -07:00
parent aaa1cbfaf6
commit b8213d43a9
17 changed files with 486 additions and 168 deletions
+35 -4
View File
@@ -11,6 +11,18 @@ class RoutingError(Exception):
pass
def _same_cube(a: str, b: str) -> bool:
"""Return True if both node ids share a sip+cube prefix (`sipX.cubeY.`)."""
pa = a.split(".")
pb = b.split(".")
return (
len(pa) >= 2 and len(pb) >= 2
and pa[0] == pb[0]
and pa[1].startswith("cube") and pb[1].startswith("cube")
and pa[1] == pb[1]
)
class AddressResolver:
"""Resolve a PhysAddr to the destination node_id in the compiled graph.
@@ -22,6 +34,12 @@ class AddressResolver:
def __init__(self, graph: TopologyGraph) -> None:
self._node_ids = set(graph.nodes)
# HBM slice size (bytes) — used to decode pe_id from hbm_offset
# so HBM PA → hbm_ctrl.pe{X} (ADR-0019 D1/D4).
mm = graph.spec.get("cube", {}).get("memory_map", {})
hbm_total_gb = int(mm.get("hbm_total_gb_per_cube", 48))
slices_per_cube = int(mm.get("hbm_slices_per_cube", 8))
self._hbm_slice_bytes = hbm_total_gb * (1 << 30) // slices_per_cube
# ── Physical-address resolution ──────────────────────────────────
@@ -29,7 +47,8 @@ class AddressResolver:
s = addr.sip_id
d = addr.die_id
if addr.kind == "hbm":
node_id = f"sip{s}.cube{d}.hbm_ctrl"
pe_id = int(addr.hbm_offset) // self._hbm_slice_bytes
node_id = f"sip{s}.cube{d}.hbm_ctrl.pe{pe_id}"
elif addr.kind == "pe_resource":
if addr.unit_type == UnitType.PE:
node_id = f"sip{s}.cube{d}.pe{addr.pe_id}.pe_tcm"
@@ -103,13 +122,25 @@ class PathRouter:
self._adj_local[e.src].append((e.dst, w))
def find_path(self, src_pe: str, dst_node: str) -> list[str]:
"""PE DMA routing: prepends .pe_dma, excludes command edges."""
"""PE DMA routing: prepends .pe_dma, excludes command edges.
When source and destination share the same cube, route through
the cube-local adjacency (which excludes UCIe internal edges).
Otherwise the cube's own UCIe port appears as a zero-distance
bus that Dijkstra prefers over the mesh — that is intended only
for cross-cube routing. Local PE_DMA must traverse the mesh so
cross-PE-slice access pays the mesh-distance cost (ADR-0019 D4).
"""
start = f"{src_pe}.pe_dma"
return self._run_dijkstra(self._adj, start, dst_node)
adj = self._adj_local if _same_cube(start, dst_node) else self._adj
return self._run_dijkstra(adj, start, dst_node)
def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
"""Match find_path's cube-local routing so reported distance reflects
the actual chosen path (ADR-0019 D4)."""
start = f"{src_pe}.pe_dma"
return self._run_dijkstra_with_dist(self._adj, start, dst_node)
adj = self._adj_local if _same_cube(start, dst_node) else self._adj
return self._run_dijkstra_with_dist(adj, start, dst_node)
def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_id: str) -> list[str]:
"""M_CPU DMA path: routes through router mesh (ADR-0019).
+37 -30
View File
@@ -404,20 +404,26 @@ def _instantiate_cube(
label=name.upper().replace("_", " "),
)
# ── HBM controller (single node, ADR-0019 D1, ADR-0033) ──
# ── Per-PE HBM controller (ADR-0019 D1/D4) ──
# Each PE owns one slice of the cube's HBM. The slice has its own
# set of pseudo-channels and is reachable ONLY through that PE's
# attaching router (see cube_mesh.yaml ``peX.hbm`` attach lists).
# Restored after the ADR-0019 over-consolidation in commit 5917b34.
hbm_spec = cube["components"]["hbm_ctrl"]
hbm_lx, hbm_ly = local_pos["hbm_ctrl"]
hbm_id = f"{cp}.hbm_ctrl"
hbm_attrs = dict(hbm_spec["attrs"])
_hbm_total_bw = float(cube["links"].get("hbm_to_router_bw_gbs", 256.0))
_num_pcs = int(hbm_attrs.get("num_pcs", 8))
hbm_attrs["num_pcs"] = _num_pcs
hbm_attrs["pc_bw_gbs"] = _hbm_total_bw / _num_pcs
nodes[hbm_id] = Node(
id=hbm_id, kind=hbm_spec["kind"], impl=hbm_spec["impl"],
attrs=hbm_attrs, pos_mm=(ox + hbm_lx, oy + hbm_ly),
label="HBM CTRL",
)
_num_pcs = int(hbm_spec["attrs"].get("num_pcs", 8))
pes_per_cube = int(cube["memory_map"].get("hbm_slices_per_cube", 8))
for pe_idx in range(pes_per_cube):
pe_hbm_id = f"{cp}.hbm_ctrl.pe{pe_idx}"
pe_hbm_attrs = dict(hbm_spec["attrs"])
pe_hbm_attrs["num_pcs"] = _num_pcs
pe_hbm_attrs["pc_bw_gbs"] = _hbm_total_bw / _num_pcs
nodes[pe_hbm_id] = Node(
id=pe_hbm_id, kind=hbm_spec["kind"], impl=hbm_spec["impl"],
attrs=pe_hbm_attrs, pos_mm=(ox + hbm_lx, oy + hbm_ly),
label=f"HBM CTRL pe{pe_idx}",
)
# ── Router mesh from cube_mesh.yaml (ADR-0019 D3) ──
routers = mesh_data["routers"]
@@ -566,7 +572,22 @@ def _instantiate_cube(
kind="command",
))
elif item.endswith(".hbm"):
pass # HBM edges handled below (all routers)
# peX.hbm: router rXcY owns the entry to hbm_ctrl.peX.
# (ADR-0019 D1/D4 — per-PE HBM partitioning.)
pe_prefix = item.rsplit(".", 1)[0]
pe_idx = int(pe_prefix.replace("pe", ""))
pe_hbm_id = f"{cp}.hbm_ctrl.pe{pe_idx}"
if pe_hbm_id in nodes:
edges.append(Edge(
src=rid, dst=pe_hbm_id,
distance_mm=0.0, bw_gbs=hbm_to_router_bw,
kind="router_to_hbm",
))
edges.append(Edge(
src=pe_hbm_id, dst=rid,
distance_mm=0.0, bw_gbs=hbm_to_router_bw,
kind="hbm_to_router",
))
elif item == "m_cpu":
# M_CPU ↔ router
mcpu_id = f"{cp}.m_cpu"
@@ -623,24 +644,10 @@ def _instantiate_cube(
kind="router_to_ucie_conn",
))
# ── HBM_CTRL ↔ all routers (ADR-0019 D1) ──
# High routing weight prevents Dijkstra from using HBM as transit shortcut
for rkey, rval in routers.items():
if rval is None:
continue
rid = f"{cp}.{rkey}"
edges.append(Edge(
src=rid, dst=hbm_id,
distance_mm=0.0, bw_gbs=hbm_to_router_bw,
routing_weight_mm=1000.0,
kind="router_to_hbm",
))
edges.append(Edge(
src=hbm_id, dst=rid,
distance_mm=0.0, bw_gbs=hbm_to_router_bw,
routing_weight_mm=1000.0,
kind="hbm_to_router",
))
# NOTE: HBM↔router edges are created in the per-router attach loop
# above (peX.hbm items map router → hbm_ctrl.peX). Removed the
# legacy "all routers → single hbm_ctrl" loop that bypassed the
# ADR-0019 D4 per-PE partition.
def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None: