diff --git a/components.yaml b/components.yaml
index 8bf0f85..1e22d69 100644
--- a/components.yaml
+++ b/components.yaml
@@ -28,9 +28,6 @@ components:
switch_v1: kernbench.components.builtin.forwarding:TransitComponent
noc_v1: kernbench.components.builtin.forwarding:TransitComponent
ucie_v1: kernbench.components.builtin.forwarding:TransitComponent
- noc_2d_mesh_v1: kernbench.components.builtin.noc:TwoDMeshNocComponent
- xbar_v1: kernbench.components.builtin.xbar:PositionAwareXbarComponent
-
# IO / Host interface
pcie_ep_v1: kernbench.components.builtin.pcie_ep:PcieEpComponent
io_cpu_v1: kernbench.components.builtin.io_cpu:IoCpuComponent
diff --git a/docs/diagrams/cube_view.svg b/docs/diagrams/cube_view.svg
index 3743329..df6bac0 100644
--- a/docs/diagrams/cube_view.svg
+++ b/docs/diagrams/cube_view.svg
@@ -135,17 +135,14 @@
r0c0
PE0
-
+
r0c1
PE1
-
-
+
+
r0c2
-
- M_CPU
-
r0c3
@@ -156,20 +153,23 @@
r1c0
r1c1
-
+
r1c2
+
+ M_CPU
+
r1c3
r1c4
PE2
-
+
r1c5
PE3
-
+
r2c0
@@ -182,7 +182,7 @@
r3c0
SRAM
-
+
r3c1
@@ -193,12 +193,12 @@
r4c0
PE4
-
+
r4c1
PE5
-
+
r4c2
@@ -219,84 +219,84 @@
r5c4
PE6
-
+
r5c5
PE7
-
-
+
+
256GB/s
-
+
256GB/s
-
+
256GB/s
-
+
256GB/s
-
+
256GB/s
-
+
256GB/s
-
+
256GB/s
-
+
256GB/s
UCIe-W
c0
-
+
c1
-
+
c2
-
+
c3
-
+
UCIe-N
c0
-
+
c1
-
+
c2
-
+
c3
-
+
UCIe-E
c0
-
+
c1
-
+
c2
-
+
c3
-
+
UCIe-S
c0
-
+
c1
-
+
c2
-
+
c3
-
+
PE Router
diff --git a/src/kernbench/cli/probe.py b/src/kernbench/cli/probe.py
index c3e1403..1333c8c 100644
--- a/src/kernbench/cli/probe.py
+++ b/src/kernbench/cli/probe.py
@@ -116,7 +116,7 @@ def _fmt_util(eff: float, bn: float | None) -> str:
def _short_name(node_id: str) -> str:
- """Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
+ """Shorten node id: keep last 2 segments to avoid ambiguity (router.pe0 vs pe0)."""
parts = node_id.split(".")
return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
@@ -366,7 +366,7 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
# --- PE DMA Summary Table ---
print()
- print(f"=== PE DMA Latency (pe_dma -> xbar -> HBM, data={nbytes}B) ===")
+ print(f"=== PE DMA Latency (pe_dma -> router -> HBM, data={nbytes}B) ===")
print(f" {'Case':<26} {'Target':<28} {'Actual':>8}"
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
diff --git a/src/kernbench/cli/report.py b/src/kernbench/cli/report.py
index 815501a..e3d619f 100644
--- a/src/kernbench/cli/report.py
+++ b/src/kernbench/cli/report.py
@@ -137,7 +137,7 @@ def _extract_peaks(spec: dict | None) -> tuple[float, float]:
gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
cube_links = cube.get("links", {})
- hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
+ hbm_bw = float(cube_links.get("hbm_to_router_bw_gbs", 0.0))
return peak_tflops, hbm_bw
diff --git a/src/kernbench/components/builtin/noc.py b/src/kernbench/components/builtin/noc.py
deleted file mode 100644
index 472b7e6..0000000
--- a/src/kernbench/components/builtin/noc.py
+++ /dev/null
@@ -1,224 +0,0 @@
-from __future__ import annotations
-
-from collections.abc import Generator
-from typing import TYPE_CHECKING, Any
-
-import simpy
-
-from kernbench.components.base import ComponentBase
-
-if TYPE_CHECKING:
- from kernbench.components.context import ComponentContext
- from kernbench.topology.types import Node
-
-
-class TwoDMeshNocComponent(ComponentBase):
- """2D mesh NOC modeled as a single smart node.
-
- Latency model:
- - Traversal latency = Manhattan distance between prev_hop and next_hop
- node positions, split into XY segments, traversed with pipeline.
- - overhead_ns (from node.attrs) is added once per traversal.
-
- Contention model:
- - Each directed XY segment is a simpy.Resource(capacity=1).
- - Pipeline: next segment's resource is requested before the current
- segment's timeout completes, so a free downstream segment is acquired
- immediately (wormhole-style cut-through).
- - Two transactions sharing a segment (same row or column band) contend.
-
- Concurrency:
- - _worker spawns an independent SimPy process per transaction, so the
- NOC is never serialized at the node level — only at segment resources.
- """
-
- def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
- super().__init__(node, ctx)
- self._env: simpy.Environment | None = None
- self._links: dict[tuple, simpy.Resource] = {}
- self._x_grid: list[float] = []
- self._y_grid: list[float] = []
-
- def start(self, env: simpy.Environment) -> None:
- self._env = env
- self._build_grid()
- super().start(env)
-
- def run(self, env: simpy.Environment, nbytes: int) -> Generator:
- yield env.timeout(0)
-
- # ── Grid construction ────────────────────────────────────────────
-
- def _build_grid(self) -> None:
- if not self.ctx:
- return
- mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None
- if mesh:
- self._build_grid_from_mesh(mesh)
- else:
- self._build_grid_from_positions()
-
- def _build_grid_from_mesh(self, mesh: dict) -> None:
- """Build XY grid from cube_mesh.yaml router positions (authoritative)."""
- origin_x, origin_y = self._cube_origin()
- xs: set[float] = set()
- ys: set[float] = set()
- for key, router in mesh.get("routers", {}).items():
- if router is not None:
- xs.add(round(origin_x + router["pos_mm"][0], 2))
- ys.add(round(origin_y + router["pos_mm"][1], 2))
- self._x_grid = sorted(xs)
- self._y_grid = sorted(ys)
-
- def _build_grid_from_positions(self) -> None:
- """Fallback: infer grid from all node positions in the cube."""
- cube_prefix = self.node.id.rsplit(".", 1)[0]
- xs: set[float] = set()
- ys: set[float] = set()
- for node_id, pos in self.ctx.positions.items():
- if node_id.startswith(cube_prefix + ".") and pos is not None:
- xs.add(round(pos[0], 2))
- ys.add(round(pos[1], 2))
- self._x_grid = sorted(xs)
- self._y_grid = sorted(ys)
-
- def _cube_origin(self) -> tuple[float, float]:
- """Compute absolute origin (top-left) of this cube from cube_id."""
- parts = self.node.id.split(".")
- cube_str = [p for p in parts if p.startswith("cube")][0]
- cube_id = int(cube_str[4:])
- spec = self.ctx.spec
- sip_spec = spec.get("sip", {})
- cube_spec = spec.get("cube", {})
- mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
- cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
- cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0)
- seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
- "distance_mm_across_seam", 1.0)
- col = cube_id % mesh_w
- row = cube_id // mesh_w
- return (col * (cube_w + seam), row * (cube_h + seam))
-
- def _get_link(self, key: tuple) -> simpy.Resource:
- if key not in self._links:
- assert self._env is not None
- self._links[key] = simpy.Resource(self._env, capacity=1)
- return self._links[key]
-
- # ── Worker ───────────────────────────────────────────────────────
-
- def _worker(self, env: simpy.Environment) -> Generator:
- while True:
- txn: Any = yield self._inbox.get()
- env.process(self._route(env, txn))
-
- def _route(self, env: simpy.Environment, txn: Any) -> Generator:
- prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
- next_hop = txn.next_hop
- overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
-
- links: list[tuple[tuple, float]] = []
- if prev_hop and next_hop and self.ctx:
- src_pos = self.ctx.positions.get(prev_hop)
- dst_pos = self.ctx.positions.get(next_hop)
- if src_pos and dst_pos:
- links = self._xy_links(src_pos, dst_pos)
-
- if links:
- yield from self._traverse(env, links, overhead_ns)
- else:
- yield env.timeout(overhead_ns)
-
- if next_hop:
- yield self.out_ports[next_hop].put(txn.advance())
- else:
- drain = getattr(txn, "drain_ns", 0.0)
- if drain > 0:
- yield env.timeout(drain)
- txn.done.succeed()
-
- # ── XY routing and pipelined link traversal ──────────────────────
-
- def _traverse(
- self,
- env: simpy.Environment,
- links: list[tuple[tuple, float]],
- overhead_ns: float,
- ) -> Generator:
- """Pipeline: request next segment before current timeout finishes."""
- ns_per_mm = self.ctx.ns_per_mm # type: ignore[union-attr]
-
- # Acquire first link
- first_key, _ = links[0]
- current_resource = self._get_link(first_key)
- current_req = current_resource.request()
- yield current_req
-
- for i, (_, dist_mm) in enumerate(links):
- # Request next link before current timeout (pipeline)
- if i + 1 < len(links):
- next_key, _ = links[i + 1]
- next_resource = self._get_link(next_key)
- next_req = next_resource.request()
-
- yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0))
- current_resource.release(current_req)
-
- if i + 1 < len(links):
- yield next_req # usually already fulfilled (pipeline)
- current_resource = next_resource
- current_req = next_req
-
- def _xy_links(
- self,
- src: tuple[float, float],
- dst: tuple[float, float],
- ) -> list[tuple[tuple, float]]:
- """XY routing: horizontal segment first, then vertical.
-
- Returns list of (link_key, dist_mm) pairs, where link_key uniquely
- identifies a directed segment shared across concurrent transactions.
- """
- x0, y0 = src
- x1, y1 = dst
- links: list[tuple[tuple, float]] = []
-
- # Horizontal segment at y≈y0
- if abs(x0 - x1) > 1e-9:
- y_band = self._snap(y0, self._y_grid)
- for xa, xb in self._segments(x0, x1, self._x_grid):
- d = abs(xb - xa)
- if d > 1e-9:
- lo, hi = (xa, xb) if xa < xb else (xb, xa)
- dir_h = "E" if xb > xa else "W"
- links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d))
-
- # Vertical segment at x≈x1
- if abs(y0 - y1) > 1e-9:
- x_band = self._snap(x1, self._x_grid)
- for ya, yb in self._segments(y0, y1, self._y_grid):
- d = abs(yb - ya)
- if d > 1e-9:
- lo, hi = (ya, yb) if ya < yb else (yb, ya)
- dir_v = "S" if yb > ya else "N"
- links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d))
-
- return links
-
- @staticmethod
- def _snap(val: float, grid: list[float]) -> float:
- if not grid:
- return val
- return min(grid, key=lambda g: abs(g - val))
-
- @staticmethod
- def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]:
- """Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints."""
- if abs(a - b) < 1e-9:
- return []
- lo, hi = (a, b) if a < b else (b, a)
- pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi]
- pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)]
- if a > b:
- pairs = [(p2, p1) for p1, p2 in reversed(pairs)]
- return pairs
diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py
index 857456c..c8ee823 100644
--- a/src/kernbench/components/builtin/pe_dma.py
+++ b/src/kernbench/components/builtin/pe_dma.py
@@ -96,7 +96,7 @@ class PeDmaComponent(PeEngineBase):
request=sub_request, path=path, step=0,
nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
)
- # Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
+ # Send to next hop (path[0] is pe_dma itself, path[1] is router)
if len(path) > 1:
yield self.out_ports[path[1]].put(sub_txn.advance())
# DMA channel released after issue
diff --git a/src/kernbench/components/builtin/xbar.py b/src/kernbench/components/builtin/xbar.py
deleted file mode 100644
index 1872f99..0000000
--- a/src/kernbench/components/builtin/xbar.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""Position-aware XBAR component.
-
-Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm,
-where internal_distance is the Manhattan distance between the entry port
-(PE router attachment) and exit port (HBM slice logical position) within
-the crossbar matrix.
-
-PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]).
-HBM slice positions are uniformly distributed across the HBM physical width.
-"""
-from __future__ import annotations
-
-from collections.abc import Generator
-from typing import TYPE_CHECKING, Any
-
-import simpy
-
-from kernbench.components.base import ComponentBase
-
-if TYPE_CHECKING:
- from kernbench.components.context import ComponentContext
- from kernbench.topology.types import Node
-
-
-class PositionAwareXbarComponent(ComponentBase):
- """XBAR with position-dependent latency based on PE-to-slice distance.
-
- Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm
-
- Entry/exit port X positions are determined from the transaction path:
- - PE_DMA nodes: router X from cube_mesh.yaml
- - HBM slices: uniformly distributed across HBM physical width
- - Bridge nodes: physical X from topology positions
- - NOC: resolved by scanning path for PE_DMA node
- """
-
- def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
- super().__init__(node, ctx)
- self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0))
- self._pe_router_xs: dict[str, float] = {}
- self._slice_xs: dict[str, float] = {}
- self._bridge_xs: dict[str, float] = {}
- self._ns_per_mm: float = 0.0
-
- def start(self, env: simpy.Environment) -> None:
- self._build_position_map()
- super().start(env)
-
- def run(self, env: simpy.Environment, nbytes: int) -> Generator:
- yield env.timeout(self._base_overhead_ns)
-
- # ── Position map construction ─────────────────────────────────
-
- def _build_position_map(self) -> None:
- if not self.ctx or not self.ctx.spec:
- return
- mesh = self.ctx.spec.get("_mesh")
- if not mesh:
- return
-
- self._ns_per_mm = self.ctx.ns_per_mm
- cube_prefix = self.node.id.rsplit(".", 1)[0]
- xbar_name = self.node.id.rsplit(".", 1)[1]
- is_top = xbar_name == "xbar_top"
- xbar_key = "top" if is_top else "bottom"
-
- # PE router X positions from mesh attachments
- routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", [])
- for router_id in routers_list:
- router_data = mesh["routers"].get(router_id)
- if router_data is None:
- continue
- router_x = router_data["pos_mm"][0]
- for attach in router_data.get("attach", []):
- if attach.endswith(".dma"):
- pe_name = attach.split(".")[0]
- pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma"
- self._pe_router_xs[pe_dma_id] = router_x
-
- # HBM slice X positions: uniformly distributed across HBM width
- cube_spec = self.ctx.spec.get("cube", {})
- cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
- hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0)
- n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8)
- half = n_slices // 2
- hbm_left = (cube_w - hbm_w) / 2
-
- if is_top:
- slice_range = range(half)
- else:
- slice_range = range(half, n_slices)
-
- n = len(list(slice_range))
- for i, sl in enumerate(slice_range):
- if n > 1:
- x = hbm_left + i * hbm_w / (n - 1)
- else:
- x = cube_w / 2
- self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x
-
- # Bridge X positions from topology positions
- for node_id, pos in self.ctx.positions.items():
- if node_id.startswith(cube_prefix + ".bridge.") and pos is not None:
- origin_x = self._cube_origin_x()
- self._bridge_xs[node_id] = pos[0] - origin_x
-
- def _cube_origin_x(self) -> float:
- """Compute absolute X origin of this cube."""
- parts = self.node.id.split(".")
- cube_str = [p for p in parts if p.startswith("cube")][0]
- cube_id = int(cube_str[4:])
- spec = self.ctx.spec
- sip_spec = spec.get("sip", {})
- cube_spec = spec.get("cube", {})
- mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
- cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
- seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
- "distance_mm_across_seam", 1.0)
- col = cube_id % mesh_w
- return col * (cube_w + seam)
-
- # ── Worker override ───────────────────────────────────────────
-
- def _worker(self, env: simpy.Environment) -> Generator:
- while True:
- txn: Any = yield self._inbox.get()
- env.process(self._position_aware_forward(env, txn))
-
- def _position_aware_forward(
- self, env: simpy.Environment, txn: Any,
- ) -> Generator:
- prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
- next_hop = txn.next_hop
-
- overhead = self._base_overhead_ns
- if prev_hop and next_hop and self._ns_per_mm > 0:
- entry_x = self._get_port_x(prev_hop, txn.path)
- exit_x = self._get_port_x(next_hop, txn.path)
- if entry_x is not None and exit_x is not None:
- overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm
-
- yield env.timeout(overhead)
-
- if next_hop:
- yield self.out_ports[next_hop].put(txn.advance())
- else:
- drain = getattr(txn, "drain_ns", 0.0)
- if drain > 0:
- yield env.timeout(drain)
- txn.done.succeed()
-
- def _get_port_x(self, node_id: str, path: list[str]) -> float | None:
- """Resolve the X position of an XBAR port from node context."""
- # Direct lookup: PE DMA
- if node_id in self._pe_router_xs:
- return self._pe_router_xs[node_id]
- # Direct lookup: HBM slice
- if node_id in self._slice_xs:
- return self._slice_xs[node_id]
- # Direct lookup: bridge
- if node_id in self._bridge_xs:
- return self._bridge_xs[node_id]
- # NOC: scan path for PE DMA node
- if "noc" in node_id:
- for p in path:
- if p in self._pe_router_xs:
- return self._pe_router_xs[p]
- return None
diff --git a/src/kernbench/policy/routing/router.py b/src/kernbench/policy/routing/router.py
index 81ed601..ce3d21e 100644
--- a/src/kernbench/policy/routing/router.py
+++ b/src/kernbench/policy/routing/router.py
@@ -81,7 +81,7 @@ class PathRouter:
# Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
# PE-internal pipeline nodes when computing DMA paths.
- _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
+ _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_router"}
_UCIE_KINDS = {"ucie_internal", "ucie_conn_to_router", "router_to_ucie_conn",
"ucie_conn_to_noc", "noc_to_ucie_conn", "ucie_mesh",
@@ -124,9 +124,9 @@ class PathRouter:
return self._run_dijkstra(self._adj_all, m_cpu_id, dst_hbm_id)
def find_memory_path(self, src: str, dst: str) -> list[str]:
- """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
+ """Direct memory path: pcie_ep → io_noc → cube → router mesh → hbm_ctrl.
- Uses _adj_mcpu_dma which excludes pe_internal and pe_to_xbar edges,
+ Uses _adj_mcpu_dma which excludes pe_internal and pe_to_router edges,
preventing routing through PE pipeline nodes.
"""
return self._run_dijkstra(self._adj_mcpu_dma, src, dst)
diff --git a/src/kernbench/sim_engine/engine.py b/src/kernbench/sim_engine/engine.py
index 298e080..a16ce66 100644
--- a/src/kernbench/sim_engine/engine.py
+++ b/src/kernbench/sim_engine/engine.py
@@ -19,9 +19,9 @@ class GraphEngine:
"""simpy-based discrete-event simulation engine.
Request routing:
- MemoryWrite/Read: pcie_ep → io_noc → cube → xbar → hbm_ctrl (m_cpu bypass)
+ MemoryWrite/Read: pcie_ep → io_noc → cube → router mesh → hbm_ctrl (m_cpu bypass)
KernelLaunch: pcie_ep → io_noc → io_cpu → io_noc → cube → m_cpu → PE
- PeDmaMsg: pe_dma → xbar → hbm_ctrl (direct probe)
+ PeDmaMsg: pe_dma → router mesh → hbm_ctrl (direct probe)
Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
"""
@@ -261,7 +261,7 @@ class GraphEngine:
done.succeed()
def _process_memory_direct(self, key: str, request: Any, done: simpy.Event):
- """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
+ """Direct memory path: pcie_ep → io_noc → cube → router mesh → hbm_ctrl.
MemoryWrite: data flows forward (nbytes on wires), drain at hbm_ctrl terminal.
MemoryRead: command flows forward (nbytes=0), hbm_ctrl sends data back on
diff --git a/src/kernbench/sim_engine/event_log.py b/src/kernbench/sim_engine/event_log.py
index 5d3c866..b053084 100644
--- a/src/kernbench/sim_engine/event_log.py
+++ b/src/kernbench/sim_engine/event_log.py
@@ -287,7 +287,7 @@ def _generate_probe_d2h(graph, edge_map) -> list[dict]:
def _generate_probe_pe_dma(graph, edge_map) -> list[dict]:
- """PE DMA probes: pe_dma → xbar → HBM."""
+ """PE DMA probes: pe_dma → router mesh → HBM."""
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
diff --git a/src/kernbench/topology/visualizer.py b/src/kernbench/topology/visualizer.py
index fe6ac8c..53c0708 100644
--- a/src/kernbench/topology/visualizer.py
+++ b/src/kernbench/topology/visualizer.py
@@ -385,6 +385,55 @@ def _escape(text: str) -> str:
return text.replace("&", "&").replace("<", "<").replace(">", ">")
+# ── Connector helper ─────────────────────────────────────────────────
+
+
+def _connector_points(
+ rx: float, ry: float, cx: float, cy: float
+) -> str:
+ """Return SVG polyline points for a rule-based connector.
+
+ Horizontal-dominant (|dx| >= |dy|): 45° → horizontal straight → 45°.
+ Vertical-dominant (|dy| > |dx|): 45° → vertical straight → 45°.
+ Near-equal or tiny distance: single straight line.
+ """
+ dx = cx - rx
+ dy = cy - ry
+ adx, ady = abs(dx), abs(dy)
+
+ # Trivial distance → single line
+ # Near-45° diagonal for short distances only (e.g. PE↔router)
+ if adx + ady < 4 or (abs(adx - ady) < 4 and adx + ady < 80):
+ return f"{rx:.0f},{ry:.0f} {cx:.0f},{cy:.0f}"
+
+ sx = 1 if dx >= 0 else -1
+ sy = 1 if dy >= 0 else -1
+
+ if adx >= ady:
+ # Horizontal-dominant: stubs handle vertical, straight is horizontal
+ stub = ady / 2
+ if stub < 2:
+ return f"{rx:.0f},{ry:.0f} {cx:.0f},{cy:.0f}"
+ r45x = rx + sx * stub
+ r45y = ry + sy * stub
+ c45x = cx - sx * stub
+ c45y = cy - sy * stub # r45y == c45y (horizontal)
+ else:
+ # Vertical-dominant: stubs handle horizontal, straight is vertical
+ stub = adx / 2
+ if stub < 2:
+ return f"{rx:.0f},{ry:.0f} {cx:.0f},{cy:.0f}"
+ r45x = rx + sx * stub
+ r45y = ry + sy * stub
+ c45x = cx - sx * stub
+ c45y = cy - sy * stub # r45x == c45x (vertical)
+
+ return (
+ f"{rx:.0f},{ry:.0f} {r45x:.0f},{r45y:.0f} "
+ f"{c45x:.0f},{c45y:.0f} {cx:.0f},{cy:.0f}"
+ )
+
+
# ── Cube-specific renderer ──────────────────────────────────────────
@@ -637,55 +686,39 @@ def _render_cube_view_svg(view: ViewGraph, spec: dict) -> str:
f'text-anchor="middle" font-family="monospace" font-size="{font_sz}" '
f'font-weight="bold" fill="{style["text"]}">{_escape(label)}'
)
- # Connector: router ─45°─ straight ─45°─ component
+ # Connector: rule-based (short → 45° line, long → 45°-straight-45°)
sc = style["stroke"]
- d = 12 # 45° stub length (px)
+
+ # Determine start (router edge) and end (component edge) points
+ bxc = bx + blk_w / 2 # component center x
if kind == "mcpu":
- # Router top → 45° NW stub → vertical → 45° into block bottom
- rx2, ry2 = px, py - r_size
- bxc, byc = bx + blk_w / 2, by + blk_h
- parts.append(
- f' '
- )
+ rx0, ry0 = px, py - r_size # router top
+ cx0, cy0 = bxc, by + blk_h # component bottom
elif kind == "sram":
- # Router bottom → 45° SW stub → vertical → 45° into block top
- rx2, ry2 = px, py + r_size
- bxc, byc = bx + blk_w / 2, by
+ rx0, ry0 = px, py + r_size # router bottom
+ cx0, cy0 = bxc, by # component top
+ elif is_top:
+ rx0, ry0 = px, py - r_size # router top
+ cx0, cy0 = bx + blk_w / 2 + offset_x, by + blk_h # component bottom
+ else:
+ rx0, ry0 = px, py + r_size # router bottom
+ cx0, cy0 = bx + blk_w / 2 + offset_x, by # component top
+
+ # PE/M_CPU/SRAM directly above/below router (same X):
+ # single diagonal line from router center to component right edge
+ if abs(cx0 - rx0) < 2 and abs(cy0 - ry0) > 4:
+ cx0 = bx + blk_w - 2
parts.append(
- f' '
+ f' '
)
else:
- # PE: vertical direction
- bxc = bx + blk_w / 2 + offset_x
- if is_top:
- rx2, ry2 = px, py - r_size # router top
- byc = by + blk_h # block bottom
- # 45° stub from router, vertical, 45° into block
- sx = bxc - px # horizontal shift direction
- sd = d if sx >= 0 else -d
- parts.append(
- f' '
- )
- else:
- rx2, ry2 = px, py + r_size # router bottom
- byc = by # block top
- sx = bxc - px
- sd = d if sx >= 0 else -d
- parts.append(
- f' '
- )
+ pts = _connector_points(rx0, ry0, cx0, cy0)
+ parts.append(
+ f' '
+ )
# (PE→HBM BW annotation drawn in the PE→HBM port group section above)
@@ -705,26 +738,13 @@ def _render_cube_view_svg(view: ViewGraph, spec: dict) -> str:
rpx, rpy = mm2px(rx, ry)
tgx, tgy = _pe_hbm_targets[pe_id]
r_edge_y = rpy + r_size if rpy < hbm_y else rpy - r_size
- # 45° stub from router → vertical → 45° into HBM port
- d = 12 # stub length
- sx = tgx - rpx
- sd = d if sx >= 0 else -d
- if rpy < hbm_y:
- parts.append(
- f' '
- )
- else:
- parts.append(
- f' '
- )
+ # Rule-based connector: router → HBM port group
+ pts = _connector_points(rpx, r_edge_y, tgx, tgy)
+ parts.append(
+ f' '
+ )
# BW annotation at midpoint
mx = (rpx + tgx) / 2 + 10
my = (r_edge_y + tgy) / 2
@@ -818,53 +838,27 @@ def _render_cube_view_svg(view: ViewGraph, spec: dict) -> str:
f'{conn}'
)
- # Connector: router ─45°stub─ straight ─45°stub─ UCIe port
+ # Connector: rule-based router → UCIe port
rpx, rpy = mm2px(crx, cry)
- d = 10
if direction == "N":
rx, ry = rpx, rpy - r_size
tx, ty = lx, cy_box + ch
- sx = tx - rx
- sd = d if sx >= 0 else -d
- parts.append(
- f' '
- )
elif direction == "S":
rx, ry = rpx, rpy + r_size
tx, ty = lx, cy_box
- sx = tx - rx
- sd = d if sx >= 0 else -d
- parts.append(
- f' '
- )
elif direction == "W":
rx, ry = rpx - r_size, rpy
tx, ty = cx + cw, cy_box + ch / 2
- sy = ty - ry
- sd = d if sy >= 0 else -d
- parts.append(
- f' '
- )
elif direction == "E":
rx, ry = rpx + r_size, rpy
tx, ty = cx, cy_box + ch / 2
- sy = ty - ry
- sd = d if sy >= 0 else -d
- parts.append(
- f' '
- )
+ else:
+ continue
+ pts = _connector_points(rx, ry, tx, ty)
+ parts.append(
+ f' '
+ )
# ── Legend ──
ly = h_px - 35
diff --git a/tests/test_component_registry.py b/tests/test_component_registry.py
index e2bf9b4..4e344a5 100644
--- a/tests/test_component_registry.py
+++ b/tests/test_component_registry.py
@@ -37,7 +37,7 @@ def _hbm_pa(pe_id: int = 0) -> int:
def _node(impl: str, overhead_ns: float = 0.0) -> Node:
- return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None)
+ return Node(id="test", kind="noc_router", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None)
# ── 1. unknown impl → error ──────────────────────────────────────────
@@ -55,7 +55,7 @@ def test_registry_unknown_impl_raises_error():
def test_transit_component_yields_overhead_ns():
"""TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns."""
- node = _node("xbar_v1", overhead_ns=3.0)
+ node = _node("forwarding_v1", overhead_ns=3.0)
comp = TransitComponent(node)
env = simpy.Environment()
@@ -119,10 +119,9 @@ def test_engine_component_model_latency():
"""MemoryRead D2H latency for local cube0 (4096B).
Bypass path (m_cpu bypass): pcie_ep → io_noc → conn → io_ucie → cube_ucie
- → conn → noc → xbar_top → hbm_ctrl.slice0
+ → conn → router mesh → hbm_ctrl
- Path goes through xbar_top (overhead_ns=2.0) instead of per-PE xbar.
- Latency must be positive and reasonable.
+ Path goes through router mesh. Latency must be positive and reasonable.
"""
graph = _graph()
engine = GraphEngine(graph)
@@ -134,7 +133,6 @@ def test_engine_component_model_latency():
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
- # Verify positive latency; exact value depends on path through xbar_top
assert trace["total_ns"] > 0
diff --git a/tests/test_noc_mesh.py b/tests/test_noc_mesh.py
index 110887b..92e8280 100644
--- a/tests/test_noc_mesh.py
+++ b/tests/test_noc_mesh.py
@@ -1,18 +1,15 @@
-"""Tests for #5+#6 CUBE NOC Router Mesh + Position-Aware XBAR.
-
-Phase 1 verification: all tests FAIL until Phase 2 implements production code.
+"""Tests for CUBE NOC Explicit Router Mesh (ADR-0019).
Key changes verified:
- - Single NOC node per cube with internal router mesh simulation
- - Auto-layout generates cube_mesh.yaml (6x6 grid for n_connections=4)
- - Position-aware XBAR (top/bottom) replaces per-PE xbar chaining
+ - Explicit router nodes per cube from cube_mesh.yaml (6×6 grid)
+ - Auto-layout generates cube_mesh.yaml with PE/UCIe/M_CPU/SRAM attachments
- Mesh file caching with source_hash change detection
- - Path routing: PE_DMA → NOC → XBAR_top/bot → HBM_CTRL
+ - Path routing: PE_DMA → router mesh → HBM_CTRL
-Latency invariant after refactor:
- Local HBM: PE_DMA → Router(overhead) → XBAR → HBM_CTRL
- Cross-row: PE_DMA → Router → mesh traverse → Router → XBAR → bridge → XBAR → HBM_CTRL
- Cross-cube: PE_DMA → Router → mesh → UCIe → ... → mesh → XBAR → HBM_CTRL
+Latency invariant:
+ Local HBM: PE_DMA → Router(overhead) → HBM_CTRL
+ Cross-row: PE_DMA → Router → mesh hops → Router → HBM_CTRL
+ Cross-cube: PE_DMA → Router → mesh → UCIe → ... → mesh → HBM_CTRL
"""
import pytest
diff --git a/tests/test_phase_a_components.py b/tests/test_phase_a_components.py
index 1e1dc2d..466cb73 100644
--- a/tests/test_phase_a_components.py
+++ b/tests/test_phase_a_components.py
@@ -24,7 +24,6 @@ from kernbench.components.builtin import (
IoCpuComponent,
MCpuComponent,
PcieEpComponent,
- PositionAwareXbarComponent,
SramComponent,
TransitComponent,
)
@@ -232,7 +231,6 @@ def test_m_cpu_terminal_no_ctx_completes():
("forwarding_v1", TransitComponent),
("noc_v1", TransitComponent),
("ucie_v1", TransitComponent),
- ("xbar_v1", PositionAwareXbarComponent),
("pcie_ep_v1", PcieEpComponent),
("io_cpu_v1", IoCpuComponent),
("m_cpu_v1", MCpuComponent),
diff --git a/tests/test_probe.py b/tests/test_probe.py
index 9f2597c..59e73fc 100644
--- a/tests/test_probe.py
+++ b/tests/test_probe.py
@@ -1,7 +1,7 @@
"""Tests for H2D writes and PE DMA probe latency invariants.
H2D tests use MemoryWriteMsg (pcie_ep → io_cpu → m_cpu → hbm_ctrl → response).
-PE DMA tests use PeDmaMsg (direct pe_dma → xbar → hbm_ctrl injection).
+PE DMA tests use PeDmaMsg (direct pe_dma → router mesh → hbm_ctrl injection).
"""
from pathlib import Path
@@ -118,7 +118,7 @@ def test_h2d_local_cube_cut_through():
"""H2D to local cube with cut-through should be < 50ns for 4096B.
Full command path: pcie_ep → io_cpu → ucie → noc → m_cpu
- DMA: m_cpu → noc → xbar → hbm_ctrl (drain once at terminal)
+ DMA: m_cpu → router mesh → hbm_ctrl (drain once at terminal)
Plus response path back.
With store-and-forward each hop would serialize; cut-through keeps it low.
"""
@@ -205,7 +205,7 @@ def test_pe_dma_local_bottleneck_hbm():
def test_pe_dma_same_half_bottleneck_hbm():
- """PE DMA pe0→slice1 (same half via xbar_top): bottleneck = HBM effective BW."""
+ """PE DMA pe0→pe1 HBM (same row via router mesh): bottleneck = HBM effective BW."""
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1)
expected = _hbm_effective_bw()
assert bn == expected, f"Same-half PE DMA bottleneck {bn}, expected {expected}"
diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py
index cb2e8fe..c2934ed 100644
--- a/tests/test_topology_compile.py
+++ b/tests/test_topology_compile.py
@@ -158,9 +158,9 @@ def test_pe_dma_to_router():
def test_command_path_m_cpu_router_pe_cpu():
es = _edge_set(_graph())
cp = "sip0.cube0"
- # m_cpu <-> r0c2 (bidirectional command)
- assert (f"{cp}.m_cpu", f"{cp}.r0c2") in es
- assert (f"{cp}.r0c2", f"{cp}.m_cpu") in es
+ # m_cpu <-> r1c2 (bidirectional command)
+ assert (f"{cp}.m_cpu", f"{cp}.r1c2") in es
+ assert (f"{cp}.r1c2", f"{cp}.m_cpu") in es
# router -> pe_cpu for each PE (command kind)
assert (f"{cp}.r0c0", f"{cp}.pe0.pe_cpu") in es
assert (f"{cp}.r5c5", f"{cp}.pe7.pe_cpu") in es
@@ -416,8 +416,8 @@ def test_cube_view_hbm_router():
def test_cube_view_m_cpu_router():
- """Cube view: m_cpu connects to its router r0c2."""
+ """Cube view: m_cpu connects to its router r1c2."""
v = _graph().cube_view
ves = {(e.src, e.dst) for e in v.edges}
- assert ("m_cpu", "r0c2") in ves
- assert ("r0c2", "m_cpu") in ves
+ assert ("m_cpu", "r1c2") in ves
+ assert ("r1c2", "m_cpu") in ves
diff --git a/topology.yaml b/topology.yaml
index 81777db..52ed3ae 100644
--- a/topology.yaml
+++ b/topology.yaml
@@ -55,7 +55,7 @@ cube:
ucie_mm: { size: 2.0 }
pe_layout:
- corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row
+ corners: [NW, NE, SW, SE] # N corners → top PE rows; S corners → bottom PE rows
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
pe_template:
@@ -97,7 +97,7 @@ cube:
# Physical placement of non-PE components (mm coordinates)
placement:
- m_cpu: { pos_mm: [7.5, 2.0] } # top center area, near UCIe-N
+ m_cpu: { pos_mm: [7.5, 3.0] } # top center, below UCIe-N
sram: { pos_mm: [1.5, 9.0] } # left side, below HBM zone
ucie: