diff --git a/components.yaml b/components.yaml index 8bf0f85..1e22d69 100644 --- a/components.yaml +++ b/components.yaml @@ -28,9 +28,6 @@ components: switch_v1: kernbench.components.builtin.forwarding:TransitComponent noc_v1: kernbench.components.builtin.forwarding:TransitComponent ucie_v1: kernbench.components.builtin.forwarding:TransitComponent - noc_2d_mesh_v1: kernbench.components.builtin.noc:TwoDMeshNocComponent - xbar_v1: kernbench.components.builtin.xbar:PositionAwareXbarComponent - # IO / Host interface pcie_ep_v1: kernbench.components.builtin.pcie_ep:PcieEpComponent io_cpu_v1: kernbench.components.builtin.io_cpu:IoCpuComponent diff --git a/docs/diagrams/cube_view.svg b/docs/diagrams/cube_view.svg index 3743329..df6bac0 100644 --- a/docs/diagrams/cube_view.svg +++ b/docs/diagrams/cube_view.svg @@ -135,17 +135,14 @@ r0c0 PE0 - + r0c1 PE1 - - + + r0c2 - - M_CPU - r0c3 @@ -156,20 +153,23 @@ r1c0 r1c1 - + r1c2 + + M_CPU + r1c3 r1c4 PE2 - + r1c5 PE3 - + r2c0 @@ -182,7 +182,7 @@ r3c0 SRAM - + r3c1 @@ -193,12 +193,12 @@ r4c0 PE4 - + r4c1 PE5 - + r4c2 @@ -219,84 +219,84 @@ r5c4 PE6 - + r5c5 PE7 - - + + 256GB/s - + 256GB/s - + 256GB/s - + 256GB/s - + 256GB/s - + 256GB/s - + 256GB/s - + 256GB/s UCIe-W c0 - + c1 - + c2 - + c3 - + UCIe-N c0 - + c1 - + c2 - + c3 - + UCIe-E c0 - + c1 - + c2 - + c3 - + UCIe-S c0 - + c1 - + c2 - + c3 - + PE Router diff --git a/src/kernbench/cli/probe.py b/src/kernbench/cli/probe.py index c3e1403..1333c8c 100644 --- a/src/kernbench/cli/probe.py +++ b/src/kernbench/cli/probe.py @@ -116,7 +116,7 @@ def _fmt_util(eff: float, bn: float | None) -> str: def _short_name(node_id: str) -> str: - """Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0).""" + """Shorten node id: keep last 2 segments to avoid ambiguity (router.pe0 vs pe0).""" parts = node_id.split(".") return ".".join(parts[-2:]) if len(parts) >= 2 else node_id @@ -366,7 +366,7 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int: # --- PE DMA Summary Table --- print() - print(f"=== PE DMA Latency (pe_dma -> xbar -> HBM, data={nbytes}B) ===") + print(f"=== PE DMA Latency (pe_dma -> router -> HBM, data={nbytes}B) ===") print(f" {'Case':<26} {'Target':<28} {'Actual':>8}" f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}" f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}") diff --git a/src/kernbench/cli/report.py b/src/kernbench/cli/report.py index 815501a..e3d619f 100644 --- a/src/kernbench/cli/report.py +++ b/src/kernbench/cli/report.py @@ -137,7 +137,7 @@ def _extract_peaks(spec: dict | None) -> tuple[float, float]: gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {}) peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0)) cube_links = cube.get("links", {}) - hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0)) + hbm_bw = float(cube_links.get("hbm_to_router_bw_gbs", 0.0)) return peak_tflops, hbm_bw diff --git a/src/kernbench/components/builtin/noc.py b/src/kernbench/components/builtin/noc.py deleted file mode 100644 index 472b7e6..0000000 --- a/src/kernbench/components/builtin/noc.py +++ /dev/null @@ -1,224 +0,0 @@ -from __future__ import annotations - -from collections.abc import Generator -from typing import TYPE_CHECKING, Any - -import simpy - -from kernbench.components.base import ComponentBase - -if TYPE_CHECKING: - from kernbench.components.context import ComponentContext - from kernbench.topology.types import Node - - -class TwoDMeshNocComponent(ComponentBase): - """2D mesh NOC modeled as a single smart node. - - Latency model: - - Traversal latency = Manhattan distance between prev_hop and next_hop - node positions, split into XY segments, traversed with pipeline. - - overhead_ns (from node.attrs) is added once per traversal. - - Contention model: - - Each directed XY segment is a simpy.Resource(capacity=1). - - Pipeline: next segment's resource is requested before the current - segment's timeout completes, so a free downstream segment is acquired - immediately (wormhole-style cut-through). - - Two transactions sharing a segment (same row or column band) contend. - - Concurrency: - - _worker spawns an independent SimPy process per transaction, so the - NOC is never serialized at the node level — only at segment resources. - """ - - def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: - super().__init__(node, ctx) - self._env: simpy.Environment | None = None - self._links: dict[tuple, simpy.Resource] = {} - self._x_grid: list[float] = [] - self._y_grid: list[float] = [] - - def start(self, env: simpy.Environment) -> None: - self._env = env - self._build_grid() - super().start(env) - - def run(self, env: simpy.Environment, nbytes: int) -> Generator: - yield env.timeout(0) - - # ── Grid construction ──────────────────────────────────────────── - - def _build_grid(self) -> None: - if not self.ctx: - return - mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None - if mesh: - self._build_grid_from_mesh(mesh) - else: - self._build_grid_from_positions() - - def _build_grid_from_mesh(self, mesh: dict) -> None: - """Build XY grid from cube_mesh.yaml router positions (authoritative).""" - origin_x, origin_y = self._cube_origin() - xs: set[float] = set() - ys: set[float] = set() - for key, router in mesh.get("routers", {}).items(): - if router is not None: - xs.add(round(origin_x + router["pos_mm"][0], 2)) - ys.add(round(origin_y + router["pos_mm"][1], 2)) - self._x_grid = sorted(xs) - self._y_grid = sorted(ys) - - def _build_grid_from_positions(self) -> None: - """Fallback: infer grid from all node positions in the cube.""" - cube_prefix = self.node.id.rsplit(".", 1)[0] - xs: set[float] = set() - ys: set[float] = set() - for node_id, pos in self.ctx.positions.items(): - if node_id.startswith(cube_prefix + ".") and pos is not None: - xs.add(round(pos[0], 2)) - ys.add(round(pos[1], 2)) - self._x_grid = sorted(xs) - self._y_grid = sorted(ys) - - def _cube_origin(self) -> tuple[float, float]: - """Compute absolute origin (top-left) of this cube from cube_id.""" - parts = self.node.id.split(".") - cube_str = [p for p in parts if p.startswith("cube")][0] - cube_id = int(cube_str[4:]) - spec = self.ctx.spec - sip_spec = spec.get("sip", {}) - cube_spec = spec.get("cube", {}) - mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4) - cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0) - cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0) - seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get( - "distance_mm_across_seam", 1.0) - col = cube_id % mesh_w - row = cube_id // mesh_w - return (col * (cube_w + seam), row * (cube_h + seam)) - - def _get_link(self, key: tuple) -> simpy.Resource: - if key not in self._links: - assert self._env is not None - self._links[key] = simpy.Resource(self._env, capacity=1) - return self._links[key] - - # ── Worker ─────────────────────────────────────────────────────── - - def _worker(self, env: simpy.Environment) -> Generator: - while True: - txn: Any = yield self._inbox.get() - env.process(self._route(env, txn)) - - def _route(self, env: simpy.Environment, txn: Any) -> Generator: - prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None - next_hop = txn.next_hop - overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) - - links: list[tuple[tuple, float]] = [] - if prev_hop and next_hop and self.ctx: - src_pos = self.ctx.positions.get(prev_hop) - dst_pos = self.ctx.positions.get(next_hop) - if src_pos and dst_pos: - links = self._xy_links(src_pos, dst_pos) - - if links: - yield from self._traverse(env, links, overhead_ns) - else: - yield env.timeout(overhead_ns) - - if next_hop: - yield self.out_ports[next_hop].put(txn.advance()) - else: - drain = getattr(txn, "drain_ns", 0.0) - if drain > 0: - yield env.timeout(drain) - txn.done.succeed() - - # ── XY routing and pipelined link traversal ────────────────────── - - def _traverse( - self, - env: simpy.Environment, - links: list[tuple[tuple, float]], - overhead_ns: float, - ) -> Generator: - """Pipeline: request next segment before current timeout finishes.""" - ns_per_mm = self.ctx.ns_per_mm # type: ignore[union-attr] - - # Acquire first link - first_key, _ = links[0] - current_resource = self._get_link(first_key) - current_req = current_resource.request() - yield current_req - - for i, (_, dist_mm) in enumerate(links): - # Request next link before current timeout (pipeline) - if i + 1 < len(links): - next_key, _ = links[i + 1] - next_resource = self._get_link(next_key) - next_req = next_resource.request() - - yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0)) - current_resource.release(current_req) - - if i + 1 < len(links): - yield next_req # usually already fulfilled (pipeline) - current_resource = next_resource - current_req = next_req - - def _xy_links( - self, - src: tuple[float, float], - dst: tuple[float, float], - ) -> list[tuple[tuple, float]]: - """XY routing: horizontal segment first, then vertical. - - Returns list of (link_key, dist_mm) pairs, where link_key uniquely - identifies a directed segment shared across concurrent transactions. - """ - x0, y0 = src - x1, y1 = dst - links: list[tuple[tuple, float]] = [] - - # Horizontal segment at y≈y0 - if abs(x0 - x1) > 1e-9: - y_band = self._snap(y0, self._y_grid) - for xa, xb in self._segments(x0, x1, self._x_grid): - d = abs(xb - xa) - if d > 1e-9: - lo, hi = (xa, xb) if xa < xb else (xb, xa) - dir_h = "E" if xb > xa else "W" - links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d)) - - # Vertical segment at x≈x1 - if abs(y0 - y1) > 1e-9: - x_band = self._snap(x1, self._x_grid) - for ya, yb in self._segments(y0, y1, self._y_grid): - d = abs(yb - ya) - if d > 1e-9: - lo, hi = (ya, yb) if ya < yb else (yb, ya) - dir_v = "S" if yb > ya else "N" - links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d)) - - return links - - @staticmethod - def _snap(val: float, grid: list[float]) -> float: - if not grid: - return val - return min(grid, key=lambda g: abs(g - val)) - - @staticmethod - def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]: - """Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints.""" - if abs(a - b) < 1e-9: - return [] - lo, hi = (a, b) if a < b else (b, a) - pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi] - pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)] - if a > b: - pairs = [(p2, p1) for p1, p2 in reversed(pairs)] - return pairs diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py index 857456c..c8ee823 100644 --- a/src/kernbench/components/builtin/pe_dma.py +++ b/src/kernbench/components/builtin/pe_dma.py @@ -96,7 +96,7 @@ class PeDmaComponent(PeEngineBase): request=sub_request, path=path, step=0, nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns, ) - # Send to next hop (path[0] is pe_dma itself, path[1] is xbar) + # Send to next hop (path[0] is pe_dma itself, path[1] is router) if len(path) > 1: yield self.out_ports[path[1]].put(sub_txn.advance()) # DMA channel released after issue diff --git a/src/kernbench/components/builtin/xbar.py b/src/kernbench/components/builtin/xbar.py deleted file mode 100644 index 1872f99..0000000 --- a/src/kernbench/components/builtin/xbar.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Position-aware XBAR component. - -Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm, -where internal_distance is the Manhattan distance between the entry port -(PE router attachment) and exit port (HBM slice logical position) within -the crossbar matrix. - -PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]). -HBM slice positions are uniformly distributed across the HBM physical width. -""" -from __future__ import annotations - -from collections.abc import Generator -from typing import TYPE_CHECKING, Any - -import simpy - -from kernbench.components.base import ComponentBase - -if TYPE_CHECKING: - from kernbench.components.context import ComponentContext - from kernbench.topology.types import Node - - -class PositionAwareXbarComponent(ComponentBase): - """XBAR with position-dependent latency based on PE-to-slice distance. - - Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm - - Entry/exit port X positions are determined from the transaction path: - - PE_DMA nodes: router X from cube_mesh.yaml - - HBM slices: uniformly distributed across HBM physical width - - Bridge nodes: physical X from topology positions - - NOC: resolved by scanning path for PE_DMA node - """ - - def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: - super().__init__(node, ctx) - self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0)) - self._pe_router_xs: dict[str, float] = {} - self._slice_xs: dict[str, float] = {} - self._bridge_xs: dict[str, float] = {} - self._ns_per_mm: float = 0.0 - - def start(self, env: simpy.Environment) -> None: - self._build_position_map() - super().start(env) - - def run(self, env: simpy.Environment, nbytes: int) -> Generator: - yield env.timeout(self._base_overhead_ns) - - # ── Position map construction ───────────────────────────────── - - def _build_position_map(self) -> None: - if not self.ctx or not self.ctx.spec: - return - mesh = self.ctx.spec.get("_mesh") - if not mesh: - return - - self._ns_per_mm = self.ctx.ns_per_mm - cube_prefix = self.node.id.rsplit(".", 1)[0] - xbar_name = self.node.id.rsplit(".", 1)[1] - is_top = xbar_name == "xbar_top" - xbar_key = "top" if is_top else "bottom" - - # PE router X positions from mesh attachments - routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", []) - for router_id in routers_list: - router_data = mesh["routers"].get(router_id) - if router_data is None: - continue - router_x = router_data["pos_mm"][0] - for attach in router_data.get("attach", []): - if attach.endswith(".dma"): - pe_name = attach.split(".")[0] - pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma" - self._pe_router_xs[pe_dma_id] = router_x - - # HBM slice X positions: uniformly distributed across HBM width - cube_spec = self.ctx.spec.get("cube", {}) - cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0) - hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0) - n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8) - half = n_slices // 2 - hbm_left = (cube_w - hbm_w) / 2 - - if is_top: - slice_range = range(half) - else: - slice_range = range(half, n_slices) - - n = len(list(slice_range)) - for i, sl in enumerate(slice_range): - if n > 1: - x = hbm_left + i * hbm_w / (n - 1) - else: - x = cube_w / 2 - self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x - - # Bridge X positions from topology positions - for node_id, pos in self.ctx.positions.items(): - if node_id.startswith(cube_prefix + ".bridge.") and pos is not None: - origin_x = self._cube_origin_x() - self._bridge_xs[node_id] = pos[0] - origin_x - - def _cube_origin_x(self) -> float: - """Compute absolute X origin of this cube.""" - parts = self.node.id.split(".") - cube_str = [p for p in parts if p.startswith("cube")][0] - cube_id = int(cube_str[4:]) - spec = self.ctx.spec - sip_spec = spec.get("sip", {}) - cube_spec = spec.get("cube", {}) - mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4) - cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0) - seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get( - "distance_mm_across_seam", 1.0) - col = cube_id % mesh_w - return col * (cube_w + seam) - - # ── Worker override ─────────────────────────────────────────── - - def _worker(self, env: simpy.Environment) -> Generator: - while True: - txn: Any = yield self._inbox.get() - env.process(self._position_aware_forward(env, txn)) - - def _position_aware_forward( - self, env: simpy.Environment, txn: Any, - ) -> Generator: - prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None - next_hop = txn.next_hop - - overhead = self._base_overhead_ns - if prev_hop and next_hop and self._ns_per_mm > 0: - entry_x = self._get_port_x(prev_hop, txn.path) - exit_x = self._get_port_x(next_hop, txn.path) - if entry_x is not None and exit_x is not None: - overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm - - yield env.timeout(overhead) - - if next_hop: - yield self.out_ports[next_hop].put(txn.advance()) - else: - drain = getattr(txn, "drain_ns", 0.0) - if drain > 0: - yield env.timeout(drain) - txn.done.succeed() - - def _get_port_x(self, node_id: str, path: list[str]) -> float | None: - """Resolve the X position of an XBAR port from node context.""" - # Direct lookup: PE DMA - if node_id in self._pe_router_xs: - return self._pe_router_xs[node_id] - # Direct lookup: HBM slice - if node_id in self._slice_xs: - return self._slice_xs[node_id] - # Direct lookup: bridge - if node_id in self._bridge_xs: - return self._bridge_xs[node_id] - # NOC: scan path for PE DMA node - if "noc" in node_id: - for p in path: - if p in self._pe_router_xs: - return self._pe_router_xs[p] - return None diff --git a/src/kernbench/policy/routing/router.py b/src/kernbench/policy/routing/router.py index 81ed601..ce3d21e 100644 --- a/src/kernbench/policy/routing/router.py +++ b/src/kernbench/policy/routing/router.py @@ -81,7 +81,7 @@ class PathRouter: # Edge kinds excluded from M_CPU DMA adjacency: prevents routing through # PE-internal pipeline nodes when computing DMA paths. - _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"} + _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_router"} _UCIE_KINDS = {"ucie_internal", "ucie_conn_to_router", "router_to_ucie_conn", "ucie_conn_to_noc", "noc_to_ucie_conn", "ucie_mesh", @@ -124,9 +124,9 @@ class PathRouter: return self._run_dijkstra(self._adj_all, m_cpu_id, dst_hbm_id) def find_memory_path(self, src: str, dst: str) -> list[str]: - """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl. + """Direct memory path: pcie_ep → io_noc → cube → router mesh → hbm_ctrl. - Uses _adj_mcpu_dma which excludes pe_internal and pe_to_xbar edges, + Uses _adj_mcpu_dma which excludes pe_internal and pe_to_router edges, preventing routing through PE pipeline nodes. """ return self._run_dijkstra(self._adj_mcpu_dma, src, dst) diff --git a/src/kernbench/sim_engine/engine.py b/src/kernbench/sim_engine/engine.py index 298e080..a16ce66 100644 --- a/src/kernbench/sim_engine/engine.py +++ b/src/kernbench/sim_engine/engine.py @@ -19,9 +19,9 @@ class GraphEngine: """simpy-based discrete-event simulation engine. Request routing: - MemoryWrite/Read: pcie_ep → io_noc → cube → xbar → hbm_ctrl (m_cpu bypass) + MemoryWrite/Read: pcie_ep → io_noc → cube → router mesh → hbm_ctrl (m_cpu bypass) KernelLaunch: pcie_ep → io_noc → io_cpu → io_noc → cube → m_cpu → PE - PeDmaMsg: pe_dma → xbar → hbm_ctrl (direct probe) + PeDmaMsg: pe_dma → router mesh → hbm_ctrl (direct probe) Component implementations are DI-injectable via component_overrides (ADR-0007 D3). """ @@ -261,7 +261,7 @@ class GraphEngine: done.succeed() def _process_memory_direct(self, key: str, request: Any, done: simpy.Event): - """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl. + """Direct memory path: pcie_ep → io_noc → cube → router mesh → hbm_ctrl. MemoryWrite: data flows forward (nbytes on wires), drain at hbm_ctrl terminal. MemoryRead: command flows forward (nbytes=0), hbm_ctrl sends data back on diff --git a/src/kernbench/sim_engine/event_log.py b/src/kernbench/sim_engine/event_log.py index 5d3c866..b053084 100644 --- a/src/kernbench/sim_engine/event_log.py +++ b/src/kernbench/sim_engine/event_log.py @@ -287,7 +287,7 @@ def _generate_probe_d2h(graph, edge_map) -> list[dict]: def _generate_probe_pe_dma(graph, edge_map) -> list[dict]: - """PE DMA probes: pe_dma → xbar → HBM.""" + """PE DMA probes: pe_dma → router mesh → HBM.""" from kernbench.policy.address.phyaddr import PhysAddr from kernbench.policy.routing.router import AddressResolver, PathRouter diff --git a/src/kernbench/topology/visualizer.py b/src/kernbench/topology/visualizer.py index fe6ac8c..53c0708 100644 --- a/src/kernbench/topology/visualizer.py +++ b/src/kernbench/topology/visualizer.py @@ -385,6 +385,55 @@ def _escape(text: str) -> str: return text.replace("&", "&").replace("<", "<").replace(">", ">") +# ── Connector helper ───────────────────────────────────────────────── + + +def _connector_points( + rx: float, ry: float, cx: float, cy: float +) -> str: + """Return SVG polyline points for a rule-based connector. + + Horizontal-dominant (|dx| >= |dy|): 45° → horizontal straight → 45°. + Vertical-dominant (|dy| > |dx|): 45° → vertical straight → 45°. + Near-equal or tiny distance: single straight line. + """ + dx = cx - rx + dy = cy - ry + adx, ady = abs(dx), abs(dy) + + # Trivial distance → single line + # Near-45° diagonal for short distances only (e.g. PE↔router) + if adx + ady < 4 or (abs(adx - ady) < 4 and adx + ady < 80): + return f"{rx:.0f},{ry:.0f} {cx:.0f},{cy:.0f}" + + sx = 1 if dx >= 0 else -1 + sy = 1 if dy >= 0 else -1 + + if adx >= ady: + # Horizontal-dominant: stubs handle vertical, straight is horizontal + stub = ady / 2 + if stub < 2: + return f"{rx:.0f},{ry:.0f} {cx:.0f},{cy:.0f}" + r45x = rx + sx * stub + r45y = ry + sy * stub + c45x = cx - sx * stub + c45y = cy - sy * stub # r45y == c45y (horizontal) + else: + # Vertical-dominant: stubs handle horizontal, straight is vertical + stub = adx / 2 + if stub < 2: + return f"{rx:.0f},{ry:.0f} {cx:.0f},{cy:.0f}" + r45x = rx + sx * stub + r45y = ry + sy * stub + c45x = cx - sx * stub + c45y = cy - sy * stub # r45x == c45x (vertical) + + return ( + f"{rx:.0f},{ry:.0f} {r45x:.0f},{r45y:.0f} " + f"{c45x:.0f},{c45y:.0f} {cx:.0f},{cy:.0f}" + ) + + # ── Cube-specific renderer ────────────────────────────────────────── @@ -637,55 +686,39 @@ def _render_cube_view_svg(view: ViewGraph, spec: dict) -> str: f'text-anchor="middle" font-family="monospace" font-size="{font_sz}" ' f'font-weight="bold" fill="{style["text"]}">{_escape(label)}' ) - # Connector: router ─45°─ straight ─45°─ component + # Connector: rule-based (short → 45° line, long → 45°-straight-45°) sc = style["stroke"] - d = 12 # 45° stub length (px) + + # Determine start (router edge) and end (component edge) points + bxc = bx + blk_w / 2 # component center x if kind == "mcpu": - # Router top → 45° NW stub → vertical → 45° into block bottom - rx2, ry2 = px, py - r_size - bxc, byc = bx + blk_w / 2, by + blk_h - parts.append( - f' ' - ) + rx0, ry0 = px, py - r_size # router top + cx0, cy0 = bxc, by + blk_h # component bottom elif kind == "sram": - # Router bottom → 45° SW stub → vertical → 45° into block top - rx2, ry2 = px, py + r_size - bxc, byc = bx + blk_w / 2, by + rx0, ry0 = px, py + r_size # router bottom + cx0, cy0 = bxc, by # component top + elif is_top: + rx0, ry0 = px, py - r_size # router top + cx0, cy0 = bx + blk_w / 2 + offset_x, by + blk_h # component bottom + else: + rx0, ry0 = px, py + r_size # router bottom + cx0, cy0 = bx + blk_w / 2 + offset_x, by # component top + + # PE/M_CPU/SRAM directly above/below router (same X): + # single diagonal line from router center to component right edge + if abs(cx0 - rx0) < 2 and abs(cy0 - ry0) > 4: + cx0 = bx + blk_w - 2 parts.append( - f' ' + f' ' ) else: - # PE: vertical direction - bxc = bx + blk_w / 2 + offset_x - if is_top: - rx2, ry2 = px, py - r_size # router top - byc = by + blk_h # block bottom - # 45° stub from router, vertical, 45° into block - sx = bxc - px # horizontal shift direction - sd = d if sx >= 0 else -d - parts.append( - f' ' - ) - else: - rx2, ry2 = px, py + r_size # router bottom - byc = by # block top - sx = bxc - px - sd = d if sx >= 0 else -d - parts.append( - f' ' - ) + pts = _connector_points(rx0, ry0, cx0, cy0) + parts.append( + f' ' + ) # (PE→HBM BW annotation drawn in the PE→HBM port group section above) @@ -705,26 +738,13 @@ def _render_cube_view_svg(view: ViewGraph, spec: dict) -> str: rpx, rpy = mm2px(rx, ry) tgx, tgy = _pe_hbm_targets[pe_id] r_edge_y = rpy + r_size if rpy < hbm_y else rpy - r_size - # 45° stub from router → vertical → 45° into HBM port - d = 12 # stub length - sx = tgx - rpx - sd = d if sx >= 0 else -d - if rpy < hbm_y: - parts.append( - f' ' - ) - else: - parts.append( - f' ' - ) + # Rule-based connector: router → HBM port group + pts = _connector_points(rpx, r_edge_y, tgx, tgy) + parts.append( + f' ' + ) # BW annotation at midpoint mx = (rpx + tgx) / 2 + 10 my = (r_edge_y + tgy) / 2 @@ -818,53 +838,27 @@ def _render_cube_view_svg(view: ViewGraph, spec: dict) -> str: f'{conn}' ) - # Connector: router ─45°stub─ straight ─45°stub─ UCIe port + # Connector: rule-based router → UCIe port rpx, rpy = mm2px(crx, cry) - d = 10 if direction == "N": rx, ry = rpx, rpy - r_size tx, ty = lx, cy_box + ch - sx = tx - rx - sd = d if sx >= 0 else -d - parts.append( - f' ' - ) elif direction == "S": rx, ry = rpx, rpy + r_size tx, ty = lx, cy_box - sx = tx - rx - sd = d if sx >= 0 else -d - parts.append( - f' ' - ) elif direction == "W": rx, ry = rpx - r_size, rpy tx, ty = cx + cw, cy_box + ch / 2 - sy = ty - ry - sd = d if sy >= 0 else -d - parts.append( - f' ' - ) elif direction == "E": rx, ry = rpx + r_size, rpy tx, ty = cx, cy_box + ch / 2 - sy = ty - ry - sd = d if sy >= 0 else -d - parts.append( - f' ' - ) + else: + continue + pts = _connector_points(rx, ry, tx, ty) + parts.append( + f' ' + ) # ── Legend ── ly = h_px - 35 diff --git a/tests/test_component_registry.py b/tests/test_component_registry.py index e2bf9b4..4e344a5 100644 --- a/tests/test_component_registry.py +++ b/tests/test_component_registry.py @@ -37,7 +37,7 @@ def _hbm_pa(pe_id: int = 0) -> int: def _node(impl: str, overhead_ns: float = 0.0) -> Node: - return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None) + return Node(id="test", kind="noc_router", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None) # ── 1. unknown impl → error ────────────────────────────────────────── @@ -55,7 +55,7 @@ def test_registry_unknown_impl_raises_error(): def test_transit_component_yields_overhead_ns(): """TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns.""" - node = _node("xbar_v1", overhead_ns=3.0) + node = _node("forwarding_v1", overhead_ns=3.0) comp = TransitComponent(node) env = simpy.Environment() @@ -119,10 +119,9 @@ def test_engine_component_model_latency(): """MemoryRead D2H latency for local cube0 (4096B). Bypass path (m_cpu bypass): pcie_ep → io_noc → conn → io_ucie → cube_ucie - → conn → noc → xbar_top → hbm_ctrl.slice0 + → conn → router mesh → hbm_ctrl - Path goes through xbar_top (overhead_ns=2.0) instead of per-PE xbar. - Latency must be positive and reasonable. + Path goes through router mesh. Latency must be positive and reasonable. """ graph = _graph() engine = GraphEngine(graph) @@ -134,7 +133,6 @@ def test_engine_component_model_latency(): h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) - # Verify positive latency; exact value depends on path through xbar_top assert trace["total_ns"] > 0 diff --git a/tests/test_noc_mesh.py b/tests/test_noc_mesh.py index 110887b..92e8280 100644 --- a/tests/test_noc_mesh.py +++ b/tests/test_noc_mesh.py @@ -1,18 +1,15 @@ -"""Tests for #5+#6 CUBE NOC Router Mesh + Position-Aware XBAR. - -Phase 1 verification: all tests FAIL until Phase 2 implements production code. +"""Tests for CUBE NOC Explicit Router Mesh (ADR-0019). Key changes verified: - - Single NOC node per cube with internal router mesh simulation - - Auto-layout generates cube_mesh.yaml (6x6 grid for n_connections=4) - - Position-aware XBAR (top/bottom) replaces per-PE xbar chaining + - Explicit router nodes per cube from cube_mesh.yaml (6×6 grid) + - Auto-layout generates cube_mesh.yaml with PE/UCIe/M_CPU/SRAM attachments - Mesh file caching with source_hash change detection - - Path routing: PE_DMA → NOC → XBAR_top/bot → HBM_CTRL + - Path routing: PE_DMA → router mesh → HBM_CTRL -Latency invariant after refactor: - Local HBM: PE_DMA → Router(overhead) → XBAR → HBM_CTRL - Cross-row: PE_DMA → Router → mesh traverse → Router → XBAR → bridge → XBAR → HBM_CTRL - Cross-cube: PE_DMA → Router → mesh → UCIe → ... → mesh → XBAR → HBM_CTRL +Latency invariant: + Local HBM: PE_DMA → Router(overhead) → HBM_CTRL + Cross-row: PE_DMA → Router → mesh hops → Router → HBM_CTRL + Cross-cube: PE_DMA → Router → mesh → UCIe → ... → mesh → HBM_CTRL """ import pytest diff --git a/tests/test_phase_a_components.py b/tests/test_phase_a_components.py index 1e1dc2d..466cb73 100644 --- a/tests/test_phase_a_components.py +++ b/tests/test_phase_a_components.py @@ -24,7 +24,6 @@ from kernbench.components.builtin import ( IoCpuComponent, MCpuComponent, PcieEpComponent, - PositionAwareXbarComponent, SramComponent, TransitComponent, ) @@ -232,7 +231,6 @@ def test_m_cpu_terminal_no_ctx_completes(): ("forwarding_v1", TransitComponent), ("noc_v1", TransitComponent), ("ucie_v1", TransitComponent), - ("xbar_v1", PositionAwareXbarComponent), ("pcie_ep_v1", PcieEpComponent), ("io_cpu_v1", IoCpuComponent), ("m_cpu_v1", MCpuComponent), diff --git a/tests/test_probe.py b/tests/test_probe.py index 9f2597c..59e73fc 100644 --- a/tests/test_probe.py +++ b/tests/test_probe.py @@ -1,7 +1,7 @@ """Tests for H2D writes and PE DMA probe latency invariants. H2D tests use MemoryWriteMsg (pcie_ep → io_cpu → m_cpu → hbm_ctrl → response). -PE DMA tests use PeDmaMsg (direct pe_dma → xbar → hbm_ctrl injection). +PE DMA tests use PeDmaMsg (direct pe_dma → router mesh → hbm_ctrl injection). """ from pathlib import Path @@ -118,7 +118,7 @@ def test_h2d_local_cube_cut_through(): """H2D to local cube with cut-through should be < 50ns for 4096B. Full command path: pcie_ep → io_cpu → ucie → noc → m_cpu - DMA: m_cpu → noc → xbar → hbm_ctrl (drain once at terminal) + DMA: m_cpu → router mesh → hbm_ctrl (drain once at terminal) Plus response path back. With store-and-forward each hop would serialize; cut-through keeps it low. """ @@ -205,7 +205,7 @@ def test_pe_dma_local_bottleneck_hbm(): def test_pe_dma_same_half_bottleneck_hbm(): - """PE DMA pe0→slice1 (same half via xbar_top): bottleneck = HBM effective BW.""" + """PE DMA pe0→pe1 HBM (same row via router mesh): bottleneck = HBM effective BW.""" bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1) expected = _hbm_effective_bw() assert bn == expected, f"Same-half PE DMA bottleneck {bn}, expected {expected}" diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py index cb2e8fe..c2934ed 100644 --- a/tests/test_topology_compile.py +++ b/tests/test_topology_compile.py @@ -158,9 +158,9 @@ def test_pe_dma_to_router(): def test_command_path_m_cpu_router_pe_cpu(): es = _edge_set(_graph()) cp = "sip0.cube0" - # m_cpu <-> r0c2 (bidirectional command) - assert (f"{cp}.m_cpu", f"{cp}.r0c2") in es - assert (f"{cp}.r0c2", f"{cp}.m_cpu") in es + # m_cpu <-> r1c2 (bidirectional command) + assert (f"{cp}.m_cpu", f"{cp}.r1c2") in es + assert (f"{cp}.r1c2", f"{cp}.m_cpu") in es # router -> pe_cpu for each PE (command kind) assert (f"{cp}.r0c0", f"{cp}.pe0.pe_cpu") in es assert (f"{cp}.r5c5", f"{cp}.pe7.pe_cpu") in es @@ -416,8 +416,8 @@ def test_cube_view_hbm_router(): def test_cube_view_m_cpu_router(): - """Cube view: m_cpu connects to its router r0c2.""" + """Cube view: m_cpu connects to its router r1c2.""" v = _graph().cube_view ves = {(e.src, e.dst) for e in v.edges} - assert ("m_cpu", "r0c2") in ves - assert ("r0c2", "m_cpu") in ves + assert ("m_cpu", "r1c2") in ves + assert ("r1c2", "m_cpu") in ves diff --git a/topology.yaml b/topology.yaml index 81777db..52ed3ae 100644 --- a/topology.yaml +++ b/topology.yaml @@ -55,7 +55,7 @@ cube: ucie_mm: { size: 2.0 } pe_layout: - corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row + corners: [NW, NE, SW, SE] # N corners → top PE rows; S corners → bottom PE rows pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8 pe_template: @@ -97,7 +97,7 @@ cube: # Physical placement of non-PE components (mm coordinates) placement: - m_cpu: { pos_mm: [7.5, 2.0] } # top center area, near UCIe-N + m_cpu: { pos_mm: [7.5, 3.0] } # top center, below UCIe-N sram: { pos_mm: [1.5, 9.0] } # left side, below HBM zone ucie: