Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
@@ -10,7 +10,7 @@ from pathlib import Path

 from kernbench.policy.address.phyaddr import PhysAddr
 from kernbench.policy.routing.router import AddressResolver, PathRouter
-from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+from kernbench.runtime_api.kernel import MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import load_topology
 from kernbench.topology.types import TopologyGraph
@@ -54,6 +54,46 @@ def _formula_breakdown(
    return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns


+def _hop_timestamps(
+    path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
+) -> list[tuple[str, float, str]]:
+    """Return per-hop timestamps: [(node_short, cumulative_ns, annotation), ...].
+
+    Annotations mark bottleneck edges and significant overhead nodes.
+    """
+    ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
+    # Find bottleneck BW for annotation
+    bws = [e.bw_gbs for i in range(len(path) - 1)
+           if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
+    bn_bw = min(bws) if bws else None
+
+    cumulative = 0.0
+    result: list[tuple[str, float, str]] = []
+    result.append((_short_name(path[0]), 0.0, ""))
+
+    for i in range(len(path) - 1):
+        e = edge_map.get((path[i], path[i + 1]))
+        ann = ""
+        if e:
+            cumulative += e.distance_mm * ns_per_mm
+            if bn_bw is not None and e.bw_gbs and e.bw_gbs == bn_bw:
+                ann = f"<BN:{e.bw_gbs:.0f}GB/s>"
+        node = graph.nodes.get(path[i + 1])
+        if node:
+            ovhd = float(node.attrs.get("overhead_ns", 0.0))
+            cumulative += ovhd
+            if ovhd > 0 and not ann:
+                ann = f"+{ovhd:.1f}ns"
+        result.append((_short_name(path[i + 1]), cumulative, ann))
+
+    # Add drain at terminal
+    if bn_bw and nbytes > 0:
+        cumulative += nbytes / bn_bw
+        result[-1] = (result[-1][0], cumulative, result[-1][2] + f" drain:{nbytes/bn_bw:.1f}ns")
+
+    return result
+
+
 def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
    """Per-request bottleneck: single request uses one connection."""
    bws: list[float] = []
@@ -85,6 +125,41 @@ def _short_path(path: list[str]) -> str:
    return " -> ".join(_short_name(n) for n in path)


+def _print_hop_trace(timestamps: list[tuple[str, float, str]], indent: str = "    ") -> None:
+    """Print per-hop timestamp trace."""
+    for node, t_ns, ann in timestamps:
+        ann_str = f"  {ann}" if ann else ""
+        print(f"{indent}{t_ns:>8.2f}ns  {node}{ann_str}")
+
+
+SWEEP_SIZES = [4096, 16384, 65536, 262144, 1048576]
+SWEEP_LABELS = ["4KB", "16KB", "64KB", "256KB", "1MB"]
+
+
+def _sweep_util(ovhd_ns: float, wire_ns: float, bn_bw: float | None, sizes: list[int] = SWEEP_SIZES) -> list[float]:
+    """Compute utilization % for each data size using formula model."""
+    if bn_bw is None or bn_bw <= 0:
+        return [0.0] * len(sizes)
+    result = []
+    for nb in sizes:
+        drain = nb / bn_bw
+        total = ovhd_ns + wire_ns + drain
+        eff = nb / total if total > 0 else 0.0
+        result.append(eff / bn_bw * 100)
+    return result
+
+
+def _print_sweep_table(case_names: list[str], sweep_data: list[list[float]]) -> None:
+    """Print compact BW saturation table."""
+    hdr = f"  {'Case':<26}" + "".join(f" {l:>7}" for l in SWEEP_LABELS)
+    print(f"\n  BW Saturation (Util% by data size):")
+    print(hdr)
+    print("  " + "-" * (26 + 8 * len(SWEEP_LABELS)))
+    for name, utils in zip(case_names, sweep_data):
+        cols = "".join(f" {u:>6.1f}%" for u in utils)
+        print(f"  {name:<26}{cols}")
+
+
 # -- Probe runner -----------------------------------------------------


@@ -96,25 +171,18 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
    resolver = AddressResolver(graph)
    router = PathRouter(graph)

-    nbytes = 4096
+    nbytes = 32768
    show_all = case_filter is None or case_filter == "all"

-    # === H2D Write ===
+    # === Collect H2D results ===
    h2d_cases = [
        ("h2d-1hop", 0, 1),
        ("h2d-2hop", 4, 2),
        ("h2d-3hop", 8, 3),
        ("h2d-4hop", 12, 4),
    ]
-    h2d_results: list[tuple[str, int, float, float, float | None]] = []
-    h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
-
-    print()
-    print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
-    print(f"  {'Case':<14} {'Target':<16} {'Hops':>4}   {'Actual':>8}"
-          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
-          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
-    print("  " + "-" * 115)
+    h2d_results: list[tuple[str, int, float, float, float | None, float, float, float, float, float]] = []
+    h2d_route_data: list[tuple[str, list[str], list[str], list[str], list[str]]] = []

    for name, cube, hops in h2d_cases:
        if not show_all and case_filter != name:
@@ -144,52 +212,67 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
        full_path = leg1 + leg2[1:] + leg3[1:]
        bn_bw = _bottleneck_bw(full_path, edge_map)

-        # Forward path breakdown only (response path is implicit in actual_ns)
        fwd_path = leg1 + leg2[1:] + leg3[1:]
        wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)

        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0

-        h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
-        h2d_paths.append((name, leg1, leg2, leg3))
-        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
-              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
-              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+        h2d_results.append((name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct))
+        h2d_route_data.append((name, leg1, leg2, leg3, fwd_path))

-    if len(h2d_results) >= 2:
-        lats = [r[2] for r in h2d_results]
-        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
-        sym = "[v]" if mono else "[x]"
-        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
-
-    if h2d_paths:
-        print()
-        print("  Route Details:")
-        print(f"  {'Case':<14} {'Leg':>4}  Path")
-        print("  " + "-" * 80)
-        for name, leg1, leg2, leg3 in h2d_paths:
-            print(f"  {name:<14} {'L1':>4}  {_short_path(leg1)}")
-            print(f"  {'':<14} {'L2':>4}  {_short_path(leg2)}")
-            print(f"  {'':<14} {'L3':>4}  {_short_path(leg3)}")
-
-    # === PE DMA → HBM (direct PE-level injection) ===
-    # (name, sip, src_cube, src_pe, dst_cube, dst_pe)
-    pe_cases = [
-        ("pe-local-hbm", 0, 0, 0, 0, 0),        # pe0 → slice0 (local, 256 GB/s)
-        ("pe-same-half-hbm", 0, 0, 0, 0, 1),     # pe0 → slice1 (xbar chain, 128 GB/s)
-        ("pe-cross-half-hbm", 0, 0, 0, 0, 4),    # pe0 → slice4 (xbar chain, 128 GB/s)
-        ("pe-cross-cube-hbm", 0, 0, 0, 1, 0),    # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
+    # === Collect D2H Read results ===
+    d2h_cases = [
+        ("d2h-1hop", 0, 1),
+        ("d2h-2hop", 4, 2),
+        ("d2h-3hop", 8, 3),
+        ("d2h-4hop", 12, 4),
    ]
-    pe_results: list[tuple[str, float, float, float | None]] = []
-    pe_paths: list[tuple[str, list[str]]] = []
+    d2h_results: list[tuple[str, int, float, float, float | None, float, float, float, float, float]] = []
+    d2h_route_data: list[tuple[str, list[str], list[str], list[str], list[str]]] = []

-    print()
-    print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
-    print(f"  {'Case':<22} {'Target':<28} {'Actual':>8}"
-          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
-          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
-    print("  " + "-" * 120)
+    for name, cube, hops in d2h_cases:
+        if not show_all and case_filter != name:
+            continue
+        engine = GraphEngine(graph)
+        pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
+        msg = MemoryReadMsg(
+            correlation_id="probe", request_id=name,
+            src_sip=0, src_cube=cube, src_pe=0,
+            src_pa=pa, nbytes=nbytes,
+        )
+        h = engine.submit(msg)
+        engine.wait(h)
+        _, trace = engine.get_completion(h)
+        total_ns = trace["total_ns"]
+        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
+
+        pa_obj = PhysAddr.decode(pa)
+        dst_node = resolver.resolve(pa_obj)
+
+        pcie_ep = resolver.find_pcie_ep(0)
+        fwd_path = router.find_memory_path(pcie_ep, dst_node)
+        rev_path = list(reversed(fwd_path))
+        bn_bw = _bottleneck_bw(fwd_path, edge_map)
+
+        wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
+
+        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
+        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
+
+        d2h_results.append((name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct))
+        d2h_route_data.append((name, fwd_path, rev_path, [], fwd_path))
+
+    # === Collect PE DMA results ===
+    pe_cases = [
+        ("pe-local-hbm", 0, 0, 0, 0, 0),
+        ("pe-same-half-hbm", 0, 0, 0, 0, 1),
+        ("pe-cross-half-hbm", 0, 0, 0, 0, 4),
+        ("pe-cross-cube-hbm-best", 0, 0, 0, 1, 0),   # adjacent cube
+        ("pe-cross-cube-hbm-worst", 0, 0, 0, 15, 0),  # diagonal far cube
+    ]
+    pe_results: list[tuple[str, float, float, float | None, float, float, float, float, float]] = []
+    pe_route_data: list[tuple[str, list[str], str]] = []

    for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
        if not show_all and case_filter != name:
@@ -219,26 +302,146 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0

        target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
-        pe_results.append((name, total_ns, eff_bw, bn_bw))
-        pe_paths.append((name, dma_path))
-        print(f"  {name:<22} {target_str:<28} {total_ns:>8.2f}"
+        pe_results.append((name, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct))
+        pe_route_data.append((name, dma_path, target_str))
+
+    # ================================================================
+    # OUTPUT: Summary tables first, then route details
+    # ================================================================
+
+    # --- H2D Summary Table ---
+    print()
+    print(f"=== H2D Write Latency (IO->HBM, data={nbytes}B) ===")
+    print(f"  {'Case':<14} {'Target':<16} {'Hops':>4}   {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 115)
+
+    for i, (name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct) in enumerate(h2d_results):
+        cube = h2d_cases[i][1] if i < len(h2d_cases) else 0
+        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
+              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
+              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+
+    if len(h2d_results) >= 2:
+        lats = [r[2] for r in h2d_results]
+        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
+        sym = "[v]" if mono else "[x]"
+        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
+
+    if h2d_results:
+        h2d_sweep = [_sweep_util(r[5], r[7], r[4]) for r in h2d_results]
+        _print_sweep_table([r[0] for r in h2d_results], h2d_sweep)
+
+    # --- D2H Summary Table ---
+    print()
+    print(f"=== D2H Read Latency (HBM->IO, data={nbytes}B) ===")
+    print(f"  {'Case':<14} {'Source':<16} {'Hops':>4}   {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 115)
+
+    for i, (name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct) in enumerate(d2h_results):
+        cube = d2h_cases[i][1] if i < len(d2h_cases) else 0
+        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
+              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
+              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+
+    if len(d2h_results) >= 2:
+        lats = [r[2] for r in d2h_results]
+        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
+        sym = "[v]" if mono else "[x]"
+        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
+
+    if d2h_results:
+        # D2H fixed cost = actual_total - drain (includes fwd+rev overhead)
+        d2h_sweep = [_sweep_util(r[2] - r[6], 0.0, r[4]) for r in d2h_results]
+        _print_sweep_table([r[0] for r in d2h_results], d2h_sweep)
+
+    # H2D vs D2H comparison
+    if h2d_results and d2h_results and len(h2d_results) == len(d2h_results):
+        all_gte = all(d2h_results[i][2] >= h2d_results[i][2] for i in range(len(h2d_results)))
+        sym = "[v]" if all_gte else "[x]"
+        print(f"  {sym} D2H >= H2D (reverse data path): {'PASS' if all_gte else 'FAIL'}")
+
+    # --- PE DMA Summary Table ---
+    print()
+    print(f"=== PE DMA Latency (pe_dma -> xbar -> HBM, data={nbytes}B) ===")
+    print(f"  {'Case':<26} {'Target':<28} {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 124)
+
+    for name, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct in pe_results:
+        target_str = [t for n, _, t in pe_route_data if n == name]
+        t_str = target_str[0] if target_str else ""
+        print(f"  {name:<26} {t_str:<28} {total_ns:>8.2f}"
              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")

    if len(pe_results) >= 2:
        local = [r for r in pe_results if "local" in r[0]]
-        chain = [r for r in pe_results if "local" not in r[0]]
-        if local and chain:
+        remote = [r for r in pe_results if "local" not in r[0]]
+        if local and remote:
            print(f"  * Local BN: {_fmt_bw(local[0][3])} GB/s, "
-                  f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
+                  f"Remote BN: {_fmt_bw(remote[0][3])} GB/s")
+        best = [r for r in pe_results if "best" in r[0]]
+        worst = [r for r in pe_results if "worst" in r[0]]
+        if best and worst:
+            sym = "[v]" if best[0][1] < worst[0][1] else "[x]"
+            print(f"  {sym} Cross-cube best < worst: {'PASS' if best[0][1] < worst[0][1] else 'FAIL'}"
+                  f"  ({best[0][1]:.2f}ns < {worst[0][1]:.2f}ns)")

-    if pe_paths:
+    if pe_results:
+        pe_sweep = [_sweep_util(r[4], r[6], r[3]) for r in pe_results]
+        _print_sweep_table([r[0] for r in pe_results], pe_sweep)
+
+    # ================================================================
+    # ROUTE DETAILS (grouped below all tables)
+    # ================================================================
+    print()
+    print("=" * 60)
+    print("  ROUTE DETAILS (per-hop timestamps)")
+    print("=" * 60)
+
+    # --- H2D Routes ---
+    if h2d_route_data:
        print()
-        print("  Route Details:")
-        print(f"  {'Case':<22}  Path")
-        print("  " + "-" * 80)
-        for name, dma_path in pe_paths:
-            print(f"  {name:<22}  {_short_path(dma_path)}")
+        print("  --- H2D Write Routes ---")
+        for name, leg1, leg2, leg3, fwd_path in h2d_route_data:
+            timestamps = _hop_timestamps(fwd_path, nbytes, edge_map, graph)
+            print(f"\n  [{name}]")
+            print(f"    Leg1: {_short_path(leg1)}")
+            print(f"    Leg2: {_short_path(leg2)}")
+            print(f"    Leg3: {_short_path(leg3)}")
+            print(f"    Per-hop trace:")
+            _print_hop_trace(timestamps, indent="      ")
+
+    # --- D2H Routes ---
+    if d2h_route_data:
+        print()
+        print("  --- D2H Read Routes ---")
+        for name, fwd_path, rev_path, _, _ in d2h_route_data:
+            timestamps_fwd = _hop_timestamps(fwd_path, 0, edge_map, graph)
+            timestamps_rev = _hop_timestamps(rev_path, nbytes, edge_map, graph)
+            print(f"\n  [{name}]")
+            print(f"    Fwd (cmd):  {_short_path(fwd_path)}")
+            print(f"    Rev (data): {_short_path(rev_path)}")
+            print(f"    Forward cmd trace (no data):")
+            _print_hop_trace(timestamps_fwd, indent="      ")
+            print(f"    Reverse data trace:")
+            _print_hop_trace(timestamps_rev, indent="      ")
+
+    # --- PE DMA Routes ---
+    if pe_route_data:
+        print()
+        print("  --- PE DMA Routes ---")
+        for name, dma_path, target_str in pe_route_data:
+            timestamps = _hop_timestamps(dma_path, nbytes, edge_map, graph)
+            print(f"\n  [{name}] {target_str}")
+            print(f"    Path: {_short_path(dma_path)}")
+            print(f"    Per-hop trace:")
+            _print_hop_trace(timestamps, indent="      ")

    print()
    return 0
@@ -18,13 +18,14 @@ from kernbench.components.impls.pe_math import PeMathComponent
 from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
 from kernbench.components.impls.pe_tcm import PeTcmComponent
 from kernbench.components.impls.sram import SramComponent
+from kernbench.components.impls.xbar import PositionAwareXbarComponent

 ComponentRegistry.register("forwarding_v1", TransitComponent)
 ComponentRegistry.register("switch_v1", TransitComponent)
 ComponentRegistry.register("noc_v1", TransitComponent)
 ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
 ComponentRegistry.register("ucie_v1", TransitComponent)
-ComponentRegistry.register("xbar_v1", TransitComponent)
+ComponentRegistry.register("xbar_v1", PositionAwareXbarComponent)
 ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
 ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
 ComponentRegistry.register("m_cpu_v1", MCpuComponent)
@@ -50,5 +51,6 @@ __all__ = [
    "PeTcmComponent",
    "TransitComponent",
    "TwoDMeshNocComponent",
+    "PositionAwareXbarComponent",
    "SramComponent",
 ]
@@ -69,17 +69,37 @@ class HbmCtrlComponent(ComponentBase):
        yield from self._send_response(env, txn)

    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
-        """Create ResponseMsg and send on reverse path back to originator.
+        """Route completion based on path type.

-        PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
-        so we succeed txn.done directly instead of sending a response Transaction.
+        - PeDmaMsg: succeed done directly (probe).
+        - Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
+          data back on reverse path with original done event.
+        - M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
        """
-        from kernbench.runtime_api.kernel import PeDmaMsg
+        from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg

        if isinstance(txn.request, PeDmaMsg):
            txn.done.succeed()
            return

+        # Bypass path: no m_cpu in the transaction path
+        is_bypass = not any("m_cpu" in n for n in txn.path)
+        if is_bypass:
+            if isinstance(txn.request, MemoryReadMsg):
+                # D2H: send data back on reverse path to pcie_ep
+                reverse_path = list(reversed(txn.path))
+                if len(reverse_path) >= 2:
+                    resp_txn = Transaction(
+                        request=txn.request, path=reverse_path, step=0,
+                        nbytes=txn.request.nbytes, done=txn.done,
+                    )
+                    yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                    return
+            # MemoryWrite bypass or short path: done
+            txn.done.succeed()
+            return
+
+        # M_CPU DMA path: send ResponseMsg for aggregation
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2 and self.ctx:
            from kernbench.runtime_api.kernel import ResponseMsg
@@ -52,6 +52,26 @@ class TwoDMeshNocComponent(ComponentBase):
    def _build_grid(self) -> None:
        if not self.ctx:
            return
+        mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None
+        if mesh:
+            self._build_grid_from_mesh(mesh)
+        else:
+            self._build_grid_from_positions()
+
+    def _build_grid_from_mesh(self, mesh: dict) -> None:
+        """Build XY grid from cube_mesh.yaml router positions (authoritative)."""
+        origin_x, origin_y = self._cube_origin()
+        xs: set[float] = set()
+        ys: set[float] = set()
+        for key, router in mesh.get("routers", {}).items():
+            if router is not None:
+                xs.add(round(origin_x + router["pos_mm"][0], 2))
+                ys.add(round(origin_y + router["pos_mm"][1], 2))
+        self._x_grid = sorted(xs)
+        self._y_grid = sorted(ys)
+
+    def _build_grid_from_positions(self) -> None:
+        """Fallback: infer grid from all node positions in the cube."""
        cube_prefix = self.node.id.rsplit(".", 1)[0]
        xs: set[float] = set()
        ys: set[float] = set()
@@ -62,6 +82,23 @@ class TwoDMeshNocComponent(ComponentBase):
        self._x_grid = sorted(xs)
        self._y_grid = sorted(ys)

+    def _cube_origin(self) -> tuple[float, float]:
+        """Compute absolute origin (top-left) of this cube from cube_id."""
+        parts = self.node.id.split(".")
+        cube_str = [p for p in parts if p.startswith("cube")][0]
+        cube_id = int(cube_str[4:])
+        spec = self.ctx.spec
+        sip_spec = spec.get("sip", {})
+        cube_spec = spec.get("cube", {})
+        mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
+        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
+        cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0)
+        seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
+            "distance_mm_across_seam", 1.0)
+        col = cube_id % mesh_w
+        row = cube_id // mesh_w
+        return (col * (cube_w + seam), row * (cube_h + seam))
+
    def _get_link(self, key: tuple) -> simpy.Resource:
        if key not in self._links:
            assert self._env is not None
@@ -0,0 +1,168 @@
+"""Position-aware XBAR component.
+
+Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm,
+where internal_distance is the Manhattan distance between the entry port
+(PE router attachment) and exit port (HBM slice logical position) within
+the crossbar matrix.
+
+PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]).
+HBM slice positions are uniformly distributed across the HBM physical width.
+"""
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PositionAwareXbarComponent(ComponentBase):
+    """XBAR with position-dependent latency based on PE-to-slice distance.
+
+    Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm
+
+    Entry/exit port X positions are determined from the transaction path:
+      - PE_DMA nodes: router X from cube_mesh.yaml
+      - HBM slices: uniformly distributed across HBM physical width
+      - Bridge nodes: physical X from topology positions
+      - NOC: resolved by scanning path for PE_DMA node
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0))
+        self._pe_router_xs: dict[str, float] = {}
+        self._slice_xs: dict[str, float] = {}
+        self._bridge_xs: dict[str, float] = {}
+        self._ns_per_mm: float = 0.0
+
+    def start(self, env: simpy.Environment) -> None:
+        self._build_position_map()
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(self._base_overhead_ns)
+
+    # ── Position map construction ─────────────────────────────────
+
+    def _build_position_map(self) -> None:
+        if not self.ctx or not self.ctx.spec:
+            return
+        mesh = self.ctx.spec.get("_mesh")
+        if not mesh:
+            return
+
+        self._ns_per_mm = self.ctx.ns_per_mm
+        cube_prefix = self.node.id.rsplit(".", 1)[0]
+        xbar_name = self.node.id.rsplit(".", 1)[1]
+        is_top = xbar_name == "xbar_top"
+        xbar_key = "top" if is_top else "bottom"
+
+        # PE router X positions from mesh attachments
+        routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", [])
+        for router_id in routers_list:
+            router_data = mesh["routers"].get(router_id)
+            if router_data is None:
+                continue
+            router_x = router_data["pos_mm"][0]
+            for attach in router_data.get("attach", []):
+                if attach.endswith(".dma"):
+                    pe_name = attach.split(".")[0]
+                    pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma"
+                    self._pe_router_xs[pe_dma_id] = router_x
+
+        # HBM slice X positions: uniformly distributed across HBM width
+        cube_spec = self.ctx.spec.get("cube", {})
+        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
+        hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0)
+        n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8)
+        half = n_slices // 2
+        hbm_left = (cube_w - hbm_w) / 2
+
+        if is_top:
+            slice_range = range(half)
+        else:
+            slice_range = range(half, n_slices)
+
+        n = len(list(slice_range))
+        for i, sl in enumerate(slice_range):
+            if n > 1:
+                x = hbm_left + i * hbm_w / (n - 1)
+            else:
+                x = cube_w / 2
+            self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x
+
+        # Bridge X positions from topology positions
+        for node_id, pos in self.ctx.positions.items():
+            if node_id.startswith(cube_prefix + ".bridge.") and pos is not None:
+                origin_x = self._cube_origin_x()
+                self._bridge_xs[node_id] = pos[0] - origin_x
+
+    def _cube_origin_x(self) -> float:
+        """Compute absolute X origin of this cube."""
+        parts = self.node.id.split(".")
+        cube_str = [p for p in parts if p.startswith("cube")][0]
+        cube_id = int(cube_str[4:])
+        spec = self.ctx.spec
+        sip_spec = spec.get("sip", {})
+        cube_spec = spec.get("cube", {})
+        mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
+        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
+        seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
+            "distance_mm_across_seam", 1.0)
+        col = cube_id % mesh_w
+        return col * (cube_w + seam)
+
+    # ── Worker override ───────────────────────────────────────────
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._position_aware_forward(env, txn))
+
+    def _position_aware_forward(
+        self, env: simpy.Environment, txn: Any,
+    ) -> Generator:
+        prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
+        next_hop = txn.next_hop
+
+        overhead = self._base_overhead_ns
+        if prev_hop and next_hop and self._ns_per_mm > 0:
+            entry_x = self._get_port_x(prev_hop, txn.path)
+            exit_x = self._get_port_x(next_hop, txn.path)
+            if entry_x is not None and exit_x is not None:
+                overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm
+
+        yield env.timeout(overhead)
+
+        if next_hop:
+            yield self.out_ports[next_hop].put(txn.advance())
+        else:
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+            txn.done.succeed()
+
+    def _get_port_x(self, node_id: str, path: list[str]) -> float | None:
+        """Resolve the X position of an XBAR port from node context."""
+        # Direct lookup: PE DMA
+        if node_id in self._pe_router_xs:
+            return self._pe_router_xs[node_id]
+        # Direct lookup: HBM slice
+        if node_id in self._slice_xs:
+            return self._slice_xs[node_id]
+        # Direct lookup: bridge
+        if node_id in self._bridge_xs:
+            return self._bridge_xs[node_id]
+        # NOC: scan path for PE DMA node
+        if "noc" in node_id:
+            for p in path:
+                if p in self._pe_router_xs:
+                    return self._pe_router_xs[p]
+        return None
@@ -110,7 +110,7 @@ class PathRouter:
    def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
        """M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).

-        Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
+        Same-cube: deterministic [m_cpu, noc, xbar_top/bot, hbm_ctrl.slice_i].
        Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
                    → routes through NOC → UCIe → target cube NOC → xbar → HBM.
        """
@@ -118,14 +118,23 @@ class PathRouter:
        d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
        if m_cube == d_cube:
            slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
+            xbar = "xbar_top" if slice_idx < 4 else "xbar_bot"
            return [
                m_cpu_id,
                f"{m_cube}.noc",
-                f"{m_cube}.xbar.pe{slice_idx}",
+                f"{m_cube}.{xbar}",
                dst_hbm_slice_id,
            ]
        return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)

+    def find_memory_path(self, src: str, dst: str) -> list[str]:
+        """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
+
+        Uses _adj_mcpu_dma which excludes pe_internal and pe_to_xbar edges,
+        preventing routing through PE pipeline nodes.
+        """
+        return self._run_dijkstra(self._adj_mcpu_dma, src, dst)
+
    def find_node_path(self, src: str, dst: str) -> list[str]:
        """General routing between arbitrary nodes, including command edges.

@@ -18,11 +18,10 @@ from kernbench.topology.types import Edge, TopologyGraph
 class GraphEngine:
    """simpy-based discrete-event simulation engine.

-    Phase B: engine injects a Transaction into the PCIE_EP host queue for
-    each request. Components handle their own routing:
-      Path 1: PCIE_EP → IO_CPU   (engine-computed path, pre-loaded in Transaction)
-      Path 2: IO_CPU → M_CPU     (IO_CPU dispatches, fire-and-forget callback)
-      Path 3: M_CPU.DMA → HBM   (M_CPU dispatches, fire-and-forget callback)
+    Request routing:
+      MemoryWrite/Read: pcie_ep → io_noc → cube → xbar → hbm_ctrl (m_cpu bypass)
+      KernelLaunch:     pcie_ep → io_noc → io_cpu → io_noc → cube → m_cpu → PE
+      PeDmaMsg:         pe_dma → xbar → hbm_ctrl (direct probe)

    Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
    """
@@ -68,18 +67,20 @@ class GraphEngine:
            src_comp.out_ports[e.dst] = store
            dst_comp.in_ports[e.src] = store

-        # Wire processes: propagation delay per edge (ADR-0015 D2)
-        # Cut-through (wormhole) model: wires apply propagation only.
-        # Serialization (drain) is computed per-path and applied once at the terminal.
+        # Wire processes: propagation delay + BW occupancy per edge (ADR-0015 D2)
+        # Cut-through (wormhole) model: wires apply propagation delay per hop.
+        # BW occupancy (available_at) tracks when each directed link becomes free
+        # for the next transaction, modeling back-to-back serialization contention.
        for e in graph.edges:
            src_comp = self._components.get(e.src)
            dst_comp = self._components.get(e.dst)
            if src_comp is None or dst_comp is None:
                continue
            prop_ns = e.distance_mm * self._ns_per_mm
+            bw_gbs = e.bw_gbs or 0.0
            self._env.process(
                self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
-                           prop_ns)
+                           prop_ns, bw_gbs)
            )

        # Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
@@ -125,14 +126,33 @@ class GraphEngine:
        out_port: simpy.Store,
        in_port: simpy.Store,
        prop_ns: float,
+        bw_gbs: float = 0.0,
    ):
-        """SimPy process: relay messages with propagation delay only.
+        """SimPy process: relay messages with propagation delay and BW occupancy.

-        Cut-through (wormhole) model: serialization (drain) is computed per-path
-        and applied once at the terminal component, not at every wire hop.
+        Each directed edge maintains an ``available_at`` timestamp tracking when
+        the link becomes free for the next transaction.  When a transaction of
+        ``nbytes`` uses a link with ``bw_gbs``, the link is occupied for
+        ``nbytes / bw_gbs`` ns.  The *next* transaction on the same directed
+        link must wait until ``available_at`` passes (back-to-back serialization).
+
+        The *current* transaction is NOT delayed by its own occupancy — only by
+        a prior transaction's occupancy that has not yet cleared.  This avoids
+        double-drain: terminal drain_ns handles single-transaction serialization,
+        while available_at handles inter-transaction BW contention.
        """
+        available_at = 0.0
        while True:
            msg = yield out_port.get()
+            # BW occupancy: wait for link to become free, then mark busy
+            if bw_gbs > 0:
+                nbytes = getattr(msg, "nbytes", 0)
+                if nbytes > 0:
+                    wait = available_at - self._env.now
+                    if wait > 0:
+                        yield self._env.timeout(wait)
+                    available_at = self._env.now + (nbytes / bw_gbs)
+            # Propagation delay
            if prop_ns > 0:
                yield self._env.timeout(prop_ns)
            yield in_port.put(msg)
@@ -142,6 +162,10 @@ class GraphEngine:
            yield from self._process_pe_dma(key, request, done)
            return

+        if isinstance(request, (MemoryWriteMsg, MemoryReadMsg)):
+            yield from self._process_memory_direct(key, request, done)
+            return
+
        entries = self._entry_points(request)
        if not entries:
            self._results[key] = (
@@ -200,6 +224,44 @@ class GraphEngine:
        )
        done.succeed()

+    def _process_memory_direct(self, key: str, request: Any, done: simpy.Event):
+        """Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
+
+        MemoryWrite: data flows forward (nbytes on wires), drain at hbm_ctrl terminal.
+        MemoryRead: command flows forward (nbytes=0), hbm_ctrl sends data back on
+                    reverse path with nbytes=request.nbytes.
+        """
+        if isinstance(request, MemoryWriteMsg):
+            sip, pa_val = request.dst_sip, request.dst_pa
+        else:
+            sip, pa_val = request.src_sip, request.src_pa
+
+        pcie_ep_id = self._resolver.find_pcie_ep(sip)
+        pa = PhysAddr.decode(pa_val)
+        hbm_node = self._resolver.resolve(pa)
+        path = self._router.find_memory_path(pcie_ep_id, hbm_node)
+        drain_ns = self._path_drain_ns(path, request.nbytes)
+
+        start_ns = self._env.now
+        txn_done = self._env.event()
+
+        is_write = isinstance(request, MemoryWriteMsg)
+        txn = Transaction(
+            request=request, path=path, step=0,
+            nbytes=request.nbytes if is_write else 0,
+            done=txn_done, drain_ns=drain_ns,
+        )
+
+        yield self._host_queues[pcie_ep_id].put(txn)
+        yield txn_done
+
+        total_ns = self._env.now - start_ns
+        self._results[key] = (
+            Completion(ok=True),
+            {"total_ns": total_ns, "nbytes": request.nbytes},
+        )
+        done.succeed()
+
    def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
        """Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
        pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
@@ -260,25 +322,8 @@ class GraphEngine:
    def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
        """Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.

-        For Memory{Write,Read}: single SIP entry.
-        For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
+        Only handles KernelLaunchMsg. MemoryWrite/Read use _process_memory_direct.
        """
-        if isinstance(request, MemoryWriteMsg):
-            sip = request.dst_sip
-            return [(
-                self._resolver.find_pcie_ep(sip),
-                self._resolver.find_io_cpu(sip),
-                request.nbytes,
-            )]
-
-        if isinstance(request, MemoryReadMsg):
-            sip = request.src_sip
-            return [(
-                self._resolver.find_pcie_ep(sip),
-                self._resolver.find_io_cpu(sip),
-                request.nbytes,
-            )]
-
        if isinstance(request, KernelLaunchMsg):
            seen: set[int] = set()
            entries: list[tuple[str, str, int]] = []
@@ -5,11 +5,13 @@ TopologyGraph with nodes, edges, and representative view projections.
 """
 from __future__ import annotations

+import math
 from pathlib import Path
 from typing import Any

 import yaml

+from .mesh_gen import ensure_mesh_file
 from .types import Edge, Node, TopologyGraph, TopologyHandle, ViewGraph


@@ -42,6 +44,10 @@ def load_topology(path: Path) -> TopologyGraph:
    """Load topology spec from file and compile into a topology graph."""
    spec = _read_spec(path)
    _validate_spec(spec)
+    # Generate cube_mesh.yaml alongside the topology file
+    mesh_path = path.parent / "cube_mesh.yaml"
+    mesh_data = ensure_mesh_file(spec["cube"], mesh_path)
+    spec["_mesh"] = mesh_data
    return _compile_graph(spec)


@@ -110,7 +116,7 @@ def _compile_graph(spec: dict) -> TopologyGraph:
                cid = row * mesh_w + col
                cp = f"{sp}.cube{cid}"
                origin = (col * stride_x, row * stride_y)
-                _instantiate_cube(nodes, edges, cp, cube_spec, origin)
+                _instantiate_cube(nodes, edges, cp, cube_spec, origin, spec["_mesh"])

        # Inter-cube UCIe mesh
        _add_inter_cube_edges(edges, sp, mesh_w, mesh_h, sip_spec)
@@ -148,9 +154,9 @@ def _cube_local_positions(cube_w: float, cube_h: float) -> dict[str, tuple[float
        "ucie-W": (uw, cy),
        "ucie-E": (cube_w - uw, cy),
        "m_cpu": (cube_w - 2.5, cy - 1.5),
-        "xbar.top": (cx, 3.5),       # Y reference for top-half xbar.pe nodes
+        "xbar_top": (cx, 3.5),
        "hbm_ctrl": (cx - 2.0, cy),
-        "xbar.bottom": (cx, cube_h - 3.5),  # Y reference for bottom-half xbar.pe nodes
+        "xbar_bot": (cx, cube_h - 3.5),
        "bridge.left": (2.5, cy + 2.0),
        "bridge.right": (cube_w - 2.5, cy + 2.0),
        "noc": (cx + 2.0, cy),
@@ -195,10 +201,11 @@ def _instantiate_io_chiplets(
    mesh_h: int,
    seam: float,
 ) -> None:
-    """Add IO chiplet nodes and internal pcie_ep → io_cpu edges."""
+    """Add IO chiplet nodes: pcie_ep, io_cpu, io_noc, io_ucie PHYs, conn nodes."""
    io_spec = sip_spec["iochiplet"]
    comp = io_spec["components"]
    links = io_spec["links"]
+    ucie_cfg = io_spec.get("ucie", {})
    mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
    mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam

@@ -208,9 +215,9 @@ def _instantiate_io_chiplets(
        side = inst["place"]["side"]
        cx = mesh_total_w / 2
        if side == "N":
-            pcie_y, cpu_y = -5.0, -3.0
+            pcie_y, cpu_y, noc_y = -5.0, -3.0, -4.0
        else:
-            pcie_y, cpu_y = mesh_total_h + 5.0, mesh_total_h + 3.0
+            pcie_y, cpu_y, noc_y = mesh_total_h + 5.0, mesh_total_h + 3.0, mesh_total_h + 4.0

        # pcie_ep
        ep = comp["pcie_ep"]
@@ -228,13 +235,114 @@ def _instantiate_io_chiplets(
            attrs=cpu["attrs"], pos_mm=(cx, cpu_y), label="IO CPU",
        )

-        # Internal edge
+        # io_noc (central switch inside IOChiplet)
+        noc = comp["io_noc"]
+        noc_id = f"{prefix}.noc"
+        nodes[noc_id] = Node(
+            id=noc_id, kind=noc["kind"], impl=noc["impl"],
+            attrs=noc["attrs"], pos_mm=(cx, noc_y), label="IO NOC",
+        )
+
+        # pcie_ep ↔ io_noc (bidirectional)
        edges.append(Edge(
-            src=ep_id, dst=cpu_id,
-            distance_mm=links["pcie_ep_to_io_cpu_mm"],
-            bw_gbs=links["pcie_ep_to_io_cpu_bw_gbs"],
+            src=ep_id, dst=noc_id,
+            distance_mm=links["pcie_ep_to_noc_mm"],
+            bw_gbs=links["pcie_ep_to_noc_bw_gbs"],
            kind="io_internal",
        ))
+        edges.append(Edge(
+            src=noc_id, dst=ep_id,
+            distance_mm=links["pcie_ep_to_noc_mm"],
+            bw_gbs=links["pcie_ep_to_noc_bw_gbs"],
+            kind="io_internal",
+        ))
+
+        # io_cpu ↔ io_noc (bidirectional)
+        edges.append(Edge(
+            src=cpu_id, dst=noc_id,
+            distance_mm=links["io_cpu_to_noc_mm"],
+            bw_gbs=links["io_cpu_to_noc_bw_gbs"],
+            kind="io_internal",
+        ))
+        edges.append(Edge(
+            src=noc_id, dst=cpu_id,
+            distance_mm=links["io_cpu_to_noc_mm"],
+            bw_gbs=links["io_cpu_to_noc_bw_gbs"],
+            kind="io_internal",
+        ))
+
+        # io_ucie PHY nodes + conn nodes per PHY
+        io_ucie_ns = float(ucie_cfg.get("overhead_ns", 1.0))
+        io_n_conn = int(ucie_cfg.get("n_connections", 4))
+        io_conn_bw = float(ucie_cfg.get("per_connection_bw_gbs", 128.0))
+        io_noc_to_ucie_mm = float(ucie_cfg.get("noc_to_ucie_mm", 0.5))
+
+        for phy in inst["ucie"]["phys"]:
+            phy_id = f"{prefix}.ucie-{phy}"
+            nodes[phy_id] = Node(
+                id=phy_id, kind="io_ucie", impl="ucie_v1",
+                attrs={"overhead_ns": io_ucie_ns},
+                pos_mm=(cx, noc_y), label=f"IO UCIe-{phy}",
+            )
+
+            for ci in range(io_n_conn):
+                conn_id = f"{phy_id}.conn{ci}"
+                nodes[conn_id] = Node(
+                    id=conn_id, kind="io_ucie_conn", impl="ucie_v1",
+                    attrs={"overhead_ns": 0.0},
+                    pos_mm=(cx, noc_y), label=f"IO UCIe-{phy} C{ci}",
+                )
+                # io_noc ↔ conn (per-connection BW)
+                edges.append(Edge(
+                    src=noc_id, dst=conn_id,
+                    distance_mm=io_noc_to_ucie_mm,
+                    bw_gbs=io_conn_bw,
+                    kind="io_noc_to_conn",
+                ))
+                edges.append(Edge(
+                    src=conn_id, dst=noc_id,
+                    distance_mm=io_noc_to_ucie_mm,
+                    bw_gbs=io_conn_bw,
+                    kind="conn_to_io_noc",
+                ))
+                # conn ↔ io_ucie (internal, no BW limit)
+                edges.append(Edge(
+                    src=conn_id, dst=phy_id,
+                    distance_mm=0.0, kind="io_ucie_internal",
+                ))
+                edges.append(Edge(
+                    src=phy_id, dst=conn_id,
+                    distance_mm=0.0, kind="io_ucie_internal",
+                ))
+
+
+# ── PE-to-router distance ─────────────────────────────────────────
+
+
+def _compute_pe_noc_distances(
+    mesh_data: dict,
+    corner_pos: dict[str, list[tuple[float, float]]],
+    corners: list[str],
+    pe_per_corner: int,
+) -> dict[int, float]:
+    """Compute per-PE Euclidean distance from physical position to assigned router."""
+    distances: dict[int, float] = {}
+    routers = mesh_data["routers"]
+    pe_idx = 0
+    for corner in corners:
+        for ci in range(pe_per_corner):
+            pe_cx, pe_cy = corner_pos[corner][ci]
+            target = f"pe{pe_idx}.dma"
+            for _rkey, rval in routers.items():
+                if rval is not None and target in rval.get("attach", []):
+                    rx, ry = rval["pos_mm"]
+                    dist = math.sqrt((pe_cx - rx) ** 2 + (pe_cy - ry) ** 2)
+                    distances[pe_idx] = round(dist, 2)
+                    break
+            else:
+                distances[pe_idx] = 0.0
+            pe_idx += 1
+    return distances


 # ── Instantiation: cube + PEs ───────────────────────────────────────
@@ -246,18 +354,26 @@ def _instantiate_cube(
    cp: str,
    cube: dict,
    origin: tuple[float, float],
+    mesh_data: dict,
 ) -> None:
-    """Add all cube-internal nodes and edges, including PE instances."""
+    """Add all cube-internal nodes and edges, including PE instances.
+
+    Topology: PE_DMA → NOC → xbar_top/bot → HBM_CTRL.
+    No per-PE xbar nodes; position-aware XBAR top/bottom replaces chaining.
+    """
    cube_w = cube["geometry"]["cube_mm"]["w"]
    cube_h = cube["geometry"]["cube_mm"]["h"]
    ox, oy = origin
    local_pos = _cube_local_positions(cube_w, cube_h)
    clinks = cube["links"]
    n_slices = cube["memory_map"]["hbm_slices_per_cube"]
+    half = n_slices // 2

-    # ── UCIe ports ──
-    ucie_ns = cube["ucie"]["overhead_ns"]
-    for port in cube["ucie"]["ports"]:
+    # ── UCIe ports + connection nodes ──
+    ucie_cfg = cube["ucie"]
+    ucie_ns = ucie_cfg["overhead_ns"]
+    ucie_n_conn = ucie_cfg.get("n_connections", 1)
+    for port in ucie_cfg["ports"]:
        pid = f"{cp}.ucie-{port}"
        lx, ly = local_pos[f"ucie-{port}"]
        nodes[pid] = Node(
@@ -265,6 +381,14 @@ def _instantiate_cube(
            attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
            label=f"UCIe-{port}",
        )
+        for ci in range(ucie_n_conn):
+            conn_id = f"{cp}.ucie-{port}.conn{ci}"
+            nodes[conn_id] = Node(
+                id=conn_id, kind="ucie_conn", impl="ucie_v1",
+                attrs={"overhead_ns": 0.0},
+                pos_mm=(ox + lx, oy + ly),
+                label=f"UCIe-{port} C{ci}",
+            )

    # ── Named components: noc, m_cpu, sram ──
    for name in ("noc", "m_cpu", "sram"):
@@ -277,7 +401,19 @@ def _instantiate_cube(
            label=name.upper().replace("_", " "),
        )

-    # ── HBM controller slices (one per PE) ──
+    # ── xbar_top and xbar_bot (position-aware XBAR) ──
+    xbar_spec = cube["components"]["xbar"]
+    for xbar_name, xbar_cfg in [("xbar_top", xbar_spec["top"]),
+                                 ("xbar_bot", xbar_spec["bottom"])]:
+        nid = f"{cp}.{xbar_name}"
+        lx, ly = local_pos[xbar_name]
+        nodes[nid] = Node(
+            id=nid, kind=xbar_cfg["kind"], impl=xbar_cfg["impl"],
+            attrs=xbar_cfg["attrs"], pos_mm=(ox + lx, oy + ly),
+            label=xbar_name.upper().replace("_", " "),
+        )
+
+    # ── HBM controller slices ──
    hbm_spec = cube["components"]["hbm_ctrl"]
    hbm_lx, hbm_ly = local_pos["hbm_ctrl"]
    for sl in range(n_slices):
@@ -289,7 +425,7 @@ def _instantiate_cube(
        )

    # ── Bridges ──
-    for br in cube["components"]["xbar"]["bridges"]:
+    for br in xbar_spec["bridges"]:
        bname = br["id"]
        nid = f"{cp}.bridge.{bname}"
        lx, ly = local_pos[f"bridge.{bname}"]
@@ -299,34 +435,22 @@ def _instantiate_cube(
            label=f"Bridge {bname.upper()}",
        )

-    # ── PE instances + per-PE xbar entry nodes ──
+    # ── PE instances (no per-PE xbar nodes) ──
    corners = cube["pe_layout"]["corners"]
    pe_per_corner = cube["pe_layout"]["pe_per_corner"]
    corner_pos = _corner_pe_positions(cube_w, cube_h)
    pe_tmpl = cube["pe_template"]
    pe_links = pe_tmpl["links"]
-
-    xbar_pe_spec = cube["components"]["xbar"]["pe"]
-    xbar_top_y = local_pos["xbar.top"][1]
-    xbar_bot_y = local_pos["xbar.bottom"][1]
+    pe_noc_distances = _compute_pe_noc_distances(
+        mesh_data, corner_pos, corners, pe_per_corner,
+    )

    pe_idx = 0
    for corner in corners:
-        is_top = corner in ("NW", "NE")
-        xbar_y = xbar_top_y if is_top else xbar_bot_y
-        mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
        for ci in range(pe_per_corner):
            pp = f"{cp}.pe{pe_idx}"
            pe_cx, pe_cy = corner_pos[corner][ci]

-            # Per-PE xbar entry node
-            xbar_nid = f"{cp}.xbar.pe{pe_idx}"
-            nodes[xbar_nid] = Node(
-                id=xbar_nid, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
-                attrs=xbar_pe_spec["attrs"], pos_mm=(ox + pe_cx, oy + xbar_y),
-                label=f"XBAR PE{pe_idx}",
-            )
-
            # PE template components
            for comp_name, comp_spec in pe_tmpl["components"].items():
                cid = f"{pp}.{comp_name}"
@@ -341,18 +465,10 @@ def _instantiate_cube(
            # PE-internal edges
            _add_pe_internal_edges(edges, pp, pe_links)

-            # PE_DMA → xbar.pe_i (HBM data path)
-            edges.append(Edge(
-                src=f"{pp}.pe_dma", dst=xbar_nid,
-                distance_mm=clinks[mm_key],
-                bw_gbs=clinks["pe_to_xbar_bw_gbs"],
-                kind="pe_to_xbar",
-            ))
-
-            # PE_DMA → noc (non-HBM data path: SRAM, inter-cube, etc.)
+            # PE_DMA → noc (distance auto-computed from PE physical position)
            edges.append(Edge(
                src=f"{pp}.pe_dma", dst=f"{cp}.noc",
-                distance_mm=clinks["pe_dma_to_noc_mm"],
+                distance_mm=pe_noc_distances.get(pe_idx, 0.0),
                bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
                kind="pe_to_noc",
            ))
@@ -366,97 +482,96 @@ def _instantiate_cube(

            pe_idx += 1

-    # ── Cube fabric edges ──
-
-    # xbar.pe_i ↔ hbm_ctrl.slice_i (local Y-path, bidirectional for response)
-    for i in range(n_slices):
+    # ── xbar_top/bot → HBM slices ──
+    hbm_eff = float(hbm_spec.get("attrs", {}).get("efficiency", 1.0))
+    hbm_bw = clinks["xbar_to_hbm_bw_gbs"] * hbm_eff
+    for i in range(half):
        edges.append(Edge(
-            src=f"{cp}.xbar.pe{i}", dst=f"{cp}.hbm_ctrl.slice{i}",
+            src=f"{cp}.xbar_top", dst=f"{cp}.hbm_ctrl.slice{i}",
            distance_mm=clinks["xbar_to_hbm_mm"],
-            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+            bw_gbs=hbm_bw,
            kind="xbar_to_hbm",
        ))
        edges.append(Edge(
-            src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar.pe{i}",
+            src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar_top",
            distance_mm=clinks["xbar_to_hbm_mm"],
-            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+            bw_gbs=hbm_bw,
+            kind="hbm_to_xbar",
+        ))
+    for i in range(half, n_slices):
+        edges.append(Edge(
+            src=f"{cp}.xbar_bot", dst=f"{cp}.hbm_ctrl.slice{i}",
+            distance_mm=clinks["xbar_to_hbm_mm"],
+            bw_gbs=hbm_bw,
+            kind="xbar_to_hbm",
+        ))
+        edges.append(Edge(
+            src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar_bot",
+            distance_mm=clinks["xbar_to_hbm_mm"],
+            bw_gbs=hbm_bw,
            kind="hbm_to_xbar",
        ))

-    # xbar chain: pe0↔pe1↔pe2↔pe3 (top), pe4↔pe5↔pe6↔pe7 (bottom)
-    half = n_slices // 2
-    for half_start in (0, half):
-        for i in range(half_start, half_start + half - 1):
-            intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
-            x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
-            for a, b in [(i, i + 1), (i + 1, i)]:
-                edges.append(Edge(
-                    src=f"{cp}.xbar.pe{a}", dst=f"{cp}.xbar.pe{b}",
-                    distance_mm=x_dist,
-                    bw_gbs=clinks["xbar_x_bw_gbs"],
-                    kind="xbar_chain",
-                ))
+    # ── NOC ↔ xbar_top/bot ──
+    # xbar_top: primary (low routing weight), xbar_bot: secondary (high routing weight
+    # steers Dijkstra through xbar_top→bridge→xbar_bot for cross-half access)
+    noc_xbar_bw = clinks.get("noc_to_xbar_bw_gbs", 256.0)
+    noc_xbar_mm = clinks.get("noc_to_xbar_mm", 0.0)
+    for xbar_name, rw in [("xbar_top", None), ("xbar_bot", 100.0)]:
+        edges.append(Edge(
+            src=f"{cp}.noc", dst=f"{cp}.{xbar_name}",
+            distance_mm=noc_xbar_mm, bw_gbs=noc_xbar_bw,
+            routing_weight_mm=rw, kind="noc_to_xbar",
+        ))
+        edges.append(Edge(
+            src=f"{cp}.{xbar_name}", dst=f"{cp}.noc",
+            distance_mm=noc_xbar_mm, bw_gbs=noc_xbar_bw,
+            routing_weight_mm=rw, kind="xbar_to_noc",
+        ))

-    # bridge connections: pe0↔bridge.left↔pe4, pe3↔bridge.right↔pe7
-    for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
+    # ── Bridge connections: xbar_top ↔ bridge ↔ xbar_bot ──
+    bridge_mm = clinks.get("xbar_to_bridge_mm", 3.0)
+    bridge_bw = clinks.get("xbar_to_bridge_bw_gbs", 128.0)
+    for bname in ("left", "right"):
        br_node = f"{cp}.bridge.{bname}"
-        for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
-                                 (pe_bot, "xbar_row_s_to_bridge_mm")]:
-            xbar_node = f"{cp}.xbar.pe{pe_i}"
+        for xbar_name in ("xbar_top", "xbar_bot"):
            edges.append(Edge(
-                src=xbar_node, dst=br_node,
-                distance_mm=clinks[br_mm_key],
-                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                src=f"{cp}.{xbar_name}", dst=br_node,
+                distance_mm=bridge_mm, bw_gbs=bridge_bw,
                kind="xbar_to_bridge",
            ))
            edges.append(Edge(
-                src=br_node, dst=xbar_node,
-                distance_mm=clinks[br_mm_key],
-                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                src=br_node, dst=f"{cp}.{xbar_name}",
+                distance_mm=bridge_mm, bw_gbs=bridge_bw,
                kind="bridge_to_xbar",
            ))

-    # ucie ↔ noc (UCIe-NOC boundary; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
-    _noc_ucie = clinks["noc_to_ucie"]
-    for port in cube["ucie"]["ports"]:
-        edges.append(Edge(
-            src=f"{cp}.ucie-{port}", dst=f"{cp}.noc",
-            distance_mm=0.0,
-            bw_gbs=_noc_ucie["per_connection_bw_gbs"],
-            n_connections=_noc_ucie["n_connections"],
-            kind="ucie_to_noc",
-        ))
+    # ── UCIe ↔ conn ↔ NOC ──
+    ucie_conn_bw = ucie_cfg.get("per_connection_bw_gbs", 128.0)
+    for port in ucie_cfg["ports"]:
+        ucie_id = f"{cp}.ucie-{port}"
+        for ci in range(ucie_n_conn):
+            conn_id = f"{cp}.ucie-{port}.conn{ci}"
+            edges.append(Edge(
+                src=ucie_id, dst=conn_id,
+                distance_mm=0.0, kind="ucie_internal",
+            ))
+            edges.append(Edge(
+                src=conn_id, dst=ucie_id,
+                distance_mm=0.0, kind="ucie_internal",
+            ))
+            edges.append(Edge(
+                src=conn_id, dst=f"{cp}.noc",
+                distance_mm=0.0, bw_gbs=ucie_conn_bw,
+                kind="ucie_conn_to_noc",
+            ))
+            edges.append(Edge(
+                src=f"{cp}.noc", dst=conn_id,
+                distance_mm=0.0, bw_gbs=ucie_conn_bw,
+                kind="noc_to_ucie_conn",
+            ))

-    for port in cube["ucie"]["ports"]:
-        edges.append(Edge(
-            src=f"{cp}.noc", dst=f"{cp}.ucie-{port}",
-            distance_mm=0.0,
-            bw_gbs=_noc_ucie["per_connection_bw_gbs"],
-            n_connections=_noc_ucie["n_connections"],
-            kind="noc_to_ucie",
-        ))
-
-    # noc ↔ xbar.pe{i}: wire delay is 0 (NOC traversal latency computed by TwoDMeshNocComponent);
-    # routing_weight_mm=50.0 steers PE DMA Dijkstra away from this path (prefer direct pe_dma→xbar)
-    _noc_xbar = clinks.get("noc_to_xbar", {})
-    _noc_xbar_bw = _noc_xbar.get("per_connection_bw_gbs")
-    for i in range(n_slices):
-        edges.append(Edge(
-            src=f"{cp}.noc", dst=f"{cp}.xbar.pe{i}",
-            distance_mm=0.0,
-            bw_gbs=_noc_xbar_bw,
-            routing_weight_mm=50.0,
-            kind="noc_to_xbar",
-        ))
-        edges.append(Edge(
-            src=f"{cp}.xbar.pe{i}", dst=f"{cp}.noc",
-            distance_mm=0.0,
-            bw_gbs=_noc_xbar_bw,
-            routing_weight_mm=50.0,
-            kind="xbar_to_noc",
-        ))
-
-    # m_cpu ↔ noc (command dispatch, both directions)
+    # ── m_cpu ↔ noc (command dispatch) ──
    edges.append(Edge(
        src=f"{cp}.m_cpu", dst=f"{cp}.noc",
        distance_mm=clinks["m_cpu_to_noc_mm"],
@@ -468,7 +583,7 @@ def _instantiate_cube(
        kind="command",
    ))

-    # noc ↔ sram (shared SRAM access; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
+    # ── noc ↔ sram ──
    _noc_sram = clinks["noc_to_sram"]
    edges.append(Edge(
        src=f"{cp}.noc", dst=f"{cp}.sram",
@@ -550,28 +665,27 @@ def _add_inter_cube_edges(
 def _add_io_to_cube_edges(
    edges: list[Edge], sp: str, sip_spec: dict, mesh_w: int,
 ) -> None:
-    """Add IO chiplet io_cpu ↔ cube UCIe edges (bidirectional for response)."""
-    io_links = sip_spec["iochiplet"]["links"]
-    io_to_ucie_mm = io_links["io_cpu_to_ucie_mm"]
-    io_to_ucie_bw = io_links["io_cpu_to_ucie_bw_gbs"]
+    """Add IO chiplet io_ucie ↔ cube UCIe edges (bidirectional)."""
    for inst in sip_spec["iochiplet"]["instances"]:
        iid = inst["id"]
-        io_cpu_id = f"{sp}.{iid}.io_cpu"
+        phy_bw = float(inst["ucie"]["phy_bw_gbs"])
        for port in inst["cube_ports"]:
            cube_col, cube_row = port["cube"]["xy"]
            cube_id = cube_row * mesh_w + cube_col
            cube_side = port["cube_side"]
-            ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}"
+            phy = port["phy"]
+            io_ucie_id = f"{sp}.{iid}.ucie-{phy}"
+            cube_ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}"
            edges.append(Edge(
-                src=io_cpu_id, dst=ucie_id,
-                distance_mm=io_to_ucie_mm + port["distance_mm"],
-                bw_gbs=io_to_ucie_bw,
+                src=io_ucie_id, dst=cube_ucie_id,
+                distance_mm=port["distance_mm"],
+                bw_gbs=phy_bw,
                kind="io_to_cube",
            ))
            edges.append(Edge(
-                src=ucie_id, dst=io_cpu_id,
-                distance_mm=io_to_ucie_mm + port["distance_mm"],
-                bw_gbs=io_to_ucie_bw,
+                src=cube_ucie_id, dst=io_ucie_id,
+                distance_mm=port["distance_mm"],
+                bw_gbs=phy_bw,
                kind="cube_to_io",
            ))

@@ -704,11 +818,13 @@ def _build_sip_view(spec: dict) -> ViewGraph:
                ))

    # IO chiplets
-    io_links = sip_spec["iochiplet"]["links"]
+    io_ucie_cfg = sip_spec["iochiplet"].get("ucie", {})
+    io_noc_to_ucie_mm = float(io_ucie_cfg.get("noc_to_ucie_mm", 0.5))
    for inst in sip_spec["iochiplet"]["instances"]:
        iid = inst["id"]
        side = inst["place"]["side"]
        iy = 2.0 if side == "N" else canvas_h - 2.0
+        phy_bw = float(inst["ucie"]["phy_bw_gbs"])
        nodes[iid] = Node(
            id=iid, kind="iochiplet", impl="",
            attrs={}, pos_mm=(mesh_total_w / 2, iy), label=f"IO {iid}",
@@ -718,8 +834,8 @@ def _build_sip_view(spec: dict) -> ViewGraph:
            cube_id = cube_row * mesh_w + cube_col
            view_edges.append(Edge(
                src=iid, dst=f"cube{cube_id}",
-                distance_mm=io_links["io_cpu_to_ucie_mm"] + port["distance_mm"],
-                bw_gbs=io_links["io_cpu_to_ucie_bw_gbs"],
+                distance_mm=io_noc_to_ucie_mm + port["distance_mm"],
+                bw_gbs=phy_bw,
                kind="io_to_cube",
            ))

@@ -737,31 +853,52 @@ def _build_cube_view(spec: dict) -> ViewGraph:
    local_pos = _cube_local_positions(cube_w, cube_h)
    clinks = cube["links"]
    n_slices = cube["memory_map"]["hbm_slices_per_cube"]
+    half = n_slices // 2

    nodes: dict[str, Node] = {}
    view_edges: list[Edge] = []

-    # UCIe ports
-    for port in cube["ucie"]["ports"]:
+    # UCIe ports + connection nodes
+    ucie_cfg = cube["ucie"]
+    ucie_n_conn = ucie_cfg.get("n_connections", 1)
+    for port in ucie_cfg["ports"]:
        pid = f"ucie-{port}"
        lx, ly = local_pos[pid]
        nodes[pid] = Node(
            id=pid, kind="ucie_port", impl="ucie_v1",
            attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
        )
+        for ci in range(ucie_n_conn):
+            conn_id = f"ucie-{port}.conn{ci}"
+            nodes[conn_id] = Node(
+                id=conn_id, kind="ucie_conn", impl="ucie_v1",
+                attrs={"overhead_ns": 0.0}, pos_mm=(lx, ly),
+                label=f"UCIe-{port} C{ci}",
+            )

    # Named components (hbm_ctrl as single representative node in view)
    for name in ("noc", "m_cpu", "hbm_ctrl", "sram"):
        c = cube["components"][name]
-        lx, ly = local_pos[name]
+        lx, ly = local_pos.get(name, local_pos.get("hbm_ctrl"))
        nodes[name] = Node(
            id=name, kind=c["kind"], impl=c["impl"],
            attrs=c["attrs"], pos_mm=(lx, ly),
            label=name.upper().replace("_", " "),
        )

+    # xbar_top, xbar_bot
+    xbar_spec = cube["components"]["xbar"]
+    for xbar_name, xbar_cfg in [("xbar_top", xbar_spec["top"]),
+                                 ("xbar_bot", xbar_spec["bottom"])]:
+        lx, ly = local_pos[xbar_name]
+        nodes[xbar_name] = Node(
+            id=xbar_name, kind=xbar_cfg["kind"], impl=xbar_cfg["impl"],
+            attrs=xbar_cfg["attrs"], pos_mm=(lx, ly),
+            label=xbar_name.upper().replace("_", " "),
+        )
+
    # Bridges
-    for br in cube["components"]["xbar"]["bridges"]:
+    for br in xbar_spec["bridges"]:
        bname = br["id"]
        bid = f"bridge.{bname}"
        lx, ly = local_pos[bid]
@@ -771,46 +908,29 @@ def _build_cube_view(spec: dict) -> ViewGraph:
            label=f"Bridge {bname.upper()}",
        )

-    # PEs as opaque blocks + per-PE xbar entry nodes
+    # PEs as opaque blocks (no per-PE xbar nodes)
    corners = cube["pe_layout"]["corners"]
    pe_per_corner = cube["pe_layout"]["pe_per_corner"]
    corner_pos = _corner_pe_positions(cube_w, cube_h)
-    xbar_pe_spec = cube["components"]["xbar"]["pe"]
-    xbar_top_y = local_pos["xbar.top"][1]
-    xbar_bot_y = local_pos["xbar.bottom"][1]
+    mesh_data = spec.get("_mesh", {})
+    pe_noc_distances = _compute_pe_noc_distances(
+        mesh_data, corner_pos, corners, pe_per_corner,
+    ) if mesh_data else {}

    pe_idx = 0
    for corner in corners:
-        is_top = corner in ("NW", "NE")
-        xbar_y = xbar_top_y if is_top else xbar_bot_y
-        mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
        for ci in range(pe_per_corner):
            pid = f"pe{pe_idx}"
-            xbar_id = f"xbar.pe{pe_idx}"
            px, py = corner_pos[corner][ci]
-
            nodes[pid] = Node(
                id=pid, kind="pe", impl="",
                attrs={"corner": corner}, pos_mm=(px, py),
                label=f"PE{pe_idx}",
            )
-            nodes[xbar_id] = Node(
-                id=xbar_id, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
-                attrs=xbar_pe_spec["attrs"], pos_mm=(px, xbar_y),
-                label=f"XBAR PE{pe_idx}",
-            )
-
-            # PE → xbar.pe_i (HBM data path)
-            view_edges.append(Edge(
-                src=pid, dst=xbar_id,
-                distance_mm=clinks[mm_key],
-                bw_gbs=clinks["pe_to_xbar_bw_gbs"],
-                kind="pe_to_xbar",
-            ))
-            # PE → noc (non-HBM data path)
+            # PE → noc (distance auto-computed from PE physical position)
            view_edges.append(Edge(
                src=pid, dst="noc",
-                distance_mm=clinks["pe_dma_to_noc_mm"],
+                distance_mm=pe_noc_distances.get(pe_idx, 0.0),
                bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
                kind="pe_to_noc",
            ))
@@ -822,60 +942,76 @@ def _build_cube_view(spec: dict) -> ViewGraph:
            ))
            pe_idx += 1

-    # Cube fabric edges
-    # xbar.pe_i → hbm_ctrl (single representative node in view)
-    for i in range(n_slices):
+    # xbar_top/bot → hbm_ctrl
+    view_edges.append(Edge(
+        src="xbar_top", dst="hbm_ctrl",
+        distance_mm=clinks["xbar_to_hbm_mm"],
+        bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+        kind="xbar_to_hbm",
+    ))
+    view_edges.append(Edge(
+        src="xbar_bot", dst="hbm_ctrl",
+        distance_mm=clinks["xbar_to_hbm_mm"],
+        bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+        kind="xbar_to_hbm",
+    ))
+
+    # noc ↔ xbar_top/bot
+    noc_xbar_bw = clinks.get("noc_to_xbar_bw_gbs", 256.0)
+    noc_xbar_mm = clinks.get("noc_to_xbar_mm", 0.0)
+    for xbar_name in ("xbar_top", "xbar_bot"):
        view_edges.append(Edge(
-            src=f"xbar.pe{i}", dst="hbm_ctrl",
-            distance_mm=clinks["xbar_to_hbm_mm"],
-            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
-            kind="xbar_to_hbm",
+            src="noc", dst=xbar_name,
+            distance_mm=noc_xbar_mm, bw_gbs=noc_xbar_bw,
+            kind="noc_to_xbar",
+        ))
+        view_edges.append(Edge(
+            src=xbar_name, dst="noc",
+            distance_mm=noc_xbar_mm, bw_gbs=noc_xbar_bw,
+            kind="xbar_to_noc",
        ))

-    # xbar chain
-    half = n_slices // 2
-    for half_start in (0, half):
-        for i in range(half_start, half_start + half - 1):
-            intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
-            x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
-            for a, b in [(i, i + 1), (i + 1, i)]:
-                view_edges.append(Edge(
-                    src=f"xbar.pe{a}", dst=f"xbar.pe{b}",
-                    distance_mm=x_dist,
-                    bw_gbs=clinks["xbar_x_bw_gbs"],
-                    kind="xbar_chain",
-                ))
-
-    # bridge connections
-    for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
+    # bridge connections: xbar_top ↔ bridge ↔ xbar_bot
+    bridge_mm = clinks.get("xbar_to_bridge_mm", 3.0)
+    bridge_bw = clinks.get("xbar_to_bridge_bw_gbs", 128.0)
+    for bname in ("left", "right"):
        br_id = f"bridge.{bname}"
-        for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
-                                 (pe_bot, "xbar_row_s_to_bridge_mm")]:
-            xbar_id = f"xbar.pe{pe_i}"
+        for xbar_name in ("xbar_top", "xbar_bot"):
            view_edges.append(Edge(
-                src=xbar_id, dst=br_id,
-                distance_mm=clinks[br_mm_key],
-                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                src=xbar_name, dst=br_id,
+                distance_mm=bridge_mm, bw_gbs=bridge_bw,
                kind="xbar_to_bridge",
            ))
            view_edges.append(Edge(
-                src=br_id, dst=xbar_id,
-                distance_mm=clinks[br_mm_key],
-                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                src=br_id, dst=xbar_name,
+                distance_mm=bridge_mm, bw_gbs=bridge_bw,
                kind="bridge_to_xbar",
            ))

-    _noc_ucie_v = clinks["noc_to_ucie"]
-    for port in cube["ucie"]["ports"]:
-        view_edges.append(Edge(
-            src="noc", dst=f"ucie-{port}",
-            distance_mm=0.0,
-            bw_gbs=_noc_ucie_v["per_connection_bw_gbs"],
-            n_connections=_noc_ucie_v["n_connections"],
-            kind="noc_to_ucie",
-        ))
+    ucie_conn_bw_v = ucie_cfg.get("per_connection_bw_gbs", 128.0)
+    for port in ucie_cfg["ports"]:
+        for ci in range(ucie_n_conn):
+            conn_id = f"ucie-{port}.conn{ci}"
+            view_edges.append(Edge(
+                src="noc", dst=conn_id,
+                distance_mm=0.0, bw_gbs=ucie_conn_bw_v,
+                kind="noc_to_ucie_conn",
+            ))
+            view_edges.append(Edge(
+                src=conn_id, dst=f"ucie-{port}",
+                distance_mm=0.0, kind="ucie_internal",
+            ))
+            view_edges.append(Edge(
+                src=f"ucie-{port}", dst=conn_id,
+                distance_mm=0.0, kind="ucie_internal",
+            ))
+            view_edges.append(Edge(
+                src=conn_id, dst="noc",
+                distance_mm=0.0, bw_gbs=ucie_conn_bw_v,
+                kind="ucie_conn_to_noc",
+            ))

-    # m_cpu ↔ noc (command dispatch, both directions)
+    # m_cpu ↔ noc
    view_edges.append(Edge(
        src="m_cpu", dst="noc",
        distance_mm=clinks["m_cpu_to_noc_mm"],
@@ -887,7 +1023,7 @@ def _build_cube_view(spec: dict) -> ViewGraph:
        kind="command",
    ))

-    # noc ↔ sram (shared SRAM access, bidirectional)
+    # noc ↔ sram
    _noc_sram_v = clinks["noc_to_sram"]
    view_edges.append(Edge(
        src="noc", dst="sram",
@@ -0,0 +1,284 @@
+"""Auto-layout mesh generation for CUBE NOC router mesh.
+
+Generates cube_mesh.yaml describing the internal router grid, PE/UCIe/XBAR
+attachments, and HBM exclusion zone.  The file is cached with a source_hash
+so it is only regenerated when relevant topology parameters change.
+
+Algorithm (final, per Phase 1 design iteration):
+  cols          = physical_cols  (PE x-positions + relay cols for max_spacing)
+  rows_per_half = ceil(n_connections / 2)
+  total_rows    = rows_per_half * 2 + 2   (+ 2 HBM rows)
+  PEs: 1 PE per row when rows available, corners at fixed positions
+  Hot path: min_connections = max(n_connections, 2)
+"""
+from __future__ import annotations
+
+import hashlib
+import math
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+# ── Public API ────────────────────────────────────────────────────────
+
+
+def ensure_mesh_file(cube_spec: dict, mesh_path: Path) -> dict:
+    """Generate cube_mesh.yaml if needed, return parsed mesh dict."""
+    source_hash = _compute_source_hash(cube_spec)
+
+    if mesh_path.exists():
+        existing = yaml.safe_load(mesh_path.read_text(encoding="utf-8"))
+        if existing and existing.get("source_hash") == source_hash:
+            return existing
+
+    mesh = _generate_mesh(cube_spec, source_hash)
+    mesh_path.write_text(
+        yaml.dump(mesh, default_flow_style=False, sort_keys=False),
+        encoding="utf-8",
+    )
+    return mesh
+
+
+# ── Hash ──────────────────────────────────────────────────────────────
+
+
+def _compute_source_hash(cube_spec: dict) -> str:
+    """Hash relevant topology params that determine mesh layout."""
+    relevant = {
+        "geometry": cube_spec["geometry"],
+        "pe_layout": cube_spec["pe_layout"],
+        "ucie_n_connections": cube_spec["ucie"]["n_connections"],
+    }
+    raw = yaml.dump(relevant, sort_keys=True)
+    return hashlib.sha256(raw.encode()).hexdigest()[:16]
+
+
+# ── Layout helpers ────────────────────────────────────────────────────
+
+
+def _corner_pe_positions(
+    cube_w: float, cube_h: float
+) -> dict[str, list[tuple[float, float]]]:
+    """PE center positions per corner, relative to cube origin."""
+    return {
+        "NW": [(1.5, 1.5), (4.5, 1.5)],
+        "NE": [(cube_w - 4.5, 1.5), (cube_w - 1.5, 1.5)],
+        "SW": [(1.5, cube_h - 1.5), (4.5, cube_h - 1.5)],
+        "SE": [(cube_w - 4.5, cube_h - 1.5), (cube_w - 1.5, cube_h - 1.5)],
+    }
+
+
+def _compute_col_positions(cube_w: float, pe_positions: dict) -> list[float]:
+    """Compute X positions for grid columns based on PE positions + relay spacing."""
+    xs: set[float] = set()
+    for positions in pe_positions.values():
+        for x, _y in positions:
+            xs.add(x)
+
+    sorted_xs = sorted(xs)
+    # Insert relay columns for gaps > max_spacing (3mm)
+    max_spacing = 3.0
+    result: list[float] = []
+    for i, x in enumerate(sorted_xs):
+        if i > 0:
+            gap = x - result[-1]
+            while gap > max_spacing + 0.01:
+                mid = result[-1] + max_spacing
+                if mid < x - 0.5:
+                    result.append(round(mid, 1))
+                    gap = x - result[-1]
+                else:
+                    break
+        result.append(x)
+    return result
+
+
+def _compute_row_positions(
+    cube_h: float, n_connections: int, pe_positions: dict
+) -> tuple[list[float], int]:
+    """Compute Y positions for grid rows.
+
+    Returns (y_positions, rows_per_half).
+    Layout: [top PE rows] [HBM row top] [HBM row bot] [bottom PE rows]
+    """
+    n_conn = max(n_connections, 2)  # hot path minimum
+    rows_per_half = math.ceil(n_conn / 2)
+
+    # Top half: evenly spaced from top PE y to just above HBM zone
+    top_pe_y = 1.5
+    hbm_top_y = cube_h / 2 - 1.5  # ~5.5 for h=14
+    hbm_bot_y = cube_h / 2 + 1.5  # ~8.5 for h=14
+    bot_pe_y = cube_h - 1.5
+
+    top_rows: list[float] = []
+    if rows_per_half == 1:
+        top_rows = [top_pe_y]
+    else:
+        step = (hbm_top_y - top_pe_y) / (rows_per_half - 1) if rows_per_half > 1 else 0
+        for i in range(rows_per_half):
+            top_rows.append(round(top_pe_y + i * step, 1))
+
+    # HBM rows
+    hbm_rows = [round(hbm_top_y, 1), round(hbm_bot_y, 1)]
+
+    # Bottom half: mirror of top
+    bot_rows: list[float] = []
+    if rows_per_half == 1:
+        bot_rows = [bot_pe_y]
+    else:
+        step = (bot_pe_y - hbm_bot_y) / (rows_per_half - 1) if rows_per_half > 1 else 0
+        for i in range(rows_per_half):
+            bot_rows.append(round(hbm_bot_y + i * step, 1))
+
+    return top_rows + hbm_rows + bot_rows, rows_per_half
+
+
+# ── Mesh generation ──────────────────────────────────────────────────
+
+
+def _generate_mesh(cube_spec: dict, source_hash: str) -> dict:
+    geom = cube_spec["geometry"]
+    cube_w = geom["cube_mm"]["w"]
+    cube_h = geom["cube_mm"]["h"]
+    pe_layout = cube_spec["pe_layout"]
+    corners = pe_layout["corners"]
+    pe_per_corner = pe_layout["pe_per_corner"]
+    n_connections = cube_spec["ucie"]["n_connections"]
+
+    pe_positions = _corner_pe_positions(cube_w, cube_h)
+    col_xs = _compute_col_positions(cube_w, pe_positions)
+    row_ys, rows_per_half = _compute_row_positions(
+        cube_h, n_connections, pe_positions
+    )
+    n_rows = len(row_ys)
+    n_cols = len(col_xs)
+
+    # HBM exclusion zone: center rows, center cols
+    hbm_row_start = rows_per_half      # first HBM row index
+    hbm_row_end = rows_per_half + 1    # last HBM row index (inclusive)
+    hbm_col_start = n_cols // 2 - 1    # center-left col
+    hbm_col_end = n_cols // 2          # center-right col
+
+    # Build routers dict
+    routers: dict[str, Any] = {}
+    for r in range(n_rows):
+        for c in range(n_cols):
+            key = f"r{r}c{c}"
+            if (hbm_row_start <= r <= hbm_row_end
+                    and hbm_col_start <= c <= hbm_col_end):
+                routers[key] = None  # HBM excluded
+            else:
+                routers[key] = {
+                    "pos_mm": [col_xs[c], row_ys[r]],
+                    "attach": [],
+                }
+
+    # PE assignment: map each PE to a router based on corner and position.
+    # All PEs in the same corner share one row. Corner order determines row:
+    #   Top half:  NW → row 0, NE → row 1
+    #   Bottom half: SW → row 4, SE → row 5  (for rows_per_half=2)
+    pe_idx = 0
+    top_pe_routers: list[str] = []
+    bot_pe_routers: list[str] = []
+
+    top_corners = [c for c in corners if c in ("NW", "NE")]
+    bot_corners = [c for c in corners if c in ("SW", "SE")]
+
+    for corner in corners:
+        is_top = corner in ("NW", "NE")
+
+        if is_top:
+            corner_idx = top_corners.index(corner)
+            row = corner_idx if corner_idx < rows_per_half else rows_per_half - 1
+        else:
+            corner_idx = bot_corners.index(corner)
+            bot_start = hbm_row_end + 1
+            row = bot_start + corner_idx if (bot_start + corner_idx) < n_rows else n_rows - 1
+
+        for ci in range(pe_per_corner):
+            pe_x, _pe_y = pe_positions[corner][ci]
+            col = min(range(n_cols), key=lambda c: abs(col_xs[c] - pe_x))
+
+            key = f"r{row}c{col}"
+            router = routers[key]
+            if router is not None:
+                router["attach"].append(f"pe{pe_idx}.dma")
+                router["attach"].append(f"pe{pe_idx}.cpu")
+                if is_top:
+                    top_pe_routers.append(key)
+                else:
+                    bot_pe_routers.append(key)
+
+            pe_idx += 1
+
+    # M_CPU and SRAM attachments (HBM row, leftmost available)
+    mcpu_key = f"r{hbm_row_start}c0"
+    if routers.get(mcpu_key) is not None:
+        routers[mcpu_key]["attach"].append("m_cpu")
+
+    sram_key = f"r{hbm_row_end}c0"
+    if routers.get(sram_key) is not None:
+        routers[sram_key]["attach"].append("sram")
+
+    # UCIe PE rows: top-half rows + bottom-half rows (1 per PE row)
+    ucie_pe_rows = []
+    for r in range(rows_per_half):
+        ucie_pe_rows.append(r)
+    for r in range(rows_per_half):
+        ucie_pe_rows.append(hbm_row_end + 1 + r)
+
+    # UCIe-E distribution: 1 per PE row, rightmost column
+    for i, row in enumerate(ucie_pe_rows):
+        key = f"r{row}c{n_cols - 1}"
+        router = routers.get(key)
+        if router is not None:
+            router["attach"].append(f"ucie_e.c{i}")
+
+    # UCIe-W distribution: 1 per PE row, leftmost column (mirror of E)
+    for i, row in enumerate(ucie_pe_rows):
+        key = f"r{row}c0"
+        router = routers.get(key)
+        if router is not None:
+            router["attach"].append(f"ucie_w.c{i}")
+
+    # UCIe PE columns: left-half + right-half PE columns (for N/S distribution)
+    pe_xs = set()
+    for positions in pe_positions.values():
+        for x, _y in positions:
+            pe_xs.add(x)
+    left_pe_cols = sorted(c for c in range(n_cols)
+                          if col_xs[c] in pe_xs and c < hbm_col_start)
+    right_pe_cols = sorted(c for c in range(n_cols)
+                           if col_xs[c] in pe_xs and c > hbm_col_end)
+    n_ucie = len(ucie_pe_rows)
+    half_n = n_ucie // 2
+    ucie_pe_cols = left_pe_cols[:half_n] + right_pe_cols[:n_ucie - half_n]
+
+    # UCIe-N distribution: PE columns on top row (row 0)
+    for i, col in enumerate(ucie_pe_cols):
+        key = f"r0c{col}"
+        router = routers.get(key)
+        if router is not None:
+            router["attach"].append(f"ucie_n.c{i}")
+
+    # UCIe-S distribution: PE columns on bottom row (row n_rows-1)
+    for i, col in enumerate(ucie_pe_cols):
+        key = f"r{n_rows - 1}c{col}"
+        router = routers.get(key)
+        if router is not None:
+            router["attach"].append(f"ucie_s.c{i}")
+
+    return {
+        "source_hash": source_hash,
+        "mesh": {
+            "rows": n_rows,
+            "cols": n_cols,
+        },
+        "routers": routers,
+        "xbar": {
+            "top": {"routers": sorted(set(top_pe_routers))},
+            "bottom": {"routers": sorted(set(bot_pe_routers))},
+        },
+    }