Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
@@ -10,7 +10,7 @@ from pathlib import Path

 from kernbench.policy.address.phyaddr import PhysAddr
 from kernbench.policy.routing.router import AddressResolver, PathRouter
-from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+from kernbench.runtime_api.kernel import MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import load_topology
 from kernbench.topology.types import TopologyGraph
@@ -54,6 +54,46 @@ def _formula_breakdown(
    return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns


+def _hop_timestamps(
+    path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
+) -> list[tuple[str, float, str]]:
+    """Return per-hop timestamps: [(node_short, cumulative_ns, annotation), ...].
+
+    Annotations mark bottleneck edges and significant overhead nodes.
+    """
+    ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
+    # Find bottleneck BW for annotation
+    bws = [e.bw_gbs for i in range(len(path) - 1)
+           if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
+    bn_bw = min(bws) if bws else None
+
+    cumulative = 0.0
+    result: list[tuple[str, float, str]] = []
+    result.append((_short_name(path[0]), 0.0, ""))
+
+    for i in range(len(path) - 1):
+        e = edge_map.get((path[i], path[i + 1]))
+        ann = ""
+        if e:
+            cumulative += e.distance_mm * ns_per_mm
+            if bn_bw is not None and e.bw_gbs and e.bw_gbs == bn_bw:
+                ann = f"<BN:{e.bw_gbs:.0f}GB/s>"
+        node = graph.nodes.get(path[i + 1])
+        if node:
+            ovhd = float(node.attrs.get("overhead_ns", 0.0))
+            cumulative += ovhd
+            if ovhd > 0 and not ann:
+                ann = f"+{ovhd:.1f}ns"
+        result.append((_short_name(path[i + 1]), cumulative, ann))
+
+    # Add drain at terminal
+    if bn_bw and nbytes > 0:
+        cumulative += nbytes / bn_bw
+        result[-1] = (result[-1][0], cumulative, result[-1][2] + f" drain:{nbytes/bn_bw:.1f}ns")
+
+    return result
+
+
 def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
    """Per-request bottleneck: single request uses one connection."""
    bws: list[float] = []
@@ -85,6 +125,41 @@ def _short_path(path: list[str]) -> str:
    return " -> ".join(_short_name(n) for n in path)


+def _print_hop_trace(timestamps: list[tuple[str, float, str]], indent: str = "    ") -> None:
+    """Print per-hop timestamp trace."""
+    for node, t_ns, ann in timestamps:
+        ann_str = f"  {ann}" if ann else ""
+        print(f"{indent}{t_ns:>8.2f}ns  {node}{ann_str}")
+
+
+SWEEP_SIZES = [4096, 16384, 65536, 262144, 1048576]
+SWEEP_LABELS = ["4KB", "16KB", "64KB", "256KB", "1MB"]
+
+
+def _sweep_util(ovhd_ns: float, wire_ns: float, bn_bw: float | None, sizes: list[int] = SWEEP_SIZES) -> list[float]:
+    """Compute utilization % for each data size using formula model."""
+    if bn_bw is None or bn_bw <= 0:
+        return [0.0] * len(sizes)
+    result = []
+    for nb in sizes:
+        drain = nb / bn_bw
+        total = ovhd_ns + wire_ns + drain
+        eff = nb / total if total > 0 else 0.0
+        result.append(eff / bn_bw * 100)
+    return result
+
+
+def _print_sweep_table(case_names: list[str], sweep_data: list[list[float]]) -> None:
+    """Print compact BW saturation table."""
+    hdr = f"  {'Case':<26}" + "".join(f" {l:>7}" for l in SWEEP_LABELS)
+    print(f"\n  BW Saturation (Util% by data size):")
+    print(hdr)
+    print("  " + "-" * (26 + 8 * len(SWEEP_LABELS)))
+    for name, utils in zip(case_names, sweep_data):
+        cols = "".join(f" {u:>6.1f}%" for u in utils)
+        print(f"  {name:<26}{cols}")
+
+
 # -- Probe runner -----------------------------------------------------


@@ -96,25 +171,18 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
    resolver = AddressResolver(graph)
    router = PathRouter(graph)

-    nbytes = 4096
+    nbytes = 32768
    show_all = case_filter is None or case_filter == "all"

-    # === H2D Write ===
+    # === Collect H2D results ===
    h2d_cases = [
        ("h2d-1hop", 0, 1),
        ("h2d-2hop", 4, 2),
        ("h2d-3hop", 8, 3),
        ("h2d-4hop", 12, 4),
    ]
-    h2d_results: list[tuple[str, int, float, float, float | None]] = []
-    h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
-
-    print()
-    print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
-    print(f"  {'Case':<14} {'Target':<16} {'Hops':>4}   {'Actual':>8}"
-          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
-          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
-    print("  " + "-" * 115)
+    h2d_results: list[tuple[str, int, float, float, float | None, float, float, float, float, float]] = []
+    h2d_route_data: list[tuple[str, list[str], list[str], list[str], list[str]]] = []

    for name, cube, hops in h2d_cases:
        if not show_all and case_filter != name:
@@ -144,52 +212,67 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
        full_path = leg1 + leg2[1:] + leg3[1:]
        bn_bw = _bottleneck_bw(full_path, edge_map)

-        # Forward path breakdown only (response path is implicit in actual_ns)
        fwd_path = leg1 + leg2[1:] + leg3[1:]
        wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)

        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0

-        h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
-        h2d_paths.append((name, leg1, leg2, leg3))
-        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
-              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
-              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+        h2d_results.append((name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct))
+        h2d_route_data.append((name, leg1, leg2, leg3, fwd_path))

-    if len(h2d_results) >= 2:
-        lats = [r[2] for r in h2d_results]
-        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
-        sym = "[v]" if mono else "[x]"
-        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
-
-    if h2d_paths:
-        print()
-        print("  Route Details:")
-        print(f"  {'Case':<14} {'Leg':>4}  Path")
-        print("  " + "-" * 80)
-        for name, leg1, leg2, leg3 in h2d_paths:
-            print(f"  {name:<14} {'L1':>4}  {_short_path(leg1)}")
-            print(f"  {'':<14} {'L2':>4}  {_short_path(leg2)}")
-            print(f"  {'':<14} {'L3':>4}  {_short_path(leg3)}")
-
-    # === PE DMA → HBM (direct PE-level injection) ===
-    # (name, sip, src_cube, src_pe, dst_cube, dst_pe)
-    pe_cases = [
-        ("pe-local-hbm", 0, 0, 0, 0, 0),        # pe0 → slice0 (local, 256 GB/s)
-        ("pe-same-half-hbm", 0, 0, 0, 0, 1),     # pe0 → slice1 (xbar chain, 128 GB/s)
-        ("pe-cross-half-hbm", 0, 0, 0, 0, 4),    # pe0 → slice4 (xbar chain, 128 GB/s)
-        ("pe-cross-cube-hbm", 0, 0, 0, 1, 0),    # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
+    # === Collect D2H Read results ===
+    d2h_cases = [
+        ("d2h-1hop", 0, 1),
+        ("d2h-2hop", 4, 2),
+        ("d2h-3hop", 8, 3),
+        ("d2h-4hop", 12, 4),
    ]
-    pe_results: list[tuple[str, float, float, float | None]] = []
-    pe_paths: list[tuple[str, list[str]]] = []
+    d2h_results: list[tuple[str, int, float, float, float | None, float, float, float, float, float]] = []
+    d2h_route_data: list[tuple[str, list[str], list[str], list[str], list[str]]] = []

-    print()
-    print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
-    print(f"  {'Case':<22} {'Target':<28} {'Actual':>8}"
-          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
-          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
-    print("  " + "-" * 120)
+    for name, cube, hops in d2h_cases:
+        if not show_all and case_filter != name:
+            continue
+        engine = GraphEngine(graph)
+        pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
+        msg = MemoryReadMsg(
+            correlation_id="probe", request_id=name,
+            src_sip=0, src_cube=cube, src_pe=0,
+            src_pa=pa, nbytes=nbytes,
+        )
+        h = engine.submit(msg)
+        engine.wait(h)
+        _, trace = engine.get_completion(h)
+        total_ns = trace["total_ns"]
+        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
+
+        pa_obj = PhysAddr.decode(pa)
+        dst_node = resolver.resolve(pa_obj)
+
+        pcie_ep = resolver.find_pcie_ep(0)
+        fwd_path = router.find_memory_path(pcie_ep, dst_node)
+        rev_path = list(reversed(fwd_path))
+        bn_bw = _bottleneck_bw(fwd_path, edge_map)
+
+        wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
+
+        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
+        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
+
+        d2h_results.append((name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct))
+        d2h_route_data.append((name, fwd_path, rev_path, [], fwd_path))
+
+    # === Collect PE DMA results ===
+    pe_cases = [
+        ("pe-local-hbm", 0, 0, 0, 0, 0),
+        ("pe-same-half-hbm", 0, 0, 0, 0, 1),
+        ("pe-cross-half-hbm", 0, 0, 0, 0, 4),
+        ("pe-cross-cube-hbm-best", 0, 0, 0, 1, 0),   # adjacent cube
+        ("pe-cross-cube-hbm-worst", 0, 0, 0, 15, 0),  # diagonal far cube
+    ]
+    pe_results: list[tuple[str, float, float, float | None, float, float, float, float, float]] = []
+    pe_route_data: list[tuple[str, list[str], str]] = []

    for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
        if not show_all and case_filter != name:
@@ -219,26 +302,146 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:
        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0

        target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
-        pe_results.append((name, total_ns, eff_bw, bn_bw))
-        pe_paths.append((name, dma_path))
-        print(f"  {name:<22} {target_str:<28} {total_ns:>8.2f}"
+        pe_results.append((name, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct))
+        pe_route_data.append((name, dma_path, target_str))
+
+    # ================================================================
+    # OUTPUT: Summary tables first, then route details
+    # ================================================================
+
+    # --- H2D Summary Table ---
+    print()
+    print(f"=== H2D Write Latency (IO->HBM, data={nbytes}B) ===")
+    print(f"  {'Case':<14} {'Target':<16} {'Hops':>4}   {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 115)
+
+    for i, (name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct) in enumerate(h2d_results):
+        cube = h2d_cases[i][1] if i < len(h2d_cases) else 0
+        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
+              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
+              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+
+    if len(h2d_results) >= 2:
+        lats = [r[2] for r in h2d_results]
+        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
+        sym = "[v]" if mono else "[x]"
+        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
+
+    if h2d_results:
+        h2d_sweep = [_sweep_util(r[5], r[7], r[4]) for r in h2d_results]
+        _print_sweep_table([r[0] for r in h2d_results], h2d_sweep)
+
+    # --- D2H Summary Table ---
+    print()
+    print(f"=== D2H Read Latency (HBM->IO, data={nbytes}B) ===")
+    print(f"  {'Case':<14} {'Source':<16} {'Hops':>4}   {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 115)
+
+    for i, (name, hops, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct) in enumerate(d2h_results):
+        cube = d2h_cases[i][1] if i < len(d2h_cases) else 0
+        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
+              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
+              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+
+    if len(d2h_results) >= 2:
+        lats = [r[2] for r in d2h_results]
+        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
+        sym = "[v]" if mono else "[x]"
+        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
+
+    if d2h_results:
+        # D2H fixed cost = actual_total - drain (includes fwd+rev overhead)
+        d2h_sweep = [_sweep_util(r[2] - r[6], 0.0, r[4]) for r in d2h_results]
+        _print_sweep_table([r[0] for r in d2h_results], d2h_sweep)
+
+    # H2D vs D2H comparison
+    if h2d_results and d2h_results and len(h2d_results) == len(d2h_results):
+        all_gte = all(d2h_results[i][2] >= h2d_results[i][2] for i in range(len(h2d_results)))
+        sym = "[v]" if all_gte else "[x]"
+        print(f"  {sym} D2H >= H2D (reverse data path): {'PASS' if all_gte else 'FAIL'}")
+
+    # --- PE DMA Summary Table ---
+    print()
+    print(f"=== PE DMA Latency (pe_dma -> xbar -> HBM, data={nbytes}B) ===")
+    print(f"  {'Case':<26} {'Target':<28} {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 124)
+
+    for name, total_ns, eff_bw, bn_bw, ovhd, drain, wire, ovhd_pct, drain_pct in pe_results:
+        target_str = [t for n, _, t in pe_route_data if n == name]
+        t_str = target_str[0] if target_str else ""
+        print(f"  {name:<26} {t_str:<28} {total_ns:>8.2f}"
              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")

    if len(pe_results) >= 2:
        local = [r for r in pe_results if "local" in r[0]]
-        chain = [r for r in pe_results if "local" not in r[0]]
-        if local and chain:
+        remote = [r for r in pe_results if "local" not in r[0]]
+        if local and remote:
            print(f"  * Local BN: {_fmt_bw(local[0][3])} GB/s, "
-                  f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
+                  f"Remote BN: {_fmt_bw(remote[0][3])} GB/s")
+        best = [r for r in pe_results if "best" in r[0]]
+        worst = [r for r in pe_results if "worst" in r[0]]
+        if best and worst:
+            sym = "[v]" if best[0][1] < worst[0][1] else "[x]"
+            print(f"  {sym} Cross-cube best < worst: {'PASS' if best[0][1] < worst[0][1] else 'FAIL'}"
+                  f"  ({best[0][1]:.2f}ns < {worst[0][1]:.2f}ns)")

-    if pe_paths:
+    if pe_results:
+        pe_sweep = [_sweep_util(r[4], r[6], r[3]) for r in pe_results]
+        _print_sweep_table([r[0] for r in pe_results], pe_sweep)
+
+    # ================================================================
+    # ROUTE DETAILS (grouped below all tables)
+    # ================================================================
+    print()
+    print("=" * 60)
+    print("  ROUTE DETAILS (per-hop timestamps)")
+    print("=" * 60)
+
+    # --- H2D Routes ---
+    if h2d_route_data:
        print()
-        print("  Route Details:")
-        print(f"  {'Case':<22}  Path")
-        print("  " + "-" * 80)
-        for name, dma_path in pe_paths:
-            print(f"  {name:<22}  {_short_path(dma_path)}")
+        print("  --- H2D Write Routes ---")
+        for name, leg1, leg2, leg3, fwd_path in h2d_route_data:
+            timestamps = _hop_timestamps(fwd_path, nbytes, edge_map, graph)
+            print(f"\n  [{name}]")
+            print(f"    Leg1: {_short_path(leg1)}")
+            print(f"    Leg2: {_short_path(leg2)}")
+            print(f"    Leg3: {_short_path(leg3)}")
+            print(f"    Per-hop trace:")
+            _print_hop_trace(timestamps, indent="      ")
+
+    # --- D2H Routes ---
+    if d2h_route_data:
+        print()
+        print("  --- D2H Read Routes ---")
+        for name, fwd_path, rev_path, _, _ in d2h_route_data:
+            timestamps_fwd = _hop_timestamps(fwd_path, 0, edge_map, graph)
+            timestamps_rev = _hop_timestamps(rev_path, nbytes, edge_map, graph)
+            print(f"\n  [{name}]")
+            print(f"    Fwd (cmd):  {_short_path(fwd_path)}")
+            print(f"    Rev (data): {_short_path(rev_path)}")
+            print(f"    Forward cmd trace (no data):")
+            _print_hop_trace(timestamps_fwd, indent="      ")
+            print(f"    Reverse data trace:")
+            _print_hop_trace(timestamps_rev, indent="      ")
+
+    # --- PE DMA Routes ---
+    if pe_route_data:
+        print()
+        print("  --- PE DMA Routes ---")
+        for name, dma_path, target_str in pe_route_data:
+            timestamps = _hop_timestamps(dma_path, nbytes, edge_map, graph)
+            print(f"\n  [{name}] {target_str}")
+            print(f"    Path: {_short_path(dma_path)}")
+            print(f"    Per-hop trace:")
+            _print_hop_trace(timestamps, indent="      ")

    print()
    return 0