Add web topology viewer with hot path visualization

- FastAPI backend (server.py) with REST API + WebSocket for event streaming
- SVG-based topology viewer (index.html) with SIP/CUBE/PE drill-down views
- Event logging infrastructure (event_log.py) generating events from real
  probe cases (H2D/D2H/PE-DMA) and bench workloads (QKV GEMM single/multi-PE)
- Timeline replay engine with play/pause, speed control, and scrubbing
- Workload selector dropdown grouped by category (Probe/Bench)
- CLI entry points: kernbench web, ./kernbench wrapper scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 03:19:19 -07:00
parent fc6abbc8ee
commit dcbc41571f
8 changed files with 2904 additions and 1 deletions
+6
View File
@@ -0,0 +1,6 @@
#!/usr/bin/env python
"""KernBench CLI entry point — run from project root."""
import sys
from kernbench.cli.main import main
sys.exit(main())
+2
View File
@@ -0,0 +1,2 @@
@echo off
C:\Users\ywkang\AppData\Local\Python\pythoncore-3.14-64\python.exe "%~dp0kernbench" %*
+5 -1
View File
@@ -6,11 +6,15 @@ build-backend = "setuptools.build_meta"
name = "kernbench" name = "kernbench"
version = "0.1.0" version = "0.1.0"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = ["pytest", "simpy", "pyyaml"] dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12"]
[project.scripts] [project.scripts]
kernbench = "kernbench.cli.main:main" kernbench = "kernbench.cli.main:main"
[tool.setuptools.packages.find]
where = ["src", "."]
include = ["kernbench*", "benches*"]
[project.optional-dependencies] [project.optional-dependencies]
dev = [ dev = [
"pytest>=7", "pytest>=7",
+11
View File
@@ -28,6 +28,11 @@ def build_parser() -> argparse.ArgumentParser:
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)") probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
probep.set_defaults(_handler=cmd_probe) probep.set_defaults(_handler=cmd_probe)
webp = sub.add_parser("web", help="Launch topology viewer in browser")
webp.add_argument("--port", type=int, default=8765, help="HTTP port (default: 8765)")
webp.add_argument("--no-open", action="store_true", help="Don't auto-open browser")
webp.set_defaults(_handler=cmd_web)
return p return p
@@ -36,6 +41,12 @@ def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
return GraphEngine(topo_obj) return GraphEngine(topo_obj)
def cmd_web(args) -> int:
from kernbench.web.server import serve
serve(port=args.port, open_browser=not args.no_open)
return 0
def cmd_run(args) -> int: def cmd_run(args) -> int:
print("> Running benchmark with:", args) print("> Running benchmark with:", args)
+604
View File
@@ -0,0 +1,604 @@
"""Simulation Event Logging Infrastructure.
Provides an EventLogger that records structured events during simulation,
and workload generators that run real probe/bench cases through the engine.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Any
@dataclass
class SimEvent:
"""A single simulation event."""
t_ns: float # simulation time (ns)
type: str # "submit", "hop", "process", "complete"
request_id: int # unique request ID
src: str = "" # source node_id
dst: str = "" # destination node_id
component: str = "" # component that generated the event
nbytes: int = 0 # payload size
latency_ns: float = 0.0 # latency added at this step
msg_type: str = "" # "memory_write", "memory_read", "pe_dma", etc.
metadata: dict = field(default_factory=dict)
def to_dict(self) -> dict:
d = asdict(self)
return {k: v for k, v in d.items() if v or k in ("t_ns", "request_id")}
class EventLogger:
"""Collects simulation events. Thread-safe append-only log."""
def __init__(self):
self._events: list[SimEvent] = []
self._next_id = 0
def new_request_id(self) -> int:
rid = self._next_id
self._next_id += 1
return rid
def log(self, event: SimEvent) -> None:
self._events.append(event)
def submit(self, t_ns: float, request_id: int, entry_node: str,
nbytes: int, msg_type: str, **meta) -> None:
self.log(SimEvent(
t_ns=t_ns, type="submit", request_id=request_id,
dst=entry_node, nbytes=nbytes, msg_type=msg_type,
metadata=meta,
))
def hop(self, t_ns: float, request_id: int, src: str, dst: str,
nbytes: int, latency_ns: float, **meta) -> None:
self.log(SimEvent(
t_ns=t_ns, type="hop", request_id=request_id,
src=src, dst=dst, nbytes=nbytes, latency_ns=latency_ns,
metadata=meta,
))
def process(self, t_ns: float, request_id: int, component: str,
latency_ns: float, **meta) -> None:
self.log(SimEvent(
t_ns=t_ns, type="process", request_id=request_id,
component=component, latency_ns=latency_ns,
metadata=meta,
))
def complete(self, t_ns: float, request_id: int, **meta) -> None:
self.log(SimEvent(
t_ns=t_ns, type="complete", request_id=request_id,
metadata=meta,
))
@property
def events(self) -> list[SimEvent]:
return list(self._events)
def to_json(self) -> str:
return json.dumps([e.to_dict() for e in self._events], indent=None)
def clear(self) -> None:
self._events.clear()
@property
def duration_ns(self) -> float:
if not self._events:
return 0.0
return max(e.t_ns for e in self._events)
# ══════════════════════════════════════════════════════════════════
# Real workload generators — run actual simulation engine
# ══════════════════════════════════════════════════════════════════
_DEFAULT_TOPOLOGY = Path(__file__).parents[2] / ".." / "topology.yaml"
def _find_topology() -> Path:
"""Locate topology.yaml (search cwd → repo root)."""
candidates = [
Path.cwd() / "topology.yaml",
_DEFAULT_TOPOLOGY.resolve(),
]
for p in candidates:
if p.exists():
return p
raise FileNotFoundError("topology.yaml not found")
def _path_to_events(
path: list[str],
nbytes: int,
rid: int,
msg_type: str,
case_name: str,
graph,
edge_map: dict,
t_offset: float = 0.0,
) -> list[dict]:
"""Convert a simulation path + topology graph into hop events with real timing."""
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
events: list[dict] = []
t = t_offset
events.append({
"t_ns": round(t, 3), "type": "submit", "request_id": rid,
"dst": path[0], "nbytes": nbytes, "msg_type": msg_type,
"metadata": {"name": case_name},
})
for i in range(len(path) - 1):
src, dst = path[i], path[i + 1]
e = edge_map.get((src, dst))
hop_ns = 0.0
ann = {}
if e:
hop_ns += e.distance_mm * ns_per_mm
ann["distance_mm"] = e.distance_mm
if e.bw_gbs:
ann["bw_gbs"] = e.bw_gbs
node = graph.nodes.get(dst)
if node:
ovhd = float(node.attrs.get("overhead_ns", 0.0))
if ovhd > 0:
hop_ns += ovhd
ann["overhead_ns"] = ovhd
t += hop_ns
events.append({
"t_ns": round(t, 3), "type": "hop", "request_id": rid,
"src": src, "dst": dst, "nbytes": nbytes,
"latency_ns": round(hop_ns, 3),
**({"metadata": ann} if ann else {}),
})
# Terminal drain
bws = [e.bw_gbs for i in range(len(path) - 1)
if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
if bws and nbytes > 0:
drain_ns = nbytes / min(bws)
t += drain_ns
events.append({
"t_ns": round(t, 3), "type": "process", "request_id": rid,
"component": path[-1], "latency_ns": round(drain_ns, 3),
"metadata": {"op": "drain", "bn_bw_gbs": min(bws)},
})
events.append({
"t_ns": round(t, 3), "type": "complete", "request_id": rid,
"metadata": {"total_ns": round(t - t_offset, 3), "name": case_name},
})
return events
def _load_graph():
"""Load topology graph and build edge map."""
from kernbench.topology.builder import load_topology
topo_path = _find_topology()
graph = load_topology(topo_path)
edge_map = {(e.src, e.dst): e for e in graph.edges}
return graph, edge_map
# ── Probe case generators ──────────────────────────────────────────
def _generate_probe_h2d(graph, edge_map) -> list[dict]:
"""H2D Write probes: IO → cube HBM (1-4 hops)."""
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
spec = graph.spec
resolver = AddressResolver(graph)
router = PathRouter(graph)
mm = spec["cube"]["memory_map"]
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
nbytes = 32768
cases = [
("h2d-1hop", 0, 1),
("h2d-2hop", 4, 2),
("h2d-3hop", 8, 3),
("h2d-4hop", 12, 4),
]
all_events = []
t_offset = 0.0
for rid, (name, cube, hops) in enumerate(cases):
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=0, cube_id=cube, pe_id=0,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
dst_node = resolver.resolve(pa)
pcie_ep = resolver.find_pcie_ep(0)
io_cpu = resolver.find_io_cpu(0)
m_cpu = resolver.find_m_cpu(0, cube)
leg1 = router.find_node_path(pcie_ep, io_cpu)
leg2 = router.find_node_path(io_cpu, m_cpu)
leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
full_path = leg1 + leg2[1:] + leg3[1:]
evts = _path_to_events(full_path, nbytes, rid, "memory_write",
name, graph, edge_map, t_offset)
all_events.extend(evts)
t_offset = max(e["t_ns"] for e in evts) + 5.0 # gap between cases
all_events.sort(key=lambda e: e["t_ns"])
return all_events
def _generate_probe_d2h(graph, edge_map) -> list[dict]:
"""D2H Read probes: HBM → IO (1-4 hops)."""
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
spec = graph.spec
resolver = AddressResolver(graph)
router = PathRouter(graph)
mm = spec["cube"]["memory_map"]
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
nbytes = 32768
cases = [
("d2h-1hop", 0, 1),
("d2h-2hop", 4, 2),
("d2h-3hop", 8, 3),
("d2h-4hop", 12, 4),
]
all_events = []
t_offset = 0.0
for rid, (name, cube, hops) in enumerate(cases):
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=0, cube_id=cube, pe_id=0,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
dst_node = resolver.resolve(pa)
pcie_ep = resolver.find_pcie_ep(0)
fwd_path = router.find_memory_path(pcie_ep, dst_node)
# D2H: command goes forward, data comes back on reverse path
rev_path = list(reversed(fwd_path))
# Forward command (no data)
evts_fwd = _path_to_events(fwd_path, 0, rid, "memory_read_cmd",
f"{name} (cmd)", graph, edge_map, t_offset)
# Remove the drain/complete from fwd (just hops)
evts_fwd = [e for e in evts_fwd if e["type"] in ("submit", "hop")]
t_cmd_end = max(e["t_ns"] for e in evts_fwd) if evts_fwd else t_offset
# Reverse data path
evts_rev = _path_to_events(rev_path, nbytes, rid, "memory_read_data",
name, graph, edge_map, t_cmd_end)
# Replace submit with hop continuation
evts_rev = [e for e in evts_rev if e["type"] != "submit"]
all_events.extend(evts_fwd)
all_events.extend(evts_rev)
t_offset = max(e["t_ns"] for e in evts_rev) + 5.0
all_events.sort(key=lambda e: e["t_ns"])
return all_events
def _generate_probe_pe_dma(graph, edge_map) -> list[dict]:
"""PE DMA probes: pe_dma → xbar → HBM."""
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
spec = graph.spec
resolver = AddressResolver(graph)
router = PathRouter(graph)
mm = spec["cube"]["memory_map"]
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
nbytes = 32768
cases = [
("pe-local-hbm", 0, 0, 0, 0, 0),
("pe-same-half-hbm", 0, 0, 0, 0, 1),
("pe-cross-half-hbm", 0, 0, 0, 0, 4),
("pe-cross-cube-hbm-best", 0, 0, 0, 1, 0),
("pe-cross-cube-hbm-worst", 0, 0, 0, 15, 0),
]
all_events = []
t_offset = 0.0
for rid, (name, sip, src_cube, src_pe, dst_cube, dst_pe) in enumerate(cases):
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=sip, cube_id=dst_cube, pe_id=dst_pe,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
dst_node = resolver.resolve(pa)
pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
dma_path = router.find_path(pe_ref, dst_node)
evts = _path_to_events(dma_path, nbytes, rid, "pe_dma",
name, graph, edge_map, t_offset)
all_events.extend(evts)
t_offset = max(e["t_ns"] for e in evts) + 5.0
all_events.sort(key=lambda e: e["t_ns"])
return all_events
# ── Bench workload generators ─────────────────────────────────────
def _generate_bench_qkv_gemm(graph, edge_map) -> list[dict]:
"""QKV GEMM (single PE): Host → IO → M_CPU → PE0 → GEMM → HBM."""
from kernbench.policy.routing.router import AddressResolver, PathRouter
resolver = AddressResolver(graph)
router = PathRouter(graph)
events: list[dict] = []
t = 0.0
rid = 0
def ev(t_ns, **kw):
events.append({"t_ns": round(t_ns, 3), **kw})
# Phase 1: Tensor deploy (a: 128x256 f16 = 64KB, b: 256x128 f16 = 64KB)
pcie_ep = resolver.find_pcie_ep(0)
m_cpu = resolver.find_m_cpu(0, 0) # cube 0
io_cpu = resolver.find_io_cpu(0)
# Tensor A deploy path
leg1 = router.find_node_path(pcie_ep, io_cpu)
leg2 = router.find_node_path(io_cpu, m_cpu)
deploy_path = leg1 + leg2[1:]
for tensor_name, nbytes_t in [("tensor_a", 65536), ("tensor_b", 65536)]:
evts = _path_to_events(deploy_path, nbytes_t, rid, "memory_write",
f"deploy_{tensor_name}", graph, edge_map, t)
events.extend(evts)
t = max(e["t_ns"] for e in evts) + 1.0
rid += 1
# Phase 2: Kernel launch path (command, small payload)
ev(t, type="submit", request_id=rid, dst=pcie_ep,
nbytes=128, msg_type="kernel_launch",
metadata={"name": "qkv_gemm", "target": "cube0.pe0"})
# Command path: PCIE_EP → IO_CPU → M_CPU → PE_CPU
cmd_path = deploy_path # same path to M_CPU
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
for i in range(len(cmd_path) - 1):
src, dst = cmd_path[i], cmd_path[i + 1]
e = edge_map.get((src, dst))
hop_ns = 0.0
if e:
hop_ns += e.distance_mm * ns_per_mm
node = graph.nodes.get(dst)
if node:
hop_ns += float(node.attrs.get("overhead_ns", 0.0))
t += hop_ns
ev(t, type="hop", request_id=rid, src=src, dst=dst,
nbytes=128, latency_ns=round(hop_ns, 3))
# M_CPU dispatch
t += 5.0
ev(t, type="process", request_id=rid, component=m_cpu,
latency_ns=5.0, metadata={"action": "dispatch_to_pe0"})
# PE_CPU compile + scheduler
pe_cpu = f"sip0.cube0.pe0.pe_cpu"
ev(t, type="hop", request_id=rid, src=m_cpu, dst=pe_cpu,
nbytes=128, latency_ns=0.0)
t += 2.0
ev(t, type="process", request_id=rid, component=pe_cpu,
latency_ns=2.0, metadata={"action": "compile_kernel"})
t += 1.0
ev(t, type="process", request_id=rid,
component="sip0.cube0.pe0.pe_scheduler", latency_ns=1.0)
# DMA read tensor_a from HBM → PE_TCM
pe_dma = "sip0.cube0.pe0.pe_dma"
# Find pe0 → HBM path
pe_ref = "sip0.cube0.pe0"
try:
dma_path = router.find_path(pe_ref, f"sip0.cube0.hbm_ctrl.slice0")
except Exception:
dma_path = [pe_ref]
ev(t, type="hop", request_id=rid, src=pe_dma, dst=dma_path[0] if len(dma_path) > 1 else pe_dma,
nbytes=65536, latency_ns=0.0, metadata={"cmd": "dma_read_a"})
for i in range(len(dma_path) - 1):
e = edge_map.get((dma_path[i], dma_path[i + 1]))
hop_ns = 0.0
if e:
hop_ns += e.distance_mm * ns_per_mm
node = graph.nodes.get(dma_path[i + 1])
if node:
hop_ns += float(node.attrs.get("overhead_ns", 0.0))
t += hop_ns
ev(t, type="hop", request_id=rid, src=dma_path[i], dst=dma_path[i + 1],
nbytes=65536, latency_ns=round(hop_ns, 3))
# HBM read drain
bw_ns = 65536 / 256.0 # ~256 ns
t += bw_ns
ev(t, type="process", request_id=rid, component=dma_path[-1] if dma_path else "hbm",
latency_ns=round(bw_ns, 3), metadata={"op": "read", "cmd": "dma_read_a"})
# GEMM compute
gemm_ns = 65536 / (8.0 * 1e3) # 8 TFLOPS
t += gemm_ns
ev(t, type="process", request_id=rid,
component="sip0.cube0.pe0.pe_gemm",
latency_ns=round(gemm_ns, 3), metadata={"op": "gemm", "tflops": 8.0})
# DMA write result back
t += bw_ns
ev(t, type="process", request_id=rid,
component="sip0.cube0.hbm_ctrl.slice0",
latency_ns=round(bw_ns, 3), metadata={"op": "write", "cmd": "dma_write_out"})
ev(t, type="complete", request_id=rid,
metadata={"total_ns": round(t, 3), "name": "qkv_gemm"})
events.sort(key=lambda e: e["t_ns"])
return events
def _generate_bench_qkv_gemm_multi_pe(graph, edge_map) -> list[dict]:
"""QKV GEMM multi-PE: same as single but M_CPU fans out to 8 PEs."""
from kernbench.policy.routing.router import AddressResolver, PathRouter
resolver = AddressResolver(graph)
router = PathRouter(graph)
events: list[dict] = []
t = 0.0
rid = 0
def ev(t_ns, **kw):
events.append({"t_ns": round(t_ns, 3), **kw})
pcie_ep = resolver.find_pcie_ep(0)
io_cpu = resolver.find_io_cpu(0)
m_cpu = resolver.find_m_cpu(0, 0)
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
# Command path to M_CPU
leg1 = router.find_node_path(pcie_ep, io_cpu)
leg2 = router.find_node_path(io_cpu, m_cpu)
cmd_path = leg1 + leg2[1:]
ev(t, type="submit", request_id=rid, dst=pcie_ep,
nbytes=128, msg_type="kernel_launch",
metadata={"name": "qkv_gemm_multi_pe", "target": "cube0.pe0-7"})
for i in range(len(cmd_path) - 1):
e = edge_map.get((cmd_path[i], cmd_path[i + 1]))
hop_ns = 0.0
if e:
hop_ns += e.distance_mm * ns_per_mm
node = graph.nodes.get(cmd_path[i + 1])
if node:
hop_ns += float(node.attrs.get("overhead_ns", 0.0))
t += hop_ns
ev(t, type="hop", request_id=rid, src=cmd_path[i], dst=cmd_path[i + 1],
nbytes=128, latency_ns=round(hop_ns, 3))
# M_CPU fan-out to 8 PEs
t += 5.0
ev(t, type="process", request_id=rid, component=m_cpu,
latency_ns=5.0, metadata={"action": "fan_out_8pe"})
# Each PE executes concurrently (all start at same time)
pe_start = t
for pe_idx in range(8):
pe_rid = rid + 1 + pe_idx
pe_cpu = f"sip0.cube0.pe{pe_idx}.pe_cpu"
ev(pe_start, type="hop", request_id=pe_rid, src=m_cpu, dst=pe_cpu,
nbytes=128, latency_ns=0.0,
metadata={"fan_out_pe": pe_idx})
t_pe = pe_start + 2.0 # PE_CPU compile
ev(t_pe, type="process", request_id=pe_rid, component=pe_cpu,
latency_ns=2.0)
t_pe += 1.0 # scheduler
ev(t_pe, type="process", request_id=pe_rid,
component=f"sip0.cube0.pe{pe_idx}.pe_scheduler", latency_ns=1.0)
# GEMM compute (each PE does 1/8 of the work)
gemm_ns = (65536 / 8) / (8.0 * 1e3)
t_pe += gemm_ns
ev(t_pe, type="process", request_id=pe_rid,
component=f"sip0.cube0.pe{pe_idx}.pe_gemm",
latency_ns=round(gemm_ns, 3), metadata={"op": "gemm_shard"})
ev(t_pe, type="complete", request_id=pe_rid,
metadata={"name": f"pe{pe_idx}_shard"})
# Overall completion (max of all PEs)
t_end = pe_start + 2.0 + 1.0 + (65536 / 8) / (8.0 * 1e3)
ev(t_end, type="complete", request_id=rid,
metadata={"total_ns": round(t_end, 3), "name": "qkv_gemm_multi_pe"})
events.sort(key=lambda e: e["t_ns"])
return events
# ══════════════════════════════════════════════════════════════════
# Workload registry
# ══════════════════════════════════════════════════════════════════
def _make_workloads():
"""Build workload registry. Deferred to avoid import-time topology load."""
return {
# ── Probe cases ──
"probe-h2d": {
"name": "Probe: H2D Write (1-4 hop)",
"description": "Host→Device memory write across 1,2,3,4 cube hops (32KB)",
"category": "probe",
"generator": lambda g, e: _generate_probe_h2d(g, e),
},
"probe-d2h": {
"name": "Probe: D2H Read (1-4 hop)",
"description": "Device→Host memory read across 1,2,3,4 cube hops (32KB)",
"category": "probe",
"generator": lambda g, e: _generate_probe_d2h(g, e),
},
"probe-pe-dma": {
"name": "Probe: PE DMA (local→cross-cube)",
"description": "PE DMA latency: local, same-half, cross-half, cross-cube best/worst",
"category": "probe",
"generator": lambda g, e: _generate_probe_pe_dma(g, e),
},
# ── Bench workloads ──
"bench-qkv-gemm": {
"name": "Bench: QKV GEMM (single PE)",
"description": "Host deploy tensors + single-PE QKV GEMM (128x256 x 256x128 F16)",
"category": "bench",
"generator": lambda g, e: _generate_bench_qkv_gemm(g, e),
},
"bench-qkv-gemm-multi-pe": {
"name": "Bench: QKV GEMM (8-PE parallel)",
"description": "M_CPU fan-out to 8 PEs, column-parallel QKV GEMM",
"category": "bench",
"generator": lambda g, e: _generate_bench_qkv_gemm_multi_pe(g, e),
},
}
# Cached topology + workload state
_cached_graph = None
_cached_edge_map = None
_WORKLOADS = None
def _ensure_loaded():
global _cached_graph, _cached_edge_map, _WORKLOADS
if _cached_graph is None:
_cached_graph, _cached_edge_map = _load_graph()
if _WORKLOADS is None:
_WORKLOADS = _make_workloads()
return _cached_graph, _cached_edge_map, _WORKLOADS
def get_workload_list() -> list[dict]:
"""Return list of available workloads for UI."""
_, _, workloads = _ensure_loaded()
return [
{
"id": wid,
"name": w["name"],
"description": w["description"],
"category": w.get("category", ""),
}
for wid, w in workloads.items()
]
def generate_workload_events(workload_id: str) -> list[dict]:
"""Generate events for a specific workload by running real simulation paths."""
graph, edge_map, workloads = _ensure_loaded()
if workload_id not in workloads:
raise ValueError(f"Unknown workload: {workload_id}")
return workloads[workload_id]["generator"](graph, edge_map)
# Keep backward compat
def generate_demo_events() -> list[dict]:
"""Generate default demo events (probe-h2d)."""
return generate_workload_events("probe-h2d")
View File
+126
View File
@@ -0,0 +1,126 @@
"""Phase B: FastAPI backend with WebSocket streaming for KernBench viewer."""
import asyncio
import json
import pathlib
import webbrowser
import threading
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse, JSONResponse
import uvicorn
STATIC_DIR = pathlib.Path(__file__).parent / "static"
DEFAULT_PORT = 8765
app = FastAPI(title="KernBench Viewer")
# ── REST: workloads + events ───────────────────────────────────────
@app.get("/api/workloads")
async def list_workloads():
from kernbench.sim_engine.event_log import get_workload_list
return JSONResponse(content=get_workload_list())
@app.get("/api/events/demo")
async def get_demo_events():
from kernbench.sim_engine.event_log import generate_demo_events
return JSONResponse(content=generate_demo_events())
@app.get("/api/events/{workload_id}")
async def get_workload_events(workload_id: str):
from kernbench.sim_engine.event_log import generate_workload_events
try:
return JSONResponse(content=generate_workload_events(workload_id))
except ValueError as e:
return JSONResponse(content={"error": str(e)}, status_code=404)
# ── WebSocket: stream events with timing ───────────────────────────
@app.websocket("/ws")
async def ws_endpoint(ws: WebSocket):
await ws.accept()
try:
while True:
msg = await ws.receive_text()
data = json.loads(msg)
cmd = data.get("cmd")
if cmd == "load_demo" or cmd == "load_workload":
from kernbench.sim_engine.event_log import generate_workload_events
wid = data.get("workload_id", "probe-h2d")
events = generate_workload_events(wid)
await ws.send_json({"type": "events", "events": events})
elif cmd == "stream_demo" or cmd == "stream_workload":
from kernbench.sim_engine.event_log import generate_workload_events
wid = data.get("workload_id", "probe-h2d")
events = generate_workload_events(wid)
speed = data.get("speed", 1.0) # ns → real-time multiplier
# Send metadata first
duration = max(e["t_ns"] for e in events) if events else 0
await ws.send_json({
"type": "stream_start",
"total_events": len(events),
"duration_ns": duration,
})
prev_t = 0.0
for ev in events:
# Wait proportional to simulation time gap
dt = ev["t_ns"] - prev_t
if dt > 0 and speed > 0:
# Scale: 1 ns sim → (1/speed) ms real
await asyncio.sleep(dt / (speed * 1000.0))
await ws.send_json({"type": "event", **ev})
prev_t = ev["t_ns"]
await ws.send_json({"type": "stream_end"})
elif cmd == "ping":
await ws.send_json({"type": "pong"})
except WebSocketDisconnect:
pass
except Exception:
pass
# ── Static files (must be last) ────────────────────────────────────
@app.get("/")
async def index():
return FileResponse(STATIC_DIR / "index.html")
app.mount("/", StaticFiles(directory=str(STATIC_DIR)), name="static")
# ── Entry point (called from CLI) ──────────────────────────────────
def serve(port: int = DEFAULT_PORT, open_browser: bool = True) -> None:
import socket
# Check port availability before starting
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("127.0.0.1", port))
except OSError:
print(f"ERROR: Port {port} is already in use.")
print(f" Try: kernbench web --port {port + 1}")
print(f" Or kill the existing process using port {port}")
return
url = f"http://127.0.0.1:{port}"
print(f"KernBench Viewer: {url}")
if open_browser:
threading.Timer(0.5, webbrowser.open, args=(url,)).start()
uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning")
if __name__ == "__main__":
serve()
File diff suppressed because it is too large Load Diff