commit - release 1

This commit is contained in:
2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
+64
View File
@@ -0,0 +1,64 @@
import argparse
import sys
from benches.loader import resolve_bench
from kernbench.cli.probe import cmd_probe
from kernbench.cli.report import format_report
from kernbench.common.types import SimEngine
from kernbench.runtime_api.bench_runner import run_bench
from kernbench.runtime_api.types import DeviceSelector, resolve_device
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(prog="kernbench")
sub = p.add_subparsers(dest="cmd", required=True)
runp = sub.add_parser("run", help="Run a benchmark")
runp.add_argument("--topology", required=True)
runp.add_argument("--bench", required=True)
runp.add_argument(
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
)
runp.set_defaults(_handler=cmd_run)
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
probep.add_argument("--topology", required=True)
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
probep.set_defaults(_handler=cmd_probe)
return p
def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
topo_obj = getattr(topology, "topology_obj", topology)
return GraphEngine(topo_obj)
def cmd_run(args) -> int:
print("> Running benchmark with:", args)
topo = resolve_topology(args.topology)
bench = resolve_bench(args.bench)
device = resolve_device(args.device)
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
topo_obj = getattr(topo, "topology_obj", topo)
spec = getattr(topo_obj, "spec", None)
if result.traces:
print(format_report(result.traces, title=args.bench, spec=spec))
print(result.summary_text())
return 0 if result.completion.ok else 1
def main(argv=None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
return int(args._handler(args))
if __name__ == "__main__":
sys.exit(main())
+248
View File
@@ -0,0 +1,248 @@
"""kernbench probe: latency and BW verification utility.
Runs predefined traffic patterns through the simulation engine and reports
latency, effective bandwidth, bottleneck bandwidth, and utilization for each
case. Validates monotonicity invariants across hop counts and access types.
"""
from __future__ import annotations
from pathlib import Path
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
from kernbench.topology.types import TopologyGraph
# -- Helpers ----------------------------------------------------------
def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int:
mm = spec["cube"]["memory_map"]
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
return pa.encode()
def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]:
return {(e.src, e.dst): e for e in graph.edges}
def _formula_breakdown(
path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
) -> tuple[float, float, float, float]:
"""Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path."""
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
wire_ns = 0.0
for i in range(len(path) - 1):
e = edge_map.get((path[i], path[i + 1]))
if e:
wire_ns += e.distance_mm * ns_per_mm
overhead_ns = 0.0
for nid in path:
node = graph.nodes.get(nid)
if node:
overhead_ns += float(node.attrs.get("overhead_ns", 0.0))
bws = [e.bw_gbs for i in range(len(path) - 1)
if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
drain_ns = nbytes / min(bws) if bws else 0.0
return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns
def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
"""Per-request bottleneck: single request uses one connection."""
bws: list[float] = []
for i in range(len(path) - 1):
e = edge_map.get((path[i], path[i + 1]))
if e and e.bw_gbs:
bws.append(e.bw_gbs)
return min(bws) if bws else None
def _fmt_bw(bw: float | None) -> str:
return f"{bw:.1f}" if bw is not None else "-"
def _fmt_util(eff: float, bn: float | None) -> str:
if bn is None or bn <= 0:
return "-"
return f"{eff / bn * 100:.1f}%"
def _short_name(node_id: str) -> str:
"""Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
parts = node_id.split(".")
return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
def _short_path(path: list[str]) -> str:
return " -> ".join(_short_name(n) for n in path)
# -- Probe runner -----------------------------------------------------
def run_probe(topology_path: str, case_filter: str | None = None) -> int:
path = Path(topology_path).expanduser().resolve()
graph = load_topology(path)
edge_map = _build_edge_map(graph)
spec = graph.spec
resolver = AddressResolver(graph)
router = PathRouter(graph)
nbytes = 4096
show_all = case_filter is None or case_filter == "all"
# === H2D Write ===
h2d_cases = [
("h2d-1hop", 0, 1),
("h2d-2hop", 4, 2),
("h2d-3hop", 8, 3),
("h2d-4hop", 12, 4),
]
h2d_results: list[tuple[str, int, float, float, float | None]] = []
h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
print()
print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
print(f" {'Case':<14} {'Target':<16} {'Hops':>4} {'Actual':>8}"
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
print(" " + "-" * 115)
for name, cube, hops in h2d_cases:
if not show_all and case_filter != name:
continue
engine = GraphEngine(graph)
pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
msg = MemoryWriteMsg(
correlation_id="probe", request_id=name,
dst_sip=0, dst_cube=cube, dst_pe=0,
dst_pa=pa, nbytes=nbytes, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
total_ns = trace["total_ns"]
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
pa_obj = PhysAddr.decode(pa)
dst_node = resolver.resolve(pa_obj)
pcie_ep = resolver.find_pcie_ep(0)
io_cpu = resolver.find_io_cpu(0)
m_cpu = resolver.find_m_cpu(0, cube)
leg1 = router.find_node_path(pcie_ep, io_cpu)
leg2 = router.find_node_path(io_cpu, m_cpu)
leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
full_path = leg1 + leg2[1:] + leg3[1:]
bn_bw = _bottleneck_bw(full_path, edge_map)
# Forward path breakdown only (response path is implicit in actual_ns)
fwd_path = leg1 + leg2[1:] + leg3[1:]
wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
h2d_paths.append((name, leg1, leg2, leg3))
print(f" {name:<14} cube{cube}.pe0{'':<8} {hops:>4} {total_ns:>8.2f}"
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
if len(h2d_results) >= 2:
lats = [r[2] for r in h2d_results]
mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
sym = "[v]" if mono else "[x]"
print(f" {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
if h2d_paths:
print()
print(" Route Details:")
print(f" {'Case':<14} {'Leg':>4} Path")
print(" " + "-" * 80)
for name, leg1, leg2, leg3 in h2d_paths:
print(f" {name:<14} {'L1':>4} {_short_path(leg1)}")
print(f" {'':<14} {'L2':>4} {_short_path(leg2)}")
print(f" {'':<14} {'L3':>4} {_short_path(leg3)}")
# === PE DMA → HBM (direct PE-level injection) ===
# (name, sip, src_cube, src_pe, dst_cube, dst_pe)
pe_cases = [
("pe-local-hbm", 0, 0, 0, 0, 0), # pe0 → slice0 (local, 256 GB/s)
("pe-same-half-hbm", 0, 0, 0, 0, 1), # pe0 → slice1 (xbar chain, 128 GB/s)
("pe-cross-half-hbm", 0, 0, 0, 0, 4), # pe0 → slice4 (xbar chain, 128 GB/s)
("pe-cross-cube-hbm", 0, 0, 0, 1, 0), # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
]
pe_results: list[tuple[str, float, float, float | None]] = []
pe_paths: list[tuple[str, list[str]]] = []
print()
print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
print(f" {'Case':<22} {'Target':<28} {'Actual':>8}"
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
print(" " + "-" * 120)
for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
if not show_all and case_filter != name:
continue
engine = GraphEngine(graph)
dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec)
msg = PeDmaMsg(
correlation_id="probe", request_id=name,
src_sip=sip, src_cube=src_cube, src_pe=src_pe,
dst_pa=dst_pa, nbytes=nbytes,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
total_ns = trace["total_ns"]
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
pa_obj = PhysAddr.decode(dst_pa)
dst_node = resolver.resolve(pa_obj)
dma_path = router.find_path(pe_ref, dst_node)
bn_bw = _bottleneck_bw(dma_path, edge_map)
wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph)
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
pe_results.append((name, total_ns, eff_bw, bn_bw))
pe_paths.append((name, dma_path))
print(f" {name:<22} {target_str:<28} {total_ns:>8.2f}"
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
if len(pe_results) >= 2:
local = [r for r in pe_results if "local" in r[0]]
chain = [r for r in pe_results if "local" not in r[0]]
if local and chain:
print(f" * Local BN: {_fmt_bw(local[0][3])} GB/s, "
f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
if pe_paths:
print()
print(" Route Details:")
print(f" {'Case':<22} Path")
print(" " + "-" * 80)
for name, dma_path in pe_paths:
print(f" {name:<22} {_short_path(dma_path)}")
print()
return 0
def cmd_probe(args) -> int:
return run_probe(args.topology, getattr(args, "case", "all"))
+175
View File
@@ -0,0 +1,175 @@
"""Performance report formatter for bench results."""
from __future__ import annotations
_DTYPE_BITS: dict[str, int] = {
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
"f32": 32, "fp32": 32, "float32": 32,
"i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
}
def format_report(
traces: list[dict],
title: str = "Benchmark",
spec: dict | None = None,
) -> str:
"""Format collected traces into a human-readable performance report.
spec: topology spec dict for peak TFLOPS / BW extraction.
"""
peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
num_pes = _count_pes(spec)
lines: list[str] = []
title_line = f"-- {title} Performance Report "
deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
# ── Title ──
# Compute max header width for consistent separator lengths
_cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
report_width = len(_cmd_hdr)
lines.append(title_line + "-" * max(0, report_width - len(title_line)))
# ── Command summary ──
if deploy_entries:
lines.append("")
hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
lines.append(hdr)
lines.append("-" * len(hdr))
for e in deploy_entries:
lat = e.get("total_ns", 0.0)
nb = e.get("nbytes", 0)
sip = e.get("sip", "-")
pe = e.get("pe", "-")
cube = e.get("cube", "-")
cmd = e.get("phase", "deploy")
xfer_ns = e.get("xfer_ns", 0.0)
proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
bw = nb / lat if lat > 0 else 0.0
min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
lines.append(
f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
)
# ── Kernel summary ──
if kernel_entries:
lines.append("")
k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
lines.append(k_hdr)
lines.append("-" * len(k_hdr))
for e in kernel_entries:
e2e_ns = e.get("total_ns", 0.0)
pe_ns = e.get("pe_exec_ns", e2e_ns)
dma_ns = e.get("dma_ns", 0.0)
compute_ns = e.get("compute_ns", 0.0)
target_pe = e.get("target_pe", "-")
scalars = e.get("scalars", [])
pe_str = "all" if target_pe == "all" else str(target_pe)
n_active = num_pes if target_pe == "all" else 1
# Bound indicator based on measured DMA vs compute time
if dma_ns > 0 or compute_ns > 0:
bound = "memory" if dma_ns >= compute_ns else "compute"
else:
bound = "-"
achieved = _calc_tflops(scalars, pe_ns)
peak_total = peak_tflops_f16 * n_active
util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
lines.append(
f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
)
# ── Per-PE summary ──
pe_deploy = _per_pe_deploy(deploy_entries)
if len(pe_deploy) > 1:
lines.append("")
pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
pe_hdr = (f"{'PE':>4} {'Deploy(ns)':>10} {'BW(GB/s)':>10} {'BW Util':>8} "
f"{'Kernel(ns)':>10} {'TFLOPS':>8} {'Util':>7}")
pe_width = max(len(pe_title), len(pe_hdr))
lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
lines.append(pe_hdr)
lines.append("-" * pe_width)
k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
n_active = len(pe_deploy)
total_achieved = _calc_tflops(k_scalars, k_ns)
per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
for pe_id in sorted(pe_deploy):
d_ns, d_bytes = pe_deploy[pe_id]
d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
lines.append(
f"{pe_id:>4} {d_ns:>10.1f} {d_bw:>10.1f} {d_util:>7.1f}% "
f"{k_ns:>10.1f} {per_pe_tflops:>8.3f} {pe_util:>6.1f}%"
)
lines.append("")
return "\n".join(lines)
def _extract_peaks(spec: dict | None) -> tuple[float, float]:
"""Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
if spec is None:
return 0.0, 0.0
cube = spec.get("cube", {})
pe_template = cube.get("pe_template", {})
comps = pe_template.get("components", {})
gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
cube_links = cube.get("links", {})
hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
return peak_tflops, hbm_bw
def _count_pes(spec: dict | None) -> int:
if spec is None:
return 8
cube = spec.get("cube", {})
layout = cube.get("pe_layout", {})
per_corner = layout.get("pe_per_corner", 2)
corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
return per_corner * corners
def _calc_tflops(scalars: list, latency_ns: float) -> float:
"""Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
if len(scalars) < 3 or latency_ns <= 0:
return 0.0
m, k, n = scalars[0], scalars[1], scalars[2]
flops = 2.0 * m * k * n
return flops / (latency_ns * 1e-9) / 1e12
def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
"""Aggregate deploy latency and bytes per PE."""
result: dict[int, tuple[float, int]] = {}
for e in deploy_entries:
pe = e.get("pe", 0)
lat = e.get("total_ns", 0.0)
nb = e.get("nbytes", 0)
if pe in result:
old_ns, old_bytes = result[pe]
result[pe] = (old_ns + lat, old_bytes + nb)
else:
result[pe] = (lat, nb)
return result