commit - release 1
This commit is contained in:
@@ -0,0 +1,64 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from benches.loader import resolve_bench
|
||||
from kernbench.cli.probe import cmd_probe
|
||||
from kernbench.cli.report import format_report
|
||||
from kernbench.common.types import SimEngine
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import DeviceSelector, resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(prog="kernbench")
|
||||
sub = p.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
runp = sub.add_parser("run", help="Run a benchmark")
|
||||
runp.add_argument("--topology", required=True)
|
||||
runp.add_argument("--bench", required=True)
|
||||
runp.add_argument(
|
||||
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
|
||||
)
|
||||
runp.set_defaults(_handler=cmd_run)
|
||||
|
||||
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
|
||||
probep.add_argument("--topology", required=True)
|
||||
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
|
||||
probep.set_defaults(_handler=cmd_probe)
|
||||
|
||||
return p
|
||||
|
||||
|
||||
def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
|
||||
topo_obj = getattr(topology, "topology_obj", topology)
|
||||
return GraphEngine(topo_obj)
|
||||
|
||||
|
||||
def cmd_run(args) -> int:
|
||||
print("> Running benchmark with:", args)
|
||||
|
||||
topo = resolve_topology(args.topology)
|
||||
bench = resolve_bench(args.bench)
|
||||
device = resolve_device(args.device)
|
||||
|
||||
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
|
||||
|
||||
topo_obj = getattr(topo, "topology_obj", topo)
|
||||
spec = getattr(topo_obj, "spec", None)
|
||||
if result.traces:
|
||||
print(format_report(result.traces, title=args.bench, spec=spec))
|
||||
print(result.summary_text())
|
||||
|
||||
return 0 if result.completion.ok else 1
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
return int(args._handler(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,248 @@
|
||||
"""kernbench probe: latency and BW verification utility.
|
||||
|
||||
Runs predefined traffic patterns through the simulation engine and reports
|
||||
latency, effective bandwidth, bottleneck bandwidth, and utilization for each
|
||||
case. Validates monotonicity invariants across hop counts and access types.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import load_topology
|
||||
from kernbench.topology.types import TopologyGraph
|
||||
|
||||
|
||||
# -- Helpers ----------------------------------------------------------
|
||||
|
||||
|
||||
def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int:
|
||||
mm = spec["cube"]["memory_map"]
|
||||
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
|
||||
pa = PhysAddr.pe_hbm_addr(
|
||||
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
|
||||
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||
)
|
||||
return pa.encode()
|
||||
|
||||
|
||||
def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]:
|
||||
return {(e.src, e.dst): e for e in graph.edges}
|
||||
|
||||
|
||||
def _formula_breakdown(
|
||||
path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
|
||||
) -> tuple[float, float, float, float]:
|
||||
"""Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path."""
|
||||
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
|
||||
wire_ns = 0.0
|
||||
for i in range(len(path) - 1):
|
||||
e = edge_map.get((path[i], path[i + 1]))
|
||||
if e:
|
||||
wire_ns += e.distance_mm * ns_per_mm
|
||||
overhead_ns = 0.0
|
||||
for nid in path:
|
||||
node = graph.nodes.get(nid)
|
||||
if node:
|
||||
overhead_ns += float(node.attrs.get("overhead_ns", 0.0))
|
||||
bws = [e.bw_gbs for i in range(len(path) - 1)
|
||||
if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
|
||||
drain_ns = nbytes / min(bws) if bws else 0.0
|
||||
return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns
|
||||
|
||||
|
||||
def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
|
||||
"""Per-request bottleneck: single request uses one connection."""
|
||||
bws: list[float] = []
|
||||
for i in range(len(path) - 1):
|
||||
e = edge_map.get((path[i], path[i + 1]))
|
||||
if e and e.bw_gbs:
|
||||
bws.append(e.bw_gbs)
|
||||
return min(bws) if bws else None
|
||||
|
||||
|
||||
|
||||
def _fmt_bw(bw: float | None) -> str:
|
||||
return f"{bw:.1f}" if bw is not None else "-"
|
||||
|
||||
|
||||
def _fmt_util(eff: float, bn: float | None) -> str:
|
||||
if bn is None or bn <= 0:
|
||||
return "-"
|
||||
return f"{eff / bn * 100:.1f}%"
|
||||
|
||||
|
||||
def _short_name(node_id: str) -> str:
|
||||
"""Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
|
||||
parts = node_id.split(".")
|
||||
return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
|
||||
|
||||
|
||||
def _short_path(path: list[str]) -> str:
|
||||
return " -> ".join(_short_name(n) for n in path)
|
||||
|
||||
|
||||
# -- Probe runner -----------------------------------------------------
|
||||
|
||||
|
||||
def run_probe(topology_path: str, case_filter: str | None = None) -> int:
|
||||
path = Path(topology_path).expanduser().resolve()
|
||||
graph = load_topology(path)
|
||||
edge_map = _build_edge_map(graph)
|
||||
spec = graph.spec
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
|
||||
nbytes = 4096
|
||||
show_all = case_filter is None or case_filter == "all"
|
||||
|
||||
# === H2D Write ===
|
||||
h2d_cases = [
|
||||
("h2d-1hop", 0, 1),
|
||||
("h2d-2hop", 4, 2),
|
||||
("h2d-3hop", 8, 3),
|
||||
("h2d-4hop", 12, 4),
|
||||
]
|
||||
h2d_results: list[tuple[str, int, float, float, float | None]] = []
|
||||
h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
|
||||
|
||||
print()
|
||||
print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
|
||||
print(f" {'Case':<14} {'Target':<16} {'Hops':>4} {'Actual':>8}"
|
||||
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
|
||||
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
|
||||
print(" " + "-" * 115)
|
||||
|
||||
for name, cube, hops in h2d_cases:
|
||||
if not show_all and case_filter != name:
|
||||
continue
|
||||
engine = GraphEngine(graph)
|
||||
pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="probe", request_id=name,
|
||||
dst_sip=0, dst_cube=cube, dst_pe=0,
|
||||
dst_pa=pa, nbytes=nbytes, pattern="zero",
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
total_ns = trace["total_ns"]
|
||||
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
|
||||
|
||||
pa_obj = PhysAddr.decode(pa)
|
||||
dst_node = resolver.resolve(pa_obj)
|
||||
|
||||
pcie_ep = resolver.find_pcie_ep(0)
|
||||
io_cpu = resolver.find_io_cpu(0)
|
||||
m_cpu = resolver.find_m_cpu(0, cube)
|
||||
leg1 = router.find_node_path(pcie_ep, io_cpu)
|
||||
leg2 = router.find_node_path(io_cpu, m_cpu)
|
||||
leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
|
||||
full_path = leg1 + leg2[1:] + leg3[1:]
|
||||
bn_bw = _bottleneck_bw(full_path, edge_map)
|
||||
|
||||
# Forward path breakdown only (response path is implicit in actual_ns)
|
||||
fwd_path = leg1 + leg2[1:] + leg3[1:]
|
||||
wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
|
||||
|
||||
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
|
||||
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
|
||||
|
||||
h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
|
||||
h2d_paths.append((name, leg1, leg2, leg3))
|
||||
print(f" {name:<14} cube{cube}.pe0{'':<8} {hops:>4} {total_ns:>8.2f}"
|
||||
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
|
||||
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
|
||||
|
||||
if len(h2d_results) >= 2:
|
||||
lats = [r[2] for r in h2d_results]
|
||||
mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
|
||||
sym = "[v]" if mono else "[x]"
|
||||
print(f" {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
|
||||
|
||||
if h2d_paths:
|
||||
print()
|
||||
print(" Route Details:")
|
||||
print(f" {'Case':<14} {'Leg':>4} Path")
|
||||
print(" " + "-" * 80)
|
||||
for name, leg1, leg2, leg3 in h2d_paths:
|
||||
print(f" {name:<14} {'L1':>4} {_short_path(leg1)}")
|
||||
print(f" {'':<14} {'L2':>4} {_short_path(leg2)}")
|
||||
print(f" {'':<14} {'L3':>4} {_short_path(leg3)}")
|
||||
|
||||
# === PE DMA → HBM (direct PE-level injection) ===
|
||||
# (name, sip, src_cube, src_pe, dst_cube, dst_pe)
|
||||
pe_cases = [
|
||||
("pe-local-hbm", 0, 0, 0, 0, 0), # pe0 → slice0 (local, 256 GB/s)
|
||||
("pe-same-half-hbm", 0, 0, 0, 0, 1), # pe0 → slice1 (xbar chain, 128 GB/s)
|
||||
("pe-cross-half-hbm", 0, 0, 0, 0, 4), # pe0 → slice4 (xbar chain, 128 GB/s)
|
||||
("pe-cross-cube-hbm", 0, 0, 0, 1, 0), # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
|
||||
]
|
||||
pe_results: list[tuple[str, float, float, float | None]] = []
|
||||
pe_paths: list[tuple[str, list[str]]] = []
|
||||
|
||||
print()
|
||||
print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
|
||||
print(f" {'Case':<22} {'Target':<28} {'Actual':>8}"
|
||||
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
|
||||
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
|
||||
print(" " + "-" * 120)
|
||||
|
||||
for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
|
||||
if not show_all and case_filter != name:
|
||||
continue
|
||||
engine = GraphEngine(graph)
|
||||
dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec)
|
||||
msg = PeDmaMsg(
|
||||
correlation_id="probe", request_id=name,
|
||||
src_sip=sip, src_cube=src_cube, src_pe=src_pe,
|
||||
dst_pa=dst_pa, nbytes=nbytes,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
total_ns = trace["total_ns"]
|
||||
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
|
||||
|
||||
pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
|
||||
pa_obj = PhysAddr.decode(dst_pa)
|
||||
dst_node = resolver.resolve(pa_obj)
|
||||
dma_path = router.find_path(pe_ref, dst_node)
|
||||
bn_bw = _bottleneck_bw(dma_path, edge_map)
|
||||
|
||||
wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph)
|
||||
|
||||
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
|
||||
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
|
||||
|
||||
target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
|
||||
pe_results.append((name, total_ns, eff_bw, bn_bw))
|
||||
pe_paths.append((name, dma_path))
|
||||
print(f" {name:<22} {target_str:<28} {total_ns:>8.2f}"
|
||||
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
|
||||
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
|
||||
|
||||
if len(pe_results) >= 2:
|
||||
local = [r for r in pe_results if "local" in r[0]]
|
||||
chain = [r for r in pe_results if "local" not in r[0]]
|
||||
if local and chain:
|
||||
print(f" * Local BN: {_fmt_bw(local[0][3])} GB/s, "
|
||||
f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
|
||||
|
||||
if pe_paths:
|
||||
print()
|
||||
print(" Route Details:")
|
||||
print(f" {'Case':<22} Path")
|
||||
print(" " + "-" * 80)
|
||||
for name, dma_path in pe_paths:
|
||||
print(f" {name:<22} {_short_path(dma_path)}")
|
||||
|
||||
print()
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_probe(args) -> int:
|
||||
return run_probe(args.topology, getattr(args, "case", "all"))
|
||||
@@ -0,0 +1,175 @@
|
||||
"""Performance report formatter for bench results."""
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
_DTYPE_BITS: dict[str, int] = {
|
||||
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
||||
"f32": 32, "fp32": 32, "float32": 32,
|
||||
"i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
|
||||
}
|
||||
|
||||
|
||||
def format_report(
|
||||
traces: list[dict],
|
||||
title: str = "Benchmark",
|
||||
spec: dict | None = None,
|
||||
) -> str:
|
||||
"""Format collected traces into a human-readable performance report.
|
||||
|
||||
spec: topology spec dict for peak TFLOPS / BW extraction.
|
||||
"""
|
||||
peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
|
||||
num_pes = _count_pes(spec)
|
||||
|
||||
lines: list[str] = []
|
||||
title_line = f"-- {title} Performance Report "
|
||||
|
||||
deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
|
||||
kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
|
||||
|
||||
# ── Title ──
|
||||
# Compute max header width for consistent separator lengths
|
||||
_cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
|
||||
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
|
||||
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
|
||||
report_width = len(_cmd_hdr)
|
||||
lines.append(title_line + "-" * max(0, report_width - len(title_line)))
|
||||
|
||||
# ── Command summary ──
|
||||
if deploy_entries:
|
||||
lines.append("")
|
||||
hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
|
||||
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
|
||||
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
|
||||
lines.append(hdr)
|
||||
lines.append("-" * len(hdr))
|
||||
for e in deploy_entries:
|
||||
lat = e.get("total_ns", 0.0)
|
||||
nb = e.get("nbytes", 0)
|
||||
sip = e.get("sip", "-")
|
||||
pe = e.get("pe", "-")
|
||||
cube = e.get("cube", "-")
|
||||
cmd = e.get("phase", "deploy")
|
||||
xfer_ns = e.get("xfer_ns", 0.0)
|
||||
proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
|
||||
bw = nb / lat if lat > 0 else 0.0
|
||||
min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
|
||||
util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
|
||||
lines.append(
|
||||
f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
|
||||
f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
|
||||
f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
|
||||
)
|
||||
|
||||
# ── Kernel summary ──
|
||||
if kernel_entries:
|
||||
lines.append("")
|
||||
k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
|
||||
f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
|
||||
f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
|
||||
lines.append(k_hdr)
|
||||
lines.append("-" * len(k_hdr))
|
||||
for e in kernel_entries:
|
||||
e2e_ns = e.get("total_ns", 0.0)
|
||||
pe_ns = e.get("pe_exec_ns", e2e_ns)
|
||||
dma_ns = e.get("dma_ns", 0.0)
|
||||
compute_ns = e.get("compute_ns", 0.0)
|
||||
target_pe = e.get("target_pe", "-")
|
||||
scalars = e.get("scalars", [])
|
||||
pe_str = "all" if target_pe == "all" else str(target_pe)
|
||||
n_active = num_pes if target_pe == "all" else 1
|
||||
|
||||
# Bound indicator based on measured DMA vs compute time
|
||||
if dma_ns > 0 or compute_ns > 0:
|
||||
bound = "memory" if dma_ns >= compute_ns else "compute"
|
||||
else:
|
||||
bound = "-"
|
||||
|
||||
achieved = _calc_tflops(scalars, pe_ns)
|
||||
peak_total = peak_tflops_f16 * n_active
|
||||
util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
|
||||
lines.append(
|
||||
f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
|
||||
f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
|
||||
f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
|
||||
)
|
||||
|
||||
# ── Per-PE summary ──
|
||||
pe_deploy = _per_pe_deploy(deploy_entries)
|
||||
if len(pe_deploy) > 1:
|
||||
lines.append("")
|
||||
pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
|
||||
f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
|
||||
pe_hdr = (f"{'PE':>4} {'Deploy(ns)':>10} {'BW(GB/s)':>10} {'BW Util':>8} "
|
||||
f"{'Kernel(ns)':>10} {'TFLOPS':>8} {'Util':>7}")
|
||||
pe_width = max(len(pe_title), len(pe_hdr))
|
||||
lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
|
||||
lines.append(pe_hdr)
|
||||
lines.append("-" * pe_width)
|
||||
|
||||
k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
|
||||
k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
|
||||
n_active = len(pe_deploy)
|
||||
total_achieved = _calc_tflops(k_scalars, k_ns)
|
||||
per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
|
||||
pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
|
||||
|
||||
for pe_id in sorted(pe_deploy):
|
||||
d_ns, d_bytes = pe_deploy[pe_id]
|
||||
d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
|
||||
d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
|
||||
lines.append(
|
||||
f"{pe_id:>4} {d_ns:>10.1f} {d_bw:>10.1f} {d_util:>7.1f}% "
|
||||
f"{k_ns:>10.1f} {per_pe_tflops:>8.3f} {pe_util:>6.1f}%"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _extract_peaks(spec: dict | None) -> tuple[float, float]:
|
||||
"""Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
|
||||
if spec is None:
|
||||
return 0.0, 0.0
|
||||
cube = spec.get("cube", {})
|
||||
pe_template = cube.get("pe_template", {})
|
||||
comps = pe_template.get("components", {})
|
||||
gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
|
||||
peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
|
||||
cube_links = cube.get("links", {})
|
||||
hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
|
||||
return peak_tflops, hbm_bw
|
||||
|
||||
|
||||
def _count_pes(spec: dict | None) -> int:
|
||||
if spec is None:
|
||||
return 8
|
||||
cube = spec.get("cube", {})
|
||||
layout = cube.get("pe_layout", {})
|
||||
per_corner = layout.get("pe_per_corner", 2)
|
||||
corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
|
||||
return per_corner * corners
|
||||
|
||||
|
||||
def _calc_tflops(scalars: list, latency_ns: float) -> float:
|
||||
"""Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
|
||||
if len(scalars) < 3 or latency_ns <= 0:
|
||||
return 0.0
|
||||
m, k, n = scalars[0], scalars[1], scalars[2]
|
||||
flops = 2.0 * m * k * n
|
||||
return flops / (latency_ns * 1e-9) / 1e12
|
||||
|
||||
|
||||
def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
|
||||
"""Aggregate deploy latency and bytes per PE."""
|
||||
result: dict[int, tuple[float, int]] = {}
|
||||
for e in deploy_entries:
|
||||
pe = e.get("pe", 0)
|
||||
lat = e.get("total_ns", 0.0)
|
||||
nb = e.get("nbytes", 0)
|
||||
if pe in result:
|
||||
old_ns, old_bytes = result[pe]
|
||||
result[pe] = (old_ns + lat, old_bytes + nb)
|
||||
else:
|
||||
result[pe] = (lat, nb)
|
||||
return result
|
||||
Reference in New Issue
Block a user