eb792e6212
- Delete xbar.py and noc.py (TwoDMeshNocComponent) — unused since router mesh - Remove xbar_v1/noc_2d_mesh_v1 from components.yaml - Fix pe_to_xbar → pe_to_router in routing exclusion set - Fix xbar_to_hbm_bw_gbs → hbm_to_router_bw_gbs in report.py - Update all docstrings/comments referencing xbar/bridge → router mesh - Cube-view connectors: rule-based _connector_points helper - PE↔router: single diagonal line (not chevron) - UCIe N/S: 45°→horizontal→45° - UCIe E/W: 45°→vertical→45° - HBM ports: 45°→horizontal→45° Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
176 lines
7.0 KiB
Python
176 lines
7.0 KiB
Python
"""Performance report formatter for bench results."""
|
|
from __future__ import annotations
|
|
|
|
|
|
_DTYPE_BITS: dict[str, int] = {
|
|
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
|
"f32": 32, "fp32": 32, "float32": 32,
|
|
"i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
|
|
}
|
|
|
|
|
|
def format_report(
|
|
traces: list[dict],
|
|
title: str = "Benchmark",
|
|
spec: dict | None = None,
|
|
) -> str:
|
|
"""Format collected traces into a human-readable performance report.
|
|
|
|
spec: topology spec dict for peak TFLOPS / BW extraction.
|
|
"""
|
|
peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
|
|
num_pes = _count_pes(spec)
|
|
|
|
lines: list[str] = []
|
|
title_line = f"-- {title} Performance Report "
|
|
|
|
deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
|
|
kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
|
|
|
|
# ── Title ──
|
|
# Compute max header width for consistent separator lengths
|
|
_cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
|
|
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
|
|
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
|
|
report_width = len(_cmd_hdr)
|
|
lines.append(title_line + "-" * max(0, report_width - len(title_line)))
|
|
|
|
# ── Command summary ──
|
|
if deploy_entries:
|
|
lines.append("")
|
|
hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
|
|
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
|
|
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
|
|
lines.append(hdr)
|
|
lines.append("-" * len(hdr))
|
|
for e in deploy_entries:
|
|
lat = e.get("total_ns", 0.0)
|
|
nb = e.get("nbytes", 0)
|
|
sip = e.get("sip", "-")
|
|
pe = e.get("pe", "-")
|
|
cube = e.get("cube", "-")
|
|
cmd = e.get("phase", "deploy")
|
|
xfer_ns = e.get("xfer_ns", 0.0)
|
|
proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
|
|
bw = nb / lat if lat > 0 else 0.0
|
|
min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
|
|
util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
|
|
lines.append(
|
|
f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
|
|
f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
|
|
f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
|
|
)
|
|
|
|
# ── Kernel summary ──
|
|
if kernel_entries:
|
|
lines.append("")
|
|
k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
|
|
f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
|
|
f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
|
|
lines.append(k_hdr)
|
|
lines.append("-" * len(k_hdr))
|
|
for e in kernel_entries:
|
|
e2e_ns = e.get("total_ns", 0.0)
|
|
pe_ns = e.get("pe_exec_ns", e2e_ns)
|
|
dma_ns = e.get("dma_ns", 0.0)
|
|
compute_ns = e.get("compute_ns", 0.0)
|
|
target_pe = e.get("target_pe", "-")
|
|
scalars = e.get("scalars", [])
|
|
pe_str = "all" if target_pe == "all" else str(target_pe)
|
|
n_active = num_pes if target_pe == "all" else 1
|
|
|
|
# Bound indicator based on measured DMA vs compute time
|
|
if dma_ns > 0 or compute_ns > 0:
|
|
bound = "memory" if dma_ns >= compute_ns else "compute"
|
|
else:
|
|
bound = "-"
|
|
|
|
achieved = _calc_tflops(scalars, pe_ns)
|
|
peak_total = peak_tflops_f16 * n_active
|
|
util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
|
|
lines.append(
|
|
f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
|
|
f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
|
|
f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
|
|
)
|
|
|
|
# ── Per-PE summary ──
|
|
pe_deploy = _per_pe_deploy(deploy_entries)
|
|
if len(pe_deploy) > 1:
|
|
lines.append("")
|
|
pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
|
|
f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
|
|
pe_hdr = (f"{'PE':>4} {'Deploy(ns)':>10} {'BW(GB/s)':>10} {'BW Util':>8} "
|
|
f"{'Kernel(ns)':>10} {'TFLOPS':>8} {'Util':>7}")
|
|
pe_width = max(len(pe_title), len(pe_hdr))
|
|
lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
|
|
lines.append(pe_hdr)
|
|
lines.append("-" * pe_width)
|
|
|
|
k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
|
|
k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
|
|
n_active = len(pe_deploy)
|
|
total_achieved = _calc_tflops(k_scalars, k_ns)
|
|
per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
|
|
pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
|
|
|
|
for pe_id in sorted(pe_deploy):
|
|
d_ns, d_bytes = pe_deploy[pe_id]
|
|
d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
|
|
d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
|
|
lines.append(
|
|
f"{pe_id:>4} {d_ns:>10.1f} {d_bw:>10.1f} {d_util:>7.1f}% "
|
|
f"{k_ns:>10.1f} {per_pe_tflops:>8.3f} {pe_util:>6.1f}%"
|
|
)
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _extract_peaks(spec: dict | None) -> tuple[float, float]:
|
|
"""Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
|
|
if spec is None:
|
|
return 0.0, 0.0
|
|
cube = spec.get("cube", {})
|
|
pe_template = cube.get("pe_template", {})
|
|
comps = pe_template.get("components", {})
|
|
gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
|
|
peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
|
|
cube_links = cube.get("links", {})
|
|
hbm_bw = float(cube_links.get("hbm_to_router_bw_gbs", 0.0))
|
|
return peak_tflops, hbm_bw
|
|
|
|
|
|
def _count_pes(spec: dict | None) -> int:
|
|
if spec is None:
|
|
return 8
|
|
cube = spec.get("cube", {})
|
|
layout = cube.get("pe_layout", {})
|
|
per_corner = layout.get("pe_per_corner", 2)
|
|
corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
|
|
return per_corner * corners
|
|
|
|
|
|
def _calc_tflops(scalars: list, latency_ns: float) -> float:
|
|
"""Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
|
|
if len(scalars) < 3 or latency_ns <= 0:
|
|
return 0.0
|
|
m, k, n = scalars[0], scalars[1], scalars[2]
|
|
flops = 2.0 * m * k * n
|
|
return flops / (latency_ns * 1e-9) / 1e12
|
|
|
|
|
|
def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
|
|
"""Aggregate deploy latency and bytes per PE."""
|
|
result: dict[int, tuple[float, int]] = {}
|
|
for e in deploy_entries:
|
|
pe = e.get("pe", 0)
|
|
lat = e.get("total_ns", 0.0)
|
|
nb = e.get("nbytes", 0)
|
|
if pe in result:
|
|
old_ns, old_bytes = result[pe]
|
|
result[pe] = (old_ns + lat, old_bytes + nb)
|
|
else:
|
|
result[pe] = (lat, nb)
|
|
return result
|