Honest measured pipeline efficiency: two timing fixes
Two related issues caused measured pipeline efficiency to look worse than the simulator's actual behavior: 1. DMA timing recorded too early. The op-log start timestamp for a DMA op fired when the request entered the queue, and the DMA channel was released as soon as the request was issued. Back-to-back DMAs therefore appeared to grab the channel simultaneously, with per-op duration drifting upward as queue depth grew - an artifact, not real cost. Fix: defer the start timestamp until after the channel is acquired, and hold the channel through the full HBM round-trip until the response returns. Per-op duration is now constant and equal to the actual transfer interval; serialization is visible as queue wait, not as inflated service time. 2. Sweep timing window folded in pre-composite work. The PE timing window spanned every PE engine record, which included the upfront pinned-operand DMA issued before the composite GEMM begins. For large-K shapes that one-shot load can be nearly half of the window, conflating operand-staging cost with composite-pipeline behavior. Fix: add a second window scoped to the composite pipeline by filtering op_log records to those tagged with a tile-pipeline stage; the legacy operand-load path is untagged and naturally excluded. For 32x3072x32 load_ref the window drops from 1765ns to 992ns and measured eff lines up with the steady-state DMA-bound stage limit instead of being penalized for the one-time load. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -114,23 +114,19 @@ SLIDES = [
|
||||
"render": "hbm_topology",
|
||||
},
|
||||
{
|
||||
"title": "14. GEMM Utilization + Useful Pipeline Efficiency (load_ref)",
|
||||
"render": "mac_utilization",
|
||||
"title": "14. Formula vs Measured Useful Eff (load_ref) — sim validates model",
|
||||
"render": "mac_utilization_formula_vs_measured",
|
||||
},
|
||||
{
|
||||
"title": "15. GEMM Utilization + Useful Pipeline Efficiency (ref_ref — both A & B via DMA_R)",
|
||||
"render": "mac_utilization_ref_ref",
|
||||
},
|
||||
{
|
||||
"title": "16. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)",
|
||||
"title": "15. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)",
|
||||
"render": "pipeline_eff_walkthrough",
|
||||
},
|
||||
{
|
||||
"title": "17. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)",
|
||||
"title": "16. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)",
|
||||
"render": "pipeline_eff_walkthrough_largeK",
|
||||
},
|
||||
{
|
||||
"title": "18. Useful Pipelined Efficiency (ideal pipeline × GEMM util)",
|
||||
"title": "17. Useful Pipelined Efficiency (ideal pipeline × GEMM util)",
|
||||
"render": "tflops_table",
|
||||
},
|
||||
]
|
||||
@@ -1715,12 +1711,12 @@ def _render_mac_utilization(slide):
|
||||
"Useful eff": "Useful eff %",
|
||||
}
|
||||
|
||||
_textbox(slide, 0.4, 1.0, 12.6, 0.70,
|
||||
f"GEMM util = useful FLOPs ÷ (tile FLOPs × tile count) — pure "
|
||||
f"shape-vs-tile metric. "
|
||||
f"Useful eff = (N_tiles × T_stage × GEMM_util) ÷ wall — "
|
||||
f"useful FLOPs delivered as a fraction of peak over the "
|
||||
f"ideal-pipelined wall (head + K-loop + inter-(m,n) DMA_W).",
|
||||
_textbox(slide, 0.4, 1.0, 12.6, 0.80,
|
||||
"FORMULA-generated (analytical ideal-pipeline model — not "
|
||||
"simulator data). GEMM util = useful FLOPs ÷ (tile FLOPs × "
|
||||
"tile count). Useful eff = (N_tiles × T_stage × GEMM_util) "
|
||||
"÷ wall, where wall = head + K-loop + inter-(m,n) DMA_W. "
|
||||
"Slide 16 overlays this against measured pe_window_ns.",
|
||||
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
|
||||
|
||||
_draw_native_bar_chart(
|
||||
@@ -1819,11 +1815,12 @@ def _render_mac_utilization_ref_ref(slide):
|
||||
"Useful eff": "Useful eff % (ref_ref)",
|
||||
}
|
||||
|
||||
_textbox(slide, 0.4, 1.0, 12.6, 0.75,
|
||||
"ref_ref: scheduler issues DMA_R for BOTH A and B every tile. "
|
||||
"Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH and GEMM stay "
|
||||
"at 16 ns. Pipeline cycle is DMA-bound → useful eff caps near "
|
||||
"50 % × GEMM_util, regardless of K-loop length.",
|
||||
_textbox(slide, 0.4, 1.0, 12.6, 0.85,
|
||||
"FORMULA-generated (ideal-pipeline model, ref_ref variant — "
|
||||
"not simulator data). Scheduler issues DMA_R for BOTH A and B "
|
||||
"every tile. Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH "
|
||||
"and GEMM stay at 16 ns. Pipeline cycle is DMA-bound → useful "
|
||||
"eff caps near 50 % × GEMM_util, regardless of K-loop length.",
|
||||
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
|
||||
|
||||
_draw_native_bar_chart(
|
||||
@@ -1846,6 +1843,136 @@ def _render_mac_utilization_ref_ref(slide):
|
||||
)
|
||||
|
||||
|
||||
def _render_mac_utilization_formula_vs_measured(slide):
|
||||
"""Overlay slide: formula useful_eff vs measured useful_eff (load_ref).
|
||||
|
||||
Formula = (N_tiles × T_stage × GEMM_util) / wall_formula × 100
|
||||
wall_formula = head + N_tiles·T_stage + inter·DMA_W
|
||||
Measured = (useful_FLOPs / pe_window_ns) / peak_FLOPs_per_ns × 100
|
||||
pe_window_ns from gemm_sweep.json (honest post Option B).
|
||||
|
||||
Agreement validates the analytical model against the simulator.
|
||||
"""
|
||||
data = _load_sweep_data()
|
||||
rows = data["rows"]
|
||||
if not rows:
|
||||
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
|
||||
"No sweep data. Run scripts/gemm_sweep.py first.",
|
||||
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
|
||||
return
|
||||
tile = data["tile_sizes"]
|
||||
TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
|
||||
tile_flops = 2 * TILE_M * TILE_K * TILE_N
|
||||
|
||||
HBM_GBS = 256.0
|
||||
bpe = 2
|
||||
T_STAGE = 16.0
|
||||
D_STAGES = 3
|
||||
head_ns = (D_STAGES - 1) * T_STAGE
|
||||
dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS
|
||||
peak_per_ns = tile_flops / T_STAGE # MAC peak throughput, flops/ns
|
||||
|
||||
by_shape: dict = {}
|
||||
for r in rows:
|
||||
if r["variant"] != "load_ref":
|
||||
continue
|
||||
by_shape[(r["M"], r["K"], r["N"])] = r
|
||||
shapes = list(by_shape.keys())
|
||||
|
||||
shape_labels = [_shape_label(by_shape[k]) for k in shapes]
|
||||
flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
|
||||
for k in shapes]
|
||||
tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]
|
||||
|
||||
gemm_util_formula: list[float] = []
|
||||
gemm_util_measured: list[float] = []
|
||||
formula_eff: list[float] = []
|
||||
measured_eff: list[float] = []
|
||||
for k in shapes:
|
||||
r = by_shape[k]
|
||||
M, K, N = r["M"], r["K"], r["N"]
|
||||
useful = 2 * M * K * N
|
||||
tiles = r["tile_count_expected"]
|
||||
gu_formula = useful / (tile_flops * tiles)
|
||||
gemm_util_formula.append(gu_formula * 100)
|
||||
|
||||
# Measured GEMM util uses the GEMM stage record count from op_log
|
||||
# (i.e. the actual number of GEMM tiles the simulator ran).
|
||||
gemm_record_count = (
|
||||
r.get("stages", {}).get("GEMM", {}).get("record_count", 0)
|
||||
or tiles # fallback if stages dict missing for older sweep data
|
||||
)
|
||||
gu_measured = useful / (tile_flops * gemm_record_count) \
|
||||
if gemm_record_count > 0 else 0.0
|
||||
gemm_util_measured.append(gu_measured * 100)
|
||||
|
||||
m_tiles = (M + TILE_M - 1) // TILE_M
|
||||
n_tiles = (N + TILE_N - 1) // TILE_N
|
||||
n_mn = m_tiles * n_tiles
|
||||
compute_total = tiles * T_STAGE
|
||||
inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns
|
||||
wall_formula = head_ns + compute_total + inter_dma_w
|
||||
feff = (compute_total * gu_formula / wall_formula) * 100 \
|
||||
if wall_formula > 0 else 0.0
|
||||
formula_eff.append(feff)
|
||||
|
||||
comp_window_ns = r.get("composite_window_ns", 0.0) or 0.0
|
||||
if comp_window_ns > 0:
|
||||
meff = (useful / comp_window_ns / peak_per_ns) * 100
|
||||
else:
|
||||
meff = 0.0
|
||||
measured_eff.append(meff)
|
||||
|
||||
series = {
|
||||
"GEMM util F": gemm_util_formula,
|
||||
"GEMM util M": gemm_util_measured,
|
||||
"Formula eff": formula_eff,
|
||||
"Measured eff": measured_eff,
|
||||
}
|
||||
colors_map = {
|
||||
"GEMM util F": COL_FS, # emerald (formula ceiling)
|
||||
"GEMM util M": RGBColor(0x6E, 0xE7, 0xB7), # mint (measured ceiling)
|
||||
"Formula eff": RGBColor(0xF5, 0x9E, 0x0B), # amber (formula eff)
|
||||
"Measured eff": COL_DMA, # blue (measured eff)
|
||||
}
|
||||
display_map = {
|
||||
"GEMM util F": "GEMM util % (formula)",
|
||||
"GEMM util M": "GEMM util % (measured, op_log)",
|
||||
"Formula eff": "Formula useful eff %",
|
||||
"Measured eff": "Measured useful eff %",
|
||||
}
|
||||
|
||||
_textbox(slide, 0.4, 1.0, 12.6, 1.10,
|
||||
"Four series per shape:\n"
|
||||
" • GEMM util (formula, green): useful_FLOPs ÷ (tile_FLOPs × "
|
||||
"tile_count_expected) — structural shape-vs-tile ceiling.\n"
|
||||
" • GEMM util (measured, mint): useful_FLOPs ÷ (tile_FLOPs × "
|
||||
"GEMM_record_count) — uses the actual GEMM ops the simulator "
|
||||
"ran. Should equal formula → validates plan execution.\n"
|
||||
" • Formula useful eff (amber): GEMM_util × ideal pipeline efficiency.\n"
|
||||
" • Measured useful eff (blue): (useful_FLOPs ÷ "
|
||||
"composite_window_ns) ÷ peak — composite_window_ns covers "
|
||||
"only the tl.composite pipeline (excludes upfront tl.load).",
|
||||
size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
|
||||
|
||||
_draw_native_bar_chart(
|
||||
slide,
|
||||
plot_x=1.0, plot_y=2.25, plot_w=10.0, plot_h=3.85,
|
||||
shape_labels=shape_labels, flagged=flagged,
|
||||
tile_counts=tile_counts,
|
||||
series=series, colors_map=colors_map,
|
||||
display_map=display_map,
|
||||
wall_clocks=None,
|
||||
y_label="%",
|
||||
legend_x=11.4, legend_w=1.85,
|
||||
foot_note=("Reading: GEMM util F = GEMM util M → simulator ran the "
|
||||
"expected tile plan. Measured eff uses composite-only "
|
||||
"window (excludes upfront tl.load), so it isolates "
|
||||
"pipeline efficiency. Variant: load_ref."),
|
||||
threshold_line=100.0,
|
||||
)
|
||||
|
||||
|
||||
def _render_tflops_table(slide):
|
||||
"""Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM.
|
||||
|
||||
@@ -2282,8 +2409,9 @@ _RENDERERS.update({
|
||||
"variant_comparison": _render_variant_comparison,
|
||||
"hbm_topology": _render_hbm_topology,
|
||||
"per_op_dma": _render_per_op_dma,
|
||||
"mac_utilization": _render_mac_utilization,
|
||||
"mac_utilization_ref_ref": _render_mac_utilization_ref_ref,
|
||||
"mac_utilization": _render_mac_utilization,
|
||||
"mac_utilization_ref_ref": _render_mac_utilization_ref_ref,
|
||||
"mac_utilization_formula_vs_measured": _render_mac_utilization_formula_vs_measured,
|
||||
"tflops_table": _render_tflops_table,
|
||||
"pipeline_eff_walkthrough": _render_pipeline_eff_walkthrough,
|
||||
"pipeline_eff_walkthrough_largeK": _render_pipeline_eff_walkthrough_largeK,
|
||||
|
||||
Reference in New Issue
Block a user