Files
kernbench2/scripts/build_overview_slides.py
T
mukesh ff7d727ddd CCL allreduce: rename to lrab_hierarchical_allreduce + descriptive plots
Rename the intercube all-reduce identity to lrab_hierarchical_allreduce
(module, config key, distributed test) so the name reflects both levels
it implements: LRAB intra-SIP (local reduce to center root + broadcast)
and the hierarchical inter-SIP topology exchange (ring/torus/mesh).
ADR-0032 slug kept as the stable decision id; pure rename, no logic change.

Also in this batch:
- ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce
  (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder.
- Rename allreduce + pe2pe latency plots to descriptive, title-matching
  filenames and retitle the in-plot headings; drop overview/overview_log.
- Point the PPTX image refs at the new plot names.

Doc + derived-artifact + rename only; no simulation behavior changed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 20:50:48 -07:00

2455 lines
98 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Generate a multi-slide PPTX summarizing the kernbench2 model.
Slides:
1. Overall architecture — how PEs are connected (cube_mesh_view)
2. Model correctness — DMA vs P2P latency (pe2pe overview)
3. PE-to-PE IPCQ communication (ipcq_two_pe_dma)
4. 6-device allreduce — model vs theoretical vs FSIM (comparison_…_fsim)
5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (…_with_TCM_SRAM_HBM)
6. PE_accelerator data path (composite GEMM pipeline structure)
7. matmul(32, 128, 32) — composite GEMM execution sequence
8. matmul(32, 128, 128) — pipeline scaling and HBM contention
This is a derived-artifact generator — no production code touched.
"""
from __future__ import annotations
from pathlib import Path
from PIL import Image
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.enum.shapes import MSO_SHAPE
from pptx.enum.text import PP_ALIGN
from pptx.util import Emu, Inches, Pt
ROOT = Path(__file__).resolve().parent.parent
DIAG = ROOT / "docs" / "diagrams"
OUT = DIAG / "kernbench2_overview.pptx"
# 16:9 widescreen — 13.333 × 7.5 in
SLIDE_W_IN = 13.333
SLIDE_H_IN = 7.5
SLIDES = [
{
"title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity",
"image": DIAG / "cube_mesh_view.png",
"bullets": [
"Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)",
"Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine",
"Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh",
"Foundation for every latency, IPCQ, and allreduce experiment that follows",
],
},
{
"title": "2. Model Correctness: DMA vs P2P Latency Sweep",
"image": DIAG / "pe2pe_latency_plots" / "overview.png",
"bullets": [
"Sweeps payload size across PE-to-PE paths and compares to DMA",
"Confirms the simulator reproduces the expected DMA/P2P crossover",
"Acts as the per-hop ground truth that feeds collective-level models",
],
},
{
"title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)",
"image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png",
"bullets": [
"Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot",
"Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume",
"Credit return rides the fabric path back (16 B packet, no slot-IO)",
"This is the building block the multi-device allreduce composes",
],
},
{
"title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator",
"image": DIAG / "allreduce_latency_plots" / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png",
"bullets": [
"Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE",
"Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)",
"Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs",
"Our 6-device collective lands at ~1722 µs — ~17× faster than ext-sim baseline",
],
},
{
"title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM",
"image": DIAG / "allreduce_latency_plots" / "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png",
"bullets": [
"Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)",
"Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop",
"Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s",
"At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link",
],
},
{
"title": "6. PE_accelerator Data Path: Composite GEMM Pipeline",
"render": "pipeline_structure",
},
{
"title": "7. PE_SCHEDULER: Plan Generation & Tile Dispatch",
"render": "scheduler",
},
{
"title": "8. matmul(32, 128, 32) — Composite GEMM Execution Sequence",
"render": "sequence_32x128x32",
},
{
"title": "9. matmul(32, 128, 128) — Pipeline Scaling & HBM Contention",
"render": "sequence_32x128x128",
},
{
"title": "10. Tiling Walkthrough: 32×128×32 — K-loop Only, No Inter-(m,n) Flush",
"render": "tiling_32x128x32",
},
{
"title": "11. Tiling Walkthrough: 32×128×128 — K-loop & Inter-(m,n) Flushes",
"render": "tiling_32x128x128",
},
{
"title": "12. GEMM Sweep — Stage Wall-Clock (load_ref)",
"render": "stage_breakdown_load_ref",
},
{
"title": "13. Why DMA Isn't Local: Cube-Shared HBM Path",
"render": "hbm_topology",
},
{
"title": "14. Formula vs Measured Useful Eff (load_ref) — sim validates model",
"render": "mac_utilization_formula_vs_measured",
},
{
"title": "15. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)",
"render": "pipeline_eff_walkthrough",
},
{
"title": "16. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)",
"render": "pipeline_eff_walkthrough_largeK",
},
{
"title": "17. Useful Pipelined Efficiency (ideal pipeline × GEMM util)",
"render": "tflops_table",
},
]
# ── Palette for the shape-drawn slides ─────────────────────────────────────
COL_TEXT_DARK = RGBColor(0x1E, 0x29, 0x3B)
COL_TEXT_LIGHT = RGBColor(0xFF, 0xFF, 0xFF)
COL_MUTED = RGBColor(0x47, 0x55, 0x69)
COL_RED = RGBColor(0xDC, 0x26, 0x26)
COL_STORAGE = RGBColor(0xE2, 0xE8, 0xF0)
COL_STORAGE_STROKE = RGBColor(0x47, 0x55, 0x69)
COL_DMA = RGBColor(0x3B, 0x82, 0xF6) # blue
COL_FS = RGBColor(0x10, 0xB9, 0x81) # emerald
COL_GEMM = RGBColor(0xF5, 0x9E, 0x0B) # amber
COL_HBM_BG = RGBColor(0xDB, 0xEA, 0xFE)
COL_TCM_BG = RGBColor(0xD1, 0xFA, 0xE5)
COL_REG_BG = RGBColor(0xFE, 0xF3, 0xC7)
COL_HBM_BORDER = COL_DMA
COL_TCM_BORDER = COL_FS
COL_REG_BORDER = RGBColor(0xD9, 0x77, 0x06)
def _add_title(slide, text):
left = Inches(0.4)
top = Inches(0.25)
width = Inches(SLIDE_W_IN - 0.8)
height = Inches(0.7)
box = slide.shapes.add_textbox(left, top, width, height)
tf = box.text_frame
tf.margin_left = tf.margin_right = Emu(0)
tf.margin_top = tf.margin_bottom = Emu(0)
p = tf.paragraphs[0]
run = p.add_run()
run.text = text
run.font.size = Pt(26)
run.font.bold = True
run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55)
return box
def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in):
with Image.open(img_path) as im:
iw, ih = im.size
max_w_emu = Inches(max_w_in)
max_h_emu = Inches(max_h_in)
scale = min(max_w_emu / iw, max_h_emu / ih)
w = int(iw * scale)
h = int(ih * scale)
left = Inches(left_in) + (max_w_emu - w) // 2
top = Inches(top_in) + (max_h_emu - h) // 2
slide.shapes.add_picture(str(img_path), left, top, width=w, height=h)
def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in):
box = slide.shapes.add_textbox(
Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in),
)
tf = box.text_frame
tf.word_wrap = True
for i, line in enumerate(bullets):
p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
p.level = 0
run = p.add_run()
run.text = "" + line
run.font.size = Pt(15)
run.font.color.rgb = RGBColor(0x22, 0x22, 0x22)
p.space_after = Pt(6)
def _add_footer(slide, idx, total):
box = slide.shapes.add_textbox(
Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45),
Inches(1.0), Inches(0.3),
)
p = box.text_frame.paragraphs[0]
run = p.add_run()
run.text = f"{idx} / {total}"
run.font.size = Pt(10)
run.font.color.rgb = RGBColor(0x88, 0x88, 0x88)
# ── Shape-drawing primitives for the diagram slides ────────────────────────
def _txt(box, text, size=11, bold=False, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER):
tf = box.text_frame
tf.word_wrap = True
tf.margin_left = Emu(18000)
tf.margin_right = Emu(18000)
tf.margin_top = Emu(9000)
tf.margin_bottom = Emu(9000)
lines = text.split("\n")
for i, ln in enumerate(lines):
p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
p.alignment = align
p.text = ""
run = p.add_run()
run.text = ln
run.font.size = Pt(size)
run.font.bold = bold
run.font.name = "Consolas"
run.font.color.rgb = color
def _textbox(slide, x, y, w, h, text, **kw):
tb = slide.shapes.add_textbox(Inches(x), Inches(y), Inches(w), Inches(h))
_txt(tb, text, **kw)
return tb
def _rrect(slide, x, y, w, h, fill, stroke, text="", **kw):
s = slide.shapes.add_shape(
MSO_SHAPE.ROUNDED_RECTANGLE,
Inches(x), Inches(y), Inches(w), Inches(h),
)
s.fill.solid()
s.fill.fore_color.rgb = fill
s.line.color.rgb = stroke
s.line.width = Pt(1.3)
if text:
_txt(s, text, **kw)
return s
def _rect_band(slide, x, y, w, h, fill, stroke):
s = slide.shapes.add_shape(
MSO_SHAPE.RECTANGLE, Inches(x), Inches(y), Inches(w), Inches(h),
)
s.fill.solid()
s.fill.fore_color.rgb = fill
s.line.color.rgb = stroke
s.line.width = Pt(1.0)
return s
def _arrow(slide, x1, y1, x2, y2, color=COL_MUTED, width_pt=1.5):
conn = slide.shapes.add_connector(
2, Inches(x1), Inches(y1), Inches(x2), Inches(y2),
)
conn.line.color.rgb = color
conn.line.width = Pt(width_pt)
from pptx.oxml.ns import qn
from lxml import etree
ln = conn.line._get_or_add_ln()
tail = ln.find(qn("a:tailEnd"))
if tail is None:
tail = etree.SubElement(ln, qn("a:tailEnd"))
tail.set("type", "triangle")
tail.set("w", "med")
tail.set("len", "med")
return conn
def _vline_dashed(slide, x, y1, y2, color, width_pt=2.0):
conn = slide.shapes.add_connector(
1, Inches(x), Inches(y1), Inches(x), Inches(y2),
)
conn.line.color.rgb = color
conn.line.width = Pt(width_pt)
from pptx.oxml.ns import qn
from lxml import etree
ln = conn.line._get_or_add_ln()
pr = ln.find(qn("a:prstDash"))
if pr is None:
pr = etree.SubElement(ln, qn("a:prstDash"))
pr.set("val", "dash")
return conn
# ── Slide 6: Pipeline structure ────────────────────────────────────────────
def _render_pipeline_structure(slide):
"""Vertical hardware datapath.
HBM at top → DMA_in → GEMM Unit (FETCH + RegFile inside) → DMA_out
(off-page to HBM, not drawn). TCM is on the right, vertical, acting
as the staging buffer that DMA_in writes into, FETCH reads from, and
that the GEMM unit STOREs back into before DMA_out drains it.
"""
READ_COLOR = COL_DMA
WRITE_COLOR = RGBColor(0xEA, 0x58, 0x0C)
_textbox(slide, 0.4, 1.0, 12.6, 0.5,
"Vertical flow. HBM → DMA_in → TCM → GEMM Unit (FETCH + "
"RegFile) → TCM → DMA_out → HBM. TCM sits on the side as the "
"staging buffer between the DMA engines and the GEMM unit.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# ── Center column: HBM → DMA_in → GEMM Unit → DMA_out ────────────
col_cx = 3.5
box_w = 3.0
box_x = col_cx - box_w / 2
box_h = 0.50
y_hbm = 1.65
y_dma_in = 2.55
y_gemm = 3.45
gemm_h = 2.40
y_dma_out = y_gemm + gemm_h + 0.30
out_arr_end_y = y_dma_out + box_h + 0.40
# HBM
_rrect(slide, box_x, y_hbm, box_w, box_h,
COL_HBM_BG, COL_HBM_BORDER,
"HBM (off-chip, 256 GB/s)",
size=11, bold=True, color=COL_HBM_BORDER)
# DMA_in
_rrect(slide, box_x, y_dma_in, box_w, box_h,
COL_DMA, COL_TEXT_DARK,
"DMA_in",
size=13, bold=True, color=COL_TEXT_LIGHT)
# GEMM Unit container (slightly wider than DMA boxes)
gemm_x = box_x - 0.7
gemm_w = box_w + 1.4
_rrect(slide, gemm_x, y_gemm, gemm_w, gemm_h,
RGBColor(0xFF, 0xFB, 0xEB),
COL_GEMM, "", size=10, color=COL_GEMM)
_textbox(slide, gemm_x + 0.20, y_gemm + 0.08, gemm_w - 0.4, 0.32,
"GEMM Unit",
size=14, bold=True, color=COL_GEMM, align=PP_ALIGN.LEFT)
# Inside GEMM Unit: FETCH (top), RegFile (below), MAC label (bottom)
sub_h = 0.50
inner_x = gemm_x + 0.5
inner_w = gemm_w - 1.0
fetch_y = y_gemm + 0.55
reg_y = fetch_y + sub_h + 0.30
_rrect(slide, inner_x, fetch_y, inner_w, sub_h,
COL_FS, COL_TEXT_DARK,
"FETCH unit",
size=11, bold=True, color=COL_TEXT_LIGHT)
_rrect(slide, inner_x, reg_y, inner_w, sub_h,
COL_REG_BG, COL_REG_BORDER,
"RegFile (A, B, C accumulator)",
size=11, bold=True, color=COL_REG_BORDER)
# MAC annotation (text-only) — implicit MAC operation on RegFile
mac_label_y = reg_y + sub_h + 0.05
_textbox(slide, inner_x, mac_label_y, inner_w, 0.32,
"↻ MAC accumulate (32 × 64 × 32 array)",
size=11, bold=True, color=COL_GEMM, align=PP_ALIGN.CENTER)
# FETCH → RegFile (internal, vertical down)
inner_cx = inner_x + inner_w / 2
_arrow(slide, inner_cx, fetch_y + sub_h + 0.02,
inner_cx, reg_y - 0.02,
color=COL_FS, width_pt=1.8)
# DMA_out (below GEMM Unit)
_rrect(slide, box_x, y_dma_out, box_w, box_h,
WRITE_COLOR, COL_TEXT_DARK,
"DMA_out",
size=13, bold=True, color=COL_TEXT_LIGHT)
# ── TCM on the right (tall vertical) ─────────────────────────────
tcm_x = 9.4
tcm_w = 2.6
tcm_y = y_dma_in - 0.05
tcm_h = (y_dma_out + box_h + 0.05) - tcm_y
_rrect(slide, tcm_x, tcm_y, tcm_w, tcm_h,
COL_TCM_BG, COL_TCM_BORDER, "",
size=12, color=COL_TCM_BORDER)
_textbox(slide, tcm_x + 0.1, tcm_y + tcm_h / 2 - 0.6,
tcm_w - 0.2, 1.2,
"TCM\n\n(PE-local SRAM,\n512 GB/s)",
size=14, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.CENTER)
# ── Wires ───────────────────────────────────────────────────────
# HBM → DMA_in (vertical down)
_arrow(slide, col_cx, y_hbm + box_h + 0.02, col_cx, y_dma_in - 0.02,
color=READ_COLOR, width_pt=2.5)
_textbox(slide, col_cx + 0.15, (y_hbm + box_h + y_dma_in) / 2 - 0.10,
1.5, 0.22, "DMA_R", size=10, bold=True,
color=READ_COLOR, align=PP_ALIGN.LEFT)
# DMA_in → TCM (horizontal right, at DMA_in y)
dma_in_cy = y_dma_in + box_h / 2
_arrow(slide, box_x + box_w + 0.02, dma_in_cy,
tcm_x - 0.02, dma_in_cy,
color=READ_COLOR, width_pt=2.5)
mid_x = (box_x + box_w + tcm_x) / 2
_textbox(slide, mid_x - 1.4, dma_in_cy - 0.32,
2.8, 0.22, "store to TCM",
size=10, bold=True, color=READ_COLOR, align=PP_ALIGN.CENTER)
# TCM → FETCH (horizontal left, into FETCH inside GEMM Unit)
fetch_right = inner_x + inner_w
fetch_cy = fetch_y + sub_h / 2
_arrow(slide, tcm_x - 0.02, fetch_cy,
fetch_right + 0.02, fetch_cy,
color=COL_FS, width_pt=2.5)
_textbox(slide, (fetch_right + tcm_x) / 2 - 1.4, fetch_cy - 0.32,
2.8, 0.22, "FETCH (TCM → Reg)",
size=10, bold=True, color=COL_FS, align=PP_ALIGN.CENTER)
# GEMM (RegFile) → TCM (STORE, horizontal right exit at RegFile y)
reg_right = inner_x + inner_w
store_cy = reg_y + sub_h / 2
_arrow(slide, reg_right + 0.02, store_cy,
tcm_x - 0.02, store_cy,
color=WRITE_COLOR, width_pt=2.5)
_textbox(slide, (reg_right + tcm_x) / 2 - 1.6, store_cy + 0.08,
3.2, 0.22, "STORE (Reg → TCM, after last K)",
size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.CENTER)
# TCM → DMA_out (horizontal left at DMA_out y)
dma_out_cy = y_dma_out + box_h / 2
_arrow(slide, tcm_x - 0.02, dma_out_cy,
box_x + box_w + 0.02, dma_out_cy,
color=WRITE_COLOR, width_pt=2.5)
_textbox(slide, mid_x - 1.4, dma_out_cy - 0.32,
2.8, 0.22, "read from TCM",
size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.CENTER)
# DMA_out → HBM (off-page; just an arrow + label, no HBM box at bottom)
_arrow(slide, col_cx, y_dma_out + box_h + 0.02,
col_cx, out_arr_end_y - 0.02,
color=WRITE_COLOR, width_pt=2.5)
_textbox(slide, col_cx + 0.15, out_arr_end_y - 0.30,
3.0, 0.22, "DMA_W → HBM",
size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.LEFT)
# ── Slide 7: PE_SCHEDULER ──────────────────────────────────────────────────
def _render_scheduler(slide):
"""PE_SCHEDULER: sole command dispatcher inside a PE.
Two paths: simple cmd → direct engine dispatch; CompositeCmd →
generate_plan + FIFO feed → TileToken self-routes through stages.
"""
SCHED_FILL = RGBColor(0xF3, 0xE8, 0xFF)
SCHED_BORDER = RGBColor(0x7C, 0x3A, 0xED)
PE_MATH_COL = RGBColor(0x9C, 0xA3, 0xAF)
_textbox(slide, 0.4, 1.0, 12.6, 0.5,
"Sole dispatcher inside a PE. CompositeCmd is expanded into a "
"TilePlan and fed tile-by-tile in FIFO order; each TileToken "
"self-routes through the pipeline stages.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# SCHEDULER container (compact — left side of slide)
sched_x, sched_y = 0.6, 2.45
sched_w, sched_h = 4.6, 2.4
# PE_CPU box (centered above scheduler)
pe_cpu_w, pe_cpu_h = 2.0, 0.50
pe_cpu_x = sched_x + sched_w / 2 - pe_cpu_w / 2
pe_cpu_y = 1.65
_rrect(slide, pe_cpu_x, pe_cpu_y, pe_cpu_w, pe_cpu_h,
COL_STORAGE, COL_STORAGE_STROKE,
"PE_CPU",
size=12, bold=True, color=COL_TEXT_DARK)
_rrect(slide, sched_x, sched_y, sched_w, sched_h,
SCHED_FILL, SCHED_BORDER, "",
size=10, color=SCHED_BORDER)
_textbox(slide, sched_x + 0.15, sched_y + 0.08, sched_w - 0.30, 0.32,
"PE_SCHEDULER",
size=13, bold=True, color=SCHED_BORDER, align=PP_ALIGN.LEFT)
# CompositeCmd description — text only, no inner box
text_x = sched_x + 0.25
text_y = sched_y + 0.55
text_w = sched_w - 0.50
_textbox(slide, text_x, text_y, text_w, 0.30,
"CompositeCmd → generate plan",
size=12, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, text_x + 0.10, text_y + 0.40, text_w - 0.15,
sched_h - 1.05,
"generate_plan(M, K, N)\n"
" → ⌈M/32⌉ × ⌈K/64⌉ × ⌈N/32⌉ tiles\n"
" each tile:\n"
" [DMA_R, FETCH, GEMM,\n"
" STORE, DMA_W] stages",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
# Engines (compact, right of scheduler — still on the LEFT half of slide)
engines_x = sched_x + sched_w + 0.40
engines_w = 2.4
engines_y = sched_y
engine_h = 0.45
engine_gap = 0.18
engines = [
("pe_dma", COL_DMA, COL_TEXT_LIGHT),
("pe_fetch_store", COL_FS, COL_TEXT_LIGHT),
("pe_gemm", COL_GEMM, COL_TEXT_DARK),
("pe_math", PE_MATH_COL, COL_TEXT_LIGHT),
]
for i, (name, fill, tcol) in enumerate(engines):
y = engines_y + i * (engine_h + engine_gap)
_rrect(slide, engines_x, y, engines_w, engine_h,
fill, COL_TEXT_DARK,
name,
size=12, bold=True, color=tcol)
# TileToken / PipelineContext annotation under the engine stack
last_y = engines_y + len(engines) * (engine_h + engine_gap) - engine_gap
note_y = last_y + 0.15
_textbox(slide, engines_x, note_y, engines_w, 0.28,
"↻ TileToken.advance()",
size=10, bold=True, color=SCHED_BORDER, align=PP_ALIGN.CENTER)
_textbox(slide, engines_x, note_y + 0.28, engines_w, 0.28,
"PipelineContext counts tiles",
size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
# PE_CPU → SCHEDULER arrow
_arrow(slide, pe_cpu_x + pe_cpu_w / 2, pe_cpu_y + pe_cpu_h + 0.02,
pe_cpu_x + pe_cpu_w / 2, sched_y - 0.02,
color=COL_TEXT_DARK, width_pt=2.5)
_textbox(slide, pe_cpu_x + pe_cpu_w + 0.10,
(pe_cpu_y + pe_cpu_h + sched_y) / 2 - 0.12,
3.0, 0.22, "PeInternalTxn(cmd)",
size=11, bold=True, color=COL_MUTED, align=PP_ALIGN.LEFT)
# SCHEDULER → engines arrows (one per engine, colour-coded)
sched_right = sched_x + sched_w
sched_cy = sched_y + sched_h / 2
for i, (name, fill, _) in enumerate(engines):
engine_cy = engines_y + i * (engine_h + engine_gap) + engine_h / 2
_arrow(slide, sched_right + 0.02, sched_cy,
engines_x - 0.02, engine_cy,
color=fill, width_pt=1.5)
# Bottom note (full width)
_textbox(slide, 0.4, 6.10, 12.6, 0.85,
"Key invariants: (1) FIFO across commands via the single "
"feeder process — no inter-command tile interleaving. "
"(2) TileToken carries its own plan; each engine reads "
"token.current_stage, advances stage_idx, and forwards to "
"the next stage's component. (3) PipelineContext."
"complete_tile() fires done_event on the last tile, "
"unblocking PE_CPU.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
# ── Slides 8 & 9: Sequence diagrams ────────────────────────────────────────
# Op kinds and their visual placement.
_KIND_INFO = {
"dma_read": {"fill": COL_DMA, "band": "between_hbm_tcm", "dir": "down"},
"fetch": {"fill": COL_FS, "band": "between_tcm_reg", "dir": "down"},
"gemm": {"fill": COL_GEMM, "band": "regfile", "dir": None},
"store": {"fill": COL_FS, "band": "between_tcm_reg", "dir": "up"},
"dma_write": {"fill": COL_DMA, "band": "between_hbm_tcm", "dir": "up"},
}
def _draw_sequence_panel(slide, ops, *, x_left=0.6, x_right=12.9,
y_top=1.7, panel_h=4.4,
tile_boundary_after=None,
tile_labels=None,
contention_after=None):
"""Generic sequence-diagram drawer.
ops: list of (step, name, kind, bytes_str, ns_str)
tile_boundary_after: index after which to draw a red dashed divider
tile_labels: list[str] showing on either side of the boundary
contention_after: index after which to draw an HBM-contention callout
"""
n_ops = len(ops)
band_x = x_left
band_w = x_right - x_left
band_h = 0.55
hbm_y = y_top
tcm_y = y_top + panel_h * 0.5 - band_h / 2
reg_y = y_top + panel_h - band_h
# Memory bands
_rect_band(slide, band_x, hbm_y, band_w, band_h, COL_HBM_BG, COL_HBM_BORDER)
_rect_band(slide, band_x, tcm_y, band_w, band_h, COL_TCM_BG, COL_TCM_BORDER)
_rect_band(slide, band_x, reg_y, band_w, band_h, COL_REG_BG, COL_REG_BORDER)
_textbox(slide, band_x + 0.05, hbm_y, 0.8, band_h, "HBM",
size=12, bold=True, color=COL_HBM_BORDER, align=PP_ALIGN.LEFT)
_textbox(slide, band_x + 0.05, tcm_y, 0.8, band_h, "TCM",
size=12, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT)
_textbox(slide, band_x + 0.05, reg_y, 0.85, band_h, "RegFile",
size=12, bold=True, color=COL_REG_BORDER, align=PP_ALIGN.LEFT)
# Op columns span from after the level labels (~0.9 in margin) to right edge
ops_left = band_x + 1.0
ops_w = band_w - 1.1
col_w = ops_w / max(n_ops, 1)
op_box_w = col_w * 0.86
op_box_h = 0.70
hbm_bot = hbm_y + band_h
tcm_top = tcm_y
tcm_bot = tcm_y + band_h
reg_top = reg_y
gap_ht = (hbm_bot + tcm_top) / 2
gap_tr = (tcm_bot + reg_top) / 2
for idx, (step, name, kind, byt, ns) in enumerate(ops):
info = _KIND_INFO[kind]
x_center = ops_left + idx * col_w + col_w / 2
x = x_center - op_box_w / 2
# Step number
_textbox(slide, x, hbm_y - 0.35, op_box_w, 0.25,
f"#{step}", size=9, bold=True, color=COL_TEXT_DARK)
if info["band"] == "between_hbm_tcm":
y_box = gap_ht - op_box_h / 2
elif info["band"] == "between_tcm_reg":
y_box = gap_tr - op_box_h / 2
else:
y_box = reg_y + band_h / 2 - op_box_h / 2
text_color = COL_TEXT_LIGHT if kind != "gemm" else COL_TEXT_DARK
label = f"{name}\n{byt} {ns}"
_rrect(slide, x, y_box, op_box_w, op_box_h,
info["fill"], COL_TEXT_DARK, label,
size=8, bold=True, color=text_color)
# Arrows
if info["dir"] == "down":
src = hbm_bot if info["band"] == "between_hbm_tcm" else tcm_bot
dst = tcm_top if info["band"] == "between_hbm_tcm" else reg_top
_arrow(slide, x_center, src, x_center, y_box,
color=info["fill"], width_pt=1.6)
_arrow(slide, x_center, y_box + op_box_h, x_center, dst,
color=info["fill"], width_pt=1.6)
elif info["dir"] == "up":
src = reg_top if info["band"] == "between_tcm_reg" else tcm_top
dst = tcm_bot if info["band"] == "between_tcm_reg" else hbm_bot
_arrow(slide, x_center, src, x_center, y_box + op_box_h,
color=info["fill"], width_pt=1.6)
_arrow(slide, x_center, y_box, x_center, dst,
color=info["fill"], width_pt=1.6)
# Tile boundary
if tile_boundary_after is not None:
bx = ops_left + tile_boundary_after * col_w
_vline_dashed(slide, bx, hbm_y - 0.05, reg_y + band_h + 0.05,
COL_RED, width_pt=2.0)
if tile_labels:
_textbox(slide, bx - 1.7, hbm_y - 0.65, 1.6, 0.3,
tile_labels[0], size=10, bold=True,
color=COL_RED, align=PP_ALIGN.RIGHT)
_textbox(slide, bx + 0.1, hbm_y - 0.65, 1.9, 0.3,
tile_labels[1], size=10, bold=True,
color=COL_RED, align=PP_ALIGN.LEFT)
# HBM contention callout
if contention_after is not None:
cx = ops_left + contention_after * col_w
cy = hbm_y + band_h + 0.05
callout_w = 2.6
callout_h = 0.7
callout_x = max(band_x + 0.5, cx - callout_w / 2)
callout_x = min(callout_x, band_x + band_w - callout_w - 0.5)
_rrect(slide, callout_x, cy + 0.05, callout_w, callout_h,
RGBColor(0xFE, 0xE2, 0xE2), COL_RED,
"HBM contention:\nDMA_WRITE out + next DMA_READ A,B\ncompete for HBM BW",
size=8, bold=True, color=COL_RED)
_arrow(slide, callout_x + callout_w / 2, cy + 0.05,
cx, hbm_y + band_h * 0.5,
color=COL_RED, width_pt=1.8)
# Engine legend (below the panel)
legend_y = reg_y + band_h + 0.6
_rect_band(slide, band_x + 0.1, legend_y, 0.25, 0.22,
COL_DMA, COL_TEXT_DARK)
_textbox(slide, band_x + 0.4, legend_y - 0.05, 2.4, 0.35,
"pe_dma (HBM↔TCM)", size=10, color=COL_TEXT_DARK,
align=PP_ALIGN.LEFT)
_rect_band(slide, band_x + 3.0, legend_y, 0.25, 0.22,
COL_FS, COL_TEXT_DARK)
_textbox(slide, band_x + 3.3, legend_y - 0.05, 3.0, 0.35,
"pe_fetch_store (TCM↔RegFile)", size=10, color=COL_TEXT_DARK,
align=PP_ALIGN.LEFT)
_rect_band(slide, band_x + 6.6, legend_y, 0.25, 0.22,
COL_GEMM, COL_TEXT_DARK)
_textbox(slide, band_x + 6.9, legend_y - 0.05, 2.6, 0.35,
"pe_gemm (MAC compute)", size=10, color=COL_TEXT_DARK,
align=PP_ALIGN.LEFT)
def _draw_composite_setup_block(slide):
"""Draw the CompositeCmd setup block before the first op column.
Represents the scheduler-side delay between PE_CPU issuing the
CompositeCmd and the first DMA_R actually firing: plan generation,
FIFO feeder enqueue, and the per-cmd scheduler overhead.
"""
SCHED_COLOR = RGBColor(0x7C, 0x3A, 0xED)
SCHED_FILL = RGBColor(0xF3, 0xE8, 0xFF)
sx, sy = 0.65, 2.45
sw, sh = 0.90, 1.05
_rrect(slide, sx, sy, sw, sh,
SCHED_FILL, SCHED_COLOR,
"PLAN-GEN\n+ FEED\n(setup)",
size=8, bold=True, color=SCHED_COLOR)
_textbox(slide, sx, sy + sh + 0.05, sw, 0.22,
"scheduler",
size=8, color=SCHED_COLOR, align=PP_ALIGN.CENTER)
# Arrow into step 1's column (ops_left = band_x + 1.0 = 1.6 by default)
_arrow(slide, sx + sw + 0.02, sy + sh / 2,
1.62, sy + sh / 2,
color=SCHED_COLOR, width_pt=1.5)
def _render_sequence_32x128x32(slide):
_textbox(slide, 0.4, 1.05, 12.6, 0.65,
"load_ref assumption: A (32×128) is pre-loaded into TCM via "
"tl.load before the kernel starts; only B is DMA_R'd per "
"tile. FETCH can start as soon as the first DMA_R(B) finishes "
"— A is already in TCM. Scheduler tile = 32×64×32 → 1·1·2 = "
"2 tiles. The PLAN-GEN/FEED block is the scheduler-side "
"setup delay before the first DMA fires.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# load_ref: no DMA_R for A; only B per tile.
ops = [
(1, "DMA_R B (tile 0)", "dma_read", "4 KB", "~16 ns"),
(2, "FETCH (tile 0)", "fetch", "8 KB", "~16 ns"),
(3, "DMA_R B (tile 1)", "dma_read", "4 KB", "~16 ns"),
(4, "GEMM K=0 (accum)", "gemm", "", "~17 ns"),
(5, "FETCH (tile 1)", "fetch", "8 KB", "~16 ns"),
(6, "GEMM K=1 (last)", "gemm", "", "~17 ns"),
(7, "STORE final", "store", "2 KB", "~4 ns"),
(8, "DMA_W out", "dma_write", "2 KB", "~8 ns"),
]
_draw_sequence_panel(slide, ops)
_draw_composite_setup_block(slide)
# "A pinned in TCM" annotation overlaid on the TCM band, left side
_textbox(slide, 0.65, 3.92, 1.6, 0.22,
"[ A pinned via tl.load ]",
size=9, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, 6.55, 12.6, 0.45,
"Pipeline is balanced (DMA, FETCH, GEMM all ~16 ns/tile) — "
"DMA engine carries only B, so per-tile DMA cost halves vs "
"ref_ref. Wall = setup + head_latency + N_tiles·T_stage + "
"final STORE+DMA_W.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
def _render_sequence_32x128x128(slide):
_textbox(slide, 0.4, 1.05, 12.6, 0.70,
"load_ref assumption: A (32×128) is pre-loaded into TCM via "
"tl.load before the kernel starts; only B is DMA_R'd per "
"tile. FETCH starts as soon as the corresponding B arrives — "
"A is already in TCM. Scheduler tile = 32×64×32 → 1·4·2 = 8 "
"tiles. PLAN-GEN/FEED block = scheduler-side setup delay "
"before the first DMA.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# load_ref: only B per tile, A pre-loaded. DMA_R scattered across timeline.
ops = [
(1, "DMA_R B (0,0,0)", "dma_read", "4 KB", "~16 ns"),
(2, "FETCH (0,0,0)", "fetch", "8 KB", "~16 ns"),
(3, "DMA_R B (0,0,1)", "dma_read", "4 KB", "~16 ns"),
(4, "GEMM K=0", "gemm", "", "~17 ns"),
(5, "FETCH (0,0,1)", "fetch", "8 KB", "~16 ns"),
(6, "DMA_R B (0,1,0)", "dma_read", "4 KB", "~16 ns"),
(7, "GEMM K=1 last", "gemm", "", "~17 ns"),
(8, "STORE out₀₀", "store", "2 KB", "~4 ns"),
(9, "DMA_W out₀₀", "dma_write", "2 KB", "~8 ns"),
(10, "FETCH (0,1,0)", "fetch", "8 KB", "~16 ns"),
(11, "DMA_R B (0,1,1)", "dma_read", "4 KB", "~16 ns"),
(12, "GEMM K=0 (0,1)", "gemm", "", "~17 ns"),
]
_draw_sequence_panel(
slide, ops,
tile_boundary_after=9,
tile_labels=["── (m,n)=(0,0) full execution ──",
"── (0,1) starts ──"],
contention_after=9,
)
_draw_composite_setup_block(slide)
# "A pinned in TCM" annotation overlaid on the TCM band
_textbox(slide, 0.65, 3.92, 1.6, 0.22,
"[ A pinned via tl.load ]",
size=9, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, 6.55, 12.6, 0.4,
"HBM half-duplex caveat: real HBM channels can't read and write "
"simultaneously. DMA_W out₀₀ competes for HBM bandwidth with "
"DMA_R(B) of (0,1). Simulator currently models PE_DMA read / "
"write as separate resources (full-duplex) — flag for revisit "
"if half-duplex matters.",
size=10, color=COL_RED, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, 6.95, 12.6, 0.3,
"Pattern repeats for (m,n) = (0,1), (0,2), (0,3). DMA engine "
"only carries B per tile, so the pipeline stays balanced — "
"no DMA bottleneck like ref_ref.",
size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
# ── Slides 10 & 11: Tiling walkthroughs ────────────────────────────────────
def _draw_matrix_tiles(slide, *, x, y, n_rows, n_cols, tile_w, tile_h,
fill, border, label_prefix, label_fmt=None,
title=None, title_size=11):
"""Draw a matrix as a grid of tiles. Returns (right_x, bottom_y).
label_fmt(r, c) returns the per-tile label; if None defaults to
f"{label_prefix}{r}{c}" for 2-d or f"{label_prefix}{c}" for 1-row.
"""
if title is not None:
_textbox(slide, x, y - 0.27, n_cols * tile_w, 0.22,
title, size=title_size, bold=True,
color=COL_TEXT_DARK, align=PP_ALIGN.CENTER)
for r in range(n_rows):
for c in range(n_cols):
if label_fmt is not None:
label = label_fmt(r, c)
elif n_rows == 1:
label = f"{label_prefix}{c}"
else:
label = f"{label_prefix}{r}{c}"
_rrect(slide, x + c * tile_w, y + r * tile_h,
tile_w, tile_h,
fill, border, label,
size=10 if n_rows == 1 else 9,
bold=True, color=border)
return x + n_cols * tile_w, y + n_rows * tile_h
def _render_tiling_32x128x32(slide):
"""32×128×32 — K-loop only, single (m,n) → NO inter-(m,n) flush.
Visualises why a tall-thin K shape is friendly: the accumulator stays
in RegFile across the entire K loop, and STORE + DMA_W fire ONCE at
the very end. No inter-pair flush serialises HBM bandwidth.
"""
_textbox(slide, 0.4, 1.0, 12.6, 0.55,
"Scheduler tile = 32×64×32 → 1·2·1 = 2 tiles. Only ONE (m,n) "
"output → the K-loop accumulates entirely in RegFile, and "
"STORE + DMA_W fire just once at the very end. No inter-(m,n) "
"flush — DMA never has to drain mid-compute.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# ── Matrix formula C = A · B (centered on slide) ─────────────────
tile_w = 0.60
tile_h = 0.60
matrix_top = 2.10
# C (1 × 1) — single output tile
c_x = 1.6
c_y = matrix_top
_draw_matrix_tiles(slide, x=c_x, y=c_y,
n_rows=1, n_cols=1,
tile_w=tile_w, tile_h=tile_h,
fill=RGBColor(0xFE, 0xF3, 0xC7),
border=COL_REG_BORDER,
label_prefix="C", title="C (32 × 32)")
# "="
_textbox(slide, c_x + tile_w + 0.10, c_y - 0.05, 0.40, 0.70,
"=", size=26, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# A (1 × 2) — 2 K-tiles
a_x = c_x + tile_w + 0.60
a_y = matrix_top
_draw_matrix_tiles(slide, x=a_x, y=a_y,
n_rows=1, n_cols=2,
tile_w=tile_w, tile_h=tile_h,
fill=RGBColor(0xDB, 0xEA, 0xFE),
border=COL_HBM_BORDER,
label_prefix="A", title="A (32 × 128)")
_textbox(slide, a_x, a_y + tile_h + 0.05, 2 * tile_w, 0.22,
"← K (2 tiles) →",
size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
# "·"
_textbox(slide, a_x + 2 * tile_w + 0.10, c_y - 0.10, 0.30, 0.70,
"·", size=28, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# B (2 × 1) — 2 K-tiles, 1 N-tile
b_x = a_x + 2 * tile_w + 0.55
b_y = matrix_top - tile_h / 2 # center on row
_draw_matrix_tiles(slide, x=b_x, y=b_y,
n_rows=2, n_cols=1,
tile_w=tile_w, tile_h=tile_h,
fill=RGBColor(0xDB, 0xEA, 0xFE),
border=COL_HBM_BORDER,
label_prefix="B", title="B (128 × 32)",
label_fmt=lambda r, c: f"B{r}")
_textbox(slide, b_x - 0.45, b_y + tile_h - 0.10, 0.40, 0.30,
"K\n", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
# Formula
_textbox(slide, 0.4, matrix_top + 2 * tile_h + 0.30, 12.6, 0.40,
"C = A0·B0 + A1·B1 (K-loop, 2 iterations — "
"accumulator stays in RegFile)",
size=13, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# ── Timeline: single (m,n) pair, NO flush ───────────────────────
tl_y = matrix_top + 2 * tile_h + 1.10
_textbox(slide, 0.4, tl_y - 0.30, 12.6, 0.22,
"Execution timeline — 1 (m,n) output, 0 inter-(m,n) flushes:",
size=12, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.LEFT)
pair_w = 4.50
pair_h = 0.85
pair_x = (SLIDE_W_IN - pair_w) / 2
k0_w = pair_w * 0.45
k1_w = pair_w * 0.53
_rrect(slide, pair_x, tl_y, k0_w, pair_h,
RGBColor(0xD1, 0xFA, 0xE5), COL_TCM_BORDER,
"K=0 (accumulate into RegFile)",
size=11, bold=True, color=COL_TCM_BORDER)
_rrect(slide, pair_x + k0_w + 0.02, tl_y, k1_w, pair_h,
RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER,
"K=1 last\nSTORE + DMA_W (final drain)",
size=11, bold=True, color=COL_REG_BORDER)
_textbox(slide, pair_x, tl_y + pair_h + 0.05, pair_w, 0.25,
"(m,n)=(0,0) → C (single output tile)",
size=11, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# "no flush" callout to the right
_textbox(slide, pair_x + pair_w + 0.20, tl_y + 0.15, 2.5, 0.55,
"✓ No inter-(m,n) flush\n (only 1 output pair)",
size=11, bold=True, color=COL_TCM_BORDER,
align=PP_ALIGN.LEFT)
# ── Bottom note ─────────────────────────────────────────────────
_textbox(slide, 0.4, 6.30, 12.6, 0.70,
"Why growing K helps: each (m,n) pair amortises its single "
"STORE+DMA_W over K_tiles iterations of pure compute. With "
"N=32 (one N-tile), there is no NEXT (m,n) pair, so no "
"inter-pair flush at all. Pipeline efficiency is bottlenecked "
"only by head latency and the final drain.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
def _render_tiling_32x128x128(slide):
"""32×128×128 — K-loop + 3 inter-(m,n) flushes (N_tiles=4 → 4 pairs)."""
_textbox(slide, 0.4, 1.0, 12.6, 0.55,
"Scheduler tile = 32×64×32 → 1·2·4 = 8 tiles. A is split along "
"K (2 tiles); B along K and N (2×4); C along N (4). For each "
"(m,n) the K-loop accumulates in RegFile; STORE + DMA_W fire "
"only on last K → 3 inter-(m,n) flushes between the 4 pairs.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# ── Matrix formula C = A · B (centered) ──────────────────────────
tile_w = 0.55
tile_h = 0.55
matrix_top = 2.10
# C (1 × 4)
c_x = 1.50
c_y = matrix_top
_draw_matrix_tiles(slide, x=c_x, y=c_y,
n_rows=1, n_cols=4,
tile_w=tile_w, tile_h=tile_h,
fill=RGBColor(0xFE, 0xF3, 0xC7),
border=COL_REG_BORDER,
label_prefix="C", title="C (32 × 128)")
_textbox(slide, c_x, c_y + tile_h + 0.05, 4 * tile_w, 0.22,
"← N (4 tiles) →",
size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
# "="
eq_x = c_x + 4 * tile_w + 0.20
_textbox(slide, eq_x, c_y - 0.05, 0.40, 0.70,
"=", size=26, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# A (1 × 2)
a_x = eq_x + 0.55
a_y = matrix_top
_draw_matrix_tiles(slide, x=a_x, y=a_y,
n_rows=1, n_cols=2,
tile_w=tile_w, tile_h=tile_h,
fill=RGBColor(0xDB, 0xEA, 0xFE),
border=COL_HBM_BORDER,
label_prefix="A", title="A (32 × 128)")
_textbox(slide, a_x, a_y + tile_h + 0.05, 2 * tile_w, 0.22,
"← K (2 tiles) →",
size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
# "·"
dot_x = a_x + 2 * tile_w + 0.15
_textbox(slide, dot_x, c_y - 0.10, 0.30, 0.70,
"·", size=28, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# B (2 × 4)
b_x = dot_x + 0.45
b_y = matrix_top - tile_h / 2 # center on the row of A and C
_draw_matrix_tiles(slide, x=b_x, y=b_y,
n_rows=2, n_cols=4,
tile_w=tile_w, tile_h=tile_h,
fill=RGBColor(0xDB, 0xEA, 0xFE),
border=COL_HBM_BORDER,
label_prefix="B", title="B (128 × 128)")
_textbox(slide, b_x, b_y + 2 * tile_h + 0.05, 4 * tile_w, 0.22,
"← N (4 tiles) →",
size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
_textbox(slide, b_x - 0.45, b_y + tile_h - 0.10, 0.40, 0.30,
"K\n", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
# Formula
_textbox(slide, 0.4, matrix_top + 2 * tile_h + 0.30, 12.6, 0.40,
"Cn = A0·B0n + A1·B1n (K-loop, 2 iters per (m,n) — "
"accumulator stays in RegFile)",
size=13, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# ── Timeline: 4 (m,n) pairs + 3 inter-pair flushes ──────────────
tl_y = matrix_top + 2 * tile_h + 1.10
_textbox(slide, 0.4, tl_y - 0.30, 12.6, 0.22,
"Execution timeline — 4 (m,n) outputs, 3 inter-(m,n) flushes:",
size=12, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.LEFT)
pair_w = 1.85
pair_h = 0.75
flush_w = 0.65
total_w = 4 * pair_w + 3 * flush_w
start_x = (SLIDE_W_IN - total_w) / 2
pair_x = start_x
for n in range(4):
k0_w = pair_w * 0.45
k1_w = pair_w * 0.53
_rrect(slide, pair_x, tl_y, k0_w, pair_h,
RGBColor(0xD1, 0xFA, 0xE5), COL_TCM_BORDER,
"K=0\n(accum)",
size=9, bold=True, color=COL_TCM_BORDER)
_rrect(slide, pair_x + k0_w + 0.02, tl_y, k1_w, pair_h,
RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER,
"K=1 last\nSTORE +\nDMA_W",
size=9, bold=True, color=COL_REG_BORDER)
_textbox(slide, pair_x, tl_y + pair_h + 0.05, pair_w, 0.22,
f"(0,{n}) → C{n}",
size=10, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
if n < 3:
flush_x = pair_x + pair_w + 0.05
_rrect(slide, flush_x, tl_y, flush_w - 0.10, pair_h,
RGBColor(0xFE, 0xE2, 0xE2), COL_RED,
"FLUSH\n(DMA_W\n vs DMA_R)",
size=8, bold=True, color=COL_RED)
pair_x = flush_x + flush_w - 0.05
else:
pair_x += pair_w + 0.05
# ── Bottom note ────────────────────────────────────────────────
_textbox(slide, 0.4, 6.30, 12.6, 0.70,
"Why flushes hurt: at every (m,n) boundary, DMA_W of Cn "
"competes with DMA_R of the next pair for the cube-shared "
"HBM channel. Inter-flush count = (M_tiles · N_tiles 1) — "
"for 32×128×128 that's 1·41 = 3 flushes. Bigger N → more "
"flushes; bigger K alone (with small N) → none.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
_RENDERERS = {
"pipeline_structure": _render_pipeline_structure,
"scheduler": _render_scheduler,
"sequence_32x128x32": _render_sequence_32x128x32,
"sequence_32x128x128": _render_sequence_32x128x128,
"tiling_32x128x32": _render_tiling_32x128x32,
"tiling_32x128x128": _render_tiling_32x128x128,
}
# ── Bar-chart slides (read from docs/diagrams/gemm_sweep.json) ─────────────
import json
GEMM_SWEEP_JSON = DIAG / "gemm_sweep.json"
def _under_tile(M, K, N, tile_M, tile_K, tile_N):
return M < tile_M or K < tile_K or N < tile_N
STAGE_KEYS = ["DMA_READ", "FETCH", "GEMM", "DMA_WRITE"]
STAGE_DISPLAY = {
"DMA_READ": "DMA in",
"FETCH": "Fetch",
"GEMM": "GEMM",
"MATH": "Math",
"DMA_WRITE": "DMA out",
}
STAGE_COLORS_PPTX = {
"DMA_READ": RGBColor(0x3B, 0x82, 0xF6),
"FETCH": RGBColor(0x10, 0xB9, 0x81),
"GEMM": RGBColor(0xF5, 0x9E, 0x0B),
"MATH": RGBColor(0xEF, 0x44, 0x44),
"DMA_WRITE": RGBColor(0xA8, 0x55, 0xF7),
}
VARIANT_COLORS_PPTX = {
"ref_ref": RGBColor(0x10, 0xB9, 0x81),
"load_ref": RGBColor(0xF5, 0x9E, 0x0B),
"load_load": RGBColor(0xEF, 0x44, 0x44),
}
def _shape_label(r: dict) -> str:
if r["M"] == r["K"] == r["N"]:
return f"M=K=N={r['M']}"
return f"M={r['M']}\nK={r['K']}\nN={r['N']}"
def _draw_native_bar_chart(slide, *, plot_x, plot_y, plot_w, plot_h,
shape_labels, flagged, tile_counts,
series, colors_map, display_map,
wall_clocks=None,
y_label="ns",
legend_x, legend_w,
foot_note=None,
threshold_line=None,
flagged_bar_color=None,
flagged_series_only=None):
"""Render a grouped bar chart natively in PPTX. Linear Y scale.
series: dict[str -> list[float]] — series_name → value per shape.
threshold_line: if set, draws a dashed horizontal reference line at this y-value.
flagged_bar_color: if set, overrides colors_map[sname] for flagged shapes.
flagged_series_only: if set, the flag override applies only to this series.
"""
n_shapes = len(shape_labels)
if n_shapes == 0:
return
series_names = list(series.keys())
n_series = len(series_names)
# Y-axis range: linear, top = max value * 1.10 (include wall-clock).
all_vals = [v for vals in series.values() for v in vals if v > 0]
if wall_clocks:
all_vals.extend([w for w in wall_clocks if w > 0])
if threshold_line is not None:
all_vals.append(threshold_line)
y_max = max(all_vals) * 1.10 if all_vals else 1.0
def y_of(v):
v = max(v, 0.0)
return plot_y + plot_h * (1 - v / y_max)
# Plot box background
_rect_band(slide, plot_x, plot_y, plot_w, plot_h,
RGBColor(0xFF, 0xFF, 0xFF), RGBColor(0xCB, 0xD5, 0xE1))
# Y-axis ticks (6 levels)
for i in range(6):
v = y_max * i / 5
y = y_of(v)
# gridline
s = slide.shapes.add_connector(1, Inches(plot_x), Inches(y),
Inches(plot_x + plot_w), Inches(y))
s.line.color.rgb = RGBColor(0xE2, 0xE8, 0xF0)
s.line.width = Pt(0.5)
# label
_textbox(slide, plot_x - 0.85, y - 0.12, 0.75, 0.25,
f"{v:>8.0f}", size=9, color=COL_MUTED, align=PP_ALIGN.RIGHT)
# Y-axis title
_textbox(slide, plot_x - 0.85, plot_y + plot_h / 2 - 0.15, 0.75, 0.3,
y_label, size=10, color=COL_TEXT_DARK)
# Threshold reference line (dashed)
if threshold_line is not None:
ty = y_of(threshold_line)
line = slide.shapes.add_connector(
1, Inches(plot_x), Inches(ty),
Inches(plot_x + plot_w), Inches(ty),
)
line.line.color.rgb = COL_TEXT_DARK
line.line.width = Pt(1.5)
from pptx.oxml.ns import qn
from lxml import etree
ln = line.line._get_or_add_ln()
pr = ln.find(qn("a:prstDash"))
if pr is None:
pr = etree.SubElement(ln, qn("a:prstDash"))
pr.set("val", "dash")
_textbox(slide, plot_x + plot_w - 0.7, ty - 0.30, 0.7, 0.25,
f"{threshold_line:.0f}% peak",
size=9, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.RIGHT)
# Geometry per shape group
group_w = plot_w / (n_shapes * 1.4)
bar_w = group_w / max(n_series, 1)
gap = (plot_w - n_shapes * group_w) / (n_shapes + 1)
y_base = plot_y + plot_h
for i in range(n_shapes):
x_group = plot_x + gap + i * (group_w + gap)
cx = x_group + group_w / 2
for j, sname in enumerate(series_names):
v = series[sname][i]
if v <= 0:
continue
bx = x_group + j * bar_w
y_top = y_of(v)
s = slide.shapes.add_shape(
MSO_SHAPE.RECTANGLE,
Inches(bx), Inches(y_top),
Inches(bar_w * 0.85), Inches(y_base - y_top),
)
s.fill.solid()
apply_flag = (flagged_bar_color is not None and flagged[i]
and (flagged_series_only is None
or sname == flagged_series_only))
fill_color = flagged_bar_color if apply_flag else colors_map[sname]
s.fill.fore_color.rgb = fill_color
s.line.color.rgb = COL_TEXT_DARK
s.line.width = Pt(0.4)
# Wall-clock dot
if wall_clocks and wall_clocks[i] > 0:
wy = y_of(wall_clocks[i])
r = 0.05
d = slide.shapes.add_shape(
MSO_SHAPE.OVAL,
Inches(cx - r), Inches(wy - r),
Inches(r * 2), Inches(r * 2),
)
d.fill.solid()
d.fill.fore_color.rgb = COL_TEXT_DARK
d.line.color.rgb = COL_TEXT_DARK
# Shape label below the group
lab_color = COL_RED if flagged[i] else COL_TEXT_DARK
_textbox(slide, x_group, y_base + 0.05, group_w, 0.7,
shape_labels[i], size=9, bold=flagged[i], color=lab_color)
if tile_counts:
_textbox(slide, x_group, y_base + 0.65, group_w, 0.3,
f"{tile_counts[i]} tiles",
size=8, color=COL_MUTED)
if flagged[i]:
_textbox(slide, x_group, y_base + 0.92, group_w, 0.3,
"↑ under-tile",
size=8, bold=True, color=COL_RED)
# Legend on the right
_textbox(slide, legend_x, plot_y, legend_w, 0.3,
"Stages (per bar):" if "DMA_READ" in series_names
else "Variants (per bar):",
size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
ly = plot_y + 0.4
for sname in series_names:
# color swatch
sw = slide.shapes.add_shape(
MSO_SHAPE.RECTANGLE,
Inches(legend_x), Inches(ly), Inches(0.2), Inches(0.2),
)
sw.fill.solid()
sw.fill.fore_color.rgb = colors_map[sname]
sw.line.color.rgb = COL_TEXT_DARK
sw.line.width = Pt(0.5)
_textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3,
display_map.get(sname, sname),
size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
ly += 0.28
if wall_clocks:
ly += 0.08
d = slide.shapes.add_shape(
MSO_SHAPE.OVAL,
Inches(legend_x + 0.05), Inches(ly + 0.04),
Inches(0.12), Inches(0.12),
)
d.fill.solid()
d.fill.fore_color.rgb = COL_TEXT_DARK
d.line.color.rgb = COL_TEXT_DARK
_textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3,
"kernel wall-clock",
size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
# Extra legend entry for flagged (under-tile) bars
if flagged_bar_color is not None and any(flagged):
ly += 0.30
sw = slide.shapes.add_shape(
MSO_SHAPE.RECTANGLE,
Inches(legend_x), Inches(ly), Inches(0.2), Inches(0.2),
)
sw.fill.solid()
sw.fill.fore_color.rgb = flagged_bar_color
sw.line.color.rgb = COL_TEXT_DARK
sw.line.width = Pt(0.5)
flagged_label_target = flagged_series_only or (
list(display_map.keys())[0] if display_map else "value"
)
flagged_label = (
display_map.get(flagged_label_target, flagged_label_target)
+ " (under-tile)"
)
_textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3,
flagged_label,
size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
if foot_note:
_textbox(slide, plot_x, y_base + 1.3, plot_w, 0.4,
foot_note, size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
EXCLUDED_SHAPES = {(512, 512, 512)}
def _load_sweep_data() -> dict:
if not GEMM_SWEEP_JSON.exists():
return {"rows": []}
data = json.loads(GEMM_SWEEP_JSON.read_text())
data["rows"] = [
r for r in data.get("rows", [])
if (r["M"], r["K"], r["N"]) not in EXCLUDED_SHAPES
]
return data
def _render_stage_breakdown(slide, variant: str, *, per_tile: bool = False):
"""Stage breakdown for one variant. Linear Y. If per_tile, divide by tile count.
Uses wall_ns (interval-union of records on each engine) — the honest
engine-active time. Avoids the double-counting that occupancy_ns has
when multiple ops overlap on a contended resource (HBM_CTRL queue).
"""
data = _load_sweep_data()
rows = [r for r in data["rows"] if r.get("variant") == variant]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
f"No sweep data found for variant '{variant}'. "
f"Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
subtitle_unit = "Per-tile" if per_tile else "Per-stage"
_textbox(slide, 0.4, 1.0, 12.6, 0.45,
f"Variant: {variant} | {subtitle_unit} engine wall-clock "
f"(linear) — DMA in / Fetch / GEMM / DMA out per shape. "
f"Tile size {tile['M']}×{tile['K']}×{tile['N']}.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
shape_labels = [_shape_label(r) for r in rows]
flagged = [_under_tile(r["M"], r["K"], r["N"],
tile["M"], tile["K"], tile["N"]) for r in rows]
tile_counts = [r["tile_count_expected"] for r in rows]
def _val(r, s):
v = r.get("stages", {}).get(s, {}).get("wall_ns", 0.0)
if per_tile:
tc = r.get("tile_count_expected", 0) or 1
return v / tc
return v
series = {s: [_val(r, s) for r in rows] for s in STAGE_KEYS}
foot_note = (
"Bars = engine wall-clock ÷ tile count (amortized per-tile cost). "
"Falls with tile count as the pipeline fills."
if per_tile else
"Bars = engine wall-clock interval (max t_end min t_start, "
"merged overlaps). Strips queue-wait double-counting."
)
_draw_native_bar_chart(
slide,
plot_x=1.0, plot_y=1.65, plot_w=10.0, plot_h=4.45,
shape_labels=shape_labels, flagged=flagged,
tile_counts=tile_counts,
series=series, colors_map=STAGE_COLORS_PPTX,
display_map=STAGE_DISPLAY,
wall_clocks=None,
y_label="ns/tile" if per_tile else "ns",
legend_x=11.4, legend_w=1.85,
foot_note=foot_note,
)
def _render_stage_breakdown_ref_ref(slide):
_render_stage_breakdown(slide, "ref_ref")
def _render_stage_breakdown_load_ref(slide):
_render_stage_breakdown(slide, "load_ref")
def _render_stage_breakdown_load_load(slide):
_render_stage_breakdown(slide, "load_load")
def _render_variant_comparison(slide):
"""Wall-clock per shape per variant (3 bars per shape)."""
data = _load_sweep_data()
rows = data["rows"]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
"No sweep data. Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
variants = data.get("variants", ["ref_ref", "load_ref", "load_load"])
# Group by shape (preserve first-seen order).
by_shape: dict = {}
for r in rows:
key = (r["M"], r["K"], r["N"])
by_shape.setdefault(key, {})[r["variant"]] = r
shapes = list(by_shape.keys())
sample = next(iter(by_shape.values()))[next(iter(by_shape[shapes[0]]))]
sample_label = _shape_label(sample)
_ = sample_label # silence unused warning
shape_labels = [_shape_label(by_shape[k][next(iter(by_shape[k]))])
for k in shapes]
flagged = [_under_tile(k[0], k[1], k[2], tile["M"], tile["K"], tile["N"])
for k in shapes]
tile_counts = [by_shape[k][next(iter(by_shape[k]))]["tile_count_expected"]
for k in shapes]
series = {
v: [(by_shape[k].get(v) or {}).get("pe_window_ns", 0.0) for k in shapes]
for v in variants
}
display_map = {v: v for v in variants}
_textbox(slide, 0.4, 1.0, 12.6, 0.4,
"Kernel wall-clock per variant per shape (linear). "
"ref_ref baseline, load_ref pins A, load_load pins both A and B.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
_draw_native_bar_chart(
slide,
plot_x=1.0, plot_y=1.6, plot_w=10.0, plot_h=4.5,
shape_labels=shape_labels, flagged=flagged,
tile_counts=tile_counts,
series=series, colors_map=VARIANT_COLORS_PPTX,
display_map=display_map,
wall_clocks=None,
y_label="wall ns",
legend_x=11.4, legend_w=1.85,
foot_note=("After Phase 2 fix (gated STORE/DMA_WRITE + pinned operand "
"skip): load_ref / load_load are faster than ref_ref."),
)
def _render_hbm_topology(slide):
"""Show cube-shared HBM_CTRL path: PE → router → HBM_CTRL → HBM."""
_textbox(slide, 0.4, 1.0, 12.6, 0.55,
"DMA reads cross the cube fabric — HBM_CTRL is one per cube "
"(NOT per PE). All 8 PEs serialize at the controller's single "
"channel resource. Even one active PE pays the round-trip on "
"every K-tile miss.",
size=13, color=COL_MUTED, align=PP_ALIGN.LEFT)
# Cube outline (encloses 8 PEs + their TCMs + the router mesh)
cube_x, cube_y, cube_w, cube_h = 0.4, 1.75, 6.4, 4.45
s = slide.shapes.add_shape(
MSO_SHAPE.RECTANGLE,
Inches(cube_x), Inches(cube_y), Inches(cube_w), Inches(cube_h),
)
s.fill.background()
s.line.color.rgb = COL_MUTED
s.line.width = Pt(1.5)
_textbox(slide, cube_x + 0.1, cube_y + 0.04, 2.5, 0.3,
"CUBE (8 PEs share HBM)", size=11, bold=True, color=COL_MUTED,
align=PP_ALIGN.LEFT)
# 8 PEs in a 4-cols × 2-rows grid, each with TCM beneath
pe_w = 1.05
pe_h = 0.62
tcm_h = 0.42
col_gap = 0.18
row_gap = 0.30
grid_x0 = cube_x + 0.25
grid_y0 = cube_y + 0.50
row_h = pe_h + tcm_h + row_gap
pe_right_y = [] # y-center of each PE for the link arrows
for i in range(8):
row = i // 4
col = i % 4
px = grid_x0 + col * (pe_w + col_gap)
py = grid_y0 + row * row_h
_rrect(slide, px, py, pe_w, pe_h,
RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER,
f"PE{i}", size=11, bold=True, color=COL_TEXT_DARK)
_rrect(slide, px, py + pe_h + 0.05, pe_w, tcm_h,
COL_TCM_BG, COL_TCM_BORDER,
"TCM (local)\n512 GB/s",
size=8, color=COL_TCM_BORDER)
if col == 3:
pe_right_y.append((px + pe_w, py + pe_h / 2))
# Router mesh strip on the right edge of the cube
router_x = cube_x + cube_w - 0.85
router_y = cube_y + 0.50
router_w = 0.65
router_h = cube_h - 0.65
_rrect(slide, router_x, router_y, router_w, router_h,
RGBColor(0xDB, 0xEA, 0xFE), COL_DMA,
"ROUTER\nMESH\n\n256 GB/s\nper link",
size=10, bold=True, color=COL_DMA)
# Arrows from each row's last PE → router strip
for (rx, ry) in pe_right_y:
_arrow(slide, rx + 0.02, ry, router_x, ry,
color=COL_DMA, width_pt=1.4)
# HBM_CTRL just outside the cube on the right
ctrl_x = cube_x + cube_w + 0.45
ctrl_w = 1.85
ctrl_h = 1.55
ctrl_y = cube_y + cube_h / 2 - ctrl_h / 2
_rrect(slide, ctrl_x, ctrl_y, ctrl_w, ctrl_h,
RGBColor(0xFE, 0xE2, 0xE2), COL_RED,
"HBM_CTRL\n(1 per cube)\n\nread channel\ncap = 1\n",
size=11, bold=True, color=COL_RED)
# Bottleneck label
_textbox(slide, ctrl_x - 0.1, ctrl_y + ctrl_h + 0.05, ctrl_w + 0.2, 0.35,
"BOTTLENECK", size=12, bold=True, color=COL_RED,
align=PP_ALIGN.CENTER)
# Router strip → HBM_CTRL arrow
link_y = cube_y + cube_h / 2
_arrow(slide, router_x + router_w, link_y,
ctrl_x, link_y, color=COL_DMA, width_pt=3.0)
_textbox(slide, router_x + router_w + 0.02,
link_y - 0.36, ctrl_x - (router_x + router_w) - 0.05, 0.3,
"256 GB/s", size=10, bold=True, color=COL_DMA,
align=PP_ALIGN.CENTER)
# HBM banks on the far right
hbm_x = ctrl_x + ctrl_w + 0.45
hbm_w = 1.85
hbm_h = ctrl_h + 0.35
hbm_y = ctrl_y - 0.175
_rrect(slide, hbm_x, hbm_y, hbm_w, hbm_h,
COL_HBM_BG, COL_HBM_BORDER,
"HBM BANKS\n(per-cube)\n\n256 GB/s\naggregated",
size=11, bold=True, color=COL_HBM_BORDER)
_arrow(slide, ctrl_x + ctrl_w, link_y,
hbm_x, link_y, color=COL_DMA, width_pt=3.0)
# Side-by-side key takeaways at the bottom
_textbox(slide, 0.4, 6.40, 6.3, 0.55,
"TCM is per-PE local → fetch/store don't contend.\n"
"HBM_CTRL is cube-shared → every DMA serializes on cap=1 channel.",
size=11, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 6.9, 6.40, 6.2, 0.55,
"Per-op DMA cost grows with #in-flight ops even on 1 PE.\n"
"load_ref/load_load pin operands → 1 HBM trip instead of per K-tile.",
size=11, color=COL_RED, align=PP_ALIGN.LEFT)
def _render_per_op_dma(slide):
"""Per-op DMA_READ cost = wall_ns / record_count.
wall_ns is interval-union of all DMA_READ records ≈ (max t_end - min t_start)
when ops overlap. Dividing by count gives the amortized per-op cost in the
DMA window — converges to the bandwidth-bound floor.
load_load is excluded — its eager up-front DMAs sit outside the composite
plan so their stage_type isn't DMA_READ and they don't appear here.
"""
data = _load_sweep_data()
rows = data["rows"]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
"No sweep data. Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
variants = ["ref_ref", "load_ref"]
by_shape: dict = {}
for r in rows:
key = (r["M"], r["K"], r["N"])
by_shape.setdefault(key, {})[r["variant"]] = r
shapes = list(by_shape.keys())
shape_labels = [_shape_label(by_shape[k][next(iter(by_shape[k]))])
for k in shapes]
flagged = [_under_tile(k[0], k[1], k[2], tile["M"], tile["K"], tile["N"])
for k in shapes]
tile_counts = [by_shape[k][next(iter(by_shape[k]))]["tile_count_expected"]
for k in shapes]
def _ns_per_op(r):
s = r.get("stages", {}).get("DMA_READ", {})
cnt = s.get("record_count", 0)
wall = s.get("wall_ns", 0.0)
return (wall / cnt) if cnt else 0.0
series = {
v: [_ns_per_op(by_shape[k].get(v) or {"stages": {}}) for k in shapes]
for v in variants
}
display_map = {v: v for v in variants}
_textbox(slide, 0.4, 1.0, 12.6, 0.55,
"Amortized per-op cost = (DMA window wall-clock) ÷ (#DMA ops). "
"Strips out queue-wait double-counting: when N ops overlap, the "
"window is N·drain_ns, so the average per op = drain_ns ≈ 16 ns "
"(bandwidth-bound floor at 4 KB ÷ 256 GB/s).",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
_draw_native_bar_chart(
slide,
plot_x=1.0, plot_y=1.7, plot_w=10.0, plot_h=4.4,
shape_labels=shape_labels, flagged=flagged,
tile_counts=tile_counts,
series=series, colors_map=VARIANT_COLORS_PPTX,
display_map=display_map,
wall_clocks=None,
y_label="ns / op",
legend_x=11.4, legend_w=1.85,
foot_note=("Flat ~16-20 ns across shapes confirms the per-op "
"transfer is constant — what looked like growing per-op "
"cost on slide before was queue wait being absorbed."),
)
def _render_mac_utilization(slide):
"""GEMM util (shape fill) AND Useful pipeline eff (computed from formula).
Useful eff = pipe_eff × GEMM_util, where
pipe_eff = (N_tiles × T_stage) / (head + N_tiles × T_stage + inter DMA_W)
"""
data = _load_sweep_data()
rows = data["rows"]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
"No sweep data. Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
tile_flops = 2 * TILE_M * TILE_K * TILE_N
HBM_GBS = 256.0
bpe = 2
T_STAGE = 16.0
D_STAGES = 3
head_ns = (D_STAGES - 1) * T_STAGE
dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS
by_shape: dict = {}
for r in rows:
if r["variant"] != "load_ref":
continue
by_shape[(r["M"], r["K"], r["N"])] = r
shapes = list(by_shape.keys())
shape_labels = [_shape_label(by_shape[k]) for k in shapes]
flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
for k in shapes]
tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]
gemm_util = []
useful_eff = []
for k in shapes:
r = by_shape[k]
M, K, N = r["M"], r["K"], r["N"]
useful = 2 * M * K * N
tiles = r["tile_count_expected"]
gu = useful / (tile_flops * tiles) * 100
gemm_util.append(gu)
m_tiles = (M + TILE_M - 1) // TILE_M
n_tiles = (N + TILE_N - 1) // TILE_N
n_mn = m_tiles * n_tiles
gemm_total = tiles * T_STAGE
inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns
wall = head_ns + gemm_total + inter_dma_w
ueff = (gemm_total * (gu / 100.0) / wall) * 100 if wall > 0 else 0.0
useful_eff.append(ueff)
series = {
"GEMM util": gemm_util,
"Useful eff": useful_eff,
}
colors_map = {
"GEMM util": COL_FS, # emerald
"Useful eff": RGBColor(0xF5, 0x9E, 0x0B), # amber
}
display_map = {
"GEMM util": "GEMM util %",
"Useful eff": "Useful eff %",
}
_textbox(slide, 0.4, 1.0, 12.6, 0.80,
"FORMULA-generated (analytical ideal-pipeline model — not "
"simulator data). GEMM util = useful FLOPs ÷ (tile FLOPs × "
"tile count). Useful eff = (N_tiles × T_stage × GEMM_util) "
"÷ wall, where wall = head + K-loop + inter-(m,n) DMA_W. "
"Slide 16 overlays this against measured pe_window_ns.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
_draw_native_bar_chart(
slide,
plot_x=1.0, plot_y=1.75, plot_w=10.0, plot_h=4.35,
shape_labels=shape_labels, flagged=flagged,
tile_counts=tile_counts,
series=series, colors_map=colors_map,
display_map=display_map,
wall_clocks=None,
y_label="%",
legend_x=11.4, legend_w=1.85,
foot_note=("GEMM util < 100% → shape mismatch (padded zeros). "
"Useful eff < GEMM util → pipeline overhead "
"(head + inter-(m,n) DMA_W) eats more of the wall."),
threshold_line=100.0,
flagged_bar_color=COL_RED,
flagged_series_only="GEMM util",
)
def _render_mac_utilization_ref_ref(slide):
"""Same metric as slide 14 but for the ref_ref variant.
In ref_ref both A and B are loaded from HBM by the scheduler — that's
TWO back-to-back DMA_R per tile, so the DMA stage takes 2 × T_stage =
32 ns/tile while FETCH/GEMM are still 16 ns/tile. The pipeline is
DMA-bound — steady-state cycle = 32 ns/tile — so useful pipeline
efficiency caps near 50 % × GEMM_util.
"""
data = _load_sweep_data()
rows = data["rows"]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
"No sweep data. Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
tile_flops = 2 * TILE_M * TILE_K * TILE_N
HBM_GBS = 256.0
bpe = 2
T_STAGE_COMPUTE = 16.0 # FETCH = GEMM = 16 ns/tile
T_STAGE_DMA_REF_REF = 2 * 16.0 # 2 DMA_R per tile (A + B)
T_STAGE = T_STAGE_DMA_REF_REF # DMA-bound steady-state cycle
D_STAGES = 3 # DMA, FETCH, GEMM
head_ns = (D_STAGES - 1) * T_STAGE_COMPUTE # pipeline fill = 32 ns
dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS
by_shape: dict = {}
for r in rows:
if r["variant"] != "ref_ref":
continue
by_shape[(r["M"], r["K"], r["N"])] = r
shapes = list(by_shape.keys())
shape_labels = [_shape_label(by_shape[k]) for k in shapes]
flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
for k in shapes]
tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]
gemm_util = []
useful_eff = []
for k in shapes:
r = by_shape[k]
M, K, N = r["M"], r["K"], r["N"]
useful = 2 * M * K * N
tiles = r["tile_count_expected"]
gu = useful / (tile_flops * tiles) * 100
gemm_util.append(gu)
m_tiles = (M + TILE_M - 1) // TILE_M
n_tiles = (N + TILE_N - 1) // TILE_N
n_mn = m_tiles * n_tiles
# Useful compute time: each tile delivers T_STAGE_COMPUTE worth
# of MAC. Steady-state pipeline cycle is DMA-bound (32 ns/tile).
compute_total = tiles * T_STAGE_COMPUTE
wall_steady = tiles * T_STAGE
inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns
wall = head_ns + wall_steady + inter_dma_w
ueff = (compute_total * (gu / 100.0) / wall) * 100 \
if wall > 0 else 0.0
useful_eff.append(ueff)
series = {
"GEMM util": gemm_util,
"Useful eff": useful_eff,
}
colors_map = {
"GEMM util": COL_FS,
"Useful eff": RGBColor(0xF5, 0x9E, 0x0B),
}
display_map = {
"GEMM util": "GEMM util %",
"Useful eff": "Useful eff % (ref_ref)",
}
_textbox(slide, 0.4, 1.0, 12.6, 0.85,
"FORMULA-generated (ideal-pipeline model, ref_ref variant — "
"not simulator data). Scheduler issues DMA_R for BOTH A and B "
"every tile. Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH "
"and GEMM stay at 16 ns. Pipeline cycle is DMA-bound → useful "
"eff caps near 50 % × GEMM_util, regardless of K-loop length.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
_draw_native_bar_chart(
slide,
plot_x=1.0, plot_y=1.85, plot_w=10.0, plot_h=4.25,
shape_labels=shape_labels, flagged=flagged,
tile_counts=tile_counts,
series=series, colors_map=colors_map,
display_map=display_map,
wall_clocks=None,
y_label="%",
legend_x=11.4, legend_w=1.85,
foot_note=("Compare to load_ref (prev slide): tl.load pins A in "
"TCM once, eliminating the per-tile A DMA_R → DMA stage "
"halves to 16 ns/tile → useful eff roughly doubles at "
"the same GEMM util."),
threshold_line=100.0,
flagged_bar_color=COL_RED,
flagged_series_only="GEMM util",
)
def _render_mac_utilization_formula_vs_measured(slide):
"""Overlay slide: formula useful_eff vs measured useful_eff (load_ref).
Formula = (N_tiles × T_stage × GEMM_util) / wall_formula × 100
wall_formula = head + N_tiles·T_stage + inter·DMA_W
Measured = (useful_FLOPs / pe_window_ns) / peak_FLOPs_per_ns × 100
pe_window_ns from gemm_sweep.json (honest post Option B).
Agreement validates the analytical model against the simulator.
"""
data = _load_sweep_data()
rows = data["rows"]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
"No sweep data. Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
tile_flops = 2 * TILE_M * TILE_K * TILE_N
HBM_GBS = 256.0
bpe = 2
T_STAGE = 16.0
D_STAGES = 3
head_ns = (D_STAGES - 1) * T_STAGE
dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS
peak_per_ns = tile_flops / T_STAGE # MAC peak throughput, flops/ns
by_shape: dict = {}
for r in rows:
if r["variant"] != "load_ref":
continue
by_shape[(r["M"], r["K"], r["N"])] = r
shapes = list(by_shape.keys())
shape_labels = [_shape_label(by_shape[k]) for k in shapes]
flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
for k in shapes]
tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]
gemm_util_formula: list[float] = []
gemm_util_measured: list[float] = []
formula_eff: list[float] = []
measured_eff: list[float] = []
for k in shapes:
r = by_shape[k]
M, K, N = r["M"], r["K"], r["N"]
useful = 2 * M * K * N
tiles = r["tile_count_expected"]
gu_formula = useful / (tile_flops * tiles)
gemm_util_formula.append(gu_formula * 100)
# Measured GEMM util uses the GEMM stage record count from op_log
# (i.e. the actual number of GEMM tiles the simulator ran).
gemm_record_count = (
r.get("stages", {}).get("GEMM", {}).get("record_count", 0)
or tiles # fallback if stages dict missing for older sweep data
)
gu_measured = useful / (tile_flops * gemm_record_count) \
if gemm_record_count > 0 else 0.0
gemm_util_measured.append(gu_measured * 100)
m_tiles = (M + TILE_M - 1) // TILE_M
n_tiles = (N + TILE_N - 1) // TILE_N
n_mn = m_tiles * n_tiles
compute_total = tiles * T_STAGE
inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns
wall_formula = head_ns + compute_total + inter_dma_w
feff = (compute_total * gu_formula / wall_formula) * 100 \
if wall_formula > 0 else 0.0
formula_eff.append(feff)
comp_window_ns = r.get("composite_window_ns", 0.0) or 0.0
if comp_window_ns > 0:
meff = (useful / comp_window_ns / peak_per_ns) * 100
else:
meff = 0.0
measured_eff.append(meff)
series = {
"GEMM util F": gemm_util_formula,
"GEMM util M": gemm_util_measured,
"Formula eff": formula_eff,
"Measured eff": measured_eff,
}
colors_map = {
"GEMM util F": COL_FS, # emerald (formula ceiling)
"GEMM util M": RGBColor(0x6E, 0xE7, 0xB7), # mint (measured ceiling)
"Formula eff": RGBColor(0xF5, 0x9E, 0x0B), # amber (formula eff)
"Measured eff": COL_DMA, # blue (measured eff)
}
display_map = {
"GEMM util F": "GEMM util % (formula)",
"GEMM util M": "GEMM util % (measured, op_log)",
"Formula eff": "Formula useful eff %",
"Measured eff": "Measured useful eff %",
}
_textbox(slide, 0.4, 1.0, 12.6, 1.10,
"Four series per shape:\n"
" • GEMM util (formula, green): useful_FLOPs ÷ (tile_FLOPs × "
"tile_count_expected) — structural shape-vs-tile ceiling.\n"
" • GEMM util (measured, mint): useful_FLOPs ÷ (tile_FLOPs × "
"GEMM_record_count) — uses the actual GEMM ops the simulator "
"ran. Should equal formula → validates plan execution.\n"
" • Formula useful eff (amber): GEMM_util × ideal pipeline efficiency.\n"
" • Measured useful eff (blue): (useful_FLOPs ÷ "
"composite_window_ns) ÷ peak — composite_window_ns covers "
"only the tl.composite pipeline (excludes upfront tl.load).",
size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
_draw_native_bar_chart(
slide,
plot_x=1.0, plot_y=2.25, plot_w=10.0, plot_h=3.85,
shape_labels=shape_labels, flagged=flagged,
tile_counts=tile_counts,
series=series, colors_map=colors_map,
display_map=display_map,
wall_clocks=None,
y_label="%",
legend_x=11.4, legend_w=1.85,
foot_note=("Reading: GEMM util F = GEMM util M → simulator ran the "
"expected tile plan. Measured eff uses composite-only "
"window (excludes upfront tl.load), so it isolates "
"pipeline efficiency. Variant: load_ref."),
threshold_line=100.0,
)
def _render_tflops_table(slide):
"""Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM.
Three-stage pipeline (DMA_R → FETCH → GEMM), all stages bandwidth-balanced
at T_stage = 16 ns/tile. Wall = pipeline fill + steady-state K-loop +
inter-(m,n) DMA_W (final flush excluded — tail, not in pipeline).
"""
data = _load_sweep_data()
rows = data["rows"]
if not rows:
_textbox(slide, 0.4, 3.0, 12.6, 1.0,
"No sweep data. Run scripts/gemm_sweep.py first.",
size=14, color=COL_RED, align=PP_ALIGN.LEFT)
return
tile = data["tile_sizes"]
TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
HBM_GBS = 256.0 # bytes/ns
bpe = 2 # f16
T_STAGE = 16.0 # ns per pipeline stage (all stages BW-balanced)
D_STAGES = 3 # DMA_R, FETCH, GEMM
head_ns = (D_STAGES - 1) * T_STAGE # pipeline fill: (D-1) × T_stage = 32 ns
dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS # = 8 ns
by_shape: dict = {}
for r in rows:
by_shape.setdefault((r["M"], r["K"], r["N"]), {})[r["variant"]] = r
shapes = list(by_shape.keys())
_textbox(slide, 0.4, 1.0, 12.6, 0.85,
f"Ideal pipelined model — assumes non-blocking tl.load + "
f"multi-channel HBM so DMA, FETCH and GEMM all run at "
f"T_stage = {T_STAGE:.0f} ns/tile.\n"
f" wall = head_latency + N_tiles × T_stage + Σ inter-(m,n) DMA_W\n"
f" head_latency = (D1) × T_stage = {head_ns:.0f} ns "
f"(pipeline fill, D = {D_STAGES} stages).\n"
f" inter DMA_W = (N_mn 1) × {dma_w_per_pair_ns:.0f} ns "
f"(final flush is tail — excluded).\n"
f" useful eff = (N_tiles × T_stage × GEMM_util) / wall — "
f"MAC time producing real output, not padded zeros.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
headers = ["Input shape", "GEMM util %", "Useful eff %"]
n_rows = len(shapes) + 1
n_cols = len(headers)
table_x = 2.5
table_y = 2.10
table_w = 8.4
table_h = 4.30
tbl_shape = slide.shapes.add_table(
n_rows, n_cols,
Inches(table_x), Inches(table_y), Inches(table_w), Inches(table_h),
)
tbl = tbl_shape.table
widths_in = [3.0, 2.5, 2.9]
for ci, w in enumerate(widths_in):
tbl.columns[ci].width = Inches(w)
def _set_cell(ci, ri, text, *, bold=False, color=COL_TEXT_DARK,
fill=None, size=12, align=PP_ALIGN.CENTER):
cell = tbl.cell(ri, ci)
if fill is not None:
cell.fill.solid()
cell.fill.fore_color.rgb = fill
tf = cell.text_frame
tf.margin_left = Emu(36000)
tf.margin_right = Emu(36000)
tf.margin_top = Emu(18000)
tf.margin_bottom = Emu(18000)
tf.word_wrap = True
p = tf.paragraphs[0]
p.alignment = align
p.text = ""
run = p.add_run()
run.text = text
run.font.size = Pt(size)
run.font.bold = bold
run.font.name = "Consolas"
run.font.color.rgb = color
for ci, h in enumerate(headers):
_set_cell(ci, 0, h, bold=True, color=COL_TEXT_LIGHT,
fill=RGBColor(0x10, 0x2A, 0x55), size=12)
tile_flops = 2 * TILE_M * TILE_K * TILE_N
for ri, k in enumerate(shapes, start=1):
M, K, N = k
useful = 2 * M * K * N
any_row = next(iter(by_shape[k].values()))
tiles = any_row["tile_count_expected"]
gemm_util = useful / (tile_flops * tiles) * 100
is_under = _under_tile(M, K, N, TILE_M, TILE_K, TILE_N)
m_tiles = (M + TILE_M - 1) // TILE_M
n_tiles = (N + TILE_N - 1) // TILE_N
n_mn = m_tiles * n_tiles
gemm_total_ns = tiles * T_STAGE
inter_dma_w_ns = max(0, n_mn - 1) * dma_w_per_pair_ns
wall_ns = head_ns + gemm_total_ns + inter_dma_w_ns
# Useful eff = pipe_eff × GEMM_util (shape-waste included).
pipe_eff = ((gemm_total_ns * (gemm_util / 100.0)) / wall_ns) * 100 \
if wall_ns > 0 else 0.0
row_fill = (RGBColor(0xFE, 0xF2, 0xF2) if is_under
else RGBColor(0xF8, 0xFA, 0xFC))
text_color = COL_RED if is_under else COL_TEXT_DARK
shape_txt = f"{M}×{K}×{N}" + (" (under-tile)" if is_under else "")
_set_cell(0, ri, shape_txt, bold=is_under, color=text_color,
fill=row_fill, align=PP_ALIGN.LEFT)
_set_cell(1, ri, f"{gemm_util:.1f} %", bold=is_under,
color=text_color, fill=row_fill)
_set_cell(2, ri, f"{pipe_eff:.1f} %", fill=row_fill,
color=COL_TEXT_DARK, bold=True)
_textbox(slide, 0.4, 6.50, 12.6, 0.6,
"What the model could achieve with non-blocking tl.load + "
"multi-channel HBM (A streams tile-by-tile, no upfront serial). "
"Today's simulator caps at ~50 % for tall-skinny K because tl.load "
"is whole-operand blocking and the cube has a single HBM channel.",
size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
def _render_pipeline_eff_walkthrough(slide):
"""Visualize how Useful eff is calculated for one example shape.
Example: 32×128×128 (8 K-tiles in 4 (m,n) groups, head + 3 inter DMA_W).
Shows the 3-stage pipeline structure, the time-line of all GEMM tiles
+ overheads, and evaluates the formula numerically.
"""
# Example parameters (kept hardcoded — this slide is illustrative)
M, K, N = 32, 128, 128
T_STAGE = 16
D_STAGES = 3 # DMA_R, FETCH, GEMM
K_TILES = 2 # K=128 → K_tiles = 2
N_MN = 4 # M_tile × N_tile = 1 × 4
TOTAL_TILES = K_TILES * N_MN # 8
HEAD_NS = (D_STAGES - 1) * T_STAGE # 32
DMA_W_NS = 8 # 32×32×2 / 256 = 8 ns
INTER_FLUSH_COUNT = N_MN - 1 # 3 flushes (final excluded)
GEMM_TOTAL = TOTAL_TILES * T_STAGE # 128
INTER_DMA_W_TOTAL = INTER_FLUSH_COUNT * DMA_W_NS # 24
WALL = HEAD_NS + GEMM_TOTAL + INTER_DMA_W_TOTAL # 184
USEFUL_EFF = GEMM_TOTAL / WALL * 100 # 69.6%
# Colors
C_HEAD = RGBColor(0x94, 0xA3, 0xB8) # slate gray
C_GEMM = RGBColor(0x10, 0xB9, 0x81) # emerald (useful)
C_FLUSH = RGBColor(0xF5, 0x9E, 0x0B) # amber (inter-(m,n))
C_TAIL = RGBColor(0xFC, 0xA5, 0xA5) # light red (excluded)
C_DMA_R = RGBColor(0x3B, 0x82, 0xF6) # blue
C_FETCH = RGBColor(0x10, 0xB9, 0x81)
C_GEMM_BOX = RGBColor(0xF5, 0x9E, 0x0B)
_textbox(slide, 0.4, 1.0, 12.6, 0.5,
f"Example: M=N=32, K=128, N-output=128 → {TOTAL_TILES} tiles in "
f"{N_MN} output groups of {K_TILES} K-tiles each. T_stage = "
f"{T_STAGE} ns/tile, pipeline depth D = {D_STAGES}.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# ── Top: 3-stage pipeline structure diagram ─────────────────────
pd_y = 1.7
pd_h = 0.55
box_w = 1.4
box_gap = 0.35
pd_total_w = 3 * box_w + 2 * box_gap
pd_x0 = (SLIDE_W_IN - pd_total_w) / 2
stages = [("DMA_R\nHBM → TCM", C_DMA_R, COL_TEXT_LIGHT),
("FETCH\nTCM → Reg", C_FETCH, COL_TEXT_LIGHT),
("GEMM\nMAC array", C_GEMM_BOX, COL_TEXT_DARK)]
for i, (lbl, fill, tc) in enumerate(stages):
x = pd_x0 + i * (box_w + box_gap)
_rrect(slide, x, pd_y, box_w, pd_h, fill, COL_TEXT_DARK,
lbl, size=10, bold=True, color=tc)
if i < 2:
ax1 = x + box_w + 0.03
ax2 = x + box_w + box_gap - 0.03
_arrow(slide, ax1, pd_y + pd_h / 2, ax2, pd_y + pd_h / 2,
color=COL_MUTED, width_pt=1.8)
_textbox(slide, pd_x0, pd_y + pd_h + 0.05, pd_total_w, 0.28,
f"each stage = {T_STAGE} ns/tile → "
f"head latency = (D1) × T_stage = {HEAD_NS} ns",
size=10, color=COL_MUTED, align=PP_ALIGN.CENTER)
# ── Middle: timeline of the full kernel ─────────────────────────
# Layout: head + 4 K-loops separated by 3 DMA_W + 1 tail (excluded)
tl_y = 3.4
tl_h = 0.85
tl_label_y = tl_y - 0.3
tl_legend_y = tl_y + tl_h + 0.18
margin = 0.8
tl_total_ns = WALL + DMA_W_NS # include tail visually
tl_w_total = SLIDE_W_IN - 2 * margin
px_per_ns = tl_w_total / tl_total_ns
cur_x = margin
# Head
w = HEAD_NS * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_HEAD, COL_TEXT_DARK)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
f"head\n{HEAD_NS} ns",
size=9, bold=True, color=COL_TEXT_LIGHT)
cur_x += w
# K-loops + inter DMA_W
for g in range(N_MN):
# K-loop: K_TILES GEMM blocks
for t in range(K_TILES):
w = T_STAGE * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK)
tile_no = g * K_TILES + t + 1
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
f"GEMM\nT{tile_no}",
size=9, bold=True, color=COL_TEXT_LIGHT)
cur_x += w
# Mark (m,n) group label above
group_start_x = cur_x - K_TILES * T_STAGE * px_per_ns
group_w = K_TILES * T_STAGE * px_per_ns
_textbox(slide, group_start_x, tl_label_y,
group_w, 0.25,
f"(m=0,n={g}) K-loop",
size=9, bold=True, color=COL_TEXT_DARK)
if g < N_MN - 1:
w = DMA_W_NS * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_FLUSH, COL_TEXT_DARK)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
"flush", size=8, bold=True, color=COL_TEXT_DARK)
cur_x += w
# Tail flush (excluded)
w = DMA_W_NS * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_TAIL, COL_RED)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
"tail", size=8, bold=True, color=COL_RED)
_textbox(slide, cur_x - 0.3, tl_y + tl_h + 0.02, w + 0.6, 0.28,
"(excluded\nfrom wall)",
size=8, bold=True, color=COL_RED)
# Wall span indicator below the timeline
wall_end_x = margin + WALL * px_per_ns
_arrow(slide, margin, tl_y + tl_h + 0.55,
wall_end_x, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
_arrow(slide, wall_end_x, tl_y + tl_h + 0.55,
margin, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
_textbox(slide, margin, tl_y + tl_h + 0.6,
wall_end_x - margin, 0.3,
f"wall = {WALL} ns",
size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER)
# ── Bottom: numerical evaluation ───────────────────────────────
formula_y = 5.7
_textbox(slide, 0.4, formula_y, 12.6, 0.35,
f"wall = head + N_tiles × T_stage + (N_mn 1) × T_dma_w = "
f"{HEAD_NS} + {TOTAL_TILES}×{T_STAGE} + {INTER_FLUSH_COUNT}×{DMA_W_NS} "
f"= {WALL} ns",
size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, formula_y + 0.35, 12.6, 0.35,
f"GEMM useful time = N_tiles × T_stage × GEMM_util = "
f"{TOTAL_TILES}×{T_STAGE}×100 % = {GEMM_TOTAL} ns",
size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, formula_y + 0.7, 12.6, 0.45,
f"Useful efficiency = {GEMM_TOTAL} / {WALL} = {USEFUL_EFF:.1f} %",
size=14, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, formula_y + 1.15, 12.6, 0.35,
"Overhead = head (pipeline fill) + inter-(m,n) flushes. "
"Bigger K (more amortization) and smaller N (fewer groups) "
"both raise the efficiency.",
size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
def _render_pipeline_eff_walkthrough_largeK(slide):
"""Walkthrough for the tall-skinny case: M=N=32, K=3072.
N_mn = 1 (one output group) → zero inter-(m,n) DMA_W flushes.
K-loop has 48 tiles — compressed visually (show first 4 + ... + last 2).
"""
M, K, N = 32, 3072, 32
T_STAGE = 16
D_STAGES = 3
K_TILES = 48
N_MN = 1
TOTAL_TILES = K_TILES
HEAD_NS = (D_STAGES - 1) * T_STAGE
DMA_W_NS = 8
GEMM_TOTAL = TOTAL_TILES * T_STAGE
INTER_DMA_W_TOTAL = 0
WALL = HEAD_NS + GEMM_TOTAL + INTER_DMA_W_TOTAL
USEFUL_EFF = GEMM_TOTAL / WALL * 100
C_HEAD = RGBColor(0x94, 0xA3, 0xB8)
C_GEMM = RGBColor(0x10, 0xB9, 0x81)
C_TAIL = RGBColor(0xFC, 0xA5, 0xA5)
C_DMA_R = RGBColor(0x3B, 0x82, 0xF6)
C_FETCH = RGBColor(0x10, 0xB9, 0x81)
C_GEMM_BOX = RGBColor(0xF5, 0x9E, 0x0B)
_textbox(slide, 0.4, 1.0, 12.6, 0.5,
f"Example: M=N=32, K=3072 → {TOTAL_TILES} tiles, "
f"N_mn = M_tiles × N_tiles = 1 × 1 = 1 → "
f"no inter-(m,n) DMA_W flushes. Long K-loop amortizes the head.",
size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
# 3-stage pipeline structure (same as before)
pd_y = 1.7
pd_h = 0.55
box_w = 1.4
box_gap = 0.35
pd_total_w = 3 * box_w + 2 * box_gap
pd_x0 = (SLIDE_W_IN - pd_total_w) / 2
stages = [("DMA_R\nHBM → TCM", C_DMA_R, COL_TEXT_LIGHT),
("FETCH\nTCM → Reg", C_FETCH, COL_TEXT_LIGHT),
("GEMM\nMAC array", C_GEMM_BOX, COL_TEXT_DARK)]
for i, (lbl, fill, tc) in enumerate(stages):
x = pd_x0 + i * (box_w + box_gap)
_rrect(slide, x, pd_y, box_w, pd_h, fill, COL_TEXT_DARK,
lbl, size=10, bold=True, color=tc)
if i < 2:
ax1 = x + box_w + 0.03
ax2 = x + box_w + box_gap - 0.03
_arrow(slide, ax1, pd_y + pd_h / 2, ax2, pd_y + pd_h / 2,
color=COL_MUTED, width_pt=1.8)
_textbox(slide, pd_x0, pd_y + pd_h + 0.05, pd_total_w, 0.28,
f"each stage = {T_STAGE} ns/tile → "
f"head latency = (D1) × T_stage = {HEAD_NS} ns",
size=10, color=COL_MUTED, align=PP_ALIGN.CENTER)
# Timeline — compressed (show 4 tiles + gap + 2 tiles)
tl_y = 3.4
tl_h = 0.85
tl_label_y = tl_y - 0.3
margin = 0.8
visible_first = 4
visible_last = 2
skipped = TOTAL_TILES - visible_first - visible_last
# Width budget: head + visible tiles + ellipsis block + tail
ellipsis_ns_equiv = 6 * T_STAGE # rendered width = 6 tiles worth
tl_total_ns = (HEAD_NS + (visible_first + visible_last) * T_STAGE
+ ellipsis_ns_equiv + DMA_W_NS)
tl_w_total = SLIDE_W_IN - 2 * margin
px_per_ns = tl_w_total / tl_total_ns
cur_x = margin
# Head
w = HEAD_NS * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_HEAD, COL_TEXT_DARK)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
f"head\n{HEAD_NS} ns",
size=9, bold=True, color=COL_TEXT_LIGHT)
cur_x += w
# Group label for the entire K-loop (one (m,n))
kloop_start_x = cur_x
# First few tiles
for t in range(visible_first):
w = T_STAGE * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
f"GEMM\nT{t + 1}",
size=9, bold=True, color=COL_TEXT_LIGHT)
cur_x += w
# Ellipsis block
ew = ellipsis_ns_equiv * px_per_ns
_rect_band(slide, cur_x, tl_y, ew, tl_h,
RGBColor(0x86, 0xEF, 0xAC), COL_TEXT_DARK)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.2, ew, 0.5,
f". . . {skipped} more GEMM tiles . . .",
size=10, bold=True, color=COL_TEXT_DARK)
cur_x += ew
# Last tiles
for t in range(visible_last):
w = T_STAGE * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
f"GEMM\nT{TOTAL_TILES - visible_last + t + 1}",
size=9, bold=True, color=COL_TEXT_LIGHT)
cur_x += w
# K-loop group label
kloop_end_x = cur_x
_textbox(slide, kloop_start_x, tl_label_y,
kloop_end_x - kloop_start_x, 0.25,
f"(m=0,n=0) K-loop — all {TOTAL_TILES} tiles in one group, "
f"NO inter flushes",
size=10, bold=True, color=COL_TEXT_DARK,
align=PP_ALIGN.CENTER)
# Tail
w = DMA_W_NS * px_per_ns
_rect_band(slide, cur_x, tl_y, w, tl_h, C_TAIL, COL_RED)
_textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
"tail", size=8, bold=True, color=COL_RED)
_textbox(slide, cur_x - 0.3, tl_y + tl_h + 0.02, w + 0.6, 0.28,
"(excluded\nfrom wall)",
size=8, bold=True, color=COL_RED)
# Wall arrow
wall_end_x = margin + (HEAD_NS + (visible_first + visible_last) * T_STAGE
+ ellipsis_ns_equiv) * px_per_ns
_arrow(slide, margin, tl_y + tl_h + 0.55,
wall_end_x, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
_arrow(slide, wall_end_x, tl_y + tl_h + 0.55,
margin, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
_textbox(slide, margin, tl_y + tl_h + 0.6,
wall_end_x - margin, 0.3,
f"wall = {WALL} ns",
size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER)
# Formula evaluation
formula_y = 5.7
_textbox(slide, 0.4, formula_y, 12.6, 0.35,
f"wall = head + N_tiles × T_stage + (N_mn 1) × T_dma_w = "
f"{HEAD_NS} + {TOTAL_TILES}×{T_STAGE} + 0 = {WALL} ns",
size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, formula_y + 0.35, 12.6, 0.35,
f"GEMM useful time = {TOTAL_TILES}×{T_STAGE}×100 % = {GEMM_TOTAL} ns",
size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, formula_y + 0.7, 12.6, 0.45,
f"Useful efficiency = {GEMM_TOTAL} / {WALL} = {USEFUL_EFF:.1f} %",
size=14, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
_textbox(slide, 0.4, formula_y + 1.15, 12.6, 0.35,
"Long K-loop, one output group → head amortized over 48 GEMM "
"tiles, no flush penalty. Approaches 100 % as K grows.",
size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
_RENDERERS.update({
"stage_breakdown_ref_ref": _render_stage_breakdown_ref_ref,
"stage_breakdown_load_ref": _render_stage_breakdown_load_ref,
"stage_breakdown_load_load": _render_stage_breakdown_load_load,
"variant_comparison": _render_variant_comparison,
"hbm_topology": _render_hbm_topology,
"per_op_dma": _render_per_op_dma,
"mac_utilization": _render_mac_utilization,
"mac_utilization_ref_ref": _render_mac_utilization_ref_ref,
"mac_utilization_formula_vs_measured": _render_mac_utilization_formula_vs_measured,
"tflops_table": _render_tflops_table,
"pipeline_eff_walkthrough": _render_pipeline_eff_walkthrough,
"pipeline_eff_walkthrough_largeK": _render_pipeline_eff_walkthrough_largeK,
})
def build():
prs = Presentation()
prs.slide_width = Inches(SLIDE_W_IN)
prs.slide_height = Inches(SLIDE_H_IN)
blank = prs.slide_layouts[6]
for i, cfg in enumerate(SLIDES, start=1):
slide = prs.slides.add_slide(blank)
_add_title(slide, cfg["title"])
if "render" in cfg:
# Shape-drawn slide (sequence diagram / pipeline structure).
_RENDERERS[cfg["render"]](slide)
else:
# Default: image on the left (8.4 in wide), bullets on the right.
_add_image_centered(
slide, cfg["image"],
left_in=0.3, top_in=1.05,
max_w_in=8.3, max_h_in=5.9,
)
_add_bullets(
slide, cfg["bullets"],
left_in=8.8, top_in=1.2,
width_in=4.3, height_in=5.7,
)
_add_footer(slide, i, len(SLIDES))
OUT.parent.mkdir(parents=True, exist_ok=True)
prs.save(OUT)
print(f"wrote {OUT}")
if __name__ == "__main__":
build()