"""Generate a multi-slide PPTX summarizing the kernbench2 model. Slides: 1. Overall architecture — how PEs are connected (cube_mesh_view) 2. Model correctness — DMA vs P2P latency (pe2pe overview) 3. PE-to-PE IPCQ communication (ipcq_two_pe_dma) 4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken) 5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep) 6. PE_accelerator data path (composite GEMM pipeline structure) 7. matmul(32, 128, 32) — composite GEMM execution sequence 8. matmul(32, 128, 128) — pipeline scaling and HBM contention This is a derived-artifact generator — no production code touched. """ from __future__ import annotations from pathlib import Path from PIL import Image from pptx import Presentation from pptx.dml.color import RGBColor from pptx.enum.shapes import MSO_SHAPE from pptx.enum.text import PP_ALIGN from pptx.util import Emu, Inches, Pt ROOT = Path(__file__).resolve().parent.parent DIAG = ROOT / "docs" / "diagrams" OUT = DIAG / "kernbench2_overview.pptx" # 16:9 widescreen — 13.333 × 7.5 in SLIDE_W_IN = 13.333 SLIDE_H_IN = 7.5 SLIDES = [ { "title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity", "image": DIAG / "cube_mesh_view.png", "bullets": [ "Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)", "Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine", "Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh", "Foundation for every latency, IPCQ, and allreduce experiment that follows", ], }, { "title": "2. Model Correctness: DMA vs P2P Latency Sweep", "image": DIAG / "pe2pe_latency_plots" / "overview.png", "bullets": [ "Sweeps payload size across PE-to-PE paths and compares to DMA", "Confirms the simulator reproduces the expected DMA/P2P crossover", "Acts as the per-hop ground truth that feeds collective-level models", ], }, { "title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)", "image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png", "bullets": [ "Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot", "Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume", "Credit return rides the fabric path back (16 B packet, no slot-IO)", "This is the building block the multi-device allreduce composes", ], }, { "title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator", "image": DIAG / "allreduce_latency_plots" / "overview_broken.png", "bullets": [ "Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE", "Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)", "Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs", "Our 6-device collective lands at ~17–22 µs — ~17× faster than ext-sim baseline", ], }, { "title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM", "image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png", "bullets": [ "Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)", "Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop", "Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s", "At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link", ], }, { "title": "6. PE_accelerator Data Path: Composite GEMM Pipeline", "render": "pipeline_structure", }, { "title": "7. PE_SCHEDULER: Plan Generation & Tile Dispatch", "render": "scheduler", }, { "title": "8. matmul(32, 128, 32) — Composite GEMM Execution Sequence", "render": "sequence_32x128x32", }, { "title": "9. matmul(32, 128, 128) — Pipeline Scaling & HBM Contention", "render": "sequence_32x128x128", }, { "title": "10. Tiling Walkthrough: 32×128×32 — K-loop Only, No Inter-(m,n) Flush", "render": "tiling_32x128x32", }, { "title": "11. Tiling Walkthrough: 32×128×128 — K-loop & Inter-(m,n) Flushes", "render": "tiling_32x128x128", }, { "title": "12. GEMM Sweep — Stage Wall-Clock (load_ref)", "render": "stage_breakdown_load_ref", }, { "title": "13. Why DMA Isn't Local: Cube-Shared HBM Path", "render": "hbm_topology", }, { "title": "14. Formula vs Measured Useful Eff (load_ref) — sim validates model", "render": "mac_utilization_formula_vs_measured", }, { "title": "15. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)", "render": "pipeline_eff_walkthrough", }, { "title": "16. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)", "render": "pipeline_eff_walkthrough_largeK", }, { "title": "17. Useful Pipelined Efficiency (ideal pipeline × GEMM util)", "render": "tflops_table", }, ] # ── Palette for the shape-drawn slides ───────────────────────────────────── COL_TEXT_DARK = RGBColor(0x1E, 0x29, 0x3B) COL_TEXT_LIGHT = RGBColor(0xFF, 0xFF, 0xFF) COL_MUTED = RGBColor(0x47, 0x55, 0x69) COL_RED = RGBColor(0xDC, 0x26, 0x26) COL_STORAGE = RGBColor(0xE2, 0xE8, 0xF0) COL_STORAGE_STROKE = RGBColor(0x47, 0x55, 0x69) COL_DMA = RGBColor(0x3B, 0x82, 0xF6) # blue COL_FS = RGBColor(0x10, 0xB9, 0x81) # emerald COL_GEMM = RGBColor(0xF5, 0x9E, 0x0B) # amber COL_HBM_BG = RGBColor(0xDB, 0xEA, 0xFE) COL_TCM_BG = RGBColor(0xD1, 0xFA, 0xE5) COL_REG_BG = RGBColor(0xFE, 0xF3, 0xC7) COL_HBM_BORDER = COL_DMA COL_TCM_BORDER = COL_FS COL_REG_BORDER = RGBColor(0xD9, 0x77, 0x06) def _add_title(slide, text): left = Inches(0.4) top = Inches(0.25) width = Inches(SLIDE_W_IN - 0.8) height = Inches(0.7) box = slide.shapes.add_textbox(left, top, width, height) tf = box.text_frame tf.margin_left = tf.margin_right = Emu(0) tf.margin_top = tf.margin_bottom = Emu(0) p = tf.paragraphs[0] run = p.add_run() run.text = text run.font.size = Pt(26) run.font.bold = True run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55) return box def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in): with Image.open(img_path) as im: iw, ih = im.size max_w_emu = Inches(max_w_in) max_h_emu = Inches(max_h_in) scale = min(max_w_emu / iw, max_h_emu / ih) w = int(iw * scale) h = int(ih * scale) left = Inches(left_in) + (max_w_emu - w) // 2 top = Inches(top_in) + (max_h_emu - h) // 2 slide.shapes.add_picture(str(img_path), left, top, width=w, height=h) def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in): box = slide.shapes.add_textbox( Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in), ) tf = box.text_frame tf.word_wrap = True for i, line in enumerate(bullets): p = tf.paragraphs[0] if i == 0 else tf.add_paragraph() p.level = 0 run = p.add_run() run.text = "• " + line run.font.size = Pt(15) run.font.color.rgb = RGBColor(0x22, 0x22, 0x22) p.space_after = Pt(6) def _add_footer(slide, idx, total): box = slide.shapes.add_textbox( Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45), Inches(1.0), Inches(0.3), ) p = box.text_frame.paragraphs[0] run = p.add_run() run.text = f"{idx} / {total}" run.font.size = Pt(10) run.font.color.rgb = RGBColor(0x88, 0x88, 0x88) # ── Shape-drawing primitives for the diagram slides ──────────────────────── def _txt(box, text, size=11, bold=False, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER): tf = box.text_frame tf.word_wrap = True tf.margin_left = Emu(18000) tf.margin_right = Emu(18000) tf.margin_top = Emu(9000) tf.margin_bottom = Emu(9000) lines = text.split("\n") for i, ln in enumerate(lines): p = tf.paragraphs[0] if i == 0 else tf.add_paragraph() p.alignment = align p.text = "" run = p.add_run() run.text = ln run.font.size = Pt(size) run.font.bold = bold run.font.name = "Consolas" run.font.color.rgb = color def _textbox(slide, x, y, w, h, text, **kw): tb = slide.shapes.add_textbox(Inches(x), Inches(y), Inches(w), Inches(h)) _txt(tb, text, **kw) return tb def _rrect(slide, x, y, w, h, fill, stroke, text="", **kw): s = slide.shapes.add_shape( MSO_SHAPE.ROUNDED_RECTANGLE, Inches(x), Inches(y), Inches(w), Inches(h), ) s.fill.solid() s.fill.fore_color.rgb = fill s.line.color.rgb = stroke s.line.width = Pt(1.3) if text: _txt(s, text, **kw) return s def _rect_band(slide, x, y, w, h, fill, stroke): s = slide.shapes.add_shape( MSO_SHAPE.RECTANGLE, Inches(x), Inches(y), Inches(w), Inches(h), ) s.fill.solid() s.fill.fore_color.rgb = fill s.line.color.rgb = stroke s.line.width = Pt(1.0) return s def _arrow(slide, x1, y1, x2, y2, color=COL_MUTED, width_pt=1.5): conn = slide.shapes.add_connector( 2, Inches(x1), Inches(y1), Inches(x2), Inches(y2), ) conn.line.color.rgb = color conn.line.width = Pt(width_pt) from pptx.oxml.ns import qn from lxml import etree ln = conn.line._get_or_add_ln() tail = ln.find(qn("a:tailEnd")) if tail is None: tail = etree.SubElement(ln, qn("a:tailEnd")) tail.set("type", "triangle") tail.set("w", "med") tail.set("len", "med") return conn def _vline_dashed(slide, x, y1, y2, color, width_pt=2.0): conn = slide.shapes.add_connector( 1, Inches(x), Inches(y1), Inches(x), Inches(y2), ) conn.line.color.rgb = color conn.line.width = Pt(width_pt) from pptx.oxml.ns import qn from lxml import etree ln = conn.line._get_or_add_ln() pr = ln.find(qn("a:prstDash")) if pr is None: pr = etree.SubElement(ln, qn("a:prstDash")) pr.set("val", "dash") return conn # ── Slide 6: Pipeline structure ──────────────────────────────────────────── def _render_pipeline_structure(slide): """Vertical hardware datapath. HBM at top → DMA_in → GEMM Unit (FETCH + RegFile inside) → DMA_out (off-page to HBM, not drawn). TCM is on the right, vertical, acting as the staging buffer that DMA_in writes into, FETCH reads from, and that the GEMM unit STOREs back into before DMA_out drains it. """ READ_COLOR = COL_DMA WRITE_COLOR = RGBColor(0xEA, 0x58, 0x0C) _textbox(slide, 0.4, 1.0, 12.6, 0.5, "Vertical flow. HBM → DMA_in → TCM → GEMM Unit (FETCH + " "RegFile) → TCM → DMA_out → HBM. TCM sits on the side as the " "staging buffer between the DMA engines and the GEMM unit.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # ── Center column: HBM → DMA_in → GEMM Unit → DMA_out ──────────── col_cx = 3.5 box_w = 3.0 box_x = col_cx - box_w / 2 box_h = 0.50 y_hbm = 1.65 y_dma_in = 2.55 y_gemm = 3.45 gemm_h = 2.40 y_dma_out = y_gemm + gemm_h + 0.30 out_arr_end_y = y_dma_out + box_h + 0.40 # HBM _rrect(slide, box_x, y_hbm, box_w, box_h, COL_HBM_BG, COL_HBM_BORDER, "HBM (off-chip, 256 GB/s)", size=11, bold=True, color=COL_HBM_BORDER) # DMA_in _rrect(slide, box_x, y_dma_in, box_w, box_h, COL_DMA, COL_TEXT_DARK, "DMA_in", size=13, bold=True, color=COL_TEXT_LIGHT) # GEMM Unit container (slightly wider than DMA boxes) gemm_x = box_x - 0.7 gemm_w = box_w + 1.4 _rrect(slide, gemm_x, y_gemm, gemm_w, gemm_h, RGBColor(0xFF, 0xFB, 0xEB), COL_GEMM, "", size=10, color=COL_GEMM) _textbox(slide, gemm_x + 0.20, y_gemm + 0.08, gemm_w - 0.4, 0.32, "GEMM Unit", size=14, bold=True, color=COL_GEMM, align=PP_ALIGN.LEFT) # Inside GEMM Unit: FETCH (top), RegFile (below), MAC label (bottom) sub_h = 0.50 inner_x = gemm_x + 0.5 inner_w = gemm_w - 1.0 fetch_y = y_gemm + 0.55 reg_y = fetch_y + sub_h + 0.30 _rrect(slide, inner_x, fetch_y, inner_w, sub_h, COL_FS, COL_TEXT_DARK, "FETCH unit", size=11, bold=True, color=COL_TEXT_LIGHT) _rrect(slide, inner_x, reg_y, inner_w, sub_h, COL_REG_BG, COL_REG_BORDER, "RegFile (A, B, C accumulator)", size=11, bold=True, color=COL_REG_BORDER) # MAC annotation (text-only) — implicit MAC operation on RegFile mac_label_y = reg_y + sub_h + 0.05 _textbox(slide, inner_x, mac_label_y, inner_w, 0.32, "↻ MAC accumulate (32 × 64 × 32 array)", size=11, bold=True, color=COL_GEMM, align=PP_ALIGN.CENTER) # FETCH → RegFile (internal, vertical down) inner_cx = inner_x + inner_w / 2 _arrow(slide, inner_cx, fetch_y + sub_h + 0.02, inner_cx, reg_y - 0.02, color=COL_FS, width_pt=1.8) # DMA_out (below GEMM Unit) _rrect(slide, box_x, y_dma_out, box_w, box_h, WRITE_COLOR, COL_TEXT_DARK, "DMA_out", size=13, bold=True, color=COL_TEXT_LIGHT) # ── TCM on the right (tall vertical) ───────────────────────────── tcm_x = 9.4 tcm_w = 2.6 tcm_y = y_dma_in - 0.05 tcm_h = (y_dma_out + box_h + 0.05) - tcm_y _rrect(slide, tcm_x, tcm_y, tcm_w, tcm_h, COL_TCM_BG, COL_TCM_BORDER, "", size=12, color=COL_TCM_BORDER) _textbox(slide, tcm_x + 0.1, tcm_y + tcm_h / 2 - 0.6, tcm_w - 0.2, 1.2, "TCM\n\n(PE-local SRAM,\n512 GB/s)", size=14, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.CENTER) # ── Wires ─────────────────────────────────────────────────────── # HBM → DMA_in (vertical down) _arrow(slide, col_cx, y_hbm + box_h + 0.02, col_cx, y_dma_in - 0.02, color=READ_COLOR, width_pt=2.5) _textbox(slide, col_cx + 0.15, (y_hbm + box_h + y_dma_in) / 2 - 0.10, 1.5, 0.22, "DMA_R", size=10, bold=True, color=READ_COLOR, align=PP_ALIGN.LEFT) # DMA_in → TCM (horizontal right, at DMA_in y) dma_in_cy = y_dma_in + box_h / 2 _arrow(slide, box_x + box_w + 0.02, dma_in_cy, tcm_x - 0.02, dma_in_cy, color=READ_COLOR, width_pt=2.5) mid_x = (box_x + box_w + tcm_x) / 2 _textbox(slide, mid_x - 1.4, dma_in_cy - 0.32, 2.8, 0.22, "store to TCM", size=10, bold=True, color=READ_COLOR, align=PP_ALIGN.CENTER) # TCM → FETCH (horizontal left, into FETCH inside GEMM Unit) fetch_right = inner_x + inner_w fetch_cy = fetch_y + sub_h / 2 _arrow(slide, tcm_x - 0.02, fetch_cy, fetch_right + 0.02, fetch_cy, color=COL_FS, width_pt=2.5) _textbox(slide, (fetch_right + tcm_x) / 2 - 1.4, fetch_cy - 0.32, 2.8, 0.22, "FETCH (TCM → Reg)", size=10, bold=True, color=COL_FS, align=PP_ALIGN.CENTER) # GEMM (RegFile) → TCM (STORE, horizontal right exit at RegFile y) reg_right = inner_x + inner_w store_cy = reg_y + sub_h / 2 _arrow(slide, reg_right + 0.02, store_cy, tcm_x - 0.02, store_cy, color=WRITE_COLOR, width_pt=2.5) _textbox(slide, (reg_right + tcm_x) / 2 - 1.6, store_cy + 0.08, 3.2, 0.22, "STORE (Reg → TCM, after last K)", size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.CENTER) # TCM → DMA_out (horizontal left at DMA_out y) dma_out_cy = y_dma_out + box_h / 2 _arrow(slide, tcm_x - 0.02, dma_out_cy, box_x + box_w + 0.02, dma_out_cy, color=WRITE_COLOR, width_pt=2.5) _textbox(slide, mid_x - 1.4, dma_out_cy - 0.32, 2.8, 0.22, "read from TCM", size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.CENTER) # DMA_out → HBM (off-page; just an arrow + label, no HBM box at bottom) _arrow(slide, col_cx, y_dma_out + box_h + 0.02, col_cx, out_arr_end_y - 0.02, color=WRITE_COLOR, width_pt=2.5) _textbox(slide, col_cx + 0.15, out_arr_end_y - 0.30, 3.0, 0.22, "DMA_W → HBM", size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.LEFT) # ── Slide 7: PE_SCHEDULER ────────────────────────────────────────────────── def _render_scheduler(slide): """PE_SCHEDULER: sole command dispatcher inside a PE. Two paths: simple cmd → direct engine dispatch; CompositeCmd → generate_plan + FIFO feed → TileToken self-routes through stages. """ SCHED_FILL = RGBColor(0xF3, 0xE8, 0xFF) SCHED_BORDER = RGBColor(0x7C, 0x3A, 0xED) PE_MATH_COL = RGBColor(0x9C, 0xA3, 0xAF) _textbox(slide, 0.4, 1.0, 12.6, 0.5, "Sole dispatcher inside a PE. CompositeCmd is expanded into a " "TilePlan and fed tile-by-tile in FIFO order; each TileToken " "self-routes through the pipeline stages.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # SCHEDULER container (compact — left side of slide) sched_x, sched_y = 0.6, 2.45 sched_w, sched_h = 4.6, 2.4 # PE_CPU box (centered above scheduler) pe_cpu_w, pe_cpu_h = 2.0, 0.50 pe_cpu_x = sched_x + sched_w / 2 - pe_cpu_w / 2 pe_cpu_y = 1.65 _rrect(slide, pe_cpu_x, pe_cpu_y, pe_cpu_w, pe_cpu_h, COL_STORAGE, COL_STORAGE_STROKE, "PE_CPU", size=12, bold=True, color=COL_TEXT_DARK) _rrect(slide, sched_x, sched_y, sched_w, sched_h, SCHED_FILL, SCHED_BORDER, "", size=10, color=SCHED_BORDER) _textbox(slide, sched_x + 0.15, sched_y + 0.08, sched_w - 0.30, 0.32, "PE_SCHEDULER", size=13, bold=True, color=SCHED_BORDER, align=PP_ALIGN.LEFT) # CompositeCmd description — text only, no inner box text_x = sched_x + 0.25 text_y = sched_y + 0.55 text_w = sched_w - 0.50 _textbox(slide, text_x, text_y, text_w, 0.30, "CompositeCmd → generate plan", size=12, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, text_x + 0.10, text_y + 0.40, text_w - 0.15, sched_h - 1.05, "generate_plan(M, K, N)\n" " → ⌈M/32⌉ × ⌈K/64⌉ × ⌈N/32⌉ tiles\n" " each tile:\n" " [DMA_R, FETCH, GEMM,\n" " STORE, DMA_W] stages", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) # Engines (compact, right of scheduler — still on the LEFT half of slide) engines_x = sched_x + sched_w + 0.40 engines_w = 2.4 engines_y = sched_y engine_h = 0.45 engine_gap = 0.18 engines = [ ("pe_dma", COL_DMA, COL_TEXT_LIGHT), ("pe_fetch_store", COL_FS, COL_TEXT_LIGHT), ("pe_gemm", COL_GEMM, COL_TEXT_DARK), ("pe_math", PE_MATH_COL, COL_TEXT_LIGHT), ] for i, (name, fill, tcol) in enumerate(engines): y = engines_y + i * (engine_h + engine_gap) _rrect(slide, engines_x, y, engines_w, engine_h, fill, COL_TEXT_DARK, name, size=12, bold=True, color=tcol) # TileToken / PipelineContext annotation under the engine stack last_y = engines_y + len(engines) * (engine_h + engine_gap) - engine_gap note_y = last_y + 0.15 _textbox(slide, engines_x, note_y, engines_w, 0.28, "↻ TileToken.advance()", size=10, bold=True, color=SCHED_BORDER, align=PP_ALIGN.CENTER) _textbox(slide, engines_x, note_y + 0.28, engines_w, 0.28, "PipelineContext counts tiles", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) # PE_CPU → SCHEDULER arrow _arrow(slide, pe_cpu_x + pe_cpu_w / 2, pe_cpu_y + pe_cpu_h + 0.02, pe_cpu_x + pe_cpu_w / 2, sched_y - 0.02, color=COL_TEXT_DARK, width_pt=2.5) _textbox(slide, pe_cpu_x + pe_cpu_w + 0.10, (pe_cpu_y + pe_cpu_h + sched_y) / 2 - 0.12, 3.0, 0.22, "PeInternalTxn(cmd)", size=11, bold=True, color=COL_MUTED, align=PP_ALIGN.LEFT) # SCHEDULER → engines arrows (one per engine, colour-coded) sched_right = sched_x + sched_w sched_cy = sched_y + sched_h / 2 for i, (name, fill, _) in enumerate(engines): engine_cy = engines_y + i * (engine_h + engine_gap) + engine_h / 2 _arrow(slide, sched_right + 0.02, sched_cy, engines_x - 0.02, engine_cy, color=fill, width_pt=1.5) # Bottom note (full width) _textbox(slide, 0.4, 6.10, 12.6, 0.85, "Key invariants: (1) FIFO across commands via the single " "feeder process — no inter-command tile interleaving. " "(2) TileToken carries its own plan; each engine reads " "token.current_stage, advances stage_idx, and forwards to " "the next stage's component. (3) PipelineContext." "complete_tile() fires done_event on the last tile, " "unblocking PE_CPU.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) # ── Slides 8 & 9: Sequence diagrams ──────────────────────────────────────── # Op kinds and their visual placement. _KIND_INFO = { "dma_read": {"fill": COL_DMA, "band": "between_hbm_tcm", "dir": "down"}, "fetch": {"fill": COL_FS, "band": "between_tcm_reg", "dir": "down"}, "gemm": {"fill": COL_GEMM, "band": "regfile", "dir": None}, "store": {"fill": COL_FS, "band": "between_tcm_reg", "dir": "up"}, "dma_write": {"fill": COL_DMA, "band": "between_hbm_tcm", "dir": "up"}, } def _draw_sequence_panel(slide, ops, *, x_left=0.6, x_right=12.9, y_top=1.7, panel_h=4.4, tile_boundary_after=None, tile_labels=None, contention_after=None): """Generic sequence-diagram drawer. ops: list of (step, name, kind, bytes_str, ns_str) tile_boundary_after: index after which to draw a red dashed divider tile_labels: list[str] showing on either side of the boundary contention_after: index after which to draw an HBM-contention callout """ n_ops = len(ops) band_x = x_left band_w = x_right - x_left band_h = 0.55 hbm_y = y_top tcm_y = y_top + panel_h * 0.5 - band_h / 2 reg_y = y_top + panel_h - band_h # Memory bands _rect_band(slide, band_x, hbm_y, band_w, band_h, COL_HBM_BG, COL_HBM_BORDER) _rect_band(slide, band_x, tcm_y, band_w, band_h, COL_TCM_BG, COL_TCM_BORDER) _rect_band(slide, band_x, reg_y, band_w, band_h, COL_REG_BG, COL_REG_BORDER) _textbox(slide, band_x + 0.05, hbm_y, 0.8, band_h, "HBM", size=12, bold=True, color=COL_HBM_BORDER, align=PP_ALIGN.LEFT) _textbox(slide, band_x + 0.05, tcm_y, 0.8, band_h, "TCM", size=12, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT) _textbox(slide, band_x + 0.05, reg_y, 0.85, band_h, "RegFile", size=12, bold=True, color=COL_REG_BORDER, align=PP_ALIGN.LEFT) # Op columns span from after the level labels (~0.9 in margin) to right edge ops_left = band_x + 1.0 ops_w = band_w - 1.1 col_w = ops_w / max(n_ops, 1) op_box_w = col_w * 0.86 op_box_h = 0.70 hbm_bot = hbm_y + band_h tcm_top = tcm_y tcm_bot = tcm_y + band_h reg_top = reg_y gap_ht = (hbm_bot + tcm_top) / 2 gap_tr = (tcm_bot + reg_top) / 2 for idx, (step, name, kind, byt, ns) in enumerate(ops): info = _KIND_INFO[kind] x_center = ops_left + idx * col_w + col_w / 2 x = x_center - op_box_w / 2 # Step number _textbox(slide, x, hbm_y - 0.35, op_box_w, 0.25, f"#{step}", size=9, bold=True, color=COL_TEXT_DARK) if info["band"] == "between_hbm_tcm": y_box = gap_ht - op_box_h / 2 elif info["band"] == "between_tcm_reg": y_box = gap_tr - op_box_h / 2 else: y_box = reg_y + band_h / 2 - op_box_h / 2 text_color = COL_TEXT_LIGHT if kind != "gemm" else COL_TEXT_DARK label = f"{name}\n{byt} {ns}" _rrect(slide, x, y_box, op_box_w, op_box_h, info["fill"], COL_TEXT_DARK, label, size=8, bold=True, color=text_color) # Arrows if info["dir"] == "down": src = hbm_bot if info["band"] == "between_hbm_tcm" else tcm_bot dst = tcm_top if info["band"] == "between_hbm_tcm" else reg_top _arrow(slide, x_center, src, x_center, y_box, color=info["fill"], width_pt=1.6) _arrow(slide, x_center, y_box + op_box_h, x_center, dst, color=info["fill"], width_pt=1.6) elif info["dir"] == "up": src = reg_top if info["band"] == "between_tcm_reg" else tcm_top dst = tcm_bot if info["band"] == "between_tcm_reg" else hbm_bot _arrow(slide, x_center, src, x_center, y_box + op_box_h, color=info["fill"], width_pt=1.6) _arrow(slide, x_center, y_box, x_center, dst, color=info["fill"], width_pt=1.6) # Tile boundary if tile_boundary_after is not None: bx = ops_left + tile_boundary_after * col_w _vline_dashed(slide, bx, hbm_y - 0.05, reg_y + band_h + 0.05, COL_RED, width_pt=2.0) if tile_labels: _textbox(slide, bx - 1.7, hbm_y - 0.65, 1.6, 0.3, tile_labels[0], size=10, bold=True, color=COL_RED, align=PP_ALIGN.RIGHT) _textbox(slide, bx + 0.1, hbm_y - 0.65, 1.9, 0.3, tile_labels[1], size=10, bold=True, color=COL_RED, align=PP_ALIGN.LEFT) # HBM contention callout if contention_after is not None: cx = ops_left + contention_after * col_w cy = hbm_y + band_h + 0.05 callout_w = 2.6 callout_h = 0.7 callout_x = max(band_x + 0.5, cx - callout_w / 2) callout_x = min(callout_x, band_x + band_w - callout_w - 0.5) _rrect(slide, callout_x, cy + 0.05, callout_w, callout_h, RGBColor(0xFE, 0xE2, 0xE2), COL_RED, "HBM contention:\nDMA_WRITE out + next DMA_READ A,B\ncompete for HBM BW", size=8, bold=True, color=COL_RED) _arrow(slide, callout_x + callout_w / 2, cy + 0.05, cx, hbm_y + band_h * 0.5, color=COL_RED, width_pt=1.8) # Engine legend (below the panel) legend_y = reg_y + band_h + 0.6 _rect_band(slide, band_x + 0.1, legend_y, 0.25, 0.22, COL_DMA, COL_TEXT_DARK) _textbox(slide, band_x + 0.4, legend_y - 0.05, 2.4, 0.35, "pe_dma (HBM↔TCM)", size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _rect_band(slide, band_x + 3.0, legend_y, 0.25, 0.22, COL_FS, COL_TEXT_DARK) _textbox(slide, band_x + 3.3, legend_y - 0.05, 3.0, 0.35, "pe_fetch_store (TCM↔RegFile)", size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _rect_band(slide, band_x + 6.6, legend_y, 0.25, 0.22, COL_GEMM, COL_TEXT_DARK) _textbox(slide, band_x + 6.9, legend_y - 0.05, 2.6, 0.35, "pe_gemm (MAC compute)", size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) def _draw_composite_setup_block(slide): """Draw the CompositeCmd setup block before the first op column. Represents the scheduler-side delay between PE_CPU issuing the CompositeCmd and the first DMA_R actually firing: plan generation, FIFO feeder enqueue, and the per-cmd scheduler overhead. """ SCHED_COLOR = RGBColor(0x7C, 0x3A, 0xED) SCHED_FILL = RGBColor(0xF3, 0xE8, 0xFF) sx, sy = 0.65, 2.45 sw, sh = 0.90, 1.05 _rrect(slide, sx, sy, sw, sh, SCHED_FILL, SCHED_COLOR, "PLAN-GEN\n+ FEED\n(setup)", size=8, bold=True, color=SCHED_COLOR) _textbox(slide, sx, sy + sh + 0.05, sw, 0.22, "scheduler", size=8, color=SCHED_COLOR, align=PP_ALIGN.CENTER) # Arrow into step 1's column (ops_left = band_x + 1.0 = 1.6 by default) _arrow(slide, sx + sw + 0.02, sy + sh / 2, 1.62, sy + sh / 2, color=SCHED_COLOR, width_pt=1.5) def _render_sequence_32x128x32(slide): _textbox(slide, 0.4, 1.05, 12.6, 0.65, "load_ref assumption: A (32×128) is pre-loaded into TCM via " "tl.load before the kernel starts; only B is DMA_R'd per " "tile. FETCH can start as soon as the first DMA_R(B) finishes " "— A is already in TCM. Scheduler tile = 32×64×32 → 1·1·2 = " "2 tiles. The PLAN-GEN/FEED block is the scheduler-side " "setup delay before the first DMA fires.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # load_ref: no DMA_R for A; only B per tile. ops = [ (1, "DMA_R B (tile 0)", "dma_read", "4 KB", "~16 ns"), (2, "FETCH (tile 0)", "fetch", "8 KB", "~16 ns"), (3, "DMA_R B (tile 1)", "dma_read", "4 KB", "~16 ns"), (4, "GEMM K=0 (accum)", "gemm", "—", "~17 ns"), (5, "FETCH (tile 1)", "fetch", "8 KB", "~16 ns"), (6, "GEMM K=1 (last)", "gemm", "—", "~17 ns"), (7, "STORE final", "store", "2 KB", "~4 ns"), (8, "DMA_W out", "dma_write", "2 KB", "~8 ns"), ] _draw_sequence_panel(slide, ops) _draw_composite_setup_block(slide) # "A pinned in TCM" annotation overlaid on the TCM band, left side _textbox(slide, 0.65, 3.92, 1.6, 0.22, "[ A pinned via tl.load ]", size=9, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, 6.55, 12.6, 0.45, "Pipeline is balanced (DMA, FETCH, GEMM all ~16 ns/tile) — " "DMA engine carries only B, so per-tile DMA cost halves vs " "ref_ref. Wall = setup + head_latency + N_tiles·T_stage + " "final STORE+DMA_W.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) def _render_sequence_32x128x128(slide): _textbox(slide, 0.4, 1.05, 12.6, 0.70, "load_ref assumption: A (32×128) is pre-loaded into TCM via " "tl.load before the kernel starts; only B is DMA_R'd per " "tile. FETCH starts as soon as the corresponding B arrives — " "A is already in TCM. Scheduler tile = 32×64×32 → 1·4·2 = 8 " "tiles. PLAN-GEN/FEED block = scheduler-side setup delay " "before the first DMA.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # load_ref: only B per tile, A pre-loaded. DMA_R scattered across timeline. ops = [ (1, "DMA_R B (0,0,0)", "dma_read", "4 KB", "~16 ns"), (2, "FETCH (0,0,0)", "fetch", "8 KB", "~16 ns"), (3, "DMA_R B (0,0,1)", "dma_read", "4 KB", "~16 ns"), (4, "GEMM K=0", "gemm", "—", "~17 ns"), (5, "FETCH (0,0,1)", "fetch", "8 KB", "~16 ns"), (6, "DMA_R B (0,1,0)", "dma_read", "4 KB", "~16 ns"), (7, "GEMM K=1 last", "gemm", "—", "~17 ns"), (8, "STORE out₀₀", "store", "2 KB", "~4 ns"), (9, "DMA_W out₀₀", "dma_write", "2 KB", "~8 ns"), (10, "FETCH (0,1,0)", "fetch", "8 KB", "~16 ns"), (11, "DMA_R B (0,1,1)", "dma_read", "4 KB", "~16 ns"), (12, "GEMM K=0 (0,1)", "gemm", "—", "~17 ns"), ] _draw_sequence_panel( slide, ops, tile_boundary_after=9, tile_labels=["── (m,n)=(0,0) full execution ──", "── (0,1) starts ──"], contention_after=9, ) _draw_composite_setup_block(slide) # "A pinned in TCM" annotation overlaid on the TCM band _textbox(slide, 0.65, 3.92, 1.6, 0.22, "[ A pinned via tl.load ]", size=9, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, 6.55, 12.6, 0.4, "HBM half-duplex caveat: real HBM channels can't read and write " "simultaneously. DMA_W out₀₀ competes for HBM bandwidth with " "DMA_R(B) of (0,1). Simulator currently models PE_DMA read / " "write as separate resources (full-duplex) — flag for revisit " "if half-duplex matters.", size=10, color=COL_RED, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, 6.95, 12.6, 0.3, "Pattern repeats for (m,n) = (0,1), (0,2), (0,3). DMA engine " "only carries B per tile, so the pipeline stays balanced — " "no DMA bottleneck like ref_ref.", size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) # ── Slides 10 & 11: Tiling walkthroughs ──────────────────────────────────── def _draw_matrix_tiles(slide, *, x, y, n_rows, n_cols, tile_w, tile_h, fill, border, label_prefix, label_fmt=None, title=None, title_size=11): """Draw a matrix as a grid of tiles. Returns (right_x, bottom_y). label_fmt(r, c) returns the per-tile label; if None defaults to f"{label_prefix}{r}{c}" for 2-d or f"{label_prefix}{c}" for 1-row. """ if title is not None: _textbox(slide, x, y - 0.27, n_cols * tile_w, 0.22, title, size=title_size, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) for r in range(n_rows): for c in range(n_cols): if label_fmt is not None: label = label_fmt(r, c) elif n_rows == 1: label = f"{label_prefix}{c}" else: label = f"{label_prefix}{r}{c}" _rrect(slide, x + c * tile_w, y + r * tile_h, tile_w, tile_h, fill, border, label, size=10 if n_rows == 1 else 9, bold=True, color=border) return x + n_cols * tile_w, y + n_rows * tile_h def _render_tiling_32x128x32(slide): """32×128×32 — K-loop only, single (m,n) → NO inter-(m,n) flush. Visualises why a tall-thin K shape is friendly: the accumulator stays in RegFile across the entire K loop, and STORE + DMA_W fire ONCE at the very end. No inter-pair flush serialises HBM bandwidth. """ _textbox(slide, 0.4, 1.0, 12.6, 0.55, "Scheduler tile = 32×64×32 → 1·2·1 = 2 tiles. Only ONE (m,n) " "output → the K-loop accumulates entirely in RegFile, and " "STORE + DMA_W fire just once at the very end. No inter-(m,n) " "flush — DMA never has to drain mid-compute.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # ── Matrix formula C = A · B (centered on slide) ───────────────── tile_w = 0.60 tile_h = 0.60 matrix_top = 2.10 # C (1 × 1) — single output tile c_x = 1.6 c_y = matrix_top _draw_matrix_tiles(slide, x=c_x, y=c_y, n_rows=1, n_cols=1, tile_w=tile_w, tile_h=tile_h, fill=RGBColor(0xFE, 0xF3, 0xC7), border=COL_REG_BORDER, label_prefix="C", title="C (32 × 32)") # "=" _textbox(slide, c_x + tile_w + 0.10, c_y - 0.05, 0.40, 0.70, "=", size=26, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # A (1 × 2) — 2 K-tiles a_x = c_x + tile_w + 0.60 a_y = matrix_top _draw_matrix_tiles(slide, x=a_x, y=a_y, n_rows=1, n_cols=2, tile_w=tile_w, tile_h=tile_h, fill=RGBColor(0xDB, 0xEA, 0xFE), border=COL_HBM_BORDER, label_prefix="A", title="A (32 × 128)") _textbox(slide, a_x, a_y + tile_h + 0.05, 2 * tile_w, 0.22, "← K (2 tiles) →", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) # "·" _textbox(slide, a_x + 2 * tile_w + 0.10, c_y - 0.10, 0.30, 0.70, "·", size=28, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # B (2 × 1) — 2 K-tiles, 1 N-tile b_x = a_x + 2 * tile_w + 0.55 b_y = matrix_top - tile_h / 2 # center on row _draw_matrix_tiles(slide, x=b_x, y=b_y, n_rows=2, n_cols=1, tile_w=tile_w, tile_h=tile_h, fill=RGBColor(0xDB, 0xEA, 0xFE), border=COL_HBM_BORDER, label_prefix="B", title="B (128 × 32)", label_fmt=lambda r, c: f"B{r}") _textbox(slide, b_x - 0.45, b_y + tile_h - 0.10, 0.40, 0.30, "K\n↓", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) # Formula _textbox(slide, 0.4, matrix_top + 2 * tile_h + 0.30, 12.6, 0.40, "C = A0·B0 + A1·B1 (K-loop, 2 iterations — " "accumulator stays in RegFile)", size=13, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # ── Timeline: single (m,n) pair, NO flush ─────────────────────── tl_y = matrix_top + 2 * tile_h + 1.10 _textbox(slide, 0.4, tl_y - 0.30, 12.6, 0.22, "Execution timeline — 1 (m,n) output, 0 inter-(m,n) flushes:", size=12, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) pair_w = 4.50 pair_h = 0.85 pair_x = (SLIDE_W_IN - pair_w) / 2 k0_w = pair_w * 0.45 k1_w = pair_w * 0.53 _rrect(slide, pair_x, tl_y, k0_w, pair_h, RGBColor(0xD1, 0xFA, 0xE5), COL_TCM_BORDER, "K=0 (accumulate into RegFile)", size=11, bold=True, color=COL_TCM_BORDER) _rrect(slide, pair_x + k0_w + 0.02, tl_y, k1_w, pair_h, RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER, "K=1 last\nSTORE + DMA_W (final drain)", size=11, bold=True, color=COL_REG_BORDER) _textbox(slide, pair_x, tl_y + pair_h + 0.05, pair_w, 0.25, "(m,n)=(0,0) → C (single output tile)", size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # "no flush" callout to the right _textbox(slide, pair_x + pair_w + 0.20, tl_y + 0.15, 2.5, 0.55, "✓ No inter-(m,n) flush\n (only 1 output pair)", size=11, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT) # ── Bottom note ───────────────────────────────────────────────── _textbox(slide, 0.4, 6.30, 12.6, 0.70, "Why growing K helps: each (m,n) pair amortises its single " "STORE+DMA_W over K_tiles iterations of pure compute. With " "N=32 (one N-tile), there is no NEXT (m,n) pair, so no " "inter-pair flush at all. Pipeline efficiency is bottlenecked " "only by head latency and the final drain.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) def _render_tiling_32x128x128(slide): """32×128×128 — K-loop + 3 inter-(m,n) flushes (N_tiles=4 → 4 pairs).""" _textbox(slide, 0.4, 1.0, 12.6, 0.55, "Scheduler tile = 32×64×32 → 1·2·4 = 8 tiles. A is split along " "K (2 tiles); B along K and N (2×4); C along N (4). For each " "(m,n) the K-loop accumulates in RegFile; STORE + DMA_W fire " "only on last K → 3 inter-(m,n) flushes between the 4 pairs.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # ── Matrix formula C = A · B (centered) ────────────────────────── tile_w = 0.55 tile_h = 0.55 matrix_top = 2.10 # C (1 × 4) c_x = 1.50 c_y = matrix_top _draw_matrix_tiles(slide, x=c_x, y=c_y, n_rows=1, n_cols=4, tile_w=tile_w, tile_h=tile_h, fill=RGBColor(0xFE, 0xF3, 0xC7), border=COL_REG_BORDER, label_prefix="C", title="C (32 × 128)") _textbox(slide, c_x, c_y + tile_h + 0.05, 4 * tile_w, 0.22, "← N (4 tiles) →", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) # "=" eq_x = c_x + 4 * tile_w + 0.20 _textbox(slide, eq_x, c_y - 0.05, 0.40, 0.70, "=", size=26, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # A (1 × 2) a_x = eq_x + 0.55 a_y = matrix_top _draw_matrix_tiles(slide, x=a_x, y=a_y, n_rows=1, n_cols=2, tile_w=tile_w, tile_h=tile_h, fill=RGBColor(0xDB, 0xEA, 0xFE), border=COL_HBM_BORDER, label_prefix="A", title="A (32 × 128)") _textbox(slide, a_x, a_y + tile_h + 0.05, 2 * tile_w, 0.22, "← K (2 tiles) →", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) # "·" dot_x = a_x + 2 * tile_w + 0.15 _textbox(slide, dot_x, c_y - 0.10, 0.30, 0.70, "·", size=28, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # B (2 × 4) b_x = dot_x + 0.45 b_y = matrix_top - tile_h / 2 # center on the row of A and C _draw_matrix_tiles(slide, x=b_x, y=b_y, n_rows=2, n_cols=4, tile_w=tile_w, tile_h=tile_h, fill=RGBColor(0xDB, 0xEA, 0xFE), border=COL_HBM_BORDER, label_prefix="B", title="B (128 × 128)") _textbox(slide, b_x, b_y + 2 * tile_h + 0.05, 4 * tile_w, 0.22, "← N (4 tiles) →", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) _textbox(slide, b_x - 0.45, b_y + tile_h - 0.10, 0.40, 0.30, "K\n↓", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER) # Formula _textbox(slide, 0.4, matrix_top + 2 * tile_h + 0.30, 12.6, 0.40, "Cn = A0·B0n + A1·B1n (K-loop, 2 iters per (m,n) — " "accumulator stays in RegFile)", size=13, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # ── Timeline: 4 (m,n) pairs + 3 inter-pair flushes ────────────── tl_y = matrix_top + 2 * tile_h + 1.10 _textbox(slide, 0.4, tl_y - 0.30, 12.6, 0.22, "Execution timeline — 4 (m,n) outputs, 3 inter-(m,n) flushes:", size=12, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) pair_w = 1.85 pair_h = 0.75 flush_w = 0.65 total_w = 4 * pair_w + 3 * flush_w start_x = (SLIDE_W_IN - total_w) / 2 pair_x = start_x for n in range(4): k0_w = pair_w * 0.45 k1_w = pair_w * 0.53 _rrect(slide, pair_x, tl_y, k0_w, pair_h, RGBColor(0xD1, 0xFA, 0xE5), COL_TCM_BORDER, "K=0\n(accum)", size=9, bold=True, color=COL_TCM_BORDER) _rrect(slide, pair_x + k0_w + 0.02, tl_y, k1_w, pair_h, RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER, "K=1 last\nSTORE +\nDMA_W", size=9, bold=True, color=COL_REG_BORDER) _textbox(slide, pair_x, tl_y + pair_h + 0.05, pair_w, 0.22, f"(0,{n}) → C{n}", size=10, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) if n < 3: flush_x = pair_x + pair_w + 0.05 _rrect(slide, flush_x, tl_y, flush_w - 0.10, pair_h, RGBColor(0xFE, 0xE2, 0xE2), COL_RED, "FLUSH\n(DMA_W\n vs DMA_R)", size=8, bold=True, color=COL_RED) pair_x = flush_x + flush_w - 0.05 else: pair_x += pair_w + 0.05 # ── Bottom note ──────────────────────────────────────────────── _textbox(slide, 0.4, 6.30, 12.6, 0.70, "Why flushes hurt: at every (m,n) boundary, DMA_W of Cn " "competes with DMA_R of the next pair for the cube-shared " "HBM channel. Inter-flush count = (M_tiles · N_tiles − 1) — " "for 32×128×128 that's 1·4−1 = 3 flushes. Bigger N → more " "flushes; bigger K alone (with small N) → none.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _RENDERERS = { "pipeline_structure": _render_pipeline_structure, "scheduler": _render_scheduler, "sequence_32x128x32": _render_sequence_32x128x32, "sequence_32x128x128": _render_sequence_32x128x128, "tiling_32x128x32": _render_tiling_32x128x32, "tiling_32x128x128": _render_tiling_32x128x128, } # ── Bar-chart slides (read from docs/diagrams/gemm_sweep.json) ───────────── import json GEMM_SWEEP_JSON = DIAG / "gemm_sweep.json" def _under_tile(M, K, N, tile_M, tile_K, tile_N): return M < tile_M or K < tile_K or N < tile_N STAGE_KEYS = ["DMA_READ", "FETCH", "GEMM", "DMA_WRITE"] STAGE_DISPLAY = { "DMA_READ": "DMA in", "FETCH": "Fetch", "GEMM": "GEMM", "MATH": "Math", "DMA_WRITE": "DMA out", } STAGE_COLORS_PPTX = { "DMA_READ": RGBColor(0x3B, 0x82, 0xF6), "FETCH": RGBColor(0x10, 0xB9, 0x81), "GEMM": RGBColor(0xF5, 0x9E, 0x0B), "MATH": RGBColor(0xEF, 0x44, 0x44), "DMA_WRITE": RGBColor(0xA8, 0x55, 0xF7), } VARIANT_COLORS_PPTX = { "ref_ref": RGBColor(0x10, 0xB9, 0x81), "load_ref": RGBColor(0xF5, 0x9E, 0x0B), "load_load": RGBColor(0xEF, 0x44, 0x44), } def _shape_label(r: dict) -> str: if r["M"] == r["K"] == r["N"]: return f"M=K=N={r['M']}" return f"M={r['M']}\nK={r['K']}\nN={r['N']}" def _draw_native_bar_chart(slide, *, plot_x, plot_y, plot_w, plot_h, shape_labels, flagged, tile_counts, series, colors_map, display_map, wall_clocks=None, y_label="ns", legend_x, legend_w, foot_note=None, threshold_line=None, flagged_bar_color=None, flagged_series_only=None): """Render a grouped bar chart natively in PPTX. Linear Y scale. series: dict[str -> list[float]] — series_name → value per shape. threshold_line: if set, draws a dashed horizontal reference line at this y-value. flagged_bar_color: if set, overrides colors_map[sname] for flagged shapes. flagged_series_only: if set, the flag override applies only to this series. """ n_shapes = len(shape_labels) if n_shapes == 0: return series_names = list(series.keys()) n_series = len(series_names) # Y-axis range: linear, top = max value * 1.10 (include wall-clock). all_vals = [v for vals in series.values() for v in vals if v > 0] if wall_clocks: all_vals.extend([w for w in wall_clocks if w > 0]) if threshold_line is not None: all_vals.append(threshold_line) y_max = max(all_vals) * 1.10 if all_vals else 1.0 def y_of(v): v = max(v, 0.0) return plot_y + plot_h * (1 - v / y_max) # Plot box background _rect_band(slide, plot_x, plot_y, plot_w, plot_h, RGBColor(0xFF, 0xFF, 0xFF), RGBColor(0xCB, 0xD5, 0xE1)) # Y-axis ticks (6 levels) for i in range(6): v = y_max * i / 5 y = y_of(v) # gridline s = slide.shapes.add_connector(1, Inches(plot_x), Inches(y), Inches(plot_x + plot_w), Inches(y)) s.line.color.rgb = RGBColor(0xE2, 0xE8, 0xF0) s.line.width = Pt(0.5) # label _textbox(slide, plot_x - 0.85, y - 0.12, 0.75, 0.25, f"{v:>8.0f}", size=9, color=COL_MUTED, align=PP_ALIGN.RIGHT) # Y-axis title _textbox(slide, plot_x - 0.85, plot_y + plot_h / 2 - 0.15, 0.75, 0.3, y_label, size=10, color=COL_TEXT_DARK) # Threshold reference line (dashed) if threshold_line is not None: ty = y_of(threshold_line) line = slide.shapes.add_connector( 1, Inches(plot_x), Inches(ty), Inches(plot_x + plot_w), Inches(ty), ) line.line.color.rgb = COL_TEXT_DARK line.line.width = Pt(1.5) from pptx.oxml.ns import qn from lxml import etree ln = line.line._get_or_add_ln() pr = ln.find(qn("a:prstDash")) if pr is None: pr = etree.SubElement(ln, qn("a:prstDash")) pr.set("val", "dash") _textbox(slide, plot_x + plot_w - 0.7, ty - 0.30, 0.7, 0.25, f"{threshold_line:.0f}% peak", size=9, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.RIGHT) # Geometry per shape group group_w = plot_w / (n_shapes * 1.4) bar_w = group_w / max(n_series, 1) gap = (plot_w - n_shapes * group_w) / (n_shapes + 1) y_base = plot_y + plot_h for i in range(n_shapes): x_group = plot_x + gap + i * (group_w + gap) cx = x_group + group_w / 2 for j, sname in enumerate(series_names): v = series[sname][i] if v <= 0: continue bx = x_group + j * bar_w y_top = y_of(v) s = slide.shapes.add_shape( MSO_SHAPE.RECTANGLE, Inches(bx), Inches(y_top), Inches(bar_w * 0.85), Inches(y_base - y_top), ) s.fill.solid() apply_flag = (flagged_bar_color is not None and flagged[i] and (flagged_series_only is None or sname == flagged_series_only)) fill_color = flagged_bar_color if apply_flag else colors_map[sname] s.fill.fore_color.rgb = fill_color s.line.color.rgb = COL_TEXT_DARK s.line.width = Pt(0.4) # Wall-clock dot if wall_clocks and wall_clocks[i] > 0: wy = y_of(wall_clocks[i]) r = 0.05 d = slide.shapes.add_shape( MSO_SHAPE.OVAL, Inches(cx - r), Inches(wy - r), Inches(r * 2), Inches(r * 2), ) d.fill.solid() d.fill.fore_color.rgb = COL_TEXT_DARK d.line.color.rgb = COL_TEXT_DARK # Shape label below the group lab_color = COL_RED if flagged[i] else COL_TEXT_DARK _textbox(slide, x_group, y_base + 0.05, group_w, 0.7, shape_labels[i], size=9, bold=flagged[i], color=lab_color) if tile_counts: _textbox(slide, x_group, y_base + 0.65, group_w, 0.3, f"{tile_counts[i]} tiles", size=8, color=COL_MUTED) if flagged[i]: _textbox(slide, x_group, y_base + 0.92, group_w, 0.3, "↑ under-tile", size=8, bold=True, color=COL_RED) # Legend on the right _textbox(slide, legend_x, plot_y, legend_w, 0.3, "Stages (per bar):" if "DMA_READ" in series_names else "Variants (per bar):", size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) ly = plot_y + 0.4 for sname in series_names: # color swatch sw = slide.shapes.add_shape( MSO_SHAPE.RECTANGLE, Inches(legend_x), Inches(ly), Inches(0.2), Inches(0.2), ) sw.fill.solid() sw.fill.fore_color.rgb = colors_map[sname] sw.line.color.rgb = COL_TEXT_DARK sw.line.width = Pt(0.5) _textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3, display_map.get(sname, sname), size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) ly += 0.28 if wall_clocks: ly += 0.08 d = slide.shapes.add_shape( MSO_SHAPE.OVAL, Inches(legend_x + 0.05), Inches(ly + 0.04), Inches(0.12), Inches(0.12), ) d.fill.solid() d.fill.fore_color.rgb = COL_TEXT_DARK d.line.color.rgb = COL_TEXT_DARK _textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3, "kernel wall-clock", size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) # Extra legend entry for flagged (under-tile) bars if flagged_bar_color is not None and any(flagged): ly += 0.30 sw = slide.shapes.add_shape( MSO_SHAPE.RECTANGLE, Inches(legend_x), Inches(ly), Inches(0.2), Inches(0.2), ) sw.fill.solid() sw.fill.fore_color.rgb = flagged_bar_color sw.line.color.rgb = COL_TEXT_DARK sw.line.width = Pt(0.5) flagged_label_target = flagged_series_only or ( list(display_map.keys())[0] if display_map else "value" ) flagged_label = ( display_map.get(flagged_label_target, flagged_label_target) + " (under-tile)" ) _textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3, flagged_label, size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) if foot_note: _textbox(slide, plot_x, y_base + 1.3, plot_w, 0.4, foot_note, size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) EXCLUDED_SHAPES = {(512, 512, 512)} def _load_sweep_data() -> dict: if not GEMM_SWEEP_JSON.exists(): return {"rows": []} data = json.loads(GEMM_SWEEP_JSON.read_text()) data["rows"] = [ r for r in data.get("rows", []) if (r["M"], r["K"], r["N"]) not in EXCLUDED_SHAPES ] return data def _render_stage_breakdown(slide, variant: str, *, per_tile: bool = False): """Stage breakdown for one variant. Linear Y. If per_tile, divide by tile count. Uses wall_ns (interval-union of records on each engine) — the honest engine-active time. Avoids the double-counting that occupancy_ns has when multiple ops overlap on a contended resource (HBM_CTRL queue). """ data = _load_sweep_data() rows = [r for r in data["rows"] if r.get("variant") == variant] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, f"No sweep data found for variant '{variant}'. " f"Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] subtitle_unit = "Per-tile" if per_tile else "Per-stage" _textbox(slide, 0.4, 1.0, 12.6, 0.45, f"Variant: {variant} | {subtitle_unit} engine wall-clock " f"(linear) — DMA in / Fetch / GEMM / DMA out per shape. " f"Tile size {tile['M']}×{tile['K']}×{tile['N']}.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) shape_labels = [_shape_label(r) for r in rows] flagged = [_under_tile(r["M"], r["K"], r["N"], tile["M"], tile["K"], tile["N"]) for r in rows] tile_counts = [r["tile_count_expected"] for r in rows] def _val(r, s): v = r.get("stages", {}).get(s, {}).get("wall_ns", 0.0) if per_tile: tc = r.get("tile_count_expected", 0) or 1 return v / tc return v series = {s: [_val(r, s) for r in rows] for s in STAGE_KEYS} foot_note = ( "Bars = engine wall-clock ÷ tile count (amortized per-tile cost). " "Falls with tile count as the pipeline fills." if per_tile else "Bars = engine wall-clock interval (max t_end − min t_start, " "merged overlaps). Strips queue-wait double-counting." ) _draw_native_bar_chart( slide, plot_x=1.0, plot_y=1.65, plot_w=10.0, plot_h=4.45, shape_labels=shape_labels, flagged=flagged, tile_counts=tile_counts, series=series, colors_map=STAGE_COLORS_PPTX, display_map=STAGE_DISPLAY, wall_clocks=None, y_label="ns/tile" if per_tile else "ns", legend_x=11.4, legend_w=1.85, foot_note=foot_note, ) def _render_stage_breakdown_ref_ref(slide): _render_stage_breakdown(slide, "ref_ref") def _render_stage_breakdown_load_ref(slide): _render_stage_breakdown(slide, "load_ref") def _render_stage_breakdown_load_load(slide): _render_stage_breakdown(slide, "load_load") def _render_variant_comparison(slide): """Wall-clock per shape per variant (3 bars per shape).""" data = _load_sweep_data() rows = data["rows"] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, "No sweep data. Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] variants = data.get("variants", ["ref_ref", "load_ref", "load_load"]) # Group by shape (preserve first-seen order). by_shape: dict = {} for r in rows: key = (r["M"], r["K"], r["N"]) by_shape.setdefault(key, {})[r["variant"]] = r shapes = list(by_shape.keys()) sample = next(iter(by_shape.values()))[next(iter(by_shape[shapes[0]]))] sample_label = _shape_label(sample) _ = sample_label # silence unused warning shape_labels = [_shape_label(by_shape[k][next(iter(by_shape[k]))]) for k in shapes] flagged = [_under_tile(k[0], k[1], k[2], tile["M"], tile["K"], tile["N"]) for k in shapes] tile_counts = [by_shape[k][next(iter(by_shape[k]))]["tile_count_expected"] for k in shapes] series = { v: [(by_shape[k].get(v) or {}).get("pe_window_ns", 0.0) for k in shapes] for v in variants } display_map = {v: v for v in variants} _textbox(slide, 0.4, 1.0, 12.6, 0.4, "Kernel wall-clock per variant per shape (linear). " "ref_ref baseline, load_ref pins A, load_load pins both A and B.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( slide, plot_x=1.0, plot_y=1.6, plot_w=10.0, plot_h=4.5, shape_labels=shape_labels, flagged=flagged, tile_counts=tile_counts, series=series, colors_map=VARIANT_COLORS_PPTX, display_map=display_map, wall_clocks=None, y_label="wall ns", legend_x=11.4, legend_w=1.85, foot_note=("After Phase 2 fix (gated STORE/DMA_WRITE + pinned operand " "skip): load_ref / load_load are faster than ref_ref."), ) def _render_hbm_topology(slide): """Show cube-shared HBM_CTRL path: PE → router → HBM_CTRL → HBM.""" _textbox(slide, 0.4, 1.0, 12.6, 0.55, "DMA reads cross the cube fabric — HBM_CTRL is one per cube " "(NOT per PE). All 8 PEs serialize at the controller's single " "channel resource. Even one active PE pays the round-trip on " "every K-tile miss.", size=13, color=COL_MUTED, align=PP_ALIGN.LEFT) # Cube outline (encloses 8 PEs + their TCMs + the router mesh) cube_x, cube_y, cube_w, cube_h = 0.4, 1.75, 6.4, 4.45 s = slide.shapes.add_shape( MSO_SHAPE.RECTANGLE, Inches(cube_x), Inches(cube_y), Inches(cube_w), Inches(cube_h), ) s.fill.background() s.line.color.rgb = COL_MUTED s.line.width = Pt(1.5) _textbox(slide, cube_x + 0.1, cube_y + 0.04, 2.5, 0.3, "CUBE (8 PEs share HBM)", size=11, bold=True, color=COL_MUTED, align=PP_ALIGN.LEFT) # 8 PEs in a 4-cols × 2-rows grid, each with TCM beneath pe_w = 1.05 pe_h = 0.62 tcm_h = 0.42 col_gap = 0.18 row_gap = 0.30 grid_x0 = cube_x + 0.25 grid_y0 = cube_y + 0.50 row_h = pe_h + tcm_h + row_gap pe_right_y = [] # y-center of each PE for the link arrows for i in range(8): row = i // 4 col = i % 4 px = grid_x0 + col * (pe_w + col_gap) py = grid_y0 + row * row_h _rrect(slide, px, py, pe_w, pe_h, RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER, f"PE{i}", size=11, bold=True, color=COL_TEXT_DARK) _rrect(slide, px, py + pe_h + 0.05, pe_w, tcm_h, COL_TCM_BG, COL_TCM_BORDER, "TCM (local)\n512 GB/s", size=8, color=COL_TCM_BORDER) if col == 3: pe_right_y.append((px + pe_w, py + pe_h / 2)) # Router mesh strip on the right edge of the cube router_x = cube_x + cube_w - 0.85 router_y = cube_y + 0.50 router_w = 0.65 router_h = cube_h - 0.65 _rrect(slide, router_x, router_y, router_w, router_h, RGBColor(0xDB, 0xEA, 0xFE), COL_DMA, "ROUTER\nMESH\n\n256 GB/s\nper link", size=10, bold=True, color=COL_DMA) # Arrows from each row's last PE → router strip for (rx, ry) in pe_right_y: _arrow(slide, rx + 0.02, ry, router_x, ry, color=COL_DMA, width_pt=1.4) # HBM_CTRL just outside the cube on the right ctrl_x = cube_x + cube_w + 0.45 ctrl_w = 1.85 ctrl_h = 1.55 ctrl_y = cube_y + cube_h / 2 - ctrl_h / 2 _rrect(slide, ctrl_x, ctrl_y, ctrl_w, ctrl_h, RGBColor(0xFE, 0xE2, 0xE2), COL_RED, "HBM_CTRL\n(1 per cube)\n\nread channel\ncap = 1\n", size=11, bold=True, color=COL_RED) # Bottleneck label _textbox(slide, ctrl_x - 0.1, ctrl_y + ctrl_h + 0.05, ctrl_w + 0.2, 0.35, "BOTTLENECK", size=12, bold=True, color=COL_RED, align=PP_ALIGN.CENTER) # Router strip → HBM_CTRL arrow link_y = cube_y + cube_h / 2 _arrow(slide, router_x + router_w, link_y, ctrl_x, link_y, color=COL_DMA, width_pt=3.0) _textbox(slide, router_x + router_w + 0.02, link_y - 0.36, ctrl_x - (router_x + router_w) - 0.05, 0.3, "256 GB/s", size=10, bold=True, color=COL_DMA, align=PP_ALIGN.CENTER) # HBM banks on the far right hbm_x = ctrl_x + ctrl_w + 0.45 hbm_w = 1.85 hbm_h = ctrl_h + 0.35 hbm_y = ctrl_y - 0.175 _rrect(slide, hbm_x, hbm_y, hbm_w, hbm_h, COL_HBM_BG, COL_HBM_BORDER, "HBM BANKS\n(per-cube)\n\n256 GB/s\naggregated", size=11, bold=True, color=COL_HBM_BORDER) _arrow(slide, ctrl_x + ctrl_w, link_y, hbm_x, link_y, color=COL_DMA, width_pt=3.0) # Side-by-side key takeaways at the bottom _textbox(slide, 0.4, 6.40, 6.3, 0.55, "TCM is per-PE local → fetch/store don't contend.\n" "HBM_CTRL is cube-shared → every DMA serializes on cap=1 channel.", size=11, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 6.9, 6.40, 6.2, 0.55, "Per-op DMA cost grows with #in-flight ops even on 1 PE.\n" "load_ref/load_load pin operands → 1 HBM trip instead of per K-tile.", size=11, color=COL_RED, align=PP_ALIGN.LEFT) def _render_per_op_dma(slide): """Per-op DMA_READ cost = wall_ns / record_count. wall_ns is interval-union of all DMA_READ records ≈ (max t_end - min t_start) when ops overlap. Dividing by count gives the amortized per-op cost in the DMA window — converges to the bandwidth-bound floor. load_load is excluded — its eager up-front DMAs sit outside the composite plan so their stage_type isn't DMA_READ and they don't appear here. """ data = _load_sweep_data() rows = data["rows"] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, "No sweep data. Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] variants = ["ref_ref", "load_ref"] by_shape: dict = {} for r in rows: key = (r["M"], r["K"], r["N"]) by_shape.setdefault(key, {})[r["variant"]] = r shapes = list(by_shape.keys()) shape_labels = [_shape_label(by_shape[k][next(iter(by_shape[k]))]) for k in shapes] flagged = [_under_tile(k[0], k[1], k[2], tile["M"], tile["K"], tile["N"]) for k in shapes] tile_counts = [by_shape[k][next(iter(by_shape[k]))]["tile_count_expected"] for k in shapes] def _ns_per_op(r): s = r.get("stages", {}).get("DMA_READ", {}) cnt = s.get("record_count", 0) wall = s.get("wall_ns", 0.0) return (wall / cnt) if cnt else 0.0 series = { v: [_ns_per_op(by_shape[k].get(v) or {"stages": {}}) for k in shapes] for v in variants } display_map = {v: v for v in variants} _textbox(slide, 0.4, 1.0, 12.6, 0.55, "Amortized per-op cost = (DMA window wall-clock) ÷ (#DMA ops). " "Strips out queue-wait double-counting: when N ops overlap, the " "window is N·drain_ns, so the average per op = drain_ns ≈ 16 ns " "(bandwidth-bound floor at 4 KB ÷ 256 GB/s).", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( slide, plot_x=1.0, plot_y=1.7, plot_w=10.0, plot_h=4.4, shape_labels=shape_labels, flagged=flagged, tile_counts=tile_counts, series=series, colors_map=VARIANT_COLORS_PPTX, display_map=display_map, wall_clocks=None, y_label="ns / op", legend_x=11.4, legend_w=1.85, foot_note=("Flat ~16-20 ns across shapes confirms the per-op " "transfer is constant — what looked like growing per-op " "cost on slide before was queue wait being absorbed."), ) def _render_mac_utilization(slide): """GEMM util (shape fill) AND Useful pipeline eff (computed from formula). Useful eff = pipe_eff × GEMM_util, where pipe_eff = (N_tiles × T_stage) / (head + N_tiles × T_stage + inter DMA_W) """ data = _load_sweep_data() rows = data["rows"] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, "No sweep data. Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] tile_flops = 2 * TILE_M * TILE_K * TILE_N HBM_GBS = 256.0 bpe = 2 T_STAGE = 16.0 D_STAGES = 3 head_ns = (D_STAGES - 1) * T_STAGE dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS by_shape: dict = {} for r in rows: if r["variant"] != "load_ref": continue by_shape[(r["M"], r["K"], r["N"])] = r shapes = list(by_shape.keys()) shape_labels = [_shape_label(by_shape[k]) for k in shapes] flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N) for k in shapes] tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] gemm_util = [] useful_eff = [] for k in shapes: r = by_shape[k] M, K, N = r["M"], r["K"], r["N"] useful = 2 * M * K * N tiles = r["tile_count_expected"] gu = useful / (tile_flops * tiles) * 100 gemm_util.append(gu) m_tiles = (M + TILE_M - 1) // TILE_M n_tiles = (N + TILE_N - 1) // TILE_N n_mn = m_tiles * n_tiles gemm_total = tiles * T_STAGE inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns wall = head_ns + gemm_total + inter_dma_w ueff = (gemm_total * (gu / 100.0) / wall) * 100 if wall > 0 else 0.0 useful_eff.append(ueff) series = { "GEMM util": gemm_util, "Useful eff": useful_eff, } colors_map = { "GEMM util": COL_FS, # emerald "Useful eff": RGBColor(0xF5, 0x9E, 0x0B), # amber } display_map = { "GEMM util": "GEMM util %", "Useful eff": "Useful eff %", } _textbox(slide, 0.4, 1.0, 12.6, 0.80, "FORMULA-generated (analytical ideal-pipeline model — not " "simulator data). GEMM util = useful FLOPs ÷ (tile FLOPs × " "tile count). Useful eff = (N_tiles × T_stage × GEMM_util) " "÷ wall, where wall = head + K-loop + inter-(m,n) DMA_W. " "Slide 16 overlays this against measured pe_window_ns.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( slide, plot_x=1.0, plot_y=1.75, plot_w=10.0, plot_h=4.35, shape_labels=shape_labels, flagged=flagged, tile_counts=tile_counts, series=series, colors_map=colors_map, display_map=display_map, wall_clocks=None, y_label="%", legend_x=11.4, legend_w=1.85, foot_note=("GEMM util < 100% → shape mismatch (padded zeros). " "Useful eff < GEMM util → pipeline overhead " "(head + inter-(m,n) DMA_W) eats more of the wall."), threshold_line=100.0, flagged_bar_color=COL_RED, flagged_series_only="GEMM util", ) def _render_mac_utilization_ref_ref(slide): """Same metric as slide 14 but for the ref_ref variant. In ref_ref both A and B are loaded from HBM by the scheduler — that's TWO back-to-back DMA_R per tile, so the DMA stage takes 2 × T_stage = 32 ns/tile while FETCH/GEMM are still 16 ns/tile. The pipeline is DMA-bound — steady-state cycle = 32 ns/tile — so useful pipeline efficiency caps near 50 % × GEMM_util. """ data = _load_sweep_data() rows = data["rows"] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, "No sweep data. Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] tile_flops = 2 * TILE_M * TILE_K * TILE_N HBM_GBS = 256.0 bpe = 2 T_STAGE_COMPUTE = 16.0 # FETCH = GEMM = 16 ns/tile T_STAGE_DMA_REF_REF = 2 * 16.0 # 2 DMA_R per tile (A + B) T_STAGE = T_STAGE_DMA_REF_REF # DMA-bound steady-state cycle D_STAGES = 3 # DMA, FETCH, GEMM head_ns = (D_STAGES - 1) * T_STAGE_COMPUTE # pipeline fill = 32 ns dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS by_shape: dict = {} for r in rows: if r["variant"] != "ref_ref": continue by_shape[(r["M"], r["K"], r["N"])] = r shapes = list(by_shape.keys()) shape_labels = [_shape_label(by_shape[k]) for k in shapes] flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N) for k in shapes] tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] gemm_util = [] useful_eff = [] for k in shapes: r = by_shape[k] M, K, N = r["M"], r["K"], r["N"] useful = 2 * M * K * N tiles = r["tile_count_expected"] gu = useful / (tile_flops * tiles) * 100 gemm_util.append(gu) m_tiles = (M + TILE_M - 1) // TILE_M n_tiles = (N + TILE_N - 1) // TILE_N n_mn = m_tiles * n_tiles # Useful compute time: each tile delivers T_STAGE_COMPUTE worth # of MAC. Steady-state pipeline cycle is DMA-bound (32 ns/tile). compute_total = tiles * T_STAGE_COMPUTE wall_steady = tiles * T_STAGE inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns wall = head_ns + wall_steady + inter_dma_w ueff = (compute_total * (gu / 100.0) / wall) * 100 \ if wall > 0 else 0.0 useful_eff.append(ueff) series = { "GEMM util": gemm_util, "Useful eff": useful_eff, } colors_map = { "GEMM util": COL_FS, "Useful eff": RGBColor(0xF5, 0x9E, 0x0B), } display_map = { "GEMM util": "GEMM util %", "Useful eff": "Useful eff % (ref_ref)", } _textbox(slide, 0.4, 1.0, 12.6, 0.85, "FORMULA-generated (ideal-pipeline model, ref_ref variant — " "not simulator data). Scheduler issues DMA_R for BOTH A and B " "every tile. Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH " "and GEMM stay at 16 ns. Pipeline cycle is DMA-bound → useful " "eff caps near 50 % × GEMM_util, regardless of K-loop length.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( slide, plot_x=1.0, plot_y=1.85, plot_w=10.0, plot_h=4.25, shape_labels=shape_labels, flagged=flagged, tile_counts=tile_counts, series=series, colors_map=colors_map, display_map=display_map, wall_clocks=None, y_label="%", legend_x=11.4, legend_w=1.85, foot_note=("Compare to load_ref (prev slide): tl.load pins A in " "TCM once, eliminating the per-tile A DMA_R → DMA stage " "halves to 16 ns/tile → useful eff roughly doubles at " "the same GEMM util."), threshold_line=100.0, flagged_bar_color=COL_RED, flagged_series_only="GEMM util", ) def _render_mac_utilization_formula_vs_measured(slide): """Overlay slide: formula useful_eff vs measured useful_eff (load_ref). Formula = (N_tiles × T_stage × GEMM_util) / wall_formula × 100 wall_formula = head + N_tiles·T_stage + inter·DMA_W Measured = (useful_FLOPs / pe_window_ns) / peak_FLOPs_per_ns × 100 pe_window_ns from gemm_sweep.json (honest post Option B). Agreement validates the analytical model against the simulator. """ data = _load_sweep_data() rows = data["rows"] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, "No sweep data. Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] tile_flops = 2 * TILE_M * TILE_K * TILE_N HBM_GBS = 256.0 bpe = 2 T_STAGE = 16.0 D_STAGES = 3 head_ns = (D_STAGES - 1) * T_STAGE dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS peak_per_ns = tile_flops / T_STAGE # MAC peak throughput, flops/ns by_shape: dict = {} for r in rows: if r["variant"] != "load_ref": continue by_shape[(r["M"], r["K"], r["N"])] = r shapes = list(by_shape.keys()) shape_labels = [_shape_label(by_shape[k]) for k in shapes] flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N) for k in shapes] tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] gemm_util_formula: list[float] = [] gemm_util_measured: list[float] = [] formula_eff: list[float] = [] measured_eff: list[float] = [] for k in shapes: r = by_shape[k] M, K, N = r["M"], r["K"], r["N"] useful = 2 * M * K * N tiles = r["tile_count_expected"] gu_formula = useful / (tile_flops * tiles) gemm_util_formula.append(gu_formula * 100) # Measured GEMM util uses the GEMM stage record count from op_log # (i.e. the actual number of GEMM tiles the simulator ran). gemm_record_count = ( r.get("stages", {}).get("GEMM", {}).get("record_count", 0) or tiles # fallback if stages dict missing for older sweep data ) gu_measured = useful / (tile_flops * gemm_record_count) \ if gemm_record_count > 0 else 0.0 gemm_util_measured.append(gu_measured * 100) m_tiles = (M + TILE_M - 1) // TILE_M n_tiles = (N + TILE_N - 1) // TILE_N n_mn = m_tiles * n_tiles compute_total = tiles * T_STAGE inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns wall_formula = head_ns + compute_total + inter_dma_w feff = (compute_total * gu_formula / wall_formula) * 100 \ if wall_formula > 0 else 0.0 formula_eff.append(feff) comp_window_ns = r.get("composite_window_ns", 0.0) or 0.0 if comp_window_ns > 0: meff = (useful / comp_window_ns / peak_per_ns) * 100 else: meff = 0.0 measured_eff.append(meff) series = { "GEMM util F": gemm_util_formula, "GEMM util M": gemm_util_measured, "Formula eff": formula_eff, "Measured eff": measured_eff, } colors_map = { "GEMM util F": COL_FS, # emerald (formula ceiling) "GEMM util M": RGBColor(0x6E, 0xE7, 0xB7), # mint (measured ceiling) "Formula eff": RGBColor(0xF5, 0x9E, 0x0B), # amber (formula eff) "Measured eff": COL_DMA, # blue (measured eff) } display_map = { "GEMM util F": "GEMM util % (formula)", "GEMM util M": "GEMM util % (measured, op_log)", "Formula eff": "Formula useful eff %", "Measured eff": "Measured useful eff %", } _textbox(slide, 0.4, 1.0, 12.6, 1.10, "Four series per shape:\n" " • GEMM util (formula, green): useful_FLOPs ÷ (tile_FLOPs × " "tile_count_expected) — structural shape-vs-tile ceiling.\n" " • GEMM util (measured, mint): useful_FLOPs ÷ (tile_FLOPs × " "GEMM_record_count) — uses the actual GEMM ops the simulator " "ran. Should equal formula → validates plan execution.\n" " • Formula useful eff (amber): GEMM_util × ideal pipeline efficiency.\n" " • Measured useful eff (blue): (useful_FLOPs ÷ " "composite_window_ns) ÷ peak — composite_window_ns covers " "only the tl.composite pipeline (excludes upfront tl.load).", size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( slide, plot_x=1.0, plot_y=2.25, plot_w=10.0, plot_h=3.85, shape_labels=shape_labels, flagged=flagged, tile_counts=tile_counts, series=series, colors_map=colors_map, display_map=display_map, wall_clocks=None, y_label="%", legend_x=11.4, legend_w=1.85, foot_note=("Reading: GEMM util F = GEMM util M → simulator ran the " "expected tile plan. Measured eff uses composite-only " "window (excludes upfront tl.load), so it isolates " "pipeline efficiency. Variant: load_ref."), threshold_line=100.0, ) def _render_tflops_table(slide): """Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM. Three-stage pipeline (DMA_R → FETCH → GEMM), all stages bandwidth-balanced at T_stage = 16 ns/tile. Wall = pipeline fill + steady-state K-loop + inter-(m,n) DMA_W (final flush excluded — tail, not in pipeline). """ data = _load_sweep_data() rows = data["rows"] if not rows: _textbox(slide, 0.4, 3.0, 12.6, 1.0, "No sweep data. Run scripts/gemm_sweep.py first.", size=14, color=COL_RED, align=PP_ALIGN.LEFT) return tile = data["tile_sizes"] TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] HBM_GBS = 256.0 # bytes/ns bpe = 2 # f16 T_STAGE = 16.0 # ns per pipeline stage (all stages BW-balanced) D_STAGES = 3 # DMA_R, FETCH, GEMM head_ns = (D_STAGES - 1) * T_STAGE # pipeline fill: (D-1) × T_stage = 32 ns dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS # = 8 ns by_shape: dict = {} for r in rows: by_shape.setdefault((r["M"], r["K"], r["N"]), {})[r["variant"]] = r shapes = list(by_shape.keys()) _textbox(slide, 0.4, 1.0, 12.6, 0.85, f"Ideal pipelined model — assumes non-blocking tl.load + " f"multi-channel HBM so DMA, FETCH and GEMM all run at " f"T_stage = {T_STAGE:.0f} ns/tile.\n" f" wall = head_latency + N_tiles × T_stage + Σ inter-(m,n) DMA_W\n" f" head_latency = (D−1) × T_stage = {head_ns:.0f} ns " f"(pipeline fill, D = {D_STAGES} stages).\n" f" inter DMA_W = (N_mn − 1) × {dma_w_per_pair_ns:.0f} ns " f"(final flush is tail — excluded).\n" f" useful eff = (N_tiles × T_stage × GEMM_util) / wall — " f"MAC time producing real output, not padded zeros.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) headers = ["Input shape", "GEMM util %", "Useful eff %"] n_rows = len(shapes) + 1 n_cols = len(headers) table_x = 2.5 table_y = 2.10 table_w = 8.4 table_h = 4.30 tbl_shape = slide.shapes.add_table( n_rows, n_cols, Inches(table_x), Inches(table_y), Inches(table_w), Inches(table_h), ) tbl = tbl_shape.table widths_in = [3.0, 2.5, 2.9] for ci, w in enumerate(widths_in): tbl.columns[ci].width = Inches(w) def _set_cell(ci, ri, text, *, bold=False, color=COL_TEXT_DARK, fill=None, size=12, align=PP_ALIGN.CENTER): cell = tbl.cell(ri, ci) if fill is not None: cell.fill.solid() cell.fill.fore_color.rgb = fill tf = cell.text_frame tf.margin_left = Emu(36000) tf.margin_right = Emu(36000) tf.margin_top = Emu(18000) tf.margin_bottom = Emu(18000) tf.word_wrap = True p = tf.paragraphs[0] p.alignment = align p.text = "" run = p.add_run() run.text = text run.font.size = Pt(size) run.font.bold = bold run.font.name = "Consolas" run.font.color.rgb = color for ci, h in enumerate(headers): _set_cell(ci, 0, h, bold=True, color=COL_TEXT_LIGHT, fill=RGBColor(0x10, 0x2A, 0x55), size=12) tile_flops = 2 * TILE_M * TILE_K * TILE_N for ri, k in enumerate(shapes, start=1): M, K, N = k useful = 2 * M * K * N any_row = next(iter(by_shape[k].values())) tiles = any_row["tile_count_expected"] gemm_util = useful / (tile_flops * tiles) * 100 is_under = _under_tile(M, K, N, TILE_M, TILE_K, TILE_N) m_tiles = (M + TILE_M - 1) // TILE_M n_tiles = (N + TILE_N - 1) // TILE_N n_mn = m_tiles * n_tiles gemm_total_ns = tiles * T_STAGE inter_dma_w_ns = max(0, n_mn - 1) * dma_w_per_pair_ns wall_ns = head_ns + gemm_total_ns + inter_dma_w_ns # Useful eff = pipe_eff × GEMM_util (shape-waste included). pipe_eff = ((gemm_total_ns * (gemm_util / 100.0)) / wall_ns) * 100 \ if wall_ns > 0 else 0.0 row_fill = (RGBColor(0xFE, 0xF2, 0xF2) if is_under else RGBColor(0xF8, 0xFA, 0xFC)) text_color = COL_RED if is_under else COL_TEXT_DARK shape_txt = f"{M}×{K}×{N}" + (" (under-tile)" if is_under else "") _set_cell(0, ri, shape_txt, bold=is_under, color=text_color, fill=row_fill, align=PP_ALIGN.LEFT) _set_cell(1, ri, f"{gemm_util:.1f} %", bold=is_under, color=text_color, fill=row_fill) _set_cell(2, ri, f"{pipe_eff:.1f} %", fill=row_fill, color=COL_TEXT_DARK, bold=True) _textbox(slide, 0.4, 6.50, 12.6, 0.6, "What the model could achieve with non-blocking tl.load + " "multi-channel HBM (A streams tile-by-tile, no upfront serial). " "Today's simulator caps at ~50 % for tall-skinny K because tl.load " "is whole-operand blocking and the cube has a single HBM channel.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) def _render_pipeline_eff_walkthrough(slide): """Visualize how Useful eff is calculated for one example shape. Example: 32×128×128 (8 K-tiles in 4 (m,n) groups, head + 3 inter DMA_W). Shows the 3-stage pipeline structure, the time-line of all GEMM tiles + overheads, and evaluates the formula numerically. """ # Example parameters (kept hardcoded — this slide is illustrative) M, K, N = 32, 128, 128 T_STAGE = 16 D_STAGES = 3 # DMA_R, FETCH, GEMM K_TILES = 2 # K=128 → K_tiles = 2 N_MN = 4 # M_tile × N_tile = 1 × 4 TOTAL_TILES = K_TILES * N_MN # 8 HEAD_NS = (D_STAGES - 1) * T_STAGE # 32 DMA_W_NS = 8 # 32×32×2 / 256 = 8 ns INTER_FLUSH_COUNT = N_MN - 1 # 3 flushes (final excluded) GEMM_TOTAL = TOTAL_TILES * T_STAGE # 128 INTER_DMA_W_TOTAL = INTER_FLUSH_COUNT * DMA_W_NS # 24 WALL = HEAD_NS + GEMM_TOTAL + INTER_DMA_W_TOTAL # 184 USEFUL_EFF = GEMM_TOTAL / WALL * 100 # 69.6% # Colors C_HEAD = RGBColor(0x94, 0xA3, 0xB8) # slate gray C_GEMM = RGBColor(0x10, 0xB9, 0x81) # emerald (useful) C_FLUSH = RGBColor(0xF5, 0x9E, 0x0B) # amber (inter-(m,n)) C_TAIL = RGBColor(0xFC, 0xA5, 0xA5) # light red (excluded) C_DMA_R = RGBColor(0x3B, 0x82, 0xF6) # blue C_FETCH = RGBColor(0x10, 0xB9, 0x81) C_GEMM_BOX = RGBColor(0xF5, 0x9E, 0x0B) _textbox(slide, 0.4, 1.0, 12.6, 0.5, f"Example: M=N=32, K=128, N-output=128 → {TOTAL_TILES} tiles in " f"{N_MN} output groups of {K_TILES} K-tiles each. T_stage = " f"{T_STAGE} ns/tile, pipeline depth D = {D_STAGES}.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # ── Top: 3-stage pipeline structure diagram ───────────────────── pd_y = 1.7 pd_h = 0.55 box_w = 1.4 box_gap = 0.35 pd_total_w = 3 * box_w + 2 * box_gap pd_x0 = (SLIDE_W_IN - pd_total_w) / 2 stages = [("DMA_R\nHBM → TCM", C_DMA_R, COL_TEXT_LIGHT), ("FETCH\nTCM → Reg", C_FETCH, COL_TEXT_LIGHT), ("GEMM\nMAC array", C_GEMM_BOX, COL_TEXT_DARK)] for i, (lbl, fill, tc) in enumerate(stages): x = pd_x0 + i * (box_w + box_gap) _rrect(slide, x, pd_y, box_w, pd_h, fill, COL_TEXT_DARK, lbl, size=10, bold=True, color=tc) if i < 2: ax1 = x + box_w + 0.03 ax2 = x + box_w + box_gap - 0.03 _arrow(slide, ax1, pd_y + pd_h / 2, ax2, pd_y + pd_h / 2, color=COL_MUTED, width_pt=1.8) _textbox(slide, pd_x0, pd_y + pd_h + 0.05, pd_total_w, 0.28, f"each stage = {T_STAGE} ns/tile → " f"head latency = (D−1) × T_stage = {HEAD_NS} ns", size=10, color=COL_MUTED, align=PP_ALIGN.CENTER) # ── Middle: timeline of the full kernel ───────────────────────── # Layout: head + 4 K-loops separated by 3 DMA_W + 1 tail (excluded) tl_y = 3.4 tl_h = 0.85 tl_label_y = tl_y - 0.3 tl_legend_y = tl_y + tl_h + 0.18 margin = 0.8 tl_total_ns = WALL + DMA_W_NS # include tail visually tl_w_total = SLIDE_W_IN - 2 * margin px_per_ns = tl_w_total / tl_total_ns cur_x = margin # Head w = HEAD_NS * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_HEAD, COL_TEXT_DARK) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, f"head\n{HEAD_NS} ns", size=9, bold=True, color=COL_TEXT_LIGHT) cur_x += w # K-loops + inter DMA_W for g in range(N_MN): # K-loop: K_TILES GEMM blocks for t in range(K_TILES): w = T_STAGE * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK) tile_no = g * K_TILES + t + 1 _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, f"GEMM\nT{tile_no}", size=9, bold=True, color=COL_TEXT_LIGHT) cur_x += w # Mark (m,n) group label above group_start_x = cur_x - K_TILES * T_STAGE * px_per_ns group_w = K_TILES * T_STAGE * px_per_ns _textbox(slide, group_start_x, tl_label_y, group_w, 0.25, f"(m=0,n={g}) K-loop", size=9, bold=True, color=COL_TEXT_DARK) if g < N_MN - 1: w = DMA_W_NS * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_FLUSH, COL_TEXT_DARK) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, "flush", size=8, bold=True, color=COL_TEXT_DARK) cur_x += w # Tail flush (excluded) w = DMA_W_NS * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_TAIL, COL_RED) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, "tail", size=8, bold=True, color=COL_RED) _textbox(slide, cur_x - 0.3, tl_y + tl_h + 0.02, w + 0.6, 0.28, "(excluded\nfrom wall)", size=8, bold=True, color=COL_RED) # Wall span indicator below the timeline wall_end_x = margin + WALL * px_per_ns _arrow(slide, margin, tl_y + tl_h + 0.55, wall_end_x, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5) _arrow(slide, wall_end_x, tl_y + tl_h + 0.55, margin, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5) _textbox(slide, margin, tl_y + tl_h + 0.6, wall_end_x - margin, 0.3, f"wall = {WALL} ns", size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # ── Bottom: numerical evaluation ─────────────────────────────── formula_y = 5.7 _textbox(slide, 0.4, formula_y, 12.6, 0.35, f"wall = head + N_tiles × T_stage + (N_mn − 1) × T_dma_w = " f"{HEAD_NS} + {TOTAL_TILES}×{T_STAGE} + {INTER_FLUSH_COUNT}×{DMA_W_NS} " f"= {WALL} ns", size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, formula_y + 0.35, 12.6, 0.35, f"GEMM useful time = N_tiles × T_stage × GEMM_util = " f"{TOTAL_TILES}×{T_STAGE}×100 % = {GEMM_TOTAL} ns", size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, formula_y + 0.7, 12.6, 0.45, f"Useful efficiency = {GEMM_TOTAL} / {WALL} = {USEFUL_EFF:.1f} %", size=14, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, formula_y + 1.15, 12.6, 0.35, "Overhead = head (pipeline fill) + inter-(m,n) flushes. " "Bigger K (more amortization) and smaller N (fewer groups) " "both raise the efficiency.", size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) def _render_pipeline_eff_walkthrough_largeK(slide): """Walkthrough for the tall-skinny case: M=N=32, K=3072. N_mn = 1 (one output group) → zero inter-(m,n) DMA_W flushes. K-loop has 48 tiles — compressed visually (show first 4 + ... + last 2). """ M, K, N = 32, 3072, 32 T_STAGE = 16 D_STAGES = 3 K_TILES = 48 N_MN = 1 TOTAL_TILES = K_TILES HEAD_NS = (D_STAGES - 1) * T_STAGE DMA_W_NS = 8 GEMM_TOTAL = TOTAL_TILES * T_STAGE INTER_DMA_W_TOTAL = 0 WALL = HEAD_NS + GEMM_TOTAL + INTER_DMA_W_TOTAL USEFUL_EFF = GEMM_TOTAL / WALL * 100 C_HEAD = RGBColor(0x94, 0xA3, 0xB8) C_GEMM = RGBColor(0x10, 0xB9, 0x81) C_TAIL = RGBColor(0xFC, 0xA5, 0xA5) C_DMA_R = RGBColor(0x3B, 0x82, 0xF6) C_FETCH = RGBColor(0x10, 0xB9, 0x81) C_GEMM_BOX = RGBColor(0xF5, 0x9E, 0x0B) _textbox(slide, 0.4, 1.0, 12.6, 0.5, f"Example: M=N=32, K=3072 → {TOTAL_TILES} tiles, " f"N_mn = M_tiles × N_tiles = 1 × 1 = 1 → " f"no inter-(m,n) DMA_W flushes. Long K-loop amortizes the head.", size=12, color=COL_MUTED, align=PP_ALIGN.LEFT) # 3-stage pipeline structure (same as before) pd_y = 1.7 pd_h = 0.55 box_w = 1.4 box_gap = 0.35 pd_total_w = 3 * box_w + 2 * box_gap pd_x0 = (SLIDE_W_IN - pd_total_w) / 2 stages = [("DMA_R\nHBM → TCM", C_DMA_R, COL_TEXT_LIGHT), ("FETCH\nTCM → Reg", C_FETCH, COL_TEXT_LIGHT), ("GEMM\nMAC array", C_GEMM_BOX, COL_TEXT_DARK)] for i, (lbl, fill, tc) in enumerate(stages): x = pd_x0 + i * (box_w + box_gap) _rrect(slide, x, pd_y, box_w, pd_h, fill, COL_TEXT_DARK, lbl, size=10, bold=True, color=tc) if i < 2: ax1 = x + box_w + 0.03 ax2 = x + box_w + box_gap - 0.03 _arrow(slide, ax1, pd_y + pd_h / 2, ax2, pd_y + pd_h / 2, color=COL_MUTED, width_pt=1.8) _textbox(slide, pd_x0, pd_y + pd_h + 0.05, pd_total_w, 0.28, f"each stage = {T_STAGE} ns/tile → " f"head latency = (D−1) × T_stage = {HEAD_NS} ns", size=10, color=COL_MUTED, align=PP_ALIGN.CENTER) # Timeline — compressed (show 4 tiles + gap + 2 tiles) tl_y = 3.4 tl_h = 0.85 tl_label_y = tl_y - 0.3 margin = 0.8 visible_first = 4 visible_last = 2 skipped = TOTAL_TILES - visible_first - visible_last # Width budget: head + visible tiles + ellipsis block + tail ellipsis_ns_equiv = 6 * T_STAGE # rendered width = 6 tiles worth tl_total_ns = (HEAD_NS + (visible_first + visible_last) * T_STAGE + ellipsis_ns_equiv + DMA_W_NS) tl_w_total = SLIDE_W_IN - 2 * margin px_per_ns = tl_w_total / tl_total_ns cur_x = margin # Head w = HEAD_NS * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_HEAD, COL_TEXT_DARK) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, f"head\n{HEAD_NS} ns", size=9, bold=True, color=COL_TEXT_LIGHT) cur_x += w # Group label for the entire K-loop (one (m,n)) kloop_start_x = cur_x # First few tiles for t in range(visible_first): w = T_STAGE * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, f"GEMM\nT{t + 1}", size=9, bold=True, color=COL_TEXT_LIGHT) cur_x += w # Ellipsis block ew = ellipsis_ns_equiv * px_per_ns _rect_band(slide, cur_x, tl_y, ew, tl_h, RGBColor(0x86, 0xEF, 0xAC), COL_TEXT_DARK) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.2, ew, 0.5, f". . . {skipped} more GEMM tiles . . .", size=10, bold=True, color=COL_TEXT_DARK) cur_x += ew # Last tiles for t in range(visible_last): w = T_STAGE * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, f"GEMM\nT{TOTAL_TILES - visible_last + t + 1}", size=9, bold=True, color=COL_TEXT_LIGHT) cur_x += w # K-loop group label kloop_end_x = cur_x _textbox(slide, kloop_start_x, tl_label_y, kloop_end_x - kloop_start_x, 0.25, f"(m=0,n=0) K-loop — all {TOTAL_TILES} tiles in one group, " f"NO inter flushes", size=10, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # Tail w = DMA_W_NS * px_per_ns _rect_band(slide, cur_x, tl_y, w, tl_h, C_TAIL, COL_RED) _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3, "tail", size=8, bold=True, color=COL_RED) _textbox(slide, cur_x - 0.3, tl_y + tl_h + 0.02, w + 0.6, 0.28, "(excluded\nfrom wall)", size=8, bold=True, color=COL_RED) # Wall arrow wall_end_x = margin + (HEAD_NS + (visible_first + visible_last) * T_STAGE + ellipsis_ns_equiv) * px_per_ns _arrow(slide, margin, tl_y + tl_h + 0.55, wall_end_x, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5) _arrow(slide, wall_end_x, tl_y + tl_h + 0.55, margin, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5) _textbox(slide, margin, tl_y + tl_h + 0.6, wall_end_x - margin, 0.3, f"wall = {WALL} ns", size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER) # Formula evaluation formula_y = 5.7 _textbox(slide, 0.4, formula_y, 12.6, 0.35, f"wall = head + N_tiles × T_stage + (N_mn − 1) × T_dma_w = " f"{HEAD_NS} + {TOTAL_TILES}×{T_STAGE} + 0 = {WALL} ns", size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, formula_y + 0.35, 12.6, 0.35, f"GEMM useful time = {TOTAL_TILES}×{T_STAGE}×100 % = {GEMM_TOTAL} ns", size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, formula_y + 0.7, 12.6, 0.45, f"Useful efficiency = {GEMM_TOTAL} / {WALL} = {USEFUL_EFF:.1f} %", size=14, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT) _textbox(slide, 0.4, formula_y + 1.15, 12.6, 0.35, "Long K-loop, one output group → head amortized over 48 GEMM " "tiles, no flush penalty. Approaches 100 % as K grows.", size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) _RENDERERS.update({ "stage_breakdown_ref_ref": _render_stage_breakdown_ref_ref, "stage_breakdown_load_ref": _render_stage_breakdown_load_ref, "stage_breakdown_load_load": _render_stage_breakdown_load_load, "variant_comparison": _render_variant_comparison, "hbm_topology": _render_hbm_topology, "per_op_dma": _render_per_op_dma, "mac_utilization": _render_mac_utilization, "mac_utilization_ref_ref": _render_mac_utilization_ref_ref, "mac_utilization_formula_vs_measured": _render_mac_utilization_formula_vs_measured, "tflops_table": _render_tflops_table, "pipeline_eff_walkthrough": _render_pipeline_eff_walkthrough, "pipeline_eff_walkthrough_largeK": _render_pipeline_eff_walkthrough_largeK, }) def build(): prs = Presentation() prs.slide_width = Inches(SLIDE_W_IN) prs.slide_height = Inches(SLIDE_H_IN) blank = prs.slide_layouts[6] for i, cfg in enumerate(SLIDES, start=1): slide = prs.slides.add_slide(blank) _add_title(slide, cfg["title"]) if "render" in cfg: # Shape-drawn slide (sequence diagram / pipeline structure). _RENDERERS[cfg["render"]](slide) else: # Default: image on the left (8.4 in wide), bullets on the right. _add_image_centered( slide, cfg["image"], left_in=0.3, top_in=1.05, max_w_in=8.3, max_h_in=5.9, ) _add_bullets( slide, cfg["bullets"], left_in=8.8, top_in=1.2, width_in=4.3, height_in=5.7, ) _add_footer(slide, i, len(SLIDES)) OUT.parent.mkdir(parents=True, exist_ok=True) prs.save(OUT) print(f"wrote {OUT}") if __name__ == "__main__": build()