kernbench2/scripts/build_overview_slides.py

"""Generate a multi-slide PPTX summarizing the kernbench2 model.

Slides:
  1. Overall architecture — how PEs are connected (cube_mesh_view)
  2. Model correctness — DMA vs P2P latency (pe2pe overview)
  3. PE-to-PE IPCQ communication (ipcq_two_pe_dma)
  4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken)
  5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep)
  6. PE_accelerator data path (composite GEMM pipeline structure)
  7. matmul(32, 128, 32) — composite GEMM execution sequence
  8. matmul(32, 128, 128) — pipeline scaling and HBM contention

This is a derived-artifact generator — no production code touched.
"""
from __future__ import annotations

from pathlib import Path

from PIL import Image
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.enum.shapes import MSO_SHAPE
from pptx.enum.text import PP_ALIGN
from pptx.util import Emu, Inches, Pt

ROOT = Path(__file__).resolve().parent.parent
DIAG = ROOT / "docs" / "diagrams"
OUT = DIAG / "kernbench2_overview.pptx"

# 16:9 widescreen — 13.333 × 7.5 in
SLIDE_W_IN = 13.333
SLIDE_H_IN = 7.5

SLIDES = [
    {
        "title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity",
        "image": DIAG / "cube_mesh_view.png",
        "bullets": [
            "Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)",
            "Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine",
            "Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh",
            "Foundation for every latency, IPCQ, and allreduce experiment that follows",
        ],
    },
    {
        "title": "2. Model Correctness: DMA vs P2P Latency Sweep",
        "image": DIAG / "pe2pe_latency_plots" / "overview.png",
        "bullets": [
            "Sweeps payload size across PE-to-PE paths and compares to DMA",
            "Confirms the simulator reproduces the expected DMA/P2P crossover",
            "Acts as the per-hop ground truth that feeds collective-level models",
        ],
    },
    {
        "title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)",
        "image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png",
        "bullets": [
            "Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot",
            "Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume",
            "Credit return rides the fabric path back (16 B packet, no slot-IO)",
            "This is the building block the multi-device allreduce composes",
        ],
    },
    {
        "title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator",
        "image": DIAG / "allreduce_latency_plots" / "overview_broken.png",
        "bullets": [
            "Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE",
            "Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)",
            "Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs",
            "Our 6-device collective lands at ~17–22 µs — ~17× faster than ext-sim baseline",
        ],
    },
    {
        "title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM",
        "image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png",
        "bullets": [
            "Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)",
            "Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop",
            "Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s",
            "At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link",
        ],
    },
    {
        "title": "6. PE_accelerator Data Path: Composite GEMM Pipeline",
        "render": "pipeline_structure",
    },
    {
        "title": "7. PE_SCHEDULER: Plan Generation & Tile Dispatch",
        "render": "scheduler",
    },
    {
        "title": "8. matmul(32, 128, 32) — Composite GEMM Execution Sequence",
        "render": "sequence_32x128x32",
    },
    {
        "title": "9. matmul(32, 128, 128) — Pipeline Scaling & HBM Contention",
        "render": "sequence_32x128x128",
    },
    {
        "title": "10. Tiling Walkthrough: 32×128×32 — K-loop Only, No Inter-(m,n) Flush",
        "render": "tiling_32x128x32",
    },
    {
        "title": "11. Tiling Walkthrough: 32×128×128 — K-loop & Inter-(m,n) Flushes",
        "render": "tiling_32x128x128",
    },
    {
        "title": "12. GEMM Sweep — Stage Wall-Clock (load_ref)",
        "render": "stage_breakdown_load_ref",
    },
    {
        "title": "13. Why DMA Isn't Local: Cube-Shared HBM Path",
        "render": "hbm_topology",
    },
    {
        "title": "14. GEMM Utilization + Useful Pipeline Efficiency (load_ref)",
        "render": "mac_utilization",
    },
    {
        "title": "15. GEMM Utilization + Useful Pipeline Efficiency (ref_ref — both A & B via DMA_R)",
        "render": "mac_utilization_ref_ref",
    },
    {
        "title": "16. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)",
        "render": "pipeline_eff_walkthrough",
    },
    {
        "title": "17. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)",
        "render": "pipeline_eff_walkthrough_largeK",
    },
    {
        "title": "18. Useful Pipelined Efficiency (ideal pipeline × GEMM util)",
        "render": "tflops_table",
    },
]


# ── Palette for the shape-drawn slides ─────────────────────────────────────

COL_TEXT_DARK = RGBColor(0x1E, 0x29, 0x3B)
COL_TEXT_LIGHT = RGBColor(0xFF, 0xFF, 0xFF)
COL_MUTED = RGBColor(0x47, 0x55, 0x69)
COL_RED = RGBColor(0xDC, 0x26, 0x26)
COL_STORAGE = RGBColor(0xE2, 0xE8, 0xF0)
COL_STORAGE_STROKE = RGBColor(0x47, 0x55, 0x69)
COL_DMA = RGBColor(0x3B, 0x82, 0xF6)   # blue
COL_FS = RGBColor(0x10, 0xB9, 0x81)    # emerald
COL_GEMM = RGBColor(0xF5, 0x9E, 0x0B)  # amber
COL_HBM_BG = RGBColor(0xDB, 0xEA, 0xFE)
COL_TCM_BG = RGBColor(0xD1, 0xFA, 0xE5)
COL_REG_BG = RGBColor(0xFE, 0xF3, 0xC7)
COL_HBM_BORDER = COL_DMA
COL_TCM_BORDER = COL_FS
COL_REG_BORDER = RGBColor(0xD9, 0x77, 0x06)


def _add_title(slide, text):
    left = Inches(0.4)
    top = Inches(0.25)
    width = Inches(SLIDE_W_IN - 0.8)
    height = Inches(0.7)
    box = slide.shapes.add_textbox(left, top, width, height)
    tf = box.text_frame
    tf.margin_left = tf.margin_right = Emu(0)
    tf.margin_top = tf.margin_bottom = Emu(0)
    p = tf.paragraphs[0]
    run = p.add_run()
    run.text = text
    run.font.size = Pt(26)
    run.font.bold = True
    run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55)
    return box


def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in):
    with Image.open(img_path) as im:
        iw, ih = im.size
    max_w_emu = Inches(max_w_in)
    max_h_emu = Inches(max_h_in)
    scale = min(max_w_emu / iw, max_h_emu / ih)
    w = int(iw * scale)
    h = int(ih * scale)
    left = Inches(left_in) + (max_w_emu - w) // 2
    top = Inches(top_in) + (max_h_emu - h) // 2
    slide.shapes.add_picture(str(img_path), left, top, width=w, height=h)


def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in):
    box = slide.shapes.add_textbox(
        Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in),
    )
    tf = box.text_frame
    tf.word_wrap = True
    for i, line in enumerate(bullets):
        p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
        p.level = 0
        run = p.add_run()
        run.text = "• " + line
        run.font.size = Pt(15)
        run.font.color.rgb = RGBColor(0x22, 0x22, 0x22)
        p.space_after = Pt(6)


def _add_footer(slide, idx, total):
    box = slide.shapes.add_textbox(
        Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45),
        Inches(1.0), Inches(0.3),
    )
    p = box.text_frame.paragraphs[0]
    run = p.add_run()
    run.text = f"{idx} / {total}"
    run.font.size = Pt(10)
    run.font.color.rgb = RGBColor(0x88, 0x88, 0x88)


# ── Shape-drawing primitives for the diagram slides ────────────────────────

def _txt(box, text, size=11, bold=False, color=COL_TEXT_DARK,
         align=PP_ALIGN.CENTER):
    tf = box.text_frame
    tf.word_wrap = True
    tf.margin_left = Emu(18000)
    tf.margin_right = Emu(18000)
    tf.margin_top = Emu(9000)
    tf.margin_bottom = Emu(9000)
    lines = text.split("\n")
    for i, ln in enumerate(lines):
        p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
        p.alignment = align
        p.text = ""
        run = p.add_run()
        run.text = ln
        run.font.size = Pt(size)
        run.font.bold = bold
        run.font.name = "Consolas"
        run.font.color.rgb = color


def _textbox(slide, x, y, w, h, text, **kw):
    tb = slide.shapes.add_textbox(Inches(x), Inches(y), Inches(w), Inches(h))
    _txt(tb, text, **kw)
    return tb


def _rrect(slide, x, y, w, h, fill, stroke, text="", **kw):
    s = slide.shapes.add_shape(
        MSO_SHAPE.ROUNDED_RECTANGLE,
        Inches(x), Inches(y), Inches(w), Inches(h),
    )
    s.fill.solid()
    s.fill.fore_color.rgb = fill
    s.line.color.rgb = stroke
    s.line.width = Pt(1.3)
    if text:
        _txt(s, text, **kw)
    return s


def _rect_band(slide, x, y, w, h, fill, stroke):
    s = slide.shapes.add_shape(
        MSO_SHAPE.RECTANGLE, Inches(x), Inches(y), Inches(w), Inches(h),
    )
    s.fill.solid()
    s.fill.fore_color.rgb = fill
    s.line.color.rgb = stroke
    s.line.width = Pt(1.0)
    return s


def _arrow(slide, x1, y1, x2, y2, color=COL_MUTED, width_pt=1.5):
    conn = slide.shapes.add_connector(
        2, Inches(x1), Inches(y1), Inches(x2), Inches(y2),
    )
    conn.line.color.rgb = color
    conn.line.width = Pt(width_pt)
    from pptx.oxml.ns import qn
    from lxml import etree
    ln = conn.line._get_or_add_ln()
    tail = ln.find(qn("a:tailEnd"))
    if tail is None:
        tail = etree.SubElement(ln, qn("a:tailEnd"))
    tail.set("type", "triangle")
    tail.set("w", "med")
    tail.set("len", "med")
    return conn


def _vline_dashed(slide, x, y1, y2, color, width_pt=2.0):
    conn = slide.shapes.add_connector(
        1, Inches(x), Inches(y1), Inches(x), Inches(y2),
    )
    conn.line.color.rgb = color
    conn.line.width = Pt(width_pt)
    from pptx.oxml.ns import qn
    from lxml import etree
    ln = conn.line._get_or_add_ln()
    pr = ln.find(qn("a:prstDash"))
    if pr is None:
        pr = etree.SubElement(ln, qn("a:prstDash"))
    pr.set("val", "dash")
    return conn


# ── Slide 6: Pipeline structure ────────────────────────────────────────────

def _render_pipeline_structure(slide):
    """Vertical hardware datapath.

    HBM at top → DMA_in → GEMM Unit (FETCH + RegFile inside) → DMA_out
    (off-page to HBM, not drawn). TCM is on the right, vertical, acting
    as the staging buffer that DMA_in writes into, FETCH reads from, and
    that the GEMM unit STOREs back into before DMA_out drains it.
    """
    READ_COLOR = COL_DMA
    WRITE_COLOR = RGBColor(0xEA, 0x58, 0x0C)

    _textbox(slide, 0.4, 1.0, 12.6, 0.5,
             "Vertical flow. HBM → DMA_in → TCM → GEMM Unit (FETCH + "
             "RegFile) → TCM → DMA_out → HBM. TCM sits on the side as the "
             "staging buffer between the DMA engines and the GEMM unit.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # ── Center column: HBM → DMA_in → GEMM Unit → DMA_out ────────────
    col_cx = 3.5
    box_w = 3.0
    box_x = col_cx - box_w / 2
    box_h = 0.50

    y_hbm     = 1.65
    y_dma_in  = 2.55
    y_gemm    = 3.45
    gemm_h    = 2.40
    y_dma_out = y_gemm + gemm_h + 0.30
    out_arr_end_y = y_dma_out + box_h + 0.40

    # HBM
    _rrect(slide, box_x, y_hbm, box_w, box_h,
           COL_HBM_BG, COL_HBM_BORDER,
           "HBM   (off-chip, 256 GB/s)",
           size=11, bold=True, color=COL_HBM_BORDER)

    # DMA_in
    _rrect(slide, box_x, y_dma_in, box_w, box_h,
           COL_DMA, COL_TEXT_DARK,
           "DMA_in",
           size=13, bold=True, color=COL_TEXT_LIGHT)

    # GEMM Unit container (slightly wider than DMA boxes)
    gemm_x = box_x - 0.7
    gemm_w = box_w + 1.4
    _rrect(slide, gemm_x, y_gemm, gemm_w, gemm_h,
           RGBColor(0xFF, 0xFB, 0xEB),
           COL_GEMM, "", size=10, color=COL_GEMM)
    _textbox(slide, gemm_x + 0.20, y_gemm + 0.08, gemm_w - 0.4, 0.32,
             "GEMM Unit",
             size=14, bold=True, color=COL_GEMM, align=PP_ALIGN.LEFT)

    # Inside GEMM Unit: FETCH (top), RegFile (below), MAC label (bottom)
    sub_h = 0.50
    inner_x = gemm_x + 0.5
    inner_w = gemm_w - 1.0

    fetch_y = y_gemm + 0.55
    reg_y   = fetch_y + sub_h + 0.30

    _rrect(slide, inner_x, fetch_y, inner_w, sub_h,
           COL_FS, COL_TEXT_DARK,
           "FETCH unit",
           size=11, bold=True, color=COL_TEXT_LIGHT)
    _rrect(slide, inner_x, reg_y, inner_w, sub_h,
           COL_REG_BG, COL_REG_BORDER,
           "RegFile  (A, B, C accumulator)",
           size=11, bold=True, color=COL_REG_BORDER)

    # MAC annotation (text-only) — implicit MAC operation on RegFile
    mac_label_y = reg_y + sub_h + 0.05
    _textbox(slide, inner_x, mac_label_y, inner_w, 0.32,
             "↻  MAC accumulate   (32 × 64 × 32 array)",
             size=11, bold=True, color=COL_GEMM, align=PP_ALIGN.CENTER)

    # FETCH → RegFile (internal, vertical down)
    inner_cx = inner_x + inner_w / 2
    _arrow(slide, inner_cx, fetch_y + sub_h + 0.02,
           inner_cx, reg_y - 0.02,
           color=COL_FS, width_pt=1.8)

    # DMA_out (below GEMM Unit)
    _rrect(slide, box_x, y_dma_out, box_w, box_h,
           WRITE_COLOR, COL_TEXT_DARK,
           "DMA_out",
           size=13, bold=True, color=COL_TEXT_LIGHT)

    # ── TCM on the right (tall vertical) ─────────────────────────────
    tcm_x = 9.4
    tcm_w = 2.6
    tcm_y = y_dma_in - 0.05
    tcm_h = (y_dma_out + box_h + 0.05) - tcm_y
    _rrect(slide, tcm_x, tcm_y, tcm_w, tcm_h,
           COL_TCM_BG, COL_TCM_BORDER, "",
           size=12, color=COL_TCM_BORDER)
    _textbox(slide, tcm_x + 0.1, tcm_y + tcm_h / 2 - 0.6,
             tcm_w - 0.2, 1.2,
             "TCM\n\n(PE-local SRAM,\n512 GB/s)",
             size=14, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.CENTER)

    # ── Wires ───────────────────────────────────────────────────────
    # HBM → DMA_in (vertical down)
    _arrow(slide, col_cx, y_hbm + box_h + 0.02, col_cx, y_dma_in - 0.02,
           color=READ_COLOR, width_pt=2.5)
    _textbox(slide, col_cx + 0.15, (y_hbm + box_h + y_dma_in) / 2 - 0.10,
             1.5, 0.22, "DMA_R", size=10, bold=True,
             color=READ_COLOR, align=PP_ALIGN.LEFT)

    # DMA_in → TCM (horizontal right, at DMA_in y)
    dma_in_cy = y_dma_in + box_h / 2
    _arrow(slide, box_x + box_w + 0.02, dma_in_cy,
           tcm_x - 0.02, dma_in_cy,
           color=READ_COLOR, width_pt=2.5)
    mid_x = (box_x + box_w + tcm_x) / 2
    _textbox(slide, mid_x - 1.4, dma_in_cy - 0.32,
             2.8, 0.22, "store to TCM",
             size=10, bold=True, color=READ_COLOR, align=PP_ALIGN.CENTER)

    # TCM → FETCH (horizontal left, into FETCH inside GEMM Unit)
    fetch_right = inner_x + inner_w
    fetch_cy = fetch_y + sub_h / 2
    _arrow(slide, tcm_x - 0.02, fetch_cy,
           fetch_right + 0.02, fetch_cy,
           color=COL_FS, width_pt=2.5)
    _textbox(slide, (fetch_right + tcm_x) / 2 - 1.4, fetch_cy - 0.32,
             2.8, 0.22, "FETCH  (TCM → Reg)",
             size=10, bold=True, color=COL_FS, align=PP_ALIGN.CENTER)

    # GEMM (RegFile) → TCM (STORE, horizontal right exit at RegFile y)
    reg_right = inner_x + inner_w
    store_cy = reg_y + sub_h / 2
    _arrow(slide, reg_right + 0.02, store_cy,
           tcm_x - 0.02, store_cy,
           color=WRITE_COLOR, width_pt=2.5)
    _textbox(slide, (reg_right + tcm_x) / 2 - 1.6, store_cy + 0.08,
             3.2, 0.22, "STORE  (Reg → TCM, after last K)",
             size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.CENTER)

    # TCM → DMA_out (horizontal left at DMA_out y)
    dma_out_cy = y_dma_out + box_h / 2
    _arrow(slide, tcm_x - 0.02, dma_out_cy,
           box_x + box_w + 0.02, dma_out_cy,
           color=WRITE_COLOR, width_pt=2.5)
    _textbox(slide, mid_x - 1.4, dma_out_cy - 0.32,
             2.8, 0.22, "read from TCM",
             size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.CENTER)

    # DMA_out → HBM (off-page; just an arrow + label, no HBM box at bottom)
    _arrow(slide, col_cx, y_dma_out + box_h + 0.02,
           col_cx, out_arr_end_y - 0.02,
           color=WRITE_COLOR, width_pt=2.5)
    _textbox(slide, col_cx + 0.15, out_arr_end_y - 0.30,
             3.0, 0.22, "DMA_W → HBM",
             size=10, bold=True, color=WRITE_COLOR, align=PP_ALIGN.LEFT)


# ── Slide 7: PE_SCHEDULER ──────────────────────────────────────────────────

def _render_scheduler(slide):
    """PE_SCHEDULER: sole command dispatcher inside a PE.

    Two paths: simple cmd → direct engine dispatch; CompositeCmd →
    generate_plan + FIFO feed → TileToken self-routes through stages.
    """
    SCHED_FILL = RGBColor(0xF3, 0xE8, 0xFF)
    SCHED_BORDER = RGBColor(0x7C, 0x3A, 0xED)
    PE_MATH_COL = RGBColor(0x9C, 0xA3, 0xAF)

    _textbox(slide, 0.4, 1.0, 12.6, 0.5,
             "Sole dispatcher inside a PE. CompositeCmd is expanded into a "
             "TilePlan and fed tile-by-tile in FIFO order; each TileToken "
             "self-routes through the pipeline stages.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # SCHEDULER container (compact — left side of slide)
    sched_x, sched_y = 0.6, 2.45
    sched_w, sched_h = 4.6, 2.4

    # PE_CPU box (centered above scheduler)
    pe_cpu_w, pe_cpu_h = 2.0, 0.50
    pe_cpu_x = sched_x + sched_w / 2 - pe_cpu_w / 2
    pe_cpu_y = 1.65
    _rrect(slide, pe_cpu_x, pe_cpu_y, pe_cpu_w, pe_cpu_h,
           COL_STORAGE, COL_STORAGE_STROKE,
           "PE_CPU",
           size=12, bold=True, color=COL_TEXT_DARK)

    _rrect(slide, sched_x, sched_y, sched_w, sched_h,
           SCHED_FILL, SCHED_BORDER, "",
           size=10, color=SCHED_BORDER)
    _textbox(slide, sched_x + 0.15, sched_y + 0.08, sched_w - 0.30, 0.32,
             "PE_SCHEDULER",
             size=13, bold=True, color=SCHED_BORDER, align=PP_ALIGN.LEFT)

    # CompositeCmd description — text only, no inner box
    text_x = sched_x + 0.25
    text_y = sched_y + 0.55
    text_w = sched_w - 0.50
    _textbox(slide, text_x, text_y, text_w, 0.30,
             "CompositeCmd → generate plan",
             size=12, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, text_x + 0.10, text_y + 0.40, text_w - 0.15,
             sched_h - 1.05,
             "generate_plan(M, K, N)\n"
             "  → ⌈M/32⌉ × ⌈K/64⌉ × ⌈N/32⌉ tiles\n"
             "  each tile:\n"
             "    [DMA_R, FETCH, GEMM,\n"
             "     STORE, DMA_W] stages",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # Engines (compact, right of scheduler — still on the LEFT half of slide)
    engines_x = sched_x + sched_w + 0.40
    engines_w = 2.4
    engines_y = sched_y
    engine_h = 0.45
    engine_gap = 0.18
    engines = [
        ("pe_dma",         COL_DMA,     COL_TEXT_LIGHT),
        ("pe_fetch_store", COL_FS,      COL_TEXT_LIGHT),
        ("pe_gemm",        COL_GEMM,    COL_TEXT_DARK),
        ("pe_math",        PE_MATH_COL, COL_TEXT_LIGHT),
    ]
    for i, (name, fill, tcol) in enumerate(engines):
        y = engines_y + i * (engine_h + engine_gap)
        _rrect(slide, engines_x, y, engines_w, engine_h,
               fill, COL_TEXT_DARK,
               name,
               size=12, bold=True, color=tcol)

    # TileToken / PipelineContext annotation under the engine stack
    last_y = engines_y + len(engines) * (engine_h + engine_gap) - engine_gap
    note_y = last_y + 0.15
    _textbox(slide, engines_x, note_y, engines_w, 0.28,
             "↻ TileToken.advance()",
             size=10, bold=True, color=SCHED_BORDER, align=PP_ALIGN.CENTER)
    _textbox(slide, engines_x, note_y + 0.28, engines_w, 0.28,
             "PipelineContext counts tiles",
             size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # PE_CPU → SCHEDULER arrow
    _arrow(slide, pe_cpu_x + pe_cpu_w / 2, pe_cpu_y + pe_cpu_h + 0.02,
           pe_cpu_x + pe_cpu_w / 2, sched_y - 0.02,
           color=COL_TEXT_DARK, width_pt=2.5)
    _textbox(slide, pe_cpu_x + pe_cpu_w + 0.10,
             (pe_cpu_y + pe_cpu_h + sched_y) / 2 - 0.12,
             3.0, 0.22, "PeInternalTxn(cmd)",
             size=11, bold=True, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # SCHEDULER → engines arrows (one per engine, colour-coded)
    sched_right = sched_x + sched_w
    sched_cy = sched_y + sched_h / 2
    for i, (name, fill, _) in enumerate(engines):
        engine_cy = engines_y + i * (engine_h + engine_gap) + engine_h / 2
        _arrow(slide, sched_right + 0.02, sched_cy,
               engines_x - 0.02, engine_cy,
               color=fill, width_pt=1.5)

    # Bottom note (full width)
    _textbox(slide, 0.4, 6.10, 12.6, 0.85,
             "Key invariants: (1) FIFO across commands via the single "
             "feeder process — no inter-command tile interleaving. "
             "(2) TileToken carries its own plan; each engine reads "
             "token.current_stage, advances stage_idx, and forwards to "
             "the next stage's component. (3) PipelineContext."
             "complete_tile() fires done_event on the last tile, "
             "unblocking PE_CPU.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)


# ── Slides 8 & 9: Sequence diagrams ────────────────────────────────────────

# Op kinds and their visual placement.
_KIND_INFO = {
    "dma_read":  {"fill": COL_DMA,  "band": "between_hbm_tcm", "dir": "down"},
    "fetch":     {"fill": COL_FS,   "band": "between_tcm_reg", "dir": "down"},
    "gemm":      {"fill": COL_GEMM, "band": "regfile",          "dir": None},
    "store":     {"fill": COL_FS,   "band": "between_tcm_reg", "dir": "up"},
    "dma_write": {"fill": COL_DMA,  "band": "between_hbm_tcm", "dir": "up"},
}


def _draw_sequence_panel(slide, ops, *, x_left=0.6, x_right=12.9,
                         y_top=1.7, panel_h=4.4,
                         tile_boundary_after=None,
                         tile_labels=None,
                         contention_after=None):
    """Generic sequence-diagram drawer.

    ops: list of (step, name, kind, bytes_str, ns_str)
    tile_boundary_after: index after which to draw a red dashed divider
    tile_labels: list[str] showing on either side of the boundary
    contention_after: index after which to draw an HBM-contention callout
    """
    n_ops = len(ops)
    band_x = x_left
    band_w = x_right - x_left
    band_h = 0.55
    hbm_y = y_top
    tcm_y = y_top + panel_h * 0.5 - band_h / 2
    reg_y = y_top + panel_h - band_h

    # Memory bands
    _rect_band(slide, band_x, hbm_y, band_w, band_h, COL_HBM_BG, COL_HBM_BORDER)
    _rect_band(slide, band_x, tcm_y, band_w, band_h, COL_TCM_BG, COL_TCM_BORDER)
    _rect_band(slide, band_x, reg_y, band_w, band_h, COL_REG_BG, COL_REG_BORDER)
    _textbox(slide, band_x + 0.05, hbm_y, 0.8, band_h, "HBM",
             size=12, bold=True, color=COL_HBM_BORDER, align=PP_ALIGN.LEFT)
    _textbox(slide, band_x + 0.05, tcm_y, 0.8, band_h, "TCM",
             size=12, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT)
    _textbox(slide, band_x + 0.05, reg_y, 0.85, band_h, "RegFile",
             size=12, bold=True, color=COL_REG_BORDER, align=PP_ALIGN.LEFT)

    # Op columns span from after the level labels (~0.9 in margin) to right edge
    ops_left = band_x + 1.0
    ops_w = band_w - 1.1
    col_w = ops_w / max(n_ops, 1)
    op_box_w = col_w * 0.86
    op_box_h = 0.70

    hbm_bot = hbm_y + band_h
    tcm_top = tcm_y
    tcm_bot = tcm_y + band_h
    reg_top = reg_y
    gap_ht = (hbm_bot + tcm_top) / 2
    gap_tr = (tcm_bot + reg_top) / 2

    for idx, (step, name, kind, byt, ns) in enumerate(ops):
        info = _KIND_INFO[kind]
        x_center = ops_left + idx * col_w + col_w / 2
        x = x_center - op_box_w / 2
        # Step number
        _textbox(slide, x, hbm_y - 0.35, op_box_w, 0.25,
                 f"#{step}", size=9, bold=True, color=COL_TEXT_DARK)
        if info["band"] == "between_hbm_tcm":
            y_box = gap_ht - op_box_h / 2
        elif info["band"] == "between_tcm_reg":
            y_box = gap_tr - op_box_h / 2
        else:
            y_box = reg_y + band_h / 2 - op_box_h / 2
        text_color = COL_TEXT_LIGHT if kind != "gemm" else COL_TEXT_DARK
        label = f"{name}\n{byt}  {ns}"
        _rrect(slide, x, y_box, op_box_w, op_box_h,
               info["fill"], COL_TEXT_DARK, label,
               size=8, bold=True, color=text_color)
        # Arrows
        if info["dir"] == "down":
            src = hbm_bot if info["band"] == "between_hbm_tcm" else tcm_bot
            dst = tcm_top if info["band"] == "between_hbm_tcm" else reg_top
            _arrow(slide, x_center, src, x_center, y_box,
                   color=info["fill"], width_pt=1.6)
            _arrow(slide, x_center, y_box + op_box_h, x_center, dst,
                   color=info["fill"], width_pt=1.6)
        elif info["dir"] == "up":
            src = reg_top if info["band"] == "between_tcm_reg" else tcm_top
            dst = tcm_bot if info["band"] == "between_tcm_reg" else hbm_bot
            _arrow(slide, x_center, src, x_center, y_box + op_box_h,
                   color=info["fill"], width_pt=1.6)
            _arrow(slide, x_center, y_box, x_center, dst,
                   color=info["fill"], width_pt=1.6)

    # Tile boundary
    if tile_boundary_after is not None:
        bx = ops_left + tile_boundary_after * col_w
        _vline_dashed(slide, bx, hbm_y - 0.05, reg_y + band_h + 0.05,
                      COL_RED, width_pt=2.0)
        if tile_labels:
            _textbox(slide, bx - 1.7, hbm_y - 0.65, 1.6, 0.3,
                     tile_labels[0], size=10, bold=True,
                     color=COL_RED, align=PP_ALIGN.RIGHT)
            _textbox(slide, bx + 0.1, hbm_y - 0.65, 1.9, 0.3,
                     tile_labels[1], size=10, bold=True,
                     color=COL_RED, align=PP_ALIGN.LEFT)

    # HBM contention callout
    if contention_after is not None:
        cx = ops_left + contention_after * col_w
        cy = hbm_y + band_h + 0.05
        callout_w = 2.6
        callout_h = 0.7
        callout_x = max(band_x + 0.5, cx - callout_w / 2)
        callout_x = min(callout_x, band_x + band_w - callout_w - 0.5)
        _rrect(slide, callout_x, cy + 0.05, callout_w, callout_h,
               RGBColor(0xFE, 0xE2, 0xE2), COL_RED,
               "HBM contention:\nDMA_WRITE out + next DMA_READ A,B\ncompete for HBM BW",
               size=8, bold=True, color=COL_RED)
        _arrow(slide, callout_x + callout_w / 2, cy + 0.05,
               cx, hbm_y + band_h * 0.5,
               color=COL_RED, width_pt=1.8)

    # Engine legend (below the panel)
    legend_y = reg_y + band_h + 0.6
    _rect_band(slide, band_x + 0.1, legend_y, 0.25, 0.22,
               COL_DMA, COL_TEXT_DARK)
    _textbox(slide, band_x + 0.4, legend_y - 0.05, 2.4, 0.35,
             "pe_dma (HBM↔TCM)", size=10, color=COL_TEXT_DARK,
             align=PP_ALIGN.LEFT)
    _rect_band(slide, band_x + 3.0, legend_y, 0.25, 0.22,
               COL_FS, COL_TEXT_DARK)
    _textbox(slide, band_x + 3.3, legend_y - 0.05, 3.0, 0.35,
             "pe_fetch_store (TCM↔RegFile)", size=10, color=COL_TEXT_DARK,
             align=PP_ALIGN.LEFT)
    _rect_band(slide, band_x + 6.6, legend_y, 0.25, 0.22,
               COL_GEMM, COL_TEXT_DARK)
    _textbox(slide, band_x + 6.9, legend_y - 0.05, 2.6, 0.35,
             "pe_gemm (MAC compute)", size=10, color=COL_TEXT_DARK,
             align=PP_ALIGN.LEFT)


def _draw_composite_setup_block(slide):
    """Draw the CompositeCmd setup block before the first op column.

    Represents the scheduler-side delay between PE_CPU issuing the
    CompositeCmd and the first DMA_R actually firing: plan generation,
    FIFO feeder enqueue, and the per-cmd scheduler overhead.
    """
    SCHED_COLOR = RGBColor(0x7C, 0x3A, 0xED)
    SCHED_FILL = RGBColor(0xF3, 0xE8, 0xFF)
    sx, sy = 0.65, 2.45
    sw, sh = 0.90, 1.05
    _rrect(slide, sx, sy, sw, sh,
           SCHED_FILL, SCHED_COLOR,
           "PLAN-GEN\n+ FEED\n(setup)",
           size=8, bold=True, color=SCHED_COLOR)
    _textbox(slide, sx, sy + sh + 0.05, sw, 0.22,
             "scheduler",
             size=8, color=SCHED_COLOR, align=PP_ALIGN.CENTER)
    # Arrow into step 1's column (ops_left = band_x + 1.0 = 1.6 by default)
    _arrow(slide, sx + sw + 0.02, sy + sh / 2,
           1.62, sy + sh / 2,
           color=SCHED_COLOR, width_pt=1.5)


def _render_sequence_32x128x32(slide):
    _textbox(slide, 0.4, 1.05, 12.6, 0.65,
             "load_ref assumption: A (32×128) is pre-loaded into TCM via "
             "tl.load before the kernel starts; only B is DMA_R'd per "
             "tile. FETCH can start as soon as the first DMA_R(B) finishes "
             "— A is already in TCM. Scheduler tile = 32×64×32 → 1·1·2 = "
             "2 tiles. The PLAN-GEN/FEED block is the scheduler-side "
             "setup delay before the first DMA fires.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
    # load_ref: no DMA_R for A; only B per tile.
    ops = [
        (1, "DMA_R B (tile 0)",  "dma_read",  "4 KB", "~16 ns"),
        (2, "FETCH (tile 0)",    "fetch",     "8 KB", "~16 ns"),
        (3, "DMA_R B (tile 1)",  "dma_read",  "4 KB", "~16 ns"),
        (4, "GEMM K=0 (accum)",  "gemm",      "—",    "~17 ns"),
        (5, "FETCH (tile 1)",    "fetch",     "8 KB", "~16 ns"),
        (6, "GEMM K=1 (last)",   "gemm",      "—",    "~17 ns"),
        (7, "STORE final",       "store",     "2 KB", "~4 ns"),
        (8, "DMA_W out",         "dma_write", "2 KB", "~8 ns"),
    ]
    _draw_sequence_panel(slide, ops)
    _draw_composite_setup_block(slide)

    # "A pinned in TCM" annotation overlaid on the TCM band, left side
    _textbox(slide, 0.65, 3.92, 1.6, 0.22,
             "[ A pinned via tl.load ]",
             size=9, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT)

    _textbox(slide, 0.4, 6.55, 12.6, 0.45,
             "Pipeline is balanced (DMA, FETCH, GEMM all ~16 ns/tile) — "
             "DMA engine carries only B, so per-tile DMA cost halves vs "
             "ref_ref. Wall = setup + head_latency + N_tiles·T_stage + "
             "final STORE+DMA_W.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)


def _render_sequence_32x128x128(slide):
    _textbox(slide, 0.4, 1.05, 12.6, 0.70,
             "load_ref assumption: A (32×128) is pre-loaded into TCM via "
             "tl.load before the kernel starts; only B is DMA_R'd per "
             "tile. FETCH starts as soon as the corresponding B arrives — "
             "A is already in TCM. Scheduler tile = 32×64×32 → 1·4·2 = 8 "
             "tiles. PLAN-GEN/FEED block = scheduler-side setup delay "
             "before the first DMA.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)
    # load_ref: only B per tile, A pre-loaded. DMA_R scattered across timeline.
    ops = [
        (1,  "DMA_R B (0,0,0)",  "dma_read",  "4 KB", "~16 ns"),
        (2,  "FETCH (0,0,0)",    "fetch",     "8 KB", "~16 ns"),
        (3,  "DMA_R B (0,0,1)",  "dma_read",  "4 KB", "~16 ns"),
        (4,  "GEMM K=0",         "gemm",      "—",    "~17 ns"),
        (5,  "FETCH (0,0,1)",    "fetch",     "8 KB", "~16 ns"),
        (6,  "DMA_R B (0,1,0)",  "dma_read",  "4 KB", "~16 ns"),
        (7,  "GEMM K=1 last",    "gemm",      "—",    "~17 ns"),
        (8,  "STORE out₀₀",      "store",     "2 KB", "~4 ns"),
        (9,  "DMA_W out₀₀",      "dma_write", "2 KB", "~8 ns"),
        (10, "FETCH (0,1,0)",    "fetch",     "8 KB", "~16 ns"),
        (11, "DMA_R B (0,1,1)",  "dma_read",  "4 KB", "~16 ns"),
        (12, "GEMM K=0 (0,1)",   "gemm",      "—",    "~17 ns"),
    ]
    _draw_sequence_panel(
        slide, ops,
        tile_boundary_after=9,
        tile_labels=["── (m,n)=(0,0) full execution ──",
                     "── (0,1) starts ──"],
        contention_after=9,
    )
    _draw_composite_setup_block(slide)

    # "A pinned in TCM" annotation overlaid on the TCM band
    _textbox(slide, 0.65, 3.92, 1.6, 0.22,
             "[ A pinned via tl.load ]",
             size=9, bold=True, color=COL_TCM_BORDER, align=PP_ALIGN.LEFT)

    _textbox(slide, 0.4, 6.55, 12.6, 0.4,
             "HBM half-duplex caveat: real HBM channels can't read and write "
             "simultaneously. DMA_W out₀₀ competes for HBM bandwidth with "
             "DMA_R(B) of (0,1). Simulator currently models PE_DMA read / "
             "write as separate resources (full-duplex) — flag for revisit "
             "if half-duplex matters.",
             size=10, color=COL_RED, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, 6.95, 12.6, 0.3,
             "Pattern repeats for (m,n) = (0,1), (0,2), (0,3). DMA engine "
             "only carries B per tile, so the pipeline stays balanced — "
             "no DMA bottleneck like ref_ref.",
             size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)


# ── Slides 10 & 11: Tiling walkthroughs ────────────────────────────────────

def _draw_matrix_tiles(slide, *, x, y, n_rows, n_cols, tile_w, tile_h,
                       fill, border, label_prefix, label_fmt=None,
                       title=None, title_size=11):
    """Draw a matrix as a grid of tiles. Returns (right_x, bottom_y).

    label_fmt(r, c) returns the per-tile label; if None defaults to
    f"{label_prefix}{r}{c}" for 2-d or f"{label_prefix}{c}" for 1-row.
    """
    if title is not None:
        _textbox(slide, x, y - 0.27, n_cols * tile_w, 0.22,
                 title, size=title_size, bold=True,
                 color=COL_TEXT_DARK, align=PP_ALIGN.CENTER)
    for r in range(n_rows):
        for c in range(n_cols):
            if label_fmt is not None:
                label = label_fmt(r, c)
            elif n_rows == 1:
                label = f"{label_prefix}{c}"
            else:
                label = f"{label_prefix}{r}{c}"
            _rrect(slide, x + c * tile_w, y + r * tile_h,
                   tile_w, tile_h,
                   fill, border, label,
                   size=10 if n_rows == 1 else 9,
                   bold=True, color=border)
    return x + n_cols * tile_w, y + n_rows * tile_h


def _render_tiling_32x128x32(slide):
    """32×128×32 — K-loop only, single (m,n) → NO inter-(m,n) flush.

    Visualises why a tall-thin K shape is friendly: the accumulator stays
    in RegFile across the entire K loop, and STORE + DMA_W fire ONCE at
    the very end. No inter-pair flush serialises HBM bandwidth.
    """
    _textbox(slide, 0.4, 1.0, 12.6, 0.55,
             "Scheduler tile = 32×64×32 → 1·2·1 = 2 tiles. Only ONE (m,n) "
             "output → the K-loop accumulates entirely in RegFile, and "
             "STORE + DMA_W fire just once at the very end. No inter-(m,n) "
             "flush — DMA never has to drain mid-compute.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # ── Matrix formula C = A · B (centered on slide) ─────────────────
    tile_w = 0.60
    tile_h = 0.60
    matrix_top = 2.10

    # C (1 × 1) — single output tile
    c_x = 1.6
    c_y = matrix_top
    _draw_matrix_tiles(slide, x=c_x, y=c_y,
                       n_rows=1, n_cols=1,
                       tile_w=tile_w, tile_h=tile_h,
                       fill=RGBColor(0xFE, 0xF3, 0xC7),
                       border=COL_REG_BORDER,
                       label_prefix="C", title="C  (32 × 32)")

    # "="
    _textbox(slide, c_x + tile_w + 0.10, c_y - 0.05, 0.40, 0.70,
             "=", size=26, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # A (1 × 2) — 2 K-tiles
    a_x = c_x + tile_w + 0.60
    a_y = matrix_top
    _draw_matrix_tiles(slide, x=a_x, y=a_y,
                       n_rows=1, n_cols=2,
                       tile_w=tile_w, tile_h=tile_h,
                       fill=RGBColor(0xDB, 0xEA, 0xFE),
                       border=COL_HBM_BORDER,
                       label_prefix="A", title="A  (32 × 128)")
    _textbox(slide, a_x, a_y + tile_h + 0.05, 2 * tile_w, 0.22,
             "← K (2 tiles) →",
             size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # "·"
    _textbox(slide, a_x + 2 * tile_w + 0.10, c_y - 0.10, 0.30, 0.70,
             "·", size=28, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # B (2 × 1) — 2 K-tiles, 1 N-tile
    b_x = a_x + 2 * tile_w + 0.55
    b_y = matrix_top - tile_h / 2  # center on row
    _draw_matrix_tiles(slide, x=b_x, y=b_y,
                       n_rows=2, n_cols=1,
                       tile_w=tile_w, tile_h=tile_h,
                       fill=RGBColor(0xDB, 0xEA, 0xFE),
                       border=COL_HBM_BORDER,
                       label_prefix="B", title="B  (128 × 32)",
                       label_fmt=lambda r, c: f"B{r}")
    _textbox(slide, b_x - 0.45, b_y + tile_h - 0.10, 0.40, 0.30,
             "K\n↓", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # Formula
    _textbox(slide, 0.4, matrix_top + 2 * tile_h + 0.30, 12.6, 0.40,
             "C  =  A0·B0  +  A1·B1     (K-loop, 2 iterations — "
             "accumulator stays in RegFile)",
             size=13, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # ── Timeline: single (m,n) pair, NO flush ───────────────────────
    tl_y = matrix_top + 2 * tile_h + 1.10
    _textbox(slide, 0.4, tl_y - 0.30, 12.6, 0.22,
             "Execution timeline — 1 (m,n) output, 0 inter-(m,n) flushes:",
             size=12, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.LEFT)

    pair_w = 4.50
    pair_h = 0.85
    pair_x = (SLIDE_W_IN - pair_w) / 2
    k0_w = pair_w * 0.45
    k1_w = pair_w * 0.53

    _rrect(slide, pair_x, tl_y, k0_w, pair_h,
           RGBColor(0xD1, 0xFA, 0xE5), COL_TCM_BORDER,
           "K=0  (accumulate into RegFile)",
           size=11, bold=True, color=COL_TCM_BORDER)
    _rrect(slide, pair_x + k0_w + 0.02, tl_y, k1_w, pair_h,
           RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER,
           "K=1 last\nSTORE + DMA_W  (final drain)",
           size=11, bold=True, color=COL_REG_BORDER)
    _textbox(slide, pair_x, tl_y + pair_h + 0.05, pair_w, 0.25,
             "(m,n)=(0,0)  →  C   (single output tile)",
             size=11, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)
    # "no flush" callout to the right
    _textbox(slide, pair_x + pair_w + 0.20, tl_y + 0.15, 2.5, 0.55,
             "✓ No inter-(m,n) flush\n   (only 1 output pair)",
             size=11, bold=True, color=COL_TCM_BORDER,
             align=PP_ALIGN.LEFT)

    # ── Bottom note ─────────────────────────────────────────────────
    _textbox(slide, 0.4, 6.30, 12.6, 0.70,
             "Why growing K helps: each (m,n) pair amortises its single "
             "STORE+DMA_W over K_tiles iterations of pure compute. With "
             "N=32 (one N-tile), there is no NEXT (m,n) pair, so no "
             "inter-pair flush at all. Pipeline efficiency is bottlenecked "
             "only by head latency and the final drain.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)


def _render_tiling_32x128x128(slide):
    """32×128×128 — K-loop + 3 inter-(m,n) flushes (N_tiles=4 → 4 pairs)."""
    _textbox(slide, 0.4, 1.0, 12.6, 0.55,
             "Scheduler tile = 32×64×32 → 1·2·4 = 8 tiles. A is split along "
             "K (2 tiles); B along K and N (2×4); C along N (4). For each "
             "(m,n) the K-loop accumulates in RegFile; STORE + DMA_W fire "
             "only on last K → 3 inter-(m,n) flushes between the 4 pairs.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # ── Matrix formula C = A · B (centered) ──────────────────────────
    tile_w = 0.55
    tile_h = 0.55
    matrix_top = 2.10

    # C (1 × 4)
    c_x = 1.50
    c_y = matrix_top
    _draw_matrix_tiles(slide, x=c_x, y=c_y,
                       n_rows=1, n_cols=4,
                       tile_w=tile_w, tile_h=tile_h,
                       fill=RGBColor(0xFE, 0xF3, 0xC7),
                       border=COL_REG_BORDER,
                       label_prefix="C", title="C  (32 × 128)")
    _textbox(slide, c_x, c_y + tile_h + 0.05, 4 * tile_w, 0.22,
             "← N (4 tiles) →",
             size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # "="
    eq_x = c_x + 4 * tile_w + 0.20
    _textbox(slide, eq_x, c_y - 0.05, 0.40, 0.70,
             "=", size=26, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # A (1 × 2)
    a_x = eq_x + 0.55
    a_y = matrix_top
    _draw_matrix_tiles(slide, x=a_x, y=a_y,
                       n_rows=1, n_cols=2,
                       tile_w=tile_w, tile_h=tile_h,
                       fill=RGBColor(0xDB, 0xEA, 0xFE),
                       border=COL_HBM_BORDER,
                       label_prefix="A", title="A  (32 × 128)")
    _textbox(slide, a_x, a_y + tile_h + 0.05, 2 * tile_w, 0.22,
             "← K (2 tiles) →",
             size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # "·"
    dot_x = a_x + 2 * tile_w + 0.15
    _textbox(slide, dot_x, c_y - 0.10, 0.30, 0.70,
             "·", size=28, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # B (2 × 4)
    b_x = dot_x + 0.45
    b_y = matrix_top - tile_h / 2  # center on the row of A and C
    _draw_matrix_tiles(slide, x=b_x, y=b_y,
                       n_rows=2, n_cols=4,
                       tile_w=tile_w, tile_h=tile_h,
                       fill=RGBColor(0xDB, 0xEA, 0xFE),
                       border=COL_HBM_BORDER,
                       label_prefix="B", title="B  (128 × 128)")
    _textbox(slide, b_x, b_y + 2 * tile_h + 0.05, 4 * tile_w, 0.22,
             "← N (4 tiles) →",
             size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)
    _textbox(slide, b_x - 0.45, b_y + tile_h - 0.10, 0.40, 0.30,
             "K\n↓", size=9, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # Formula
    _textbox(slide, 0.4, matrix_top + 2 * tile_h + 0.30, 12.6, 0.40,
             "Cn  =  A0·B0n  +  A1·B1n     (K-loop, 2 iters per (m,n) — "
             "accumulator stays in RegFile)",
             size=13, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # ── Timeline: 4 (m,n) pairs + 3 inter-pair flushes ──────────────
    tl_y = matrix_top + 2 * tile_h + 1.10
    _textbox(slide, 0.4, tl_y - 0.30, 12.6, 0.22,
             "Execution timeline — 4 (m,n) outputs, 3 inter-(m,n) flushes:",
             size=12, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.LEFT)

    pair_w = 1.85
    pair_h = 0.75
    flush_w = 0.65
    total_w = 4 * pair_w + 3 * flush_w
    start_x = (SLIDE_W_IN - total_w) / 2

    pair_x = start_x
    for n in range(4):
        k0_w = pair_w * 0.45
        k1_w = pair_w * 0.53
        _rrect(slide, pair_x, tl_y, k0_w, pair_h,
               RGBColor(0xD1, 0xFA, 0xE5), COL_TCM_BORDER,
               "K=0\n(accum)",
               size=9, bold=True, color=COL_TCM_BORDER)
        _rrect(slide, pair_x + k0_w + 0.02, tl_y, k1_w, pair_h,
               RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER,
               "K=1 last\nSTORE +\nDMA_W",
               size=9, bold=True, color=COL_REG_BORDER)
        _textbox(slide, pair_x, tl_y + pair_h + 0.05, pair_w, 0.22,
                 f"(0,{n}) → C{n}",
                 size=10, bold=True, color=COL_TEXT_DARK,
                 align=PP_ALIGN.CENTER)
        if n < 3:
            flush_x = pair_x + pair_w + 0.05
            _rrect(slide, flush_x, tl_y, flush_w - 0.10, pair_h,
                   RGBColor(0xFE, 0xE2, 0xE2), COL_RED,
                   "FLUSH\n(DMA_W\n vs DMA_R)",
                   size=8, bold=True, color=COL_RED)
            pair_x = flush_x + flush_w - 0.05
        else:
            pair_x += pair_w + 0.05

    # ── Bottom note ────────────────────────────────────────────────
    _textbox(slide, 0.4, 6.30, 12.6, 0.70,
             "Why flushes hurt: at every (m,n) boundary, DMA_W of Cn "
             "competes with DMA_R of the next pair for the cube-shared "
             "HBM channel. Inter-flush count = (M_tiles · N_tiles − 1) — "
             "for 32×128×128 that's 1·4−1 = 3 flushes. Bigger N → more "
             "flushes; bigger K alone (with small N) → none.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)


_RENDERERS = {
    "pipeline_structure":   _render_pipeline_structure,
    "scheduler":            _render_scheduler,
    "sequence_32x128x32":   _render_sequence_32x128x32,
    "sequence_32x128x128":  _render_sequence_32x128x128,
    "tiling_32x128x32":     _render_tiling_32x128x32,
    "tiling_32x128x128":    _render_tiling_32x128x128,
}


# ── Bar-chart slides (read from docs/diagrams/gemm_sweep.json) ─────────────

import json

GEMM_SWEEP_JSON = DIAG / "gemm_sweep.json"


def _under_tile(M, K, N, tile_M, tile_K, tile_N):
    return M < tile_M or K < tile_K or N < tile_N

STAGE_KEYS = ["DMA_READ", "FETCH", "GEMM", "DMA_WRITE"]
STAGE_DISPLAY = {
    "DMA_READ":  "DMA in",
    "FETCH":     "Fetch",
    "GEMM":      "GEMM",
    "MATH":      "Math",
    "DMA_WRITE": "DMA out",
}
STAGE_COLORS_PPTX = {
    "DMA_READ":  RGBColor(0x3B, 0x82, 0xF6),
    "FETCH":     RGBColor(0x10, 0xB9, 0x81),
    "GEMM":      RGBColor(0xF5, 0x9E, 0x0B),
    "MATH":      RGBColor(0xEF, 0x44, 0x44),
    "DMA_WRITE": RGBColor(0xA8, 0x55, 0xF7),
}
VARIANT_COLORS_PPTX = {
    "ref_ref":   RGBColor(0x10, 0xB9, 0x81),
    "load_ref":  RGBColor(0xF5, 0x9E, 0x0B),
    "load_load": RGBColor(0xEF, 0x44, 0x44),
}


def _shape_label(r: dict) -> str:
    if r["M"] == r["K"] == r["N"]:
        return f"M=K=N={r['M']}"
    return f"M={r['M']}\nK={r['K']}\nN={r['N']}"


def _draw_native_bar_chart(slide, *, plot_x, plot_y, plot_w, plot_h,
                           shape_labels, flagged, tile_counts,
                           series, colors_map, display_map,
                           wall_clocks=None,
                           y_label="ns",
                           legend_x, legend_w,
                           foot_note=None,
                           threshold_line=None,
                           flagged_bar_color=None,
                           flagged_series_only=None):
    """Render a grouped bar chart natively in PPTX. Linear Y scale.

    series: dict[str -> list[float]] — series_name → value per shape.
    threshold_line: if set, draws a dashed horizontal reference line at this y-value.
    flagged_bar_color: if set, overrides colors_map[sname] for flagged shapes.
    flagged_series_only: if set, the flag override applies only to this series.
    """
    n_shapes = len(shape_labels)
    if n_shapes == 0:
        return
    series_names = list(series.keys())
    n_series = len(series_names)

    # Y-axis range: linear, top = max value * 1.10 (include wall-clock).
    all_vals = [v for vals in series.values() for v in vals if v > 0]
    if wall_clocks:
        all_vals.extend([w for w in wall_clocks if w > 0])
    if threshold_line is not None:
        all_vals.append(threshold_line)
    y_max = max(all_vals) * 1.10 if all_vals else 1.0

    def y_of(v):
        v = max(v, 0.0)
        return plot_y + plot_h * (1 - v / y_max)

    # Plot box background
    _rect_band(slide, plot_x, plot_y, plot_w, plot_h,
               RGBColor(0xFF, 0xFF, 0xFF), RGBColor(0xCB, 0xD5, 0xE1))

    # Y-axis ticks (6 levels)
    for i in range(6):
        v = y_max * i / 5
        y = y_of(v)
        # gridline
        s = slide.shapes.add_connector(1, Inches(plot_x), Inches(y),
                                       Inches(plot_x + plot_w), Inches(y))
        s.line.color.rgb = RGBColor(0xE2, 0xE8, 0xF0)
        s.line.width = Pt(0.5)
        # label
        _textbox(slide, plot_x - 0.85, y - 0.12, 0.75, 0.25,
                 f"{v:>8.0f}", size=9, color=COL_MUTED, align=PP_ALIGN.RIGHT)

    # Y-axis title
    _textbox(slide, plot_x - 0.85, plot_y + plot_h / 2 - 0.15, 0.75, 0.3,
             y_label, size=10, color=COL_TEXT_DARK)

    # Threshold reference line (dashed)
    if threshold_line is not None:
        ty = y_of(threshold_line)
        line = slide.shapes.add_connector(
            1, Inches(plot_x), Inches(ty),
            Inches(plot_x + plot_w), Inches(ty),
        )
        line.line.color.rgb = COL_TEXT_DARK
        line.line.width = Pt(1.5)
        from pptx.oxml.ns import qn
        from lxml import etree
        ln = line.line._get_or_add_ln()
        pr = ln.find(qn("a:prstDash"))
        if pr is None:
            pr = etree.SubElement(ln, qn("a:prstDash"))
        pr.set("val", "dash")
        _textbox(slide, plot_x + plot_w - 0.7, ty - 0.30, 0.7, 0.25,
                 f"{threshold_line:.0f}% peak",
                 size=9, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.RIGHT)

    # Geometry per shape group
    group_w = plot_w / (n_shapes * 1.4)
    bar_w = group_w / max(n_series, 1)
    gap = (plot_w - n_shapes * group_w) / (n_shapes + 1)

    y_base = plot_y + plot_h
    for i in range(n_shapes):
        x_group = plot_x + gap + i * (group_w + gap)
        cx = x_group + group_w / 2

        for j, sname in enumerate(series_names):
            v = series[sname][i]
            if v <= 0:
                continue
            bx = x_group + j * bar_w
            y_top = y_of(v)
            s = slide.shapes.add_shape(
                MSO_SHAPE.RECTANGLE,
                Inches(bx), Inches(y_top),
                Inches(bar_w * 0.85), Inches(y_base - y_top),
            )
            s.fill.solid()
            apply_flag = (flagged_bar_color is not None and flagged[i]
                          and (flagged_series_only is None
                               or sname == flagged_series_only))
            fill_color = flagged_bar_color if apply_flag else colors_map[sname]
            s.fill.fore_color.rgb = fill_color
            s.line.color.rgb = COL_TEXT_DARK
            s.line.width = Pt(0.4)

        # Wall-clock dot
        if wall_clocks and wall_clocks[i] > 0:
            wy = y_of(wall_clocks[i])
            r = 0.05
            d = slide.shapes.add_shape(
                MSO_SHAPE.OVAL,
                Inches(cx - r), Inches(wy - r),
                Inches(r * 2), Inches(r * 2),
            )
            d.fill.solid()
            d.fill.fore_color.rgb = COL_TEXT_DARK
            d.line.color.rgb = COL_TEXT_DARK

        # Shape label below the group
        lab_color = COL_RED if flagged[i] else COL_TEXT_DARK
        _textbox(slide, x_group, y_base + 0.05, group_w, 0.7,
                 shape_labels[i], size=9, bold=flagged[i], color=lab_color)
        if tile_counts:
            _textbox(slide, x_group, y_base + 0.65, group_w, 0.3,
                     f"{tile_counts[i]} tiles",
                     size=8, color=COL_MUTED)
        if flagged[i]:
            _textbox(slide, x_group, y_base + 0.92, group_w, 0.3,
                     "↑ under-tile",
                     size=8, bold=True, color=COL_RED)

    # Legend on the right
    _textbox(slide, legend_x, plot_y, legend_w, 0.3,
             "Stages (per bar):" if "DMA_READ" in series_names
             else "Variants (per bar):",
             size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    ly = plot_y + 0.4
    for sname in series_names:
        # color swatch
        sw = slide.shapes.add_shape(
            MSO_SHAPE.RECTANGLE,
            Inches(legend_x), Inches(ly), Inches(0.2), Inches(0.2),
        )
        sw.fill.solid()
        sw.fill.fore_color.rgb = colors_map[sname]
        sw.line.color.rgb = COL_TEXT_DARK
        sw.line.width = Pt(0.5)
        _textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3,
                 display_map.get(sname, sname),
                 size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
        ly += 0.28

    if wall_clocks:
        ly += 0.08
        d = slide.shapes.add_shape(
            MSO_SHAPE.OVAL,
            Inches(legend_x + 0.05), Inches(ly + 0.04),
            Inches(0.12), Inches(0.12),
        )
        d.fill.solid()
        d.fill.fore_color.rgb = COL_TEXT_DARK
        d.line.color.rgb = COL_TEXT_DARK
        _textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3,
                 "kernel wall-clock",
                 size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)

    # Extra legend entry for flagged (under-tile) bars
    if flagged_bar_color is not None and any(flagged):
        ly += 0.30
        sw = slide.shapes.add_shape(
            MSO_SHAPE.RECTANGLE,
            Inches(legend_x), Inches(ly), Inches(0.2), Inches(0.2),
        )
        sw.fill.solid()
        sw.fill.fore_color.rgb = flagged_bar_color
        sw.line.color.rgb = COL_TEXT_DARK
        sw.line.width = Pt(0.5)
        flagged_label_target = flagged_series_only or (
            list(display_map.keys())[0] if display_map else "value"
        )
        flagged_label = (
            display_map.get(flagged_label_target, flagged_label_target)
            + " (under-tile)"
        )
        _textbox(slide, legend_x + 0.28, ly - 0.05, legend_w - 0.3, 0.3,
                 flagged_label,
                 size=10, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)

    if foot_note:
        _textbox(slide, plot_x, y_base + 1.3, plot_w, 0.4,
                 foot_note, size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)


EXCLUDED_SHAPES = {(512, 512, 512)}


def _load_sweep_data() -> dict:
    if not GEMM_SWEEP_JSON.exists():
        return {"rows": []}
    data = json.loads(GEMM_SWEEP_JSON.read_text())
    data["rows"] = [
        r for r in data.get("rows", [])
        if (r["M"], r["K"], r["N"]) not in EXCLUDED_SHAPES
    ]
    return data


def _render_stage_breakdown(slide, variant: str, *, per_tile: bool = False):
    """Stage breakdown for one variant. Linear Y. If per_tile, divide by tile count.

    Uses wall_ns (interval-union of records on each engine) — the honest
    engine-active time. Avoids the double-counting that occupancy_ns has
    when multiple ops overlap on a contended resource (HBM_CTRL queue).
    """
    data = _load_sweep_data()
    rows = [r for r in data["rows"] if r.get("variant") == variant]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 f"No sweep data found for variant '{variant}'. "
                 f"Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    subtitle_unit = "Per-tile" if per_tile else "Per-stage"
    _textbox(slide, 0.4, 1.0, 12.6, 0.45,
             f"Variant: {variant}   |   {subtitle_unit} engine wall-clock "
             f"(linear) — DMA in / Fetch / GEMM / DMA out per shape. "
             f"Tile size {tile['M']}×{tile['K']}×{tile['N']}.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    shape_labels = [_shape_label(r) for r in rows]
    flagged = [_under_tile(r["M"], r["K"], r["N"],
                           tile["M"], tile["K"], tile["N"]) for r in rows]
    tile_counts = [r["tile_count_expected"] for r in rows]

    def _val(r, s):
        v = r.get("stages", {}).get(s, {}).get("wall_ns", 0.0)
        if per_tile:
            tc = r.get("tile_count_expected", 0) or 1
            return v / tc
        return v

    series = {s: [_val(r, s) for r in rows] for s in STAGE_KEYS}
    foot_note = (
        "Bars = engine wall-clock ÷ tile count (amortized per-tile cost). "
        "Falls with tile count as the pipeline fills."
        if per_tile else
        "Bars = engine wall-clock interval (max t_end − min t_start, "
        "merged overlaps). Strips queue-wait double-counting."
    )
    _draw_native_bar_chart(
        slide,
        plot_x=1.0, plot_y=1.65, plot_w=10.0, plot_h=4.45,
        shape_labels=shape_labels, flagged=flagged,
        tile_counts=tile_counts,
        series=series, colors_map=STAGE_COLORS_PPTX,
        display_map=STAGE_DISPLAY,
        wall_clocks=None,
        y_label="ns/tile" if per_tile else "ns",
        legend_x=11.4, legend_w=1.85,
        foot_note=foot_note,
    )


def _render_stage_breakdown_ref_ref(slide):
    _render_stage_breakdown(slide, "ref_ref")


def _render_stage_breakdown_load_ref(slide):
    _render_stage_breakdown(slide, "load_ref")


def _render_stage_breakdown_load_load(slide):
    _render_stage_breakdown(slide, "load_load")


def _render_variant_comparison(slide):
    """Wall-clock per shape per variant (3 bars per shape)."""
    data = _load_sweep_data()
    rows = data["rows"]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 "No sweep data. Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    variants = data.get("variants", ["ref_ref", "load_ref", "load_load"])

    # Group by shape (preserve first-seen order).
    by_shape: dict = {}
    for r in rows:
        key = (r["M"], r["K"], r["N"])
        by_shape.setdefault(key, {})[r["variant"]] = r
    shapes = list(by_shape.keys())

    sample = next(iter(by_shape.values()))[next(iter(by_shape[shapes[0]]))]
    sample_label = _shape_label(sample)
    _ = sample_label  # silence unused warning

    shape_labels = [_shape_label(by_shape[k][next(iter(by_shape[k]))])
                    for k in shapes]
    flagged = [_under_tile(k[0], k[1], k[2], tile["M"], tile["K"], tile["N"])
               for k in shapes]
    tile_counts = [by_shape[k][next(iter(by_shape[k]))]["tile_count_expected"]
                   for k in shapes]
    series = {
        v: [(by_shape[k].get(v) or {}).get("pe_window_ns", 0.0) for k in shapes]
        for v in variants
    }
    display_map = {v: v for v in variants}

    _textbox(slide, 0.4, 1.0, 12.6, 0.4,
             "Kernel wall-clock per variant per shape (linear). "
             "ref_ref baseline, load_ref pins A, load_load pins both A and B.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    _draw_native_bar_chart(
        slide,
        plot_x=1.0, plot_y=1.6, plot_w=10.0, plot_h=4.5,
        shape_labels=shape_labels, flagged=flagged,
        tile_counts=tile_counts,
        series=series, colors_map=VARIANT_COLORS_PPTX,
        display_map=display_map,
        wall_clocks=None,
        y_label="wall ns",
        legend_x=11.4, legend_w=1.85,
        foot_note=("After Phase 2 fix (gated STORE/DMA_WRITE + pinned operand "
                   "skip): load_ref / load_load are faster than ref_ref."),
    )


def _render_hbm_topology(slide):
    """Show cube-shared HBM_CTRL path: PE → router → HBM_CTRL → HBM."""
    _textbox(slide, 0.4, 1.0, 12.6, 0.55,
             "DMA reads cross the cube fabric — HBM_CTRL is one per cube "
             "(NOT per PE). All 8 PEs serialize at the controller's single "
             "channel resource. Even one active PE pays the round-trip on "
             "every K-tile miss.",
             size=13, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # Cube outline (encloses 8 PEs + their TCMs + the router mesh)
    cube_x, cube_y, cube_w, cube_h = 0.4, 1.75, 6.4, 4.45
    s = slide.shapes.add_shape(
        MSO_SHAPE.RECTANGLE,
        Inches(cube_x), Inches(cube_y), Inches(cube_w), Inches(cube_h),
    )
    s.fill.background()
    s.line.color.rgb = COL_MUTED
    s.line.width = Pt(1.5)
    _textbox(slide, cube_x + 0.1, cube_y + 0.04, 2.5, 0.3,
             "CUBE (8 PEs share HBM)", size=11, bold=True, color=COL_MUTED,
             align=PP_ALIGN.LEFT)

    # 8 PEs in a 4-cols × 2-rows grid, each with TCM beneath
    pe_w = 1.05
    pe_h = 0.62
    tcm_h = 0.42
    col_gap = 0.18
    row_gap = 0.30
    grid_x0 = cube_x + 0.25
    grid_y0 = cube_y + 0.50
    row_h = pe_h + tcm_h + row_gap
    pe_right_y = []  # y-center of each PE for the link arrows
    for i in range(8):
        row = i // 4
        col = i % 4
        px = grid_x0 + col * (pe_w + col_gap)
        py = grid_y0 + row * row_h
        _rrect(slide, px, py, pe_w, pe_h,
               RGBColor(0xFE, 0xF3, 0xC7), COL_REG_BORDER,
               f"PE{i}", size=11, bold=True, color=COL_TEXT_DARK)
        _rrect(slide, px, py + pe_h + 0.05, pe_w, tcm_h,
               COL_TCM_BG, COL_TCM_BORDER,
               "TCM (local)\n512 GB/s",
               size=8, color=COL_TCM_BORDER)
        if col == 3:
            pe_right_y.append((px + pe_w, py + pe_h / 2))

    # Router mesh strip on the right edge of the cube
    router_x = cube_x + cube_w - 0.85
    router_y = cube_y + 0.50
    router_w = 0.65
    router_h = cube_h - 0.65
    _rrect(slide, router_x, router_y, router_w, router_h,
           RGBColor(0xDB, 0xEA, 0xFE), COL_DMA,
           "ROUTER\nMESH\n\n256 GB/s\nper link",
           size=10, bold=True, color=COL_DMA)

    # Arrows from each row's last PE → router strip
    for (rx, ry) in pe_right_y:
        _arrow(slide, rx + 0.02, ry, router_x, ry,
               color=COL_DMA, width_pt=1.4)

    # HBM_CTRL just outside the cube on the right
    ctrl_x = cube_x + cube_w + 0.45
    ctrl_w = 1.85
    ctrl_h = 1.55
    ctrl_y = cube_y + cube_h / 2 - ctrl_h / 2
    _rrect(slide, ctrl_x, ctrl_y, ctrl_w, ctrl_h,
           RGBColor(0xFE, 0xE2, 0xE2), COL_RED,
           "HBM_CTRL\n(1 per cube)\n\nread channel\ncap = 1\n",
           size=11, bold=True, color=COL_RED)

    # Bottleneck label
    _textbox(slide, ctrl_x - 0.1, ctrl_y + ctrl_h + 0.05, ctrl_w + 0.2, 0.35,
             "BOTTLENECK", size=12, bold=True, color=COL_RED,
             align=PP_ALIGN.CENTER)

    # Router strip → HBM_CTRL arrow
    link_y = cube_y + cube_h / 2
    _arrow(slide, router_x + router_w, link_y,
           ctrl_x, link_y, color=COL_DMA, width_pt=3.0)
    _textbox(slide, router_x + router_w + 0.02,
             link_y - 0.36, ctrl_x - (router_x + router_w) - 0.05, 0.3,
             "256 GB/s", size=10, bold=True, color=COL_DMA,
             align=PP_ALIGN.CENTER)

    # HBM banks on the far right
    hbm_x = ctrl_x + ctrl_w + 0.45
    hbm_w = 1.85
    hbm_h = ctrl_h + 0.35
    hbm_y = ctrl_y - 0.175
    _rrect(slide, hbm_x, hbm_y, hbm_w, hbm_h,
           COL_HBM_BG, COL_HBM_BORDER,
           "HBM BANKS\n(per-cube)\n\n256 GB/s\naggregated",
           size=11, bold=True, color=COL_HBM_BORDER)
    _arrow(slide, ctrl_x + ctrl_w, link_y,
           hbm_x, link_y, color=COL_DMA, width_pt=3.0)

    # Side-by-side key takeaways at the bottom
    _textbox(slide, 0.4, 6.40, 6.3, 0.55,
             "TCM is per-PE local → fetch/store don't contend.\n"
             "HBM_CTRL is cube-shared → every DMA serializes on cap=1 channel.",
             size=11, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 6.9, 6.40, 6.2, 0.55,
             "Per-op DMA cost grows with #in-flight ops even on 1 PE.\n"
             "load_ref/load_load pin operands → 1 HBM trip instead of per K-tile.",
             size=11, color=COL_RED, align=PP_ALIGN.LEFT)


def _render_per_op_dma(slide):
    """Per-op DMA_READ cost = wall_ns / record_count.

    wall_ns is interval-union of all DMA_READ records ≈ (max t_end - min t_start)
    when ops overlap. Dividing by count gives the amortized per-op cost in the
    DMA window — converges to the bandwidth-bound floor.

    load_load is excluded — its eager up-front DMAs sit outside the composite
    plan so their stage_type isn't DMA_READ and they don't appear here.
    """
    data = _load_sweep_data()
    rows = data["rows"]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 "No sweep data. Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    variants = ["ref_ref", "load_ref"]

    by_shape: dict = {}
    for r in rows:
        key = (r["M"], r["K"], r["N"])
        by_shape.setdefault(key, {})[r["variant"]] = r
    shapes = list(by_shape.keys())

    shape_labels = [_shape_label(by_shape[k][next(iter(by_shape[k]))])
                    for k in shapes]
    flagged = [_under_tile(k[0], k[1], k[2], tile["M"], tile["K"], tile["N"])
               for k in shapes]
    tile_counts = [by_shape[k][next(iter(by_shape[k]))]["tile_count_expected"]
                   for k in shapes]

    def _ns_per_op(r):
        s = r.get("stages", {}).get("DMA_READ", {})
        cnt = s.get("record_count", 0)
        wall = s.get("wall_ns", 0.0)
        return (wall / cnt) if cnt else 0.0

    series = {
        v: [_ns_per_op(by_shape[k].get(v) or {"stages": {}}) for k in shapes]
        for v in variants
    }
    display_map = {v: v for v in variants}

    _textbox(slide, 0.4, 1.0, 12.6, 0.55,
             "Amortized per-op cost = (DMA window wall-clock) ÷ (#DMA ops). "
             "Strips out queue-wait double-counting: when N ops overlap, the "
             "window is N·drain_ns, so the average per op = drain_ns ≈ 16 ns "
             "(bandwidth-bound floor at 4 KB ÷ 256 GB/s).",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    _draw_native_bar_chart(
        slide,
        plot_x=1.0, plot_y=1.7, plot_w=10.0, plot_h=4.4,
        shape_labels=shape_labels, flagged=flagged,
        tile_counts=tile_counts,
        series=series, colors_map=VARIANT_COLORS_PPTX,
        display_map=display_map,
        wall_clocks=None,
        y_label="ns / op",
        legend_x=11.4, legend_w=1.85,
        foot_note=("Flat ~16-20 ns across shapes confirms the per-op "
                   "transfer is constant — what looked like growing per-op "
                   "cost on slide before was queue wait being absorbed."),
    )


def _render_mac_utilization(slide):
    """GEMM util (shape fill) AND Useful pipeline eff (computed from formula).

    Useful eff = pipe_eff × GEMM_util, where
      pipe_eff = (N_tiles × T_stage) / (head + N_tiles × T_stage + inter DMA_W)
    """
    data = _load_sweep_data()
    rows = data["rows"]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 "No sweep data. Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
    tile_flops = 2 * TILE_M * TILE_K * TILE_N

    HBM_GBS = 256.0
    bpe = 2
    T_STAGE = 16.0
    D_STAGES = 3
    head_ns = (D_STAGES - 1) * T_STAGE
    dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS

    by_shape: dict = {}
    for r in rows:
        if r["variant"] != "load_ref":
            continue
        by_shape[(r["M"], r["K"], r["N"])] = r
    shapes = list(by_shape.keys())

    shape_labels = [_shape_label(by_shape[k]) for k in shapes]
    flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
               for k in shapes]
    tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]

    gemm_util = []
    useful_eff = []
    for k in shapes:
        r = by_shape[k]
        M, K, N = r["M"], r["K"], r["N"]
        useful = 2 * M * K * N
        tiles = r["tile_count_expected"]
        gu = useful / (tile_flops * tiles) * 100
        gemm_util.append(gu)

        m_tiles = (M + TILE_M - 1) // TILE_M
        n_tiles = (N + TILE_N - 1) // TILE_N
        n_mn = m_tiles * n_tiles
        gemm_total = tiles * T_STAGE
        inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns
        wall = head_ns + gemm_total + inter_dma_w
        ueff = (gemm_total * (gu / 100.0) / wall) * 100 if wall > 0 else 0.0
        useful_eff.append(ueff)

    series = {
        "GEMM util":   gemm_util,
        "Useful eff":  useful_eff,
    }
    colors_map = {
        "GEMM util":   COL_FS,                       # emerald
        "Useful eff":  RGBColor(0xF5, 0x9E, 0x0B),   # amber
    }
    display_map = {
        "GEMM util":   "GEMM util %",
        "Useful eff":  "Useful eff %",
    }

    _textbox(slide, 0.4, 1.0, 12.6, 0.70,
             f"GEMM util = useful FLOPs ÷ (tile FLOPs × tile count) — pure "
             f"shape-vs-tile metric. "
             f"Useful eff = (N_tiles × T_stage × GEMM_util) ÷ wall — "
             f"useful FLOPs delivered as a fraction of peak over the "
             f"ideal-pipelined wall (head + K-loop + inter-(m,n) DMA_W).",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)

    _draw_native_bar_chart(
        slide,
        plot_x=1.0, plot_y=1.75, plot_w=10.0, plot_h=4.35,
        shape_labels=shape_labels, flagged=flagged,
        tile_counts=tile_counts,
        series=series, colors_map=colors_map,
        display_map=display_map,
        wall_clocks=None,
        y_label="%",
        legend_x=11.4, legend_w=1.85,
        foot_note=("GEMM util < 100% → shape mismatch (padded zeros). "
                   "Useful eff < GEMM util → pipeline overhead "
                   "(head + inter-(m,n) DMA_W) eats more of the wall."),
        threshold_line=100.0,
        flagged_bar_color=COL_RED,
        flagged_series_only="GEMM util",
    )


def _render_mac_utilization_ref_ref(slide):
    """Same metric as slide 14 but for the ref_ref variant.

    In ref_ref both A and B are loaded from HBM by the scheduler — that's
    TWO back-to-back DMA_R per tile, so the DMA stage takes 2 × T_stage =
    32 ns/tile while FETCH/GEMM are still 16 ns/tile. The pipeline is
    DMA-bound — steady-state cycle = 32 ns/tile — so useful pipeline
    efficiency caps near 50 % × GEMM_util.
    """
    data = _load_sweep_data()
    rows = data["rows"]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 "No sweep data. Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
    tile_flops = 2 * TILE_M * TILE_K * TILE_N

    HBM_GBS = 256.0
    bpe = 2
    T_STAGE_COMPUTE = 16.0                 # FETCH = GEMM = 16 ns/tile
    T_STAGE_DMA_REF_REF = 2 * 16.0         # 2 DMA_R per tile (A + B)
    T_STAGE = T_STAGE_DMA_REF_REF          # DMA-bound steady-state cycle
    D_STAGES = 3                            # DMA, FETCH, GEMM
    head_ns = (D_STAGES - 1) * T_STAGE_COMPUTE  # pipeline fill = 32 ns
    dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS

    by_shape: dict = {}
    for r in rows:
        if r["variant"] != "ref_ref":
            continue
        by_shape[(r["M"], r["K"], r["N"])] = r
    shapes = list(by_shape.keys())

    shape_labels = [_shape_label(by_shape[k]) for k in shapes]
    flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
               for k in shapes]
    tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]

    gemm_util = []
    useful_eff = []
    for k in shapes:
        r = by_shape[k]
        M, K, N = r["M"], r["K"], r["N"]
        useful = 2 * M * K * N
        tiles = r["tile_count_expected"]
        gu = useful / (tile_flops * tiles) * 100
        gemm_util.append(gu)

        m_tiles = (M + TILE_M - 1) // TILE_M
        n_tiles = (N + TILE_N - 1) // TILE_N
        n_mn = m_tiles * n_tiles
        # Useful compute time: each tile delivers T_STAGE_COMPUTE worth
        # of MAC. Steady-state pipeline cycle is DMA-bound (32 ns/tile).
        compute_total = tiles * T_STAGE_COMPUTE
        wall_steady   = tiles * T_STAGE
        inter_dma_w   = max(0, n_mn - 1) * dma_w_per_pair_ns
        wall = head_ns + wall_steady + inter_dma_w
        ueff = (compute_total * (gu / 100.0) / wall) * 100 \
               if wall > 0 else 0.0
        useful_eff.append(ueff)

    series = {
        "GEMM util":   gemm_util,
        "Useful eff":  useful_eff,
    }
    colors_map = {
        "GEMM util":   COL_FS,
        "Useful eff":  RGBColor(0xF5, 0x9E, 0x0B),
    }
    display_map = {
        "GEMM util":   "GEMM util %",
        "Useful eff":  "Useful eff % (ref_ref)",
    }

    _textbox(slide, 0.4, 1.0, 12.6, 0.75,
             "ref_ref: scheduler issues DMA_R for BOTH A and B every tile. "
             "Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH and GEMM stay "
             "at 16 ns. Pipeline cycle is DMA-bound → useful eff caps near "
             "50 % × GEMM_util, regardless of K-loop length.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)

    _draw_native_bar_chart(
        slide,
        plot_x=1.0, plot_y=1.85, plot_w=10.0, plot_h=4.25,
        shape_labels=shape_labels, flagged=flagged,
        tile_counts=tile_counts,
        series=series, colors_map=colors_map,
        display_map=display_map,
        wall_clocks=None,
        y_label="%",
        legend_x=11.4, legend_w=1.85,
        foot_note=("Compare to load_ref (prev slide): tl.load pins A in "
                   "TCM once, eliminating the per-tile A DMA_R → DMA stage "
                   "halves to 16 ns/tile → useful eff roughly doubles at "
                   "the same GEMM util."),
        threshold_line=100.0,
        flagged_bar_color=COL_RED,
        flagged_series_only="GEMM util",
    )


def _render_tflops_table(slide):
    """Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM.

    Three-stage pipeline (DMA_R → FETCH → GEMM), all stages bandwidth-balanced
    at T_stage = 16 ns/tile. Wall = pipeline fill + steady-state K-loop +
    inter-(m,n) DMA_W (final flush excluded — tail, not in pipeline).
    """
    data = _load_sweep_data()
    rows = data["rows"]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 "No sweep data. Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]

    HBM_GBS = 256.0  # bytes/ns
    bpe = 2  # f16
    T_STAGE = 16.0   # ns per pipeline stage (all stages BW-balanced)
    D_STAGES = 3     # DMA_R, FETCH, GEMM
    head_ns = (D_STAGES - 1) * T_STAGE  # pipeline fill: (D-1) × T_stage = 32 ns
    dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS  # = 8 ns

    by_shape: dict = {}
    for r in rows:
        by_shape.setdefault((r["M"], r["K"], r["N"]), {})[r["variant"]] = r
    shapes = list(by_shape.keys())

    _textbox(slide, 0.4, 1.0, 12.6, 0.85,
             f"Ideal pipelined model — assumes non-blocking tl.load + "
             f"multi-channel HBM so DMA, FETCH and GEMM all run at "
             f"T_stage = {T_STAGE:.0f} ns/tile.\n"
             f"   wall = head_latency + N_tiles × T_stage + Σ inter-(m,n) DMA_W\n"
             f"   head_latency = (D−1) × T_stage = {head_ns:.0f} ns "
             f"(pipeline fill, D = {D_STAGES} stages).\n"
             f"   inter DMA_W = (N_mn − 1) × {dma_w_per_pair_ns:.0f} ns   "
             f"(final flush is tail — excluded).\n"
             f"   useful eff = (N_tiles × T_stage × GEMM_util) / wall — "
             f"MAC time producing real output, not padded zeros.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)

    headers = ["Input shape", "GEMM util %", "Useful eff %"]
    n_rows = len(shapes) + 1
    n_cols = len(headers)
    table_x = 2.5
    table_y = 2.10
    table_w = 8.4
    table_h = 4.30

    tbl_shape = slide.shapes.add_table(
        n_rows, n_cols,
        Inches(table_x), Inches(table_y), Inches(table_w), Inches(table_h),
    )
    tbl = tbl_shape.table

    widths_in = [3.0, 2.5, 2.9]
    for ci, w in enumerate(widths_in):
        tbl.columns[ci].width = Inches(w)

    def _set_cell(ci, ri, text, *, bold=False, color=COL_TEXT_DARK,
                  fill=None, size=12, align=PP_ALIGN.CENTER):
        cell = tbl.cell(ri, ci)
        if fill is not None:
            cell.fill.solid()
            cell.fill.fore_color.rgb = fill
        tf = cell.text_frame
        tf.margin_left = Emu(36000)
        tf.margin_right = Emu(36000)
        tf.margin_top = Emu(18000)
        tf.margin_bottom = Emu(18000)
        tf.word_wrap = True
        p = tf.paragraphs[0]
        p.alignment = align
        p.text = ""
        run = p.add_run()
        run.text = text
        run.font.size = Pt(size)
        run.font.bold = bold
        run.font.name = "Consolas"
        run.font.color.rgb = color

    for ci, h in enumerate(headers):
        _set_cell(ci, 0, h, bold=True, color=COL_TEXT_LIGHT,
                  fill=RGBColor(0x10, 0x2A, 0x55), size=12)

    tile_flops = 2 * TILE_M * TILE_K * TILE_N
    for ri, k in enumerate(shapes, start=1):
        M, K, N = k
        useful = 2 * M * K * N
        any_row = next(iter(by_shape[k].values()))
        tiles = any_row["tile_count_expected"]
        gemm_util = useful / (tile_flops * tiles) * 100
        is_under = _under_tile(M, K, N, TILE_M, TILE_K, TILE_N)

        m_tiles = (M + TILE_M - 1) // TILE_M
        n_tiles = (N + TILE_N - 1) // TILE_N
        n_mn = m_tiles * n_tiles

        gemm_total_ns = tiles * T_STAGE
        inter_dma_w_ns = max(0, n_mn - 1) * dma_w_per_pair_ns
        wall_ns = head_ns + gemm_total_ns + inter_dma_w_ns
        # Useful eff = pipe_eff × GEMM_util (shape-waste included).
        pipe_eff = ((gemm_total_ns * (gemm_util / 100.0)) / wall_ns) * 100 \
            if wall_ns > 0 else 0.0

        row_fill = (RGBColor(0xFE, 0xF2, 0xF2) if is_under
                    else RGBColor(0xF8, 0xFA, 0xFC))
        text_color = COL_RED if is_under else COL_TEXT_DARK

        shape_txt = f"{M}×{K}×{N}" + ("  (under-tile)" if is_under else "")
        _set_cell(0, ri, shape_txt, bold=is_under, color=text_color,
                  fill=row_fill, align=PP_ALIGN.LEFT)
        _set_cell(1, ri, f"{gemm_util:.1f} %", bold=is_under,
                  color=text_color, fill=row_fill)
        _set_cell(2, ri, f"{pipe_eff:.1f} %", fill=row_fill,
                  color=COL_TEXT_DARK, bold=True)

    _textbox(slide, 0.4, 6.50, 12.6, 0.6,
             "What the model could achieve with non-blocking tl.load + "
             "multi-channel HBM (A streams tile-by-tile, no upfront serial). "
             "Today's simulator caps at ~50 % for tall-skinny K because tl.load "
             "is whole-operand blocking and the cube has a single HBM channel.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)


def _render_pipeline_eff_walkthrough(slide):
    """Visualize how Useful eff is calculated for one example shape.

    Example: 32×128×128 (8 K-tiles in 4 (m,n) groups, head + 3 inter DMA_W).
    Shows the 3-stage pipeline structure, the time-line of all GEMM tiles
    + overheads, and evaluates the formula numerically.
    """
    # Example parameters (kept hardcoded — this slide is illustrative)
    M, K, N = 32, 128, 128
    T_STAGE = 16
    D_STAGES = 3                  # DMA_R, FETCH, GEMM
    K_TILES = 2                   # K=128 → K_tiles = 2
    N_MN = 4                      # M_tile × N_tile = 1 × 4
    TOTAL_TILES = K_TILES * N_MN  # 8
    HEAD_NS = (D_STAGES - 1) * T_STAGE        # 32
    DMA_W_NS = 8                              # 32×32×2 / 256 = 8 ns
    INTER_FLUSH_COUNT = N_MN - 1              # 3 flushes (final excluded)
    GEMM_TOTAL = TOTAL_TILES * T_STAGE        # 128
    INTER_DMA_W_TOTAL = INTER_FLUSH_COUNT * DMA_W_NS  # 24
    WALL = HEAD_NS + GEMM_TOTAL + INTER_DMA_W_TOTAL   # 184
    USEFUL_EFF = GEMM_TOTAL / WALL * 100              # 69.6%

    # Colors
    C_HEAD  = RGBColor(0x94, 0xA3, 0xB8)  # slate gray
    C_GEMM  = RGBColor(0x10, 0xB9, 0x81)  # emerald (useful)
    C_FLUSH = RGBColor(0xF5, 0x9E, 0x0B)  # amber (inter-(m,n))
    C_TAIL  = RGBColor(0xFC, 0xA5, 0xA5)  # light red (excluded)
    C_DMA_R = RGBColor(0x3B, 0x82, 0xF6)  # blue
    C_FETCH = RGBColor(0x10, 0xB9, 0x81)
    C_GEMM_BOX = RGBColor(0xF5, 0x9E, 0x0B)

    _textbox(slide, 0.4, 1.0, 12.6, 0.5,
             f"Example: M=N=32, K=128, N-output=128 → {TOTAL_TILES} tiles in "
             f"{N_MN} output groups of {K_TILES} K-tiles each. T_stage = "
             f"{T_STAGE} ns/tile, pipeline depth D = {D_STAGES}.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # ── Top: 3-stage pipeline structure diagram ─────────────────────
    pd_y = 1.7
    pd_h = 0.55
    box_w = 1.4
    box_gap = 0.35
    pd_total_w = 3 * box_w + 2 * box_gap
    pd_x0 = (SLIDE_W_IN - pd_total_w) / 2
    stages = [("DMA_R\nHBM → TCM", C_DMA_R, COL_TEXT_LIGHT),
              ("FETCH\nTCM → Reg",  C_FETCH, COL_TEXT_LIGHT),
              ("GEMM\nMAC array",   C_GEMM_BOX, COL_TEXT_DARK)]
    for i, (lbl, fill, tc) in enumerate(stages):
        x = pd_x0 + i * (box_w + box_gap)
        _rrect(slide, x, pd_y, box_w, pd_h, fill, COL_TEXT_DARK,
               lbl, size=10, bold=True, color=tc)
        if i < 2:
            ax1 = x + box_w + 0.03
            ax2 = x + box_w + box_gap - 0.03
            _arrow(slide, ax1, pd_y + pd_h / 2, ax2, pd_y + pd_h / 2,
                   color=COL_MUTED, width_pt=1.8)
    _textbox(slide, pd_x0, pd_y + pd_h + 0.05, pd_total_w, 0.28,
             f"each stage = {T_STAGE} ns/tile  →  "
             f"head latency = (D−1) × T_stage = {HEAD_NS} ns",
             size=10, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # ── Middle: timeline of the full kernel ─────────────────────────
    # Layout: head + 4 K-loops separated by 3 DMA_W + 1 tail (excluded)
    tl_y = 3.4
    tl_h = 0.85
    tl_label_y = tl_y - 0.3
    tl_legend_y = tl_y + tl_h + 0.18

    margin = 0.8
    tl_total_ns = WALL + DMA_W_NS  # include tail visually
    tl_w_total = SLIDE_W_IN - 2 * margin
    px_per_ns = tl_w_total / tl_total_ns

    cur_x = margin
    # Head
    w = HEAD_NS * px_per_ns
    _rect_band(slide, cur_x, tl_y, w, tl_h, C_HEAD, COL_TEXT_DARK)
    _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
             f"head\n{HEAD_NS} ns",
             size=9, bold=True, color=COL_TEXT_LIGHT)
    cur_x += w

    # K-loops + inter DMA_W
    for g in range(N_MN):
        # K-loop: K_TILES GEMM blocks
        for t in range(K_TILES):
            w = T_STAGE * px_per_ns
            _rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK)
            tile_no = g * K_TILES + t + 1
            _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
                     f"GEMM\nT{tile_no}",
                     size=9, bold=True, color=COL_TEXT_LIGHT)
            cur_x += w
        # Mark (m,n) group label above
        group_start_x = cur_x - K_TILES * T_STAGE * px_per_ns
        group_w = K_TILES * T_STAGE * px_per_ns
        _textbox(slide, group_start_x, tl_label_y,
                 group_w, 0.25,
                 f"(m=0,n={g}) K-loop",
                 size=9, bold=True, color=COL_TEXT_DARK)
        if g < N_MN - 1:
            w = DMA_W_NS * px_per_ns
            _rect_band(slide, cur_x, tl_y, w, tl_h, C_FLUSH, COL_TEXT_DARK)
            _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
                     "flush", size=8, bold=True, color=COL_TEXT_DARK)
            cur_x += w

    # Tail flush (excluded)
    w = DMA_W_NS * px_per_ns
    _rect_band(slide, cur_x, tl_y, w, tl_h, C_TAIL, COL_RED)
    _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
             "tail", size=8, bold=True, color=COL_RED)
    _textbox(slide, cur_x - 0.3, tl_y + tl_h + 0.02, w + 0.6, 0.28,
             "(excluded\nfrom wall)",
             size=8, bold=True, color=COL_RED)

    # Wall span indicator below the timeline
    wall_end_x = margin + WALL * px_per_ns
    _arrow(slide, margin, tl_y + tl_h + 0.55,
           wall_end_x, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
    _arrow(slide, wall_end_x, tl_y + tl_h + 0.55,
           margin, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
    _textbox(slide, margin, tl_y + tl_h + 0.6,
             wall_end_x - margin, 0.3,
             f"wall = {WALL} ns",
             size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER)

    # ── Bottom: numerical evaluation ───────────────────────────────
    formula_y = 5.7
    _textbox(slide, 0.4, formula_y, 12.6, 0.35,
             f"wall = head + N_tiles × T_stage + (N_mn − 1) × T_dma_w  =  "
             f"{HEAD_NS} + {TOTAL_TILES}×{T_STAGE} + {INTER_FLUSH_COUNT}×{DMA_W_NS}  "
             f"=  {WALL} ns",
             size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, formula_y + 0.35, 12.6, 0.35,
             f"GEMM useful time = N_tiles × T_stage × GEMM_util  =  "
             f"{TOTAL_TILES}×{T_STAGE}×100 %  =  {GEMM_TOTAL} ns",
             size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, formula_y + 0.7, 12.6, 0.45,
             f"Useful efficiency = {GEMM_TOTAL} / {WALL}  =  {USEFUL_EFF:.1f} %",
             size=14, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, formula_y + 1.15, 12.6, 0.35,
             "Overhead = head (pipeline fill) + inter-(m,n) flushes. "
             "Bigger K (more amortization) and smaller N (fewer groups) "
             "both raise the efficiency.",
             size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)


def _render_pipeline_eff_walkthrough_largeK(slide):
    """Walkthrough for the tall-skinny case: M=N=32, K=3072.

    N_mn = 1 (one output group) → zero inter-(m,n) DMA_W flushes.
    K-loop has 48 tiles — compressed visually (show first 4 + ... + last 2).
    """
    M, K, N = 32, 3072, 32
    T_STAGE = 16
    D_STAGES = 3
    K_TILES = 48
    N_MN = 1
    TOTAL_TILES = K_TILES
    HEAD_NS = (D_STAGES - 1) * T_STAGE
    DMA_W_NS = 8
    GEMM_TOTAL = TOTAL_TILES * T_STAGE
    INTER_DMA_W_TOTAL = 0
    WALL = HEAD_NS + GEMM_TOTAL + INTER_DMA_W_TOTAL
    USEFUL_EFF = GEMM_TOTAL / WALL * 100

    C_HEAD  = RGBColor(0x94, 0xA3, 0xB8)
    C_GEMM  = RGBColor(0x10, 0xB9, 0x81)
    C_TAIL  = RGBColor(0xFC, 0xA5, 0xA5)
    C_DMA_R = RGBColor(0x3B, 0x82, 0xF6)
    C_FETCH = RGBColor(0x10, 0xB9, 0x81)
    C_GEMM_BOX = RGBColor(0xF5, 0x9E, 0x0B)

    _textbox(slide, 0.4, 1.0, 12.6, 0.5,
             f"Example: M=N=32, K=3072 → {TOTAL_TILES} tiles, "
             f"N_mn = M_tiles × N_tiles = 1 × 1 = 1 → "
             f"no inter-(m,n) DMA_W flushes. Long K-loop amortizes the head.",
             size=12, color=COL_MUTED, align=PP_ALIGN.LEFT)

    # 3-stage pipeline structure (same as before)
    pd_y = 1.7
    pd_h = 0.55
    box_w = 1.4
    box_gap = 0.35
    pd_total_w = 3 * box_w + 2 * box_gap
    pd_x0 = (SLIDE_W_IN - pd_total_w) / 2
    stages = [("DMA_R\nHBM → TCM", C_DMA_R, COL_TEXT_LIGHT),
              ("FETCH\nTCM → Reg",  C_FETCH, COL_TEXT_LIGHT),
              ("GEMM\nMAC array",   C_GEMM_BOX, COL_TEXT_DARK)]
    for i, (lbl, fill, tc) in enumerate(stages):
        x = pd_x0 + i * (box_w + box_gap)
        _rrect(slide, x, pd_y, box_w, pd_h, fill, COL_TEXT_DARK,
               lbl, size=10, bold=True, color=tc)
        if i < 2:
            ax1 = x + box_w + 0.03
            ax2 = x + box_w + box_gap - 0.03
            _arrow(slide, ax1, pd_y + pd_h / 2, ax2, pd_y + pd_h / 2,
                   color=COL_MUTED, width_pt=1.8)
    _textbox(slide, pd_x0, pd_y + pd_h + 0.05, pd_total_w, 0.28,
             f"each stage = {T_STAGE} ns/tile  →  "
             f"head latency = (D−1) × T_stage = {HEAD_NS} ns",
             size=10, color=COL_MUTED, align=PP_ALIGN.CENTER)

    # Timeline — compressed (show 4 tiles + gap + 2 tiles)
    tl_y = 3.4
    tl_h = 0.85
    tl_label_y = tl_y - 0.3

    margin = 0.8
    visible_first = 4
    visible_last = 2
    skipped = TOTAL_TILES - visible_first - visible_last
    # Width budget: head + visible tiles + ellipsis block + tail
    ellipsis_ns_equiv = 6 * T_STAGE  # rendered width = 6 tiles worth
    tl_total_ns = (HEAD_NS + (visible_first + visible_last) * T_STAGE
                   + ellipsis_ns_equiv + DMA_W_NS)
    tl_w_total = SLIDE_W_IN - 2 * margin
    px_per_ns = tl_w_total / tl_total_ns

    cur_x = margin
    # Head
    w = HEAD_NS * px_per_ns
    _rect_band(slide, cur_x, tl_y, w, tl_h, C_HEAD, COL_TEXT_DARK)
    _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
             f"head\n{HEAD_NS} ns",
             size=9, bold=True, color=COL_TEXT_LIGHT)
    cur_x += w

    # Group label for the entire K-loop (one (m,n))
    kloop_start_x = cur_x
    # First few tiles
    for t in range(visible_first):
        w = T_STAGE * px_per_ns
        _rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK)
        _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
                 f"GEMM\nT{t + 1}",
                 size=9, bold=True, color=COL_TEXT_LIGHT)
        cur_x += w
    # Ellipsis block
    ew = ellipsis_ns_equiv * px_per_ns
    _rect_band(slide, cur_x, tl_y, ew, tl_h,
               RGBColor(0x86, 0xEF, 0xAC), COL_TEXT_DARK)
    _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.2, ew, 0.5,
             f". . .  {skipped} more GEMM tiles  . . .",
             size=10, bold=True, color=COL_TEXT_DARK)
    cur_x += ew
    # Last tiles
    for t in range(visible_last):
        w = T_STAGE * px_per_ns
        _rect_band(slide, cur_x, tl_y, w, tl_h, C_GEMM, COL_TEXT_DARK)
        _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
                 f"GEMM\nT{TOTAL_TILES - visible_last + t + 1}",
                 size=9, bold=True, color=COL_TEXT_LIGHT)
        cur_x += w
    # K-loop group label
    kloop_end_x = cur_x
    _textbox(slide, kloop_start_x, tl_label_y,
             kloop_end_x - kloop_start_x, 0.25,
             f"(m=0,n=0) K-loop  —  all {TOTAL_TILES} tiles in one group, "
             f"NO inter flushes",
             size=10, bold=True, color=COL_TEXT_DARK,
             align=PP_ALIGN.CENTER)

    # Tail
    w = DMA_W_NS * px_per_ns
    _rect_band(slide, cur_x, tl_y, w, tl_h, C_TAIL, COL_RED)
    _textbox(slide, cur_x, tl_y + tl_h / 2 - 0.13, w, 0.3,
             "tail", size=8, bold=True, color=COL_RED)
    _textbox(slide, cur_x - 0.3, tl_y + tl_h + 0.02, w + 0.6, 0.28,
             "(excluded\nfrom wall)",
             size=8, bold=True, color=COL_RED)

    # Wall arrow
    wall_end_x = margin + (HEAD_NS + (visible_first + visible_last) * T_STAGE
                            + ellipsis_ns_equiv) * px_per_ns
    _arrow(slide, margin, tl_y + tl_h + 0.55,
           wall_end_x, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
    _arrow(slide, wall_end_x, tl_y + tl_h + 0.55,
           margin, tl_y + tl_h + 0.55, color=COL_TEXT_DARK, width_pt=1.5)
    _textbox(slide, margin, tl_y + tl_h + 0.6,
             wall_end_x - margin, 0.3,
             f"wall = {WALL} ns",
             size=11, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.CENTER)

    # Formula evaluation
    formula_y = 5.7
    _textbox(slide, 0.4, formula_y, 12.6, 0.35,
             f"wall = head + N_tiles × T_stage + (N_mn − 1) × T_dma_w  =  "
             f"{HEAD_NS} + {TOTAL_TILES}×{T_STAGE} + 0  =  {WALL} ns",
             size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, formula_y + 0.35, 12.6, 0.35,
             f"GEMM useful time = {TOTAL_TILES}×{T_STAGE}×100 %  =  {GEMM_TOTAL} ns",
             size=12, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, formula_y + 0.7, 12.6, 0.45,
             f"Useful efficiency = {GEMM_TOTAL} / {WALL}  =  {USEFUL_EFF:.1f} %",
             size=14, bold=True, color=COL_TEXT_DARK, align=PP_ALIGN.LEFT)
    _textbox(slide, 0.4, formula_y + 1.15, 12.6, 0.35,
             "Long K-loop, one output group → head amortized over 48 GEMM "
             "tiles, no flush penalty. Approaches 100 % as K grows.",
             size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)


_RENDERERS.update({
    "stage_breakdown_ref_ref":   _render_stage_breakdown_ref_ref,
    "stage_breakdown_load_ref":  _render_stage_breakdown_load_ref,
    "stage_breakdown_load_load": _render_stage_breakdown_load_load,
    "variant_comparison":        _render_variant_comparison,
    "hbm_topology":              _render_hbm_topology,
    "per_op_dma":                _render_per_op_dma,
    "mac_utilization":           _render_mac_utilization,
    "mac_utilization_ref_ref":   _render_mac_utilization_ref_ref,
    "tflops_table":              _render_tflops_table,
    "pipeline_eff_walkthrough":  _render_pipeline_eff_walkthrough,
    "pipeline_eff_walkthrough_largeK":  _render_pipeline_eff_walkthrough_largeK,
})


def build():
    prs = Presentation()
    prs.slide_width = Inches(SLIDE_W_IN)
    prs.slide_height = Inches(SLIDE_H_IN)
    blank = prs.slide_layouts[6]

    for i, cfg in enumerate(SLIDES, start=1):
        slide = prs.slides.add_slide(blank)
        _add_title(slide, cfg["title"])

        if "render" in cfg:
            # Shape-drawn slide (sequence diagram / pipeline structure).
            _RENDERERS[cfg["render"]](slide)
        else:
            # Default: image on the left (8.4 in wide), bullets on the right.
            _add_image_centered(
                slide, cfg["image"],
                left_in=0.3, top_in=1.05,
                max_w_in=8.3, max_h_in=5.9,
            )
            _add_bullets(
                slide, cfg["bullets"],
                left_in=8.8, top_in=1.2,
                width_in=4.3, height_in=5.7,
            )
        _add_footer(slide, i, len(SLIDES))

    OUT.parent.mkdir(parents=True, exist_ok=True)
    prs.save(OUT)
    print(f"wrote {OUT}")


if __name__ == "__main__":
    build()