Add deck builder + overview-with-ref diagram scripts

scripts/build_overview_slides.py renders a 5-slide PPTX (kernbench2_overview.pptx) summarizing architecture, model correctness, IPCQ, allreduce, and buffer-kind tier comparison. scripts/emit_overview_with_external_ref.py renders log-y and broken-y variants of the allreduce overview (overview_log.png, overview_broken.png) including a 366 µs ext-sim reference marker at 96 KB / PE. Also includes cube_mesh_view.png rendered from the SVG. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 18:20:54 -07:00
parent a563169e89
commit 5accd98171
6 changed files with 363 additions and 0 deletions
@@ -0,0 +1,171 @@
+"""Generate a 5-slide PPTX summarizing the kernbench2 model.
+
+Slides (in order):
+  1. Overall architecture — how PEs are connected (cube_mesh_view)
+  2. Model correctness — DMA vs P2P latency (pe2pe overview)
+  3. PE-to-PE IPCQ communication (ipcq_two_pe_dma)
+  4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken)
+  5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep)
+
+This is a derived-artifact generator — no production code touched.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+from PIL import Image
+from pptx import Presentation
+from pptx.dml.color import RGBColor
+from pptx.enum.shapes import MSO_SHAPE
+from pptx.util import Emu, Inches, Pt
+
+ROOT = Path(__file__).resolve().parent.parent
+DIAG = ROOT / "docs" / "diagrams"
+OUT = DIAG / "kernbench2_overview.pptx"
+
+# 16:9 widescreen — 13.333 × 7.5 in
+SLIDE_W_IN = 13.333
+SLIDE_H_IN = 7.5
+
+SLIDES = [
+    {
+        "title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity",
+        "image": DIAG / "cube_mesh_view.png",
+        "bullets": [
+            "Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)",
+            "Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine",
+            "Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh",
+            "Foundation for every latency, IPCQ, and allreduce experiment that follows",
+        ],
+    },
+    {
+        "title": "2. Model Correctness: DMA vs P2P Latency Sweep",
+        "image": DIAG / "pe2pe_latency_plots" / "overview.png",
+        "bullets": [
+            "Sweeps payload size across PE-to-PE paths and compares to DMA",
+            "Confirms the simulator reproduces the expected DMA/P2P crossover",
+            "Acts as the per-hop ground truth that feeds collective-level models",
+        ],
+    },
+    {
+        "title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)",
+        "image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png",
+        "bullets": [
+            "Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot",
+            "Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume",
+            "Credit return rides the fabric path back (16 B packet, no slot-IO)",
+            "This is the building block the multi-device allreduce composes",
+        ],
+    },
+    {
+        "title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator",
+        "image": DIAG / "allreduce_latency_plots" / "overview_broken.png",
+        "bullets": [
+            "Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE",
+            "Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)",
+            "Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs",
+            "Our 6-device collective lands at ~17–22 µs — ~17× faster than ext-sim baseline",
+        ],
+    },
+    {
+        "title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM",
+        "image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png",
+        "bullets": [
+            "Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)",
+            "Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop",
+            "Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s",
+            "At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link",
+        ],
+    },
+]
+
+
+def _add_title(slide, text):
+    left = Inches(0.4)
+    top = Inches(0.25)
+    width = Inches(SLIDE_W_IN - 0.8)
+    height = Inches(0.7)
+    box = slide.shapes.add_textbox(left, top, width, height)
+    tf = box.text_frame
+    tf.margin_left = tf.margin_right = Emu(0)
+    tf.margin_top = tf.margin_bottom = Emu(0)
+    p = tf.paragraphs[0]
+    run = p.add_run()
+    run.text = text
+    run.font.size = Pt(26)
+    run.font.bold = True
+    run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55)
+    return box
+
+
+def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in):
+    with Image.open(img_path) as im:
+        iw, ih = im.size
+    max_w_emu = Inches(max_w_in)
+    max_h_emu = Inches(max_h_in)
+    scale = min(max_w_emu / iw, max_h_emu / ih)
+    w = int(iw * scale)
+    h = int(ih * scale)
+    left = Inches(left_in) + (max_w_emu - w) // 2
+    top = Inches(top_in) + (max_h_emu - h) // 2
+    slide.shapes.add_picture(str(img_path), left, top, width=w, height=h)
+
+
+def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in):
+    box = slide.shapes.add_textbox(
+        Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in),
+    )
+    tf = box.text_frame
+    tf.word_wrap = True
+    for i, line in enumerate(bullets):
+        p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
+        p.level = 0
+        run = p.add_run()
+        run.text = "• " + line
+        run.font.size = Pt(15)
+        run.font.color.rgb = RGBColor(0x22, 0x22, 0x22)
+        p.space_after = Pt(6)
+
+
+def _add_footer(slide, idx, total):
+    box = slide.shapes.add_textbox(
+        Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45),
+        Inches(1.0), Inches(0.3),
+    )
+    p = box.text_frame.paragraphs[0]
+    run = p.add_run()
+    run.text = f"{idx} / {total}"
+    run.font.size = Pt(10)
+    run.font.color.rgb = RGBColor(0x88, 0x88, 0x88)
+
+
+def build():
+    prs = Presentation()
+    prs.slide_width = Inches(SLIDE_W_IN)
+    prs.slide_height = Inches(SLIDE_H_IN)
+    blank = prs.slide_layouts[6]
+
+    for i, cfg in enumerate(SLIDES, start=1):
+        slide = prs.slides.add_slide(blank)
+        _add_title(slide, cfg["title"])
+
+        # Layout: image on the left (8.4 in wide), bullets on the right (4.4 in).
+        _add_image_centered(
+            slide, cfg["image"],
+            left_in=0.3, top_in=1.05,
+            max_w_in=8.3, max_h_in=5.9,
+        )
+        _add_bullets(
+            slide, cfg["bullets"],
+            left_in=8.8, top_in=1.2,
+            width_in=4.3, height_in=5.7,
+        )
+        _add_footer(slide, i, len(SLIDES))
+
+    OUT.parent.mkdir(parents=True, exist_ok=True)
+    prs.save(OUT)
+    print(f"wrote {OUT}")
+
+
+if __name__ == "__main__":
+    build()