"""Generate a 5-slide PPTX summarizing the kernbench2 model. Slides (in order): 1. Overall architecture — how PEs are connected (cube_mesh_view) 2. Model correctness — DMA vs P2P latency (pe2pe overview) 3. PE-to-PE IPCQ communication (ipcq_two_pe_dma) 4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken) 5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep) This is a derived-artifact generator — no production code touched. """ from __future__ import annotations from pathlib import Path from PIL import Image from pptx import Presentation from pptx.dml.color import RGBColor from pptx.enum.shapes import MSO_SHAPE from pptx.util import Emu, Inches, Pt ROOT = Path(__file__).resolve().parent.parent DIAG = ROOT / "docs" / "diagrams" OUT = DIAG / "kernbench2_overview.pptx" # 16:9 widescreen — 13.333 × 7.5 in SLIDE_W_IN = 13.333 SLIDE_H_IN = 7.5 SLIDES = [ { "title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity", "image": DIAG / "cube_mesh_view.png", "bullets": [ "Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)", "Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine", "Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh", "Foundation for every latency, IPCQ, and allreduce experiment that follows", ], }, { "title": "2. Model Correctness: DMA vs P2P Latency Sweep", "image": DIAG / "pe2pe_latency_plots" / "overview.png", "bullets": [ "Sweeps payload size across PE-to-PE paths and compares to DMA", "Confirms the simulator reproduces the expected DMA/P2P crossover", "Acts as the per-hop ground truth that feeds collective-level models", ], }, { "title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)", "image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png", "bullets": [ "Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot", "Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume", "Credit return rides the fabric path back (16 B packet, no slot-IO)", "This is the building block the multi-device allreduce composes", ], }, { "title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator", "image": DIAG / "allreduce_latency_plots" / "overview_broken.png", "bullets": [ "Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE", "Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)", "Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs", "Our 6-device collective lands at ~17–22 µs — ~17× faster than ext-sim baseline", ], }, { "title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM", "image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png", "bullets": [ "Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)", "Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop", "Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s", "At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link", ], }, ] def _add_title(slide, text): left = Inches(0.4) top = Inches(0.25) width = Inches(SLIDE_W_IN - 0.8) height = Inches(0.7) box = slide.shapes.add_textbox(left, top, width, height) tf = box.text_frame tf.margin_left = tf.margin_right = Emu(0) tf.margin_top = tf.margin_bottom = Emu(0) p = tf.paragraphs[0] run = p.add_run() run.text = text run.font.size = Pt(26) run.font.bold = True run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55) return box def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in): with Image.open(img_path) as im: iw, ih = im.size max_w_emu = Inches(max_w_in) max_h_emu = Inches(max_h_in) scale = min(max_w_emu / iw, max_h_emu / ih) w = int(iw * scale) h = int(ih * scale) left = Inches(left_in) + (max_w_emu - w) // 2 top = Inches(top_in) + (max_h_emu - h) // 2 slide.shapes.add_picture(str(img_path), left, top, width=w, height=h) def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in): box = slide.shapes.add_textbox( Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in), ) tf = box.text_frame tf.word_wrap = True for i, line in enumerate(bullets): p = tf.paragraphs[0] if i == 0 else tf.add_paragraph() p.level = 0 run = p.add_run() run.text = "• " + line run.font.size = Pt(15) run.font.color.rgb = RGBColor(0x22, 0x22, 0x22) p.space_after = Pt(6) def _add_footer(slide, idx, total): box = slide.shapes.add_textbox( Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45), Inches(1.0), Inches(0.3), ) p = box.text_frame.paragraphs[0] run = p.add_run() run.text = f"{idx} / {total}" run.font.size = Pt(10) run.font.color.rgb = RGBColor(0x88, 0x88, 0x88) def build(): prs = Presentation() prs.slide_width = Inches(SLIDE_W_IN) prs.slide_height = Inches(SLIDE_H_IN) blank = prs.slide_layouts[6] for i, cfg in enumerate(SLIDES, start=1): slide = prs.slides.add_slide(blank) _add_title(slide, cfg["title"]) # Layout: image on the left (8.4 in wide), bullets on the right (4.4 in). _add_image_centered( slide, cfg["image"], left_in=0.3, top_in=1.05, max_w_in=8.3, max_h_in=5.9, ) _add_bullets( slide, cfg["bullets"], left_in=8.8, top_in=1.2, width_in=4.3, height_in=5.7, ) _add_footer(slide, i, len(SLIDES)) OUT.parent.mkdir(parents=True, exist_ok=True) prs.save(OUT) print(f"wrote {OUT}") if __name__ == "__main__": build()