diff --git a/docs/diagrams/allreduce_latency_plots/overview_broken.png b/docs/diagrams/allreduce_latency_plots/overview_broken.png new file mode 100644 index 0000000..3d56281 Binary files /dev/null and b/docs/diagrams/allreduce_latency_plots/overview_broken.png differ diff --git a/docs/diagrams/allreduce_latency_plots/overview_log.png b/docs/diagrams/allreduce_latency_plots/overview_log.png new file mode 100644 index 0000000..8c8c2b9 Binary files /dev/null and b/docs/diagrams/allreduce_latency_plots/overview_log.png differ diff --git a/docs/diagrams/cube_mesh_view.png b/docs/diagrams/cube_mesh_view.png new file mode 100644 index 0000000..2736c29 Binary files /dev/null and b/docs/diagrams/cube_mesh_view.png differ diff --git a/docs/diagrams/kernbench2_overview.pptx b/docs/diagrams/kernbench2_overview.pptx new file mode 100644 index 0000000..0941ab1 Binary files /dev/null and b/docs/diagrams/kernbench2_overview.pptx differ diff --git a/scripts/build_overview_slides.py b/scripts/build_overview_slides.py new file mode 100644 index 0000000..1219762 --- /dev/null +++ b/scripts/build_overview_slides.py @@ -0,0 +1,171 @@ +"""Generate a 5-slide PPTX summarizing the kernbench2 model. + +Slides (in order): + 1. Overall architecture — how PEs are connected (cube_mesh_view) + 2. Model correctness — DMA vs P2P latency (pe2pe overview) + 3. PE-to-PE IPCQ communication (ipcq_two_pe_dma) + 4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken) + 5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep) + +This is a derived-artifact generator — no production code touched. +""" +from __future__ import annotations + +from pathlib import Path + +from PIL import Image +from pptx import Presentation +from pptx.dml.color import RGBColor +from pptx.enum.shapes import MSO_SHAPE +from pptx.util import Emu, Inches, Pt + +ROOT = Path(__file__).resolve().parent.parent +DIAG = ROOT / "docs" / "diagrams" +OUT = DIAG / "kernbench2_overview.pptx" + +# 16:9 widescreen — 13.333 × 7.5 in +SLIDE_W_IN = 13.333 +SLIDE_H_IN = 7.5 + +SLIDES = [ + { + "title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity", + "image": DIAG / "cube_mesh_view.png", + "bullets": [ + "Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)", + "Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine", + "Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh", + "Foundation for every latency, IPCQ, and allreduce experiment that follows", + ], + }, + { + "title": "2. Model Correctness: DMA vs P2P Latency Sweep", + "image": DIAG / "pe2pe_latency_plots" / "overview.png", + "bullets": [ + "Sweeps payload size across PE-to-PE paths and compares to DMA", + "Confirms the simulator reproduces the expected DMA/P2P crossover", + "Acts as the per-hop ground truth that feeds collective-level models", + ], + }, + { + "title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)", + "image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png", + "bullets": [ + "Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot", + "Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume", + "Credit return rides the fabric path back (16 B packet, no slot-IO)", + "This is the building block the multi-device allreduce composes", + ], + }, + { + "title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator", + "image": DIAG / "allreduce_latency_plots" / "overview_broken.png", + "bullets": [ + "Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE", + "Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)", + "Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs", + "Our 6-device collective lands at ~17–22 µs — ~17× faster than ext-sim baseline", + ], + }, + { + "title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM", + "image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png", + "bullets": [ + "Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)", + "Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop", + "Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s", + "At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link", + ], + }, +] + + +def _add_title(slide, text): + left = Inches(0.4) + top = Inches(0.25) + width = Inches(SLIDE_W_IN - 0.8) + height = Inches(0.7) + box = slide.shapes.add_textbox(left, top, width, height) + tf = box.text_frame + tf.margin_left = tf.margin_right = Emu(0) + tf.margin_top = tf.margin_bottom = Emu(0) + p = tf.paragraphs[0] + run = p.add_run() + run.text = text + run.font.size = Pt(26) + run.font.bold = True + run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55) + return box + + +def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in): + with Image.open(img_path) as im: + iw, ih = im.size + max_w_emu = Inches(max_w_in) + max_h_emu = Inches(max_h_in) + scale = min(max_w_emu / iw, max_h_emu / ih) + w = int(iw * scale) + h = int(ih * scale) + left = Inches(left_in) + (max_w_emu - w) // 2 + top = Inches(top_in) + (max_h_emu - h) // 2 + slide.shapes.add_picture(str(img_path), left, top, width=w, height=h) + + +def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in): + box = slide.shapes.add_textbox( + Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in), + ) + tf = box.text_frame + tf.word_wrap = True + for i, line in enumerate(bullets): + p = tf.paragraphs[0] if i == 0 else tf.add_paragraph() + p.level = 0 + run = p.add_run() + run.text = "• " + line + run.font.size = Pt(15) + run.font.color.rgb = RGBColor(0x22, 0x22, 0x22) + p.space_after = Pt(6) + + +def _add_footer(slide, idx, total): + box = slide.shapes.add_textbox( + Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45), + Inches(1.0), Inches(0.3), + ) + p = box.text_frame.paragraphs[0] + run = p.add_run() + run.text = f"{idx} / {total}" + run.font.size = Pt(10) + run.font.color.rgb = RGBColor(0x88, 0x88, 0x88) + + +def build(): + prs = Presentation() + prs.slide_width = Inches(SLIDE_W_IN) + prs.slide_height = Inches(SLIDE_H_IN) + blank = prs.slide_layouts[6] + + for i, cfg in enumerate(SLIDES, start=1): + slide = prs.slides.add_slide(blank) + _add_title(slide, cfg["title"]) + + # Layout: image on the left (8.4 in wide), bullets on the right (4.4 in). + _add_image_centered( + slide, cfg["image"], + left_in=0.3, top_in=1.05, + max_w_in=8.3, max_h_in=5.9, + ) + _add_bullets( + slide, cfg["bullets"], + left_in=8.8, top_in=1.2, + width_in=4.3, height_in=5.7, + ) + _add_footer(slide, i, len(SLIDES)) + + OUT.parent.mkdir(parents=True, exist_ok=True) + prs.save(OUT) + print(f"wrote {OUT}") + + +if __name__ == "__main__": + build() diff --git a/scripts/emit_overview_with_external_ref.py b/scripts/emit_overview_with_external_ref.py new file mode 100644 index 0000000..a8a2863 --- /dev/null +++ b/scripts/emit_overview_with_external_ref.py @@ -0,0 +1,192 @@ +"""One-shot: render overview.png with an external 366 µs reference, in two +variants — log scale and broken y-axis. Reads docs/diagrams/allreduce_latency_plots/summary.csv +and writes overview_log.png and overview_broken.png alongside it. + +This is a derived-artifact generator (per CLAUDE.md): plotting only, no production +or test logic touched. +""" +from __future__ import annotations + +import csv +from pathlib import Path + +import matplotlib.pyplot as plt +import matplotlib.ticker as mticker + +ROOT = Path(__file__).resolve().parent.parent +PLOT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots" +CSV_PATH = PLOT_DIR / "summary.csv" + +EXT_LABEL = "ext-sim single-device reduce: 366 µs" +EXT_LATENCY_NS = 366_000.0 + +COLORS = { + "ring_1d": "tab:blue", + "torus_2d": "tab:orange", + "mesh_2d_no_wrap": "tab:green", +} + +# Hand-derived theoretical model for torus_2d (6 SIPs). Mirrors +# _aggregate_sweep_plots in tests/test_allreduce_multidevice.py. +NOC_PACKET_BYTES = 128 +PES_PER_CUBE = 8 +T_STARTUP_NS = 1346.0 +TAU_NS = (8741.0 - 1346.0) / (6144 - 1) + + +def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float: + bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE + n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES)) + return T_STARTUP_NS + (n_packets - 1) * TAU_NS + + +def _plot_theoretical(ax, records): + torus_rs = sorted( + [r for r in records if r["sip_topology"] == "torus_2d"], + key=lambda r: r["bytes_per_pe"], + ) + if not torus_rs: + return + ax.plot( + [r["bytes_per_pe"] for r in torus_rs], + [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs], + color="tab:red", linestyle="--", linewidth=1.6, marker="x", + label="theoretical torus_2d (6 SIPs)", + ) + + +def _bytes_fmt(x, _pos): + if x >= 1024 * 1024: + return f"{x / (1024 * 1024):.0f}M" + if x >= 1024: + return f"{x / 1024:.0f}K" + return f"{int(x)}" + + +def _load_records(): + rows = [] + with open(CSV_PATH, newline="") as f: + r = csv.DictReader(f) + for row in r: + rows.append({ + "sip_topology": row["sip_topology"], + "bytes_per_pe": int(row["bytes_per_pe"]), + "latency_ns": float(row["latency_ns"]), + }) + return rows + + +def _ext_x(records): + """Anchor the external reference at the largest payload (96 KB / PE).""" + return max(r["bytes_per_pe"] for r in records) + + +def _plot_curves(ax, records, topologies): + for topo in topologies: + rs = sorted([r for r in records if r["sip_topology"] == topo], + key=lambda r: r["bytes_per_pe"]) + if not rs: + continue + ax.plot( + [r["bytes_per_pe"] for r in rs], + [r["latency_ns"] for r in rs], + marker="o", + label=f"{topo}", + color=COLORS.get(topo), + ) + + +def emit_log(records): + topologies = sorted({r["sip_topology"] for r in records}) + fig, ax = plt.subplots(figsize=(9, 6)) + _plot_curves(ax, records, topologies) + _plot_theoretical(ax, records) + ax.scatter( + [_ext_x(records)], [EXT_LATENCY_NS], + marker="*", s=220, color="tab:red", zorder=5, + label=EXT_LABEL, + ) + ax.set_xscale("log", base=2) + ax.set_yscale("log") + ax.set_xlabel("Bytes per PE (log scale)") + ax.set_ylabel("Time (ns) — log scale") + ax.set_title("Multi-device allreduce latency vs external single-device reference") + ax.grid(True, which="both", alpha=0.3) + ax.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt)) + ax.legend(loc="upper left") + fig.tight_layout() + out = PLOT_DIR / "overview_log.png" + fig.savefig(out, dpi=120) + plt.close(fig) + print(f"wrote {out}") + + +def emit_broken(records): + topologies = sorted({r["sip_topology"] for r in records}) + max_local = max(r["latency_ns"] for r in records) + + fig, (ax_top, ax_bot) = plt.subplots( + 2, 1, sharex=True, + gridspec_kw={"height_ratios": [1, 4], "hspace": 0.05}, + figsize=(9, 6.5), + ) + + # Bottom panel: today's three curves + theoretical, linear y. + _plot_curves(ax_bot, records, topologies) + _plot_theoretical(ax_bot, records) + ax_bot.set_ylim(0, max_local * 1.10) + + # Top panel: only the external reference marker, linear y around 366 µs. + ax_top.scatter( + [_ext_x(records)], [EXT_LATENCY_NS], + marker="*", s=240, color="tab:red", zorder=5, + label=EXT_LABEL, + ) + ax_top.set_ylim(EXT_LATENCY_NS * 0.93, EXT_LATENCY_NS * 1.05) + + # Hide the spine between the two panels and draw diagonal "break" ticks. + ax_top.spines["bottom"].set_visible(False) + ax_bot.spines["top"].set_visible(False) + ax_top.tick_params(labeltop=False, bottom=False) + ax_bot.xaxis.tick_bottom() + + d = 0.012 # diagonal-tick size, in axis-fraction + kw = dict(transform=ax_top.transAxes, color="k", clip_on=False, lw=1) + ax_top.plot((-d, +d), (-d, +d), **kw) + ax_top.plot((1 - d, 1 + d), (-d, +d), **kw) + kw.update(transform=ax_bot.transAxes) + ax_bot.plot((-d, +d), (1 - d * 4, 1 + d * 4), **kw) + ax_bot.plot((1 - d, 1 + d), (1 - d * 4, 1 + d * 4), **kw) + + ax_bot.set_xscale("log", base=2) + ax_bot.set_xlabel("Bytes per PE (log scale)") + ax_bot.set_ylabel("Time (ns)") + ax_top.set_ylabel("Time (ns)") + ax_bot.grid(True, alpha=0.3) + ax_top.grid(True, alpha=0.3) + ax_bot.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt)) + + # One legend covering both axes. + handles_bot, labels_bot = ax_bot.get_legend_handles_labels() + handles_top, labels_top = ax_top.get_legend_handles_labels() + ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top, + loc="upper left") + + fig.suptitle("Multi-device allreduce latency vs external single-device reference (broken y-axis)") + fig.tight_layout() + out = PLOT_DIR / "overview_broken.png" + fig.savefig(out, dpi=120) + plt.close(fig) + print(f"wrote {out}") + + +def main(): + records = _load_records() + if not records: + raise SystemExit(f"no rows in {CSV_PATH}") + emit_log(records) + emit_broken(records) + + +if __name__ == "__main__": + main()