Compare commits
3 Commits
533e699299
...
5accd98171
| Author | SHA1 | Date | |
|---|---|---|---|
| 5accd98171 | |||
| a563169e89 | |||
| 9c129d6131 |
@@ -1,12 +1,12 @@
|
||||
buffer_kind,sip_topology,n_sips,n_elem,bytes_per_pe,latency_ns
|
||||
hbm,torus_2d,6,128,256,2002.0399999999827
|
||||
hbm,torus_2d,6,1024,2048,3541.0399999999827
|
||||
hbm,torus_2d,6,8192,16384,15889.03999999999
|
||||
hbm,torus_2d,6,32768,65536,58225.03999999998
|
||||
sram,torus_2d,6,128,256,1762.0399999999827
|
||||
sram,torus_2d,6,1024,2048,2293.0399999999827
|
||||
sram,torus_2d,6,8192,16384,6577.039999999986
|
||||
sram,torus_2d,6,32768,65536,21265.03999999992
|
||||
hbm,torus_2d,6,128,256,1858.0399999999827
|
||||
hbm,torus_2d,6,1024,2048,2389.0399999999827
|
||||
hbm,torus_2d,6,8192,16384,6673.039999999986
|
||||
hbm,torus_2d,6,32768,65536,21361.03999999992
|
||||
sram,torus_2d,6,128,256,1774.0399999999827
|
||||
sram,torus_2d,6,1024,2048,2389.0399999999827
|
||||
sram,torus_2d,6,8192,16384,7345.039999999986
|
||||
sram,torus_2d,6,32768,65536,24337.039999999935
|
||||
tcm,torus_2d,6,128,256,1678.0399999999827
|
||||
tcm,torus_2d,6,1024,2048,1957.0399999999827
|
||||
tcm,torus_2d,6,8192,16384,4225.039999999986
|
||||
|
||||
|
|
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 74 KiB |
|
After Width: | Height: | Size: 80 KiB |
|
After Width: | Height: | Size: 75 KiB |
|
After Width: | Height: | Size: 150 KiB |
|
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 50 KiB |
|
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 53 KiB After Width: | Height: | Size: 54 KiB |
|
Before Width: | Height: | Size: 52 KiB After Width: | Height: | Size: 53 KiB |
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 109 KiB |
@@ -1,81 +1,81 @@
|
||||
hop,label,size_bytes,path,total_ns
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.3899999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,raw,12.019999999996799
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.1399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,raw,13.019999999996799
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,35.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,34.8899999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,raw,14.019999999996799
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,37.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,36.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,raw,15.019999999996799
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,41.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,40.1399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,raw,17.0199999999968
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,45.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,43.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,raw,19.0199999999968
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,61.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,57.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,raw,27.0199999999968
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,93.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,85.6399999999976
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,raw,43.0199999999968
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,157.64000000000306
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,141.64000000000306
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,raw,75.02000000000407
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,189.64000000000306
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,169.64000000000306
|
||||
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,raw,91.02000000000407
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.3899999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,raw,12.019999999996799
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.1399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,raw,13.019999999996799
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,35.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,34.8899999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,raw,14.019999999996799
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,37.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,36.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,raw,15.019999999996799
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,41.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,40.1399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,raw,17.0199999999968
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,45.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,43.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,raw,19.0199999999968
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,61.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,57.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,raw,27.0199999999968
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,93.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,85.6399999999976
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,raw,43.0199999999968
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,157.64000000000306
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,141.64000000000306
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,raw,75.02000000000407
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,189.64000000000306
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,169.64000000000306
|
||||
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,raw,91.02000000000407
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.40999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,raw,68.53999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.15999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,raw,70.03999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,71.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,70.90999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,raw,71.53999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,73.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,72.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,raw,73.03999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,77.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,76.15999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,raw,76.03999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,81.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,79.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,raw,79.03999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,97.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,93.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,raw,91.03999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,129.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,121.65999999999804
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,raw,115.03999999999724
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,193.65999999999985
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,177.65999999999985
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,raw,163.04000000000087
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,225.65999999999985
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,205.65999999999985
|
||||
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,raw,187.04000000000087
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.40999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,raw,88.53999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.15999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,raw,90.03999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,91.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,90.90999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,raw,91.53999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,93.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,92.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,raw,93.03999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,97.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,96.15999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,raw,96.03999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,101.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,99.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,raw,99.03999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,117.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,113.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,raw,111.03999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,149.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,141.65999999999804
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,raw,135.03999999999724
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,213.65999999999985
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,197.65999999999985
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,raw,183.04000000000087
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,245.65999999999985
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,225.65999999999985
|
||||
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,raw,207.04000000000087
|
||||
|
||||
|
@@ -0,0 +1,171 @@
|
||||
"""Generate a 5-slide PPTX summarizing the kernbench2 model.
|
||||
|
||||
Slides (in order):
|
||||
1. Overall architecture — how PEs are connected (cube_mesh_view)
|
||||
2. Model correctness — DMA vs P2P latency (pe2pe overview)
|
||||
3. PE-to-PE IPCQ communication (ipcq_two_pe_dma)
|
||||
4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken)
|
||||
5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep)
|
||||
|
||||
This is a derived-artifact generator — no production code touched.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
from pptx import Presentation
|
||||
from pptx.dml.color import RGBColor
|
||||
from pptx.enum.shapes import MSO_SHAPE
|
||||
from pptx.util import Emu, Inches, Pt
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
DIAG = ROOT / "docs" / "diagrams"
|
||||
OUT = DIAG / "kernbench2_overview.pptx"
|
||||
|
||||
# 16:9 widescreen — 13.333 × 7.5 in
|
||||
SLIDE_W_IN = 13.333
|
||||
SLIDE_H_IN = 7.5
|
||||
|
||||
SLIDES = [
|
||||
{
|
||||
"title": "1. CUBE Architecture: NOC Router Mesh + PE Connectivity",
|
||||
"image": DIAG / "cube_mesh_view.png",
|
||||
"bullets": [
|
||||
"Each CUBE holds an 8-PE NOC mesh wired through routers (R0..R7)",
|
||||
"Every PE has IO_CPU, M_CPU, PE_CPU + IPCQ engine + DMA engine",
|
||||
"Inter-cube traffic exits via UCIe/UAL ports; SIPs stitch into ring/torus/mesh",
|
||||
"Foundation for every latency, IPCQ, and allreduce experiment that follows",
|
||||
],
|
||||
},
|
||||
{
|
||||
"title": "2. Model Correctness: DMA vs P2P Latency Sweep",
|
||||
"image": DIAG / "pe2pe_latency_plots" / "overview.png",
|
||||
"bullets": [
|
||||
"Sweeps payload size across PE-to-PE paths and compares to DMA",
|
||||
"Confirms the simulator reproduces the expected DMA/P2P crossover",
|
||||
"Acts as the per-hop ground truth that feeds collective-level models",
|
||||
],
|
||||
},
|
||||
{
|
||||
"title": "3. IPCQ: How Two PEs Communicate (DMA + Slot Memory)",
|
||||
"image": DIAG / "ipcq_diagram_plots" / "ipcq_two_pe_dma.png",
|
||||
"bullets": [
|
||||
"Sender pushes payload through PE_DMA → fabric → receiver IPCQ slot",
|
||||
"Slot memory (TCM/SRAM/HBM) charges a write on arrival, a read on consume",
|
||||
"Credit return rides the fabric path back (16 B packet, no slot-IO)",
|
||||
"This is the building block the multi-device allreduce composes",
|
||||
],
|
||||
},
|
||||
{
|
||||
"title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator",
|
||||
"image": DIAG / "allreduce_latency_plots" / "overview_broken.png",
|
||||
"bullets": [
|
||||
"Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE",
|
||||
"Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)",
|
||||
"Top panel (broken y-axis): single-device reduce on ext-sim ≈ 366 µs",
|
||||
"Our 6-device collective lands at ~17–22 µs — ~17× faster than ext-sim baseline",
|
||||
],
|
||||
},
|
||||
{
|
||||
"title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM",
|
||||
"image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png",
|
||||
"bullets": [
|
||||
"Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)",
|
||||
"Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop",
|
||||
"Topology link BWs set the order: SRAM bank link 128 GB/s is the narrowest in the system, HBM 256 GB/s",
|
||||
"At 64 KB / PE: TCM 12.0 µs < HBM 21.4 µs < SRAM 24.3 µs — SRAM is slowest because of its narrow bank link",
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _add_title(slide, text):
|
||||
left = Inches(0.4)
|
||||
top = Inches(0.25)
|
||||
width = Inches(SLIDE_W_IN - 0.8)
|
||||
height = Inches(0.7)
|
||||
box = slide.shapes.add_textbox(left, top, width, height)
|
||||
tf = box.text_frame
|
||||
tf.margin_left = tf.margin_right = Emu(0)
|
||||
tf.margin_top = tf.margin_bottom = Emu(0)
|
||||
p = tf.paragraphs[0]
|
||||
run = p.add_run()
|
||||
run.text = text
|
||||
run.font.size = Pt(26)
|
||||
run.font.bold = True
|
||||
run.font.color.rgb = RGBColor(0x10, 0x2A, 0x55)
|
||||
return box
|
||||
|
||||
|
||||
def _add_image_centered(slide, img_path, *, left_in, top_in, max_w_in, max_h_in):
|
||||
with Image.open(img_path) as im:
|
||||
iw, ih = im.size
|
||||
max_w_emu = Inches(max_w_in)
|
||||
max_h_emu = Inches(max_h_in)
|
||||
scale = min(max_w_emu / iw, max_h_emu / ih)
|
||||
w = int(iw * scale)
|
||||
h = int(ih * scale)
|
||||
left = Inches(left_in) + (max_w_emu - w) // 2
|
||||
top = Inches(top_in) + (max_h_emu - h) // 2
|
||||
slide.shapes.add_picture(str(img_path), left, top, width=w, height=h)
|
||||
|
||||
|
||||
def _add_bullets(slide, bullets, *, left_in, top_in, width_in, height_in):
|
||||
box = slide.shapes.add_textbox(
|
||||
Inches(left_in), Inches(top_in), Inches(width_in), Inches(height_in),
|
||||
)
|
||||
tf = box.text_frame
|
||||
tf.word_wrap = True
|
||||
for i, line in enumerate(bullets):
|
||||
p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
|
||||
p.level = 0
|
||||
run = p.add_run()
|
||||
run.text = "• " + line
|
||||
run.font.size = Pt(15)
|
||||
run.font.color.rgb = RGBColor(0x22, 0x22, 0x22)
|
||||
p.space_after = Pt(6)
|
||||
|
||||
|
||||
def _add_footer(slide, idx, total):
|
||||
box = slide.shapes.add_textbox(
|
||||
Inches(SLIDE_W_IN - 1.2), Inches(SLIDE_H_IN - 0.45),
|
||||
Inches(1.0), Inches(0.3),
|
||||
)
|
||||
p = box.text_frame.paragraphs[0]
|
||||
run = p.add_run()
|
||||
run.text = f"{idx} / {total}"
|
||||
run.font.size = Pt(10)
|
||||
run.font.color.rgb = RGBColor(0x88, 0x88, 0x88)
|
||||
|
||||
|
||||
def build():
|
||||
prs = Presentation()
|
||||
prs.slide_width = Inches(SLIDE_W_IN)
|
||||
prs.slide_height = Inches(SLIDE_H_IN)
|
||||
blank = prs.slide_layouts[6]
|
||||
|
||||
for i, cfg in enumerate(SLIDES, start=1):
|
||||
slide = prs.slides.add_slide(blank)
|
||||
_add_title(slide, cfg["title"])
|
||||
|
||||
# Layout: image on the left (8.4 in wide), bullets on the right (4.4 in).
|
||||
_add_image_centered(
|
||||
slide, cfg["image"],
|
||||
left_in=0.3, top_in=1.05,
|
||||
max_w_in=8.3, max_h_in=5.9,
|
||||
)
|
||||
_add_bullets(
|
||||
slide, cfg["bullets"],
|
||||
left_in=8.8, top_in=1.2,
|
||||
width_in=4.3, height_in=5.7,
|
||||
)
|
||||
_add_footer(slide, i, len(SLIDES))
|
||||
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
prs.save(OUT)
|
||||
print(f"wrote {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
build()
|
||||
@@ -0,0 +1,192 @@
|
||||
"""One-shot: render overview.png with an external 366 µs reference, in two
|
||||
variants — log scale and broken y-axis. Reads docs/diagrams/allreduce_latency_plots/summary.csv
|
||||
and writes overview_log.png and overview_broken.png alongside it.
|
||||
|
||||
This is a derived-artifact generator (per CLAUDE.md): plotting only, no production
|
||||
or test logic touched.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mticker
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
PLOT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots"
|
||||
CSV_PATH = PLOT_DIR / "summary.csv"
|
||||
|
||||
EXT_LABEL = "ext-sim single-device reduce: 366 µs"
|
||||
EXT_LATENCY_NS = 366_000.0
|
||||
|
||||
COLORS = {
|
||||
"ring_1d": "tab:blue",
|
||||
"torus_2d": "tab:orange",
|
||||
"mesh_2d_no_wrap": "tab:green",
|
||||
}
|
||||
|
||||
# Hand-derived theoretical model for torus_2d (6 SIPs). Mirrors
|
||||
# _aggregate_sweep_plots in tests/test_allreduce_multidevice.py.
|
||||
NOC_PACKET_BYTES = 128
|
||||
PES_PER_CUBE = 8
|
||||
T_STARTUP_NS = 1346.0
|
||||
TAU_NS = (8741.0 - 1346.0) / (6144 - 1)
|
||||
|
||||
|
||||
def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float:
|
||||
bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE
|
||||
n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES))
|
||||
return T_STARTUP_NS + (n_packets - 1) * TAU_NS
|
||||
|
||||
|
||||
def _plot_theoretical(ax, records):
|
||||
torus_rs = sorted(
|
||||
[r for r in records if r["sip_topology"] == "torus_2d"],
|
||||
key=lambda r: r["bytes_per_pe"],
|
||||
)
|
||||
if not torus_rs:
|
||||
return
|
||||
ax.plot(
|
||||
[r["bytes_per_pe"] for r in torus_rs],
|
||||
[_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs],
|
||||
color="tab:red", linestyle="--", linewidth=1.6, marker="x",
|
||||
label="theoretical torus_2d (6 SIPs)",
|
||||
)
|
||||
|
||||
|
||||
def _bytes_fmt(x, _pos):
|
||||
if x >= 1024 * 1024:
|
||||
return f"{x / (1024 * 1024):.0f}M"
|
||||
if x >= 1024:
|
||||
return f"{x / 1024:.0f}K"
|
||||
return f"{int(x)}"
|
||||
|
||||
|
||||
def _load_records():
|
||||
rows = []
|
||||
with open(CSV_PATH, newline="") as f:
|
||||
r = csv.DictReader(f)
|
||||
for row in r:
|
||||
rows.append({
|
||||
"sip_topology": row["sip_topology"],
|
||||
"bytes_per_pe": int(row["bytes_per_pe"]),
|
||||
"latency_ns": float(row["latency_ns"]),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def _ext_x(records):
|
||||
"""Anchor the external reference at the largest payload (96 KB / PE)."""
|
||||
return max(r["bytes_per_pe"] for r in records)
|
||||
|
||||
|
||||
def _plot_curves(ax, records, topologies):
|
||||
for topo in topologies:
|
||||
rs = sorted([r for r in records if r["sip_topology"] == topo],
|
||||
key=lambda r: r["bytes_per_pe"])
|
||||
if not rs:
|
||||
continue
|
||||
ax.plot(
|
||||
[r["bytes_per_pe"] for r in rs],
|
||||
[r["latency_ns"] for r in rs],
|
||||
marker="o",
|
||||
label=f"{topo}",
|
||||
color=COLORS.get(topo),
|
||||
)
|
||||
|
||||
|
||||
def emit_log(records):
|
||||
topologies = sorted({r["sip_topology"] for r in records})
|
||||
fig, ax = plt.subplots(figsize=(9, 6))
|
||||
_plot_curves(ax, records, topologies)
|
||||
_plot_theoretical(ax, records)
|
||||
ax.scatter(
|
||||
[_ext_x(records)], [EXT_LATENCY_NS],
|
||||
marker="*", s=220, color="tab:red", zorder=5,
|
||||
label=EXT_LABEL,
|
||||
)
|
||||
ax.set_xscale("log", base=2)
|
||||
ax.set_yscale("log")
|
||||
ax.set_xlabel("Bytes per PE (log scale)")
|
||||
ax.set_ylabel("Time (ns) — log scale")
|
||||
ax.set_title("Multi-device allreduce latency vs external single-device reference")
|
||||
ax.grid(True, which="both", alpha=0.3)
|
||||
ax.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt))
|
||||
ax.legend(loc="upper left")
|
||||
fig.tight_layout()
|
||||
out = PLOT_DIR / "overview_log.png"
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
print(f"wrote {out}")
|
||||
|
||||
|
||||
def emit_broken(records):
|
||||
topologies = sorted({r["sip_topology"] for r in records})
|
||||
max_local = max(r["latency_ns"] for r in records)
|
||||
|
||||
fig, (ax_top, ax_bot) = plt.subplots(
|
||||
2, 1, sharex=True,
|
||||
gridspec_kw={"height_ratios": [1, 4], "hspace": 0.05},
|
||||
figsize=(9, 6.5),
|
||||
)
|
||||
|
||||
# Bottom panel: today's three curves + theoretical, linear y.
|
||||
_plot_curves(ax_bot, records, topologies)
|
||||
_plot_theoretical(ax_bot, records)
|
||||
ax_bot.set_ylim(0, max_local * 1.10)
|
||||
|
||||
# Top panel: only the external reference marker, linear y around 366 µs.
|
||||
ax_top.scatter(
|
||||
[_ext_x(records)], [EXT_LATENCY_NS],
|
||||
marker="*", s=240, color="tab:red", zorder=5,
|
||||
label=EXT_LABEL,
|
||||
)
|
||||
ax_top.set_ylim(EXT_LATENCY_NS * 0.93, EXT_LATENCY_NS * 1.05)
|
||||
|
||||
# Hide the spine between the two panels and draw diagonal "break" ticks.
|
||||
ax_top.spines["bottom"].set_visible(False)
|
||||
ax_bot.spines["top"].set_visible(False)
|
||||
ax_top.tick_params(labeltop=False, bottom=False)
|
||||
ax_bot.xaxis.tick_bottom()
|
||||
|
||||
d = 0.012 # diagonal-tick size, in axis-fraction
|
||||
kw = dict(transform=ax_top.transAxes, color="k", clip_on=False, lw=1)
|
||||
ax_top.plot((-d, +d), (-d, +d), **kw)
|
||||
ax_top.plot((1 - d, 1 + d), (-d, +d), **kw)
|
||||
kw.update(transform=ax_bot.transAxes)
|
||||
ax_bot.plot((-d, +d), (1 - d * 4, 1 + d * 4), **kw)
|
||||
ax_bot.plot((1 - d, 1 + d), (1 - d * 4, 1 + d * 4), **kw)
|
||||
|
||||
ax_bot.set_xscale("log", base=2)
|
||||
ax_bot.set_xlabel("Bytes per PE (log scale)")
|
||||
ax_bot.set_ylabel("Time (ns)")
|
||||
ax_top.set_ylabel("Time (ns)")
|
||||
ax_bot.grid(True, alpha=0.3)
|
||||
ax_top.grid(True, alpha=0.3)
|
||||
ax_bot.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt))
|
||||
|
||||
# One legend covering both axes.
|
||||
handles_bot, labels_bot = ax_bot.get_legend_handles_labels()
|
||||
handles_top, labels_top = ax_top.get_legend_handles_labels()
|
||||
ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top,
|
||||
loc="upper left")
|
||||
|
||||
fig.suptitle("Multi-device allreduce latency vs external single-device reference (broken y-axis)")
|
||||
fig.tight_layout()
|
||||
out = PLOT_DIR / "overview_broken.png"
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
print(f"wrote {out}")
|
||||
|
||||
|
||||
def main():
|
||||
records = _load_records()
|
||||
if not records:
|
||||
raise SystemExit(f"no rows in {CSV_PATH}")
|
||||
emit_log(records)
|
||||
emit_broken(records)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,141 @@
|
||||
"""Re-render pe2pe latency PNGs from the existing summary.csv with the
|
||||
current (no-consume) labels. Used after a label-only test edit to avoid
|
||||
re-measuring (~5 min) when the data on disk is already correct.
|
||||
|
||||
Reads docs/diagrams/pe2pe_latency_plots/summary.csv. Plots 2 curves:
|
||||
"IPCQ no-consume" (from the ipcq_no_consume rows if present, else from
|
||||
the ipcq rows) and "Raw DMA" (raw rows).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
PLOT_DIR = ROOT / "docs" / "diagrams" / "pe2pe_latency_plots"
|
||||
CSV_PATH = PLOT_DIR / "summary.csv"
|
||||
|
||||
|
||||
def _load_records():
|
||||
rows = []
|
||||
with open(CSV_PATH, newline="") as f:
|
||||
for r in csv.DictReader(f):
|
||||
rows.append({
|
||||
"hop": r["hop"],
|
||||
"label": r["label"],
|
||||
"size_bytes": int(r["size_bytes"]),
|
||||
"path": r["path"],
|
||||
"total_ns": float(r["total_ns"]),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def _ipcq_rows(records, hop):
|
||||
# Prefer ipcq_no_consume if present (older 3-path CSV); fall back to ipcq
|
||||
# (current single-path CSV where ipcq IS no-consume).
|
||||
nc = [r for r in records
|
||||
if r["hop"] == hop and r["path"] == "ipcq_no_consume"]
|
||||
if nc:
|
||||
return sorted(nc, key=lambda r: r["size_bytes"])
|
||||
return sorted(
|
||||
[r for r in records if r["hop"] == hop and r["path"] == "ipcq"],
|
||||
key=lambda r: r["size_bytes"],
|
||||
)
|
||||
|
||||
|
||||
def _raw_rows(records, hop):
|
||||
return sorted(
|
||||
[r for r in records if r["hop"] == hop and r["path"] == "raw"],
|
||||
key=lambda r: r["size_bytes"],
|
||||
)
|
||||
|
||||
|
||||
def _hops(records):
|
||||
seen = []
|
||||
for r in records:
|
||||
if r["hop"] not in {h["id"] for h in seen}:
|
||||
seen.append({"id": r["hop"], "label": r["label"]})
|
||||
return seen
|
||||
|
||||
|
||||
def _plot_per_hop(records, hop, path):
|
||||
ipcq = _ipcq_rows(records, hop["id"])
|
||||
raw = _raw_rows(records, hop["id"])
|
||||
fig, ax = plt.subplots(figsize=(8, 5))
|
||||
if ipcq:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in ipcq],
|
||||
[r["total_ns"] for r in ipcq],
|
||||
marker="o", color="tab:blue",
|
||||
label="IPCQ no-consume (send/recv, no slot read)",
|
||||
)
|
||||
if raw:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in raw],
|
||||
[r["total_ns"] for r in raw],
|
||||
marker="s", color="tab:orange",
|
||||
label="Raw DMA (load+store)",
|
||||
)
|
||||
ax.set_xlabel("Data size (bytes)")
|
||||
ax.set_ylabel("Latency (ns)")
|
||||
ax.set_title(hop["label"])
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.legend()
|
||||
fig.tight_layout()
|
||||
fig.savefig(path, dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def _plot_overview(records, hops, path):
|
||||
fig, axes = plt.subplots(2, 2, figsize=(13, 9))
|
||||
axes = axes.flatten()
|
||||
for i, hop in enumerate(hops):
|
||||
ax = axes[i]
|
||||
ipcq = _ipcq_rows(records, hop["id"])
|
||||
raw = _raw_rows(records, hop["id"])
|
||||
if ipcq:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in ipcq],
|
||||
[r["total_ns"] for r in ipcq],
|
||||
marker="o", color="tab:blue",
|
||||
label="IPCQ no-consume",
|
||||
)
|
||||
if raw:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in raw],
|
||||
[r["total_ns"] for r in raw],
|
||||
marker="s", color="tab:orange",
|
||||
label="Raw DMA",
|
||||
)
|
||||
ax.set_title(hop["label"], fontsize=10)
|
||||
ax.set_xlabel("bytes")
|
||||
ax.set_ylabel("ns")
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.legend(fontsize=8)
|
||||
for j in range(len(hops), len(axes)):
|
||||
axes[j].axis("off")
|
||||
fig.suptitle(
|
||||
"PE-to-PE latency: IPCQ no-consume vs raw DMA",
|
||||
fontsize=14,
|
||||
)
|
||||
fig.tight_layout()
|
||||
fig.savefig(path, dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def main():
|
||||
records = _load_records()
|
||||
hops = _hops(records)
|
||||
for hop in hops:
|
||||
out = PLOT_DIR / f"{hop['id']}.png"
|
||||
_plot_per_hop(records, hop, out)
|
||||
print(f"wrote {out}")
|
||||
overview = PLOT_DIR / "overview.png"
|
||||
_plot_overview(records, hops, overview)
|
||||
print(f"wrote {overview}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -135,6 +135,13 @@ class IpcqRecvCmd:
|
||||
"return_slot" — return slot address as-is (default, zero-copy).
|
||||
Kernel uses the slot memory directly.
|
||||
"copy_to_dst" — copy slot data to dst_addr, then return.
|
||||
|
||||
``consume`` (DIAGNOSTIC ONLY): when False, recv still blocks until the
|
||||
payload lands in the slot, but skips the slot-read latency charge
|
||||
(slot-IO + PE↔bank fabric drain for SRAM/HBM tiers). This exists
|
||||
solely so the pe2pe overview plot can compare apples-to-apples
|
||||
against tl.store (a one-sided write that pays no read on DST). Real
|
||||
kernels always need the data they receive — leave this True.
|
||||
"""
|
||||
|
||||
direction: str | None # None → round-robin (weak fairness, D4)
|
||||
@@ -146,6 +153,7 @@ class IpcqRecvCmd:
|
||||
dst_space: str = "" # used only when recv_mode == "copy_to_dst"
|
||||
blocking: bool = True
|
||||
data_op: bool = True
|
||||
consume: bool = True # DIAGNOSTIC: see docstring
|
||||
|
||||
|
||||
# ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ───────────────────
|
||||
|
||||
@@ -222,10 +222,24 @@ class PeDmaComponent(PeEngineBase):
|
||||
# ADR-0023 D9.7: charge IPCQ slot-WRITE latency against the
|
||||
# backing-memory tier (tcm/sram/hbm) before the atomic block.
|
||||
# Must come BEFORE the atomic write→IpcqMetaArrival pair (I6).
|
||||
# SRAM/HBM also pay a PE_DMA→bank fabric drain (slot lives on
|
||||
# the cube NoC); TCM is per-PE local and skips this hop.
|
||||
from kernbench.common.ipcq_types import slot_io_latency_ns
|
||||
slot_write_ns = slot_io_latency_ns(
|
||||
token.dst_endpoint.buffer_kind, token.nbytes,
|
||||
buffer_kind = token.dst_endpoint.buffer_kind
|
||||
if buffer_kind in ("sram", "hbm") and self.ctx is not None:
|
||||
cube_prefix = self._pe_prefix.rsplit(".", 1)[0]
|
||||
bank_node = (
|
||||
f"{cube_prefix}.sram" if buffer_kind == "sram"
|
||||
else f"{cube_prefix}.hbm_ctrl"
|
||||
)
|
||||
try:
|
||||
path = self.ctx.router.find_path(self._pe_prefix, bank_node)
|
||||
bank_drain_ns = self.ctx.compute_drain_ns(path, token.nbytes)
|
||||
if bank_drain_ns > 0:
|
||||
yield env.timeout(bank_drain_ns)
|
||||
except Exception:
|
||||
pass
|
||||
slot_write_ns = slot_io_latency_ns(buffer_kind, token.nbytes)
|
||||
if slot_write_ns > 0:
|
||||
yield env.timeout(slot_write_ns)
|
||||
|
||||
|
||||
@@ -332,10 +332,35 @@ class PeIpcqComponent(ComponentBase):
|
||||
# ADR-0023 D9.7: charge IPCQ slot-READ latency against the
|
||||
# backing-memory tier (tcm/sram/hbm). Recv blocks for the
|
||||
# kernel-side slot consume; pe_exec_ns reflects this cost.
|
||||
# SRAM/HBM live on the cube NoC behind a router-attached link,
|
||||
# so reading a slot also pays a PE→bank fabric drain. TCM is
|
||||
# per-PE local and skips this hop.
|
||||
#
|
||||
# cmd.consume is a DIAGNOSTIC flag (default True). When False,
|
||||
# the read charges below are skipped — used only by the pe2pe
|
||||
# overview plot for an apples-to-apples comparison against
|
||||
# tl.store (one-sided write, no read on DST). Real kernels
|
||||
# always consume; this branch must not be exercised in
|
||||
# production code paths.
|
||||
from kernbench.common.ipcq_types import slot_io_latency_ns
|
||||
slot_read_ns = slot_io_latency_ns(
|
||||
self._buffer_kind, req.result_data.get("nbytes", 0),
|
||||
nbytes = req.result_data.get("nbytes", 0)
|
||||
if cmd.consume:
|
||||
if self._buffer_kind in ("sram", "hbm") and self.ctx is not None:
|
||||
cube_prefix = self._pe_prefix.rsplit(".", 1)[0]
|
||||
bank_node = (
|
||||
f"{cube_prefix}.sram" if self._buffer_kind == "sram"
|
||||
else f"{cube_prefix}.hbm_ctrl"
|
||||
)
|
||||
try:
|
||||
path = self.ctx.router.find_path(
|
||||
self._pe_prefix, bank_node,
|
||||
)
|
||||
bank_drain_ns = self.ctx.compute_drain_ns(path, nbytes)
|
||||
if bank_drain_ns > 0:
|
||||
yield env.timeout(bank_drain_ns)
|
||||
except Exception:
|
||||
pass
|
||||
slot_read_ns = slot_io_latency_ns(self._buffer_kind, nbytes)
|
||||
if slot_read_ns > 0:
|
||||
yield env.timeout(slot_read_ns)
|
||||
|
||||
|
||||
@@ -492,6 +492,48 @@ class TLContext:
|
||||
)
|
||||
return self._make_handle(addr=0, shape=shape, dtype=dtype)
|
||||
|
||||
def recv_no_consume(
|
||||
self,
|
||||
dir: str | None = None,
|
||||
shape: tuple[int, ...] = (),
|
||||
dtype: str = "f16",
|
||||
) -> TensorHandle:
|
||||
"""DIAGNOSTIC ONLY — recv that blocks for arrival but skips slot read.
|
||||
|
||||
Same blocking semantics as ``tl.recv``: the kernel waits until
|
||||
the payload has landed in the IPCQ slot. Differs from ``tl.recv``
|
||||
by skipping the slot-read latency charge (slot-IO + PE↔bank
|
||||
fabric drain) on DST.
|
||||
|
||||
This entry point exists solely so the pe2pe overview plot can
|
||||
draw an apples-to-apples comparison against ``tl.store`` (a
|
||||
one-sided fabric write that pays no read on DST). Production
|
||||
kernels MUST use ``tl.recv`` — they need to consume the data
|
||||
they receive. This API is segregated from ``tl.recv`` so the
|
||||
diagnostic flag can never accidentally be set in real workloads.
|
||||
"""
|
||||
self._emit_dispatch_overhead()
|
||||
cmd = IpcqRecvCmd(
|
||||
direction=dir,
|
||||
shape=shape, dtype=dtype,
|
||||
handle_id=self._next_handle_id(),
|
||||
consume=False,
|
||||
)
|
||||
result = self._emit(cmd) # type: ignore[arg-type]
|
||||
if isinstance(result, dict):
|
||||
slot_addr = int(result.get("src_addr", 0))
|
||||
slot_space = str(result.get("src_space", "tcm"))
|
||||
return TensorHandle(
|
||||
id=self._next_handle_id(),
|
||||
addr=slot_addr,
|
||||
shape=shape,
|
||||
dtype=dtype,
|
||||
nbytes=self._nbytes(shape, dtype),
|
||||
data=None,
|
||||
space=slot_space,
|
||||
)
|
||||
return self._make_handle(addr=0, shape=shape, dtype=dtype)
|
||||
|
||||
def recv_async(
|
||||
self,
|
||||
dir: str,
|
||||
|
||||
@@ -43,20 +43,30 @@ from tests.test_allreduce_multidevice import (
|
||||
)
|
||||
|
||||
|
||||
# Expected per-tier BW + overhead (Phase 2 will encode this in
|
||||
# pe_ipcq.py). Mirrors topology.yaml component values.
|
||||
_EXPECTED_BW = {
|
||||
"tcm": (512.0, 0.0),
|
||||
"sram": (512.0, 2.0),
|
||||
"hbm": (256.0, 6.0),
|
||||
# Expected per-tier (slot intrinsic BW, fixed overhead, PE↔bank hop BW).
|
||||
# Slot intrinsic mirrors _BUFFER_KIND_BW in src/kernbench/common/ipcq_types.py.
|
||||
# PE↔bank hop reflects topology.yaml link BWs:
|
||||
# - TCM is per-PE local → no hop, encoded as inf.
|
||||
# - SRAM bank sits on cube NoC behind sram_to_router_bw_gbs = 128 GB/s.
|
||||
# - HBM ctrl sits on cube NoC behind hbm_to_router_bw_gbs = 256 GB/s.
|
||||
_EXPECTED_TIER = {
|
||||
"tcm": {"slot_bw_gbs": 512.0, "overhead_ns": 0.0, "bank_hop_bw_gbs": float("inf")},
|
||||
"sram": {"slot_bw_gbs": 512.0, "overhead_ns": 2.0, "bank_hop_bw_gbs": 128.0},
|
||||
"hbm": {"slot_bw_gbs": 256.0, "overhead_ns": 6.0, "bank_hop_bw_gbs": 256.0},
|
||||
}
|
||||
|
||||
|
||||
def _expected_slot_io_ns(buffer_kind: str, nbytes: int) -> float:
|
||||
"""Per-access latency the model is expected to add (write OR read)."""
|
||||
bw_gbs, overhead_ns = _EXPECTED_BW[buffer_kind]
|
||||
# 1 GB/s = 1 byte/ns
|
||||
return nbytes / bw_gbs + overhead_ns
|
||||
"""Per-access latency the model is expected to add (write OR read).
|
||||
|
||||
Includes the PE↔bank fabric hop for non-TCM tiers — SRAM and HBM
|
||||
live on the cube NoC behind a router-attached link, so each slot
|
||||
access pays a fabric drain in addition to the intrinsic slot-IO.
|
||||
"""
|
||||
tier = _EXPECTED_TIER[buffer_kind]
|
||||
bank_hop_ns = nbytes / tier["bank_hop_bw_gbs"]
|
||||
slot_io_ns = nbytes / tier["slot_bw_gbs"] + tier["overhead_ns"]
|
||||
return bank_hop_ns + slot_io_ns
|
||||
|
||||
|
||||
def _run_torus_allreduce(
|
||||
@@ -114,12 +124,19 @@ def _run_torus_allreduce(
|
||||
# ── Phase 1 assertions ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
|
||||
"""tcm < sram < hbm at 8192 B per send.
|
||||
def test_slot_write_latency_orders_tcm_hbm_sram(tmp_path):
|
||||
"""tcm < hbm < sram at 8192 B per send.
|
||||
|
||||
Pre-Phase-2: all three return the same pe_exec_ns and this
|
||||
assertion fails. Post-Phase-2: the per-tier BW + overhead make
|
||||
hbm visibly slower than sram, which is slower than tcm.
|
||||
The ordering is set by the topology link BWs, NOT the intrinsic slot
|
||||
cell rates: SRAM and HBM both live on the cube NoC behind a router
|
||||
link, and SRAM's link (128 GB/s) is the narrowest in the system —
|
||||
narrower than HBM's (256 GB/s). So once the PE↔bank hop is charged,
|
||||
SRAM ends up the slowest tier even though its slot cell array has
|
||||
the same intrinsic BW as TCM.
|
||||
|
||||
Pre-fix model misses the PE↔bank hop entirely → assertion FAILS
|
||||
(today's ordering is tcm < sram < hbm). Post-fix model includes the
|
||||
hop → assertion PASSES.
|
||||
"""
|
||||
n_elem = 4096 # 8192 B per slot
|
||||
lat_tcm = _run_torus_allreduce(tmp_path, buffer_kind="tcm", n_elem=n_elem)
|
||||
@@ -130,21 +147,22 @@ def test_slot_write_latency_orders_tcm_sram_hbm(tmp_path):
|
||||
exp_tcm = 2 * _expected_slot_io_ns("tcm", n_elem * 2)
|
||||
exp_sram = 2 * _expected_slot_io_ns("sram", n_elem * 2)
|
||||
exp_hbm = 2 * _expected_slot_io_ns("hbm", n_elem * 2)
|
||||
# Floor margin: 50% of the raw expected per-access delta — lets Phase 2
|
||||
# implementation choose to charge only one side without breaking the test,
|
||||
# but still requires a clearly observable gap.
|
||||
margin_sram_tcm = 0.5 * (exp_sram - exp_tcm)
|
||||
margin_hbm_sram = 0.5 * (exp_hbm - exp_sram)
|
||||
# Floor margin: 50% of the raw expected per-access delta — lets the
|
||||
# implementation choose to charge only one side without breaking the
|
||||
# test, but still requires a clearly observable gap.
|
||||
margin_hbm_tcm = 0.5 * (exp_hbm - exp_tcm)
|
||||
margin_sram_hbm = 0.5 * (exp_sram - exp_hbm)
|
||||
|
||||
assert lat_sram > lat_tcm + margin_sram_tcm, (
|
||||
f"sram should be slower than tcm by ≥ {margin_sram_tcm:.1f} ns "
|
||||
f"per allreduce, got sram={lat_sram:.1f} tcm={lat_tcm:.1f} "
|
||||
f"(delta={lat_sram - lat_tcm:.1f})"
|
||||
assert lat_hbm > lat_tcm + margin_hbm_tcm, (
|
||||
f"hbm should be slower than tcm by ≥ {margin_hbm_tcm:.1f} ns "
|
||||
f"per allreduce, got hbm={lat_hbm:.1f} tcm={lat_tcm:.1f} "
|
||||
f"(delta={lat_hbm - lat_tcm:.1f})"
|
||||
)
|
||||
assert lat_hbm > lat_sram + margin_hbm_sram, (
|
||||
f"hbm should be slower than sram by ≥ {margin_hbm_sram:.1f} ns "
|
||||
f"per allreduce, got hbm={lat_hbm:.1f} sram={lat_sram:.1f} "
|
||||
f"(delta={lat_hbm - lat_sram:.1f})"
|
||||
assert lat_sram > lat_hbm + margin_sram_hbm, (
|
||||
f"sram should be slower than hbm by ≥ {margin_sram_hbm:.1f} ns "
|
||||
f"per allreduce (sram bank link 128 GB/s is narrower than hbm "
|
||||
f"link 256 GB/s), got sram={lat_sram:.1f} hbm={lat_hbm:.1f} "
|
||||
f"(delta={lat_sram - lat_hbm:.1f})"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,208 @@
|
||||
"""Phase 1 micro-tests for IPCQ slot-memory PHYSICAL placement.
|
||||
|
||||
The current model in ``_BUFFER_KIND_BW`` (src/kernbench/common/ipcq_types.py)
|
||||
charges only an intrinsic-memory term for IPCQ slot read/write::
|
||||
|
||||
TCM: nbytes/512 + 0
|
||||
SRAM: nbytes/512 + 2
|
||||
HBM: nbytes/256 + 6
|
||||
|
||||
This treats SRAM and HBM as if they were per-PE local. The topology
|
||||
declares the opposite — both live on the cube NoC, behind their own
|
||||
router-attached link::
|
||||
|
||||
topology.yaml:130 sram_to_router_bw_gbs: 128.0
|
||||
topology.yaml:129 hbm_to_router_bw_gbs: 256.0
|
||||
|
||||
So a correct model must charge a PE→bank fabric drain for SRAM and HBM
|
||||
on both ``tl.send`` (writer landing bytes into the cube SRAM/HBM bank
|
||||
via PE_DMA → router → bank) and ``tl.recv`` (reader pulling bytes back
|
||||
across the same link). TCM stays free of that hop because it is
|
||||
genuinely per-PE local.
|
||||
|
||||
The three tests below run the existing torus_2d 6-SIP allreduce harness
|
||||
with ``buffer_kind`` flipped between tcm/sram/hbm and assert invariants
|
||||
that the post-fix model must satisfy. They EXPECT TO FAIL today because
|
||||
the simulator under-charges SRAM and HBM by skipping the PE↔bank hop.
|
||||
|
||||
Phase 2 will edit:
|
||||
- src/kernbench/components/builtin/pe_ipcq.py (_handle_recv: add
|
||||
compute_drain_ns(pe→bank, nbytes) for sram/hbm)
|
||||
- src/kernbench/components/builtin/pe_dma.py (_handle_ipcq_inbound:
|
||||
add second-leg drain for sram/hbm-destined slots)
|
||||
|
||||
Tests must NEVER be weakened to make Phase 2 pass — invariants below
|
||||
follow from physics (link BW × payload), so any model reflecting the
|
||||
topology will satisfy them by construction.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
from tests.test_allreduce_multidevice import (
|
||||
_write_temp_configs,
|
||||
run_allreduce,
|
||||
)
|
||||
|
||||
|
||||
def _run_allreduce_with_buffer_kind(
|
||||
tmp_path: Path, *, buffer_kind: str, n_elem: int,
|
||||
) -> float:
|
||||
"""Run one torus_2d 6-SIP allreduce with the given buffer_kind and
|
||||
return critical-path pe_exec_ns (max across all PEs).
|
||||
|
||||
Mirrors the sweep harness in test_allreduce_buffer_kind_sweep.py
|
||||
so the assertions below compare apples-to-apples against that PNG.
|
||||
"""
|
||||
sub = tmp_path / f"{buffer_kind}_{n_elem}"
|
||||
sub.mkdir()
|
||||
topo_path, ccl_path = _write_temp_configs(
|
||||
sub,
|
||||
sip_topology="torus_2d",
|
||||
n_sips=6,
|
||||
algorithm="intercube_allreduce",
|
||||
sip_w=3, sip_h=2,
|
||||
n_elem_override=n_elem,
|
||||
)
|
||||
|
||||
with open(ccl_path) as f:
|
||||
ccl_cfg = yaml.safe_load(f)
|
||||
ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
|
||||
ccl_cfg.setdefault("algorithms", {}).setdefault(
|
||||
"intercube_allreduce", {},
|
||||
)["buffer_kind"] = buffer_kind
|
||||
with open(ccl_path, "w") as f:
|
||||
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
||||
|
||||
topo = resolve_topology(topo_path)
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
spec = topo.topology_obj.spec
|
||||
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id=f"loc_{buffer_kind}_{n_elem}",
|
||||
spec=spec,
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm="intercube_allreduce", ccl_yaml=ccl_path,
|
||||
)
|
||||
assert result["ok_cubes"] > 0, "allreduce did not validate"
|
||||
|
||||
pe_exec_vals = [
|
||||
float(tr.get("pe_exec_ns", 0.0) or 0.0)
|
||||
for _, (_, tr) in engine._results.items()
|
||||
if isinstance(tr, dict)
|
||||
]
|
||||
return max(pe_exec_vals) if pe_exec_vals else 0.0
|
||||
|
||||
|
||||
# ── Phase 1 assertions ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_sram_meaningfully_slower_than_tcm_at_large_payload(tmp_path):
|
||||
"""At 32 KB / PE the SRAM-backed allreduce must take meaningfully
|
||||
longer than the TCM-backed one because every IPCQ slot access goes
|
||||
through the 128 GB/s SRAM↔router link, while TCM stays per-PE local.
|
||||
|
||||
Floor justification (physics, not implementation):
|
||||
Per-IPCQ-roundtrip the SRAM tier adds 2 × nbytes/128 ns over TCM
|
||||
(one PE→SRAM hop on send-inbound, one SRAM→PE hop on recv).
|
||||
At 32 KB: 2 × 32768/128 = 512 ns added per slot exchange.
|
||||
With ≥ 10 critical-path exchanges in a 6-SIP torus_2d allreduce
|
||||
this is ≥ 5_120 ns. The threshold below is half that to leave
|
||||
room for differing critical-path counting.
|
||||
|
||||
Pre-Phase-2: gap is constant 48 ns (just the SRAM overhead × 24
|
||||
slot accesses); test FAILS.
|
||||
Post-Phase-2: gap scales with payload; test PASSES.
|
||||
"""
|
||||
n_elem = 16384 # 32 KB / PE
|
||||
lat_tcm = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="tcm", n_elem=n_elem,
|
||||
)
|
||||
lat_sram = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="sram", n_elem=n_elem,
|
||||
)
|
||||
delta = lat_sram - lat_tcm
|
||||
THRESHOLD_NS = 2_500.0
|
||||
assert delta > THRESHOLD_NS, (
|
||||
f"SRAM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB "
|
||||
f"because each IPCQ access pays a 128 GB/s PE↔SRAM hop. "
|
||||
f"got tcm={lat_tcm:.1f} sram={lat_sram:.1f} delta={delta:.1f} ns"
|
||||
)
|
||||
|
||||
|
||||
def test_sram_tcm_gap_scales_with_payload(tmp_path):
|
||||
"""The SRAM-vs-TCM gap must grow roughly linearly with payload size.
|
||||
|
||||
Pre-Phase-2: the only difference between TCM and SRAM is the SRAM
|
||||
per-access ``overhead_ns = 2``, which does NOT scale with payload —
|
||||
so the gap is the same constant 48 ns at 8 KB and at 32 KB. Ratio = 1.
|
||||
|
||||
Post-Phase-2: the dominant term is 2 × nbytes/128 (PE↔SRAM hop on
|
||||
write+read) which IS linear in payload. Going 8 KB → 32 KB (4×)
|
||||
should produce a gap roughly 4× larger.
|
||||
|
||||
Threshold below is 3× to keep slack for fixed-overhead effects.
|
||||
"""
|
||||
lat_tcm_small = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="tcm", n_elem=4096, # 8 KB
|
||||
)
|
||||
lat_sram_small = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="sram", n_elem=4096,
|
||||
)
|
||||
lat_tcm_large = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="tcm", n_elem=16384, # 32 KB
|
||||
)
|
||||
lat_sram_large = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="sram", n_elem=16384,
|
||||
)
|
||||
|
||||
gap_small = lat_sram_small - lat_tcm_small
|
||||
gap_large = lat_sram_large - lat_tcm_large
|
||||
|
||||
assert gap_small > 0, (
|
||||
f"sanity: SRAM should never be FASTER than TCM, "
|
||||
f"got gap_small={gap_small:.1f} ns"
|
||||
)
|
||||
assert gap_large > 3.0 * gap_small, (
|
||||
f"4× payload should produce ≥3× SRAM/TCM gap (linear in nbytes "
|
||||
f"because of the 128 GB/s PE↔SRAM hop). "
|
||||
f"got gap_small={gap_small:.1f} (8KB), gap_large={gap_large:.1f} "
|
||||
f"(32KB), ratio={gap_large / max(gap_small, 1e-9):.2f}"
|
||||
)
|
||||
|
||||
|
||||
def test_hbm_pe_hop_charged_at_large_payload(tmp_path):
|
||||
"""At 32 KB / PE the HBM-vs-TCM gap must exceed the gap that comes
|
||||
purely from HBM's 256 GB/s intrinsic slot-IO disadvantage.
|
||||
|
||||
Pre-Phase-2 the entire HBM/TCM gap is just the slot-IO term
|
||||
(24 × (nbytes/512 + 6) ≈ 1_700 ns at 32 KB). Post-fix adds another
|
||||
24 × (nbytes/256) × 2 ≈ 6_144 ns from the PE↔HBM hop on send and
|
||||
recv, so the total HBM/TCM gap should clearly clear 4 µs.
|
||||
"""
|
||||
n_elem = 16384 # 32 KB / PE
|
||||
lat_tcm = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="tcm", n_elem=n_elem,
|
||||
)
|
||||
lat_hbm = _run_allreduce_with_buffer_kind(
|
||||
tmp_path, buffer_kind="hbm", n_elem=n_elem,
|
||||
)
|
||||
delta = lat_hbm - lat_tcm
|
||||
THRESHOLD_NS = 4_000.0
|
||||
assert delta > THRESHOLD_NS, (
|
||||
f"HBM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB "
|
||||
f"once the 256 GB/s PE↔HBM hop is charged on each IPCQ access. "
|
||||
f"got tcm={lat_tcm:.1f} hbm={lat_hbm:.1f} delta={delta:.1f} ns"
|
||||
)
|
||||
@@ -1,7 +1,12 @@
|
||||
"""PE-to-PE latency sweep across hop types and data sizes.
|
||||
|
||||
Compares IPCQ send/recv vs raw-DMA (tl.load + tl.store) latency for four
|
||||
hop types:
|
||||
hop types. The IPCQ path uses ``tl.recv_no_consume(...)`` so that DST
|
||||
does not pay the slot-read latency — apples-to-apples with the DMA
|
||||
path, which is a one-sided write that has no read on DST.
|
||||
|
||||
``tl.recv_no_consume`` is a DIAGNOSTIC-only entry point that exists
|
||||
solely to draw this graph; production kernels use ``tl.recv``.
|
||||
|
||||
H1 Intra-cube horizontal pe0 → pe1
|
||||
H2 Intra-cube vertical pe0 → pe4
|
||||
@@ -28,7 +33,9 @@ from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
PLOT_DIR = Path(__file__).parent / "pe2pe_latency_plots"
|
||||
PLOT_DIR = (
|
||||
Path(__file__).parent.parent / "docs" / "diagrams" / "pe2pe_latency_plots"
|
||||
)
|
||||
|
||||
SIZES = [128, 256, 384, 512, 768, 1024, 2048, 4096, 8192, 10240]
|
||||
|
||||
@@ -101,7 +108,12 @@ def _measure_ipcq(hop: Hop, nbytes: int) -> float:
|
||||
data = tl.load(t_ptr, shape=(n_elem,), dtype="f16")
|
||||
tl.send(dir=send_dir, src=data)
|
||||
elif cube_id == dst_cube and pe_id == dst_pe:
|
||||
tl.recv(dir=recv_dir, shape=(n_elem,), dtype="f16")
|
||||
# tl.recv_no_consume: DST blocks until bytes land in
|
||||
# slot but skips slot-read latency. Apples-to-apples
|
||||
# with the raw-DMA path below, which has no DST read.
|
||||
# Diagnostic-only — production kernels use tl.recv.
|
||||
tl.recv_no_consume(dir=recv_dir,
|
||||
shape=(n_elem,), dtype="f16")
|
||||
|
||||
tensors = []
|
||||
for s in sorted({src_sip, dst_sip}):
|
||||
@@ -238,7 +250,8 @@ def _plot_per_hop(records, hop: Hop, path: Path) -> None:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in ipcq],
|
||||
[r["total_ns"] for r in ipcq],
|
||||
marker="o", label="IPCQ (send/recv)", color="tab:blue",
|
||||
marker="o", label="IPCQ no-consume (send/recv, no slot read)",
|
||||
color="tab:blue",
|
||||
)
|
||||
if raw:
|
||||
ax.plot(
|
||||
@@ -275,13 +288,13 @@ def _plot_overview(records, path: Path) -> None:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in ipcq],
|
||||
[r["total_ns"] for r in ipcq],
|
||||
marker="o", label="IPCQ", color="tab:blue",
|
||||
marker="o", label="IPCQ no-consume", color="tab:blue",
|
||||
)
|
||||
if raw:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in raw],
|
||||
[r["total_ns"] for r in raw],
|
||||
marker="s", label="Raw", color="tab:orange",
|
||||
marker="s", label="Raw DMA", color="tab:orange",
|
||||
)
|
||||
ax.set_title(hop.label, fontsize=10)
|
||||
ax.set_xlabel("bytes")
|
||||
@@ -291,7 +304,7 @@ def _plot_overview(records, path: Path) -> None:
|
||||
for j in range(len(HOPS), len(axes)):
|
||||
axes[j].axis("off")
|
||||
fig.suptitle(
|
||||
"PE-to-PE latency: IPCQ vs raw DMA",
|
||||
"PE-to-PE latency: IPCQ no-consume vs raw DMA",
|
||||
fontsize=14,
|
||||
)
|
||||
fig.tight_layout()
|
||||
@@ -307,7 +320,8 @@ def test_pe_to_pe_latency_sweep():
|
||||
|
||||
for hop in HOPS:
|
||||
for size in SIZES:
|
||||
# IPCQ path
|
||||
# IPCQ path uses tl.recv(consume=False) — apples-to-apples
|
||||
# with the raw-DMA path, which has no DST read either.
|
||||
ipcq_ns = _measure_ipcq(hop, size)
|
||||
records.append({
|
||||
"hop": hop.id, "label": hop.label,
|
||||
|
||||