Add tl.recv_no_consume diagnostic API for apples-to-apples pe2pe plot

The pe2pe overview compared IPCQ (tl.send + tl.recv) against raw DMA
(tl.load + tl.store), but DMA is one-sided — DST never reads — while
tl.recv pays a slot-read on DST. The comparison was unfair: IPCQ
looked slower partly because it does more work.

Adds tl.recv_no_consume() — a separate, diagnostic-only entry point
that blocks for slot arrival but skips the slot-read (and bank-hop)
charge on DST. Production tl.recv is unchanged (no `consume` kwarg
on the public API), so the diagnostic flag can never accidentally
leak into real workloads.

Updates test_pe_to_pe_latency to call tl.recv_no_consume so the
overview.png shows IPCQ no-consume vs raw DMA on equal footing.
Also fixes PLOT_DIR back to docs/diagrams/pe2pe_latency_plots/
(was lost in a merge). Adds scripts/replot_pe2pe.py for label-only
re-renders without re-measuring.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-28 18:20:44 -07:00
parent 9c129d6131
commit a563169e89
9 changed files with 245 additions and 48 deletions
Binary file not shown.

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 52 KiB

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 109 KiB

+40 -40
View File
@@ -1,81 +1,81 @@
hop,label,size_bytes,path,total_ns
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.3899999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,raw,12.019999999996799
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.1399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,raw,13.019999999996799
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,35.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,34.8899999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,raw,14.019999999996799
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,37.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,36.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,raw,15.019999999996799
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,41.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,40.1399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,raw,17.0199999999968
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,45.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,43.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,raw,19.0199999999968
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,61.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,57.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,raw,27.0199999999968
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,93.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,85.6399999999976
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,raw,43.0199999999968
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,157.64000000000306
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,141.64000000000306
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,raw,75.02000000000407
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,189.64000000000306
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,169.64000000000306
h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,raw,91.02000000000407
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.3899999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,raw,12.019999999996799
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.1399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,raw,13.019999999996799
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,35.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,34.8899999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,raw,14.019999999996799
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,37.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,36.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,raw,15.019999999996799
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,41.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,40.1399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,raw,17.0199999999968
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,45.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,43.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,raw,19.0199999999968
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,61.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,57.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,raw,27.0199999999968
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,93.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,85.6399999999976
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,raw,43.0199999999968
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,157.64000000000306
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,141.64000000000306
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,raw,75.02000000000407
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,189.64000000000306
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,169.64000000000306
h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,raw,91.02000000000407
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.40999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,raw,68.53999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.15999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,raw,70.03999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,71.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,70.90999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,raw,71.53999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,73.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,72.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,raw,73.03999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,77.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,76.15999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,raw,76.03999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,81.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,79.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,raw,79.03999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,97.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,93.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,raw,91.03999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,129.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,121.65999999999804
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,raw,115.03999999999724
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,193.65999999999985
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,177.65999999999985
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,raw,163.04000000000087
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,225.65999999999985
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,205.65999999999985
h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,raw,187.04000000000087
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.40999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,raw,88.53999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.15999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,raw,90.03999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,91.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,90.90999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,raw,91.53999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,93.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,92.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,raw,93.03999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,97.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,96.15999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,raw,96.03999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,101.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,99.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,raw,99.03999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,117.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,113.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,raw,111.03999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,149.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,141.65999999999804
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,raw,135.03999999999724
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,213.65999999999985
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,197.65999999999985
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,raw,183.04000000000087
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,245.65999999999985
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,225.65999999999985
h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,raw,207.04000000000087
1 hop label size_bytes path total_ns
2 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 128 ipcq 31.6399999999976 31.3899999999976
3 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 128 raw 12.019999999996799
4 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 256 ipcq 33.6399999999976 33.1399999999976
5 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 256 raw 13.019999999996799
6 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 384 ipcq 35.6399999999976 34.8899999999976
7 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 384 raw 14.019999999996799
8 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 512 ipcq 37.6399999999976 36.6399999999976
9 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 512 raw 15.019999999996799
10 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 768 ipcq 41.6399999999976 40.1399999999976
11 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 768 raw 17.0199999999968
12 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 1024 ipcq 45.6399999999976 43.6399999999976
13 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 1024 raw 19.0199999999968
14 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 2048 ipcq 61.6399999999976 57.6399999999976
15 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 2048 raw 27.0199999999968
16 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 4096 ipcq 93.6399999999976 85.6399999999976
17 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 4096 raw 43.0199999999968
18 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 8192 ipcq 157.64000000000306 141.64000000000306
19 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 8192 raw 75.02000000000407
20 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 10240 ipcq 189.64000000000306 169.64000000000306
21 h1_intra_horizontal Intra-cube horizontal (pe0 to pe1) 10240 raw 91.02000000000407
22 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 128 ipcq 31.6399999999976 31.3899999999976
23 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 128 raw 12.019999999996799
24 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 256 ipcq 33.6399999999976 33.1399999999976
25 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 256 raw 13.019999999996799
26 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 384 ipcq 35.6399999999976 34.8899999999976
27 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 384 raw 14.019999999996799
28 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 512 ipcq 37.6399999999976 36.6399999999976
29 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 512 raw 15.019999999996799
30 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 768 ipcq 41.6399999999976 40.1399999999976
31 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 768 raw 17.0199999999968
32 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 1024 ipcq 45.6399999999976 43.6399999999976
33 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 1024 raw 19.0199999999968
34 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 2048 ipcq 61.6399999999976 57.6399999999976
35 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 2048 raw 27.0199999999968
36 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 4096 ipcq 93.6399999999976 85.6399999999976
37 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 4096 raw 43.0199999999968
38 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 8192 ipcq 157.64000000000306 141.64000000000306
39 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 8192 raw 75.02000000000407
40 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 10240 ipcq 189.64000000000306 169.64000000000306
41 h2_intra_vertical Intra-cube vertical (pe0 to pe4) 10240 raw 91.02000000000407
42 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 128 ipcq 67.65999999999804 67.40999999999804
43 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 128 raw 68.53999999999724
44 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 256 ipcq 69.65999999999804 69.15999999999804
45 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 256 raw 70.03999999999724
46 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 384 ipcq 71.65999999999804 70.90999999999804
47 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 384 raw 71.53999999999724
48 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 512 ipcq 73.65999999999804 72.65999999999804
49 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 512 raw 73.03999999999724
50 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 768 ipcq 77.65999999999804 76.15999999999804
51 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 768 raw 76.03999999999724
52 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 1024 ipcq 81.65999999999804 79.65999999999804
53 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 1024 raw 79.03999999999724
54 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 2048 ipcq 97.65999999999804 93.65999999999804
55 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 2048 raw 91.03999999999724
56 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 4096 ipcq 129.65999999999804 121.65999999999804
57 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 4096 raw 115.03999999999724
58 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 8192 ipcq 193.65999999999985 177.65999999999985
59 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 8192 raw 163.04000000000087
60 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 10240 ipcq 225.65999999999985 205.65999999999985
61 h3_inter_cube_horizontal Inter-cube horizontal (cube0 to cube1) 10240 raw 187.04000000000087
62 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 128 ipcq 87.65999999999804 87.40999999999804
63 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 128 raw 88.53999999999724
64 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 256 ipcq 89.65999999999804 89.15999999999804
65 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 256 raw 90.03999999999724
66 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 384 ipcq 91.65999999999804 90.90999999999804
67 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 384 raw 91.53999999999724
68 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 512 ipcq 93.65999999999804 92.65999999999804
69 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 512 raw 93.03999999999724
70 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 768 ipcq 97.65999999999804 96.15999999999804
71 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 768 raw 96.03999999999724
72 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 1024 ipcq 101.65999999999804 99.65999999999804
73 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 1024 raw 99.03999999999724
74 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 2048 ipcq 117.65999999999804 113.65999999999804
75 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 2048 raw 111.03999999999724
76 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 4096 ipcq 149.65999999999804 141.65999999999804
77 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 4096 raw 135.03999999999724
78 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 8192 ipcq 213.65999999999985 197.65999999999985
79 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 8192 raw 183.04000000000087
80 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 10240 ipcq 245.65999999999985 225.65999999999985
81 h4_inter_cube_vertical Inter-cube vertical (cube0 to cube4) 10240 raw 207.04000000000087
+141
View File
@@ -0,0 +1,141 @@
"""Re-render pe2pe latency PNGs from the existing summary.csv with the
current (no-consume) labels. Used after a label-only test edit to avoid
re-measuring (~5 min) when the data on disk is already correct.
Reads docs/diagrams/pe2pe_latency_plots/summary.csv. Plots 2 curves:
"IPCQ no-consume" (from the ipcq_no_consume rows if present, else from
the ipcq rows) and "Raw DMA" (raw rows).
"""
from __future__ import annotations
import csv
from pathlib import Path
import matplotlib.pyplot as plt
ROOT = Path(__file__).resolve().parent.parent
PLOT_DIR = ROOT / "docs" / "diagrams" / "pe2pe_latency_plots"
CSV_PATH = PLOT_DIR / "summary.csv"
def _load_records():
rows = []
with open(CSV_PATH, newline="") as f:
for r in csv.DictReader(f):
rows.append({
"hop": r["hop"],
"label": r["label"],
"size_bytes": int(r["size_bytes"]),
"path": r["path"],
"total_ns": float(r["total_ns"]),
})
return rows
def _ipcq_rows(records, hop):
# Prefer ipcq_no_consume if present (older 3-path CSV); fall back to ipcq
# (current single-path CSV where ipcq IS no-consume).
nc = [r for r in records
if r["hop"] == hop and r["path"] == "ipcq_no_consume"]
if nc:
return sorted(nc, key=lambda r: r["size_bytes"])
return sorted(
[r for r in records if r["hop"] == hop and r["path"] == "ipcq"],
key=lambda r: r["size_bytes"],
)
def _raw_rows(records, hop):
return sorted(
[r for r in records if r["hop"] == hop and r["path"] == "raw"],
key=lambda r: r["size_bytes"],
)
def _hops(records):
seen = []
for r in records:
if r["hop"] not in {h["id"] for h in seen}:
seen.append({"id": r["hop"], "label": r["label"]})
return seen
def _plot_per_hop(records, hop, path):
ipcq = _ipcq_rows(records, hop["id"])
raw = _raw_rows(records, hop["id"])
fig, ax = plt.subplots(figsize=(8, 5))
if ipcq:
ax.plot(
[r["size_bytes"] for r in ipcq],
[r["total_ns"] for r in ipcq],
marker="o", color="tab:blue",
label="IPCQ no-consume (send/recv, no slot read)",
)
if raw:
ax.plot(
[r["size_bytes"] for r in raw],
[r["total_ns"] for r in raw],
marker="s", color="tab:orange",
label="Raw DMA (load+store)",
)
ax.set_xlabel("Data size (bytes)")
ax.set_ylabel("Latency (ns)")
ax.set_title(hop["label"])
ax.grid(True, alpha=0.3)
ax.legend()
fig.tight_layout()
fig.savefig(path, dpi=120)
plt.close(fig)
def _plot_overview(records, hops, path):
fig, axes = plt.subplots(2, 2, figsize=(13, 9))
axes = axes.flatten()
for i, hop in enumerate(hops):
ax = axes[i]
ipcq = _ipcq_rows(records, hop["id"])
raw = _raw_rows(records, hop["id"])
if ipcq:
ax.plot(
[r["size_bytes"] for r in ipcq],
[r["total_ns"] for r in ipcq],
marker="o", color="tab:blue",
label="IPCQ no-consume",
)
if raw:
ax.plot(
[r["size_bytes"] for r in raw],
[r["total_ns"] for r in raw],
marker="s", color="tab:orange",
label="Raw DMA",
)
ax.set_title(hop["label"], fontsize=10)
ax.set_xlabel("bytes")
ax.set_ylabel("ns")
ax.grid(True, alpha=0.3)
ax.legend(fontsize=8)
for j in range(len(hops), len(axes)):
axes[j].axis("off")
fig.suptitle(
"PE-to-PE latency: IPCQ no-consume vs raw DMA",
fontsize=14,
)
fig.tight_layout()
fig.savefig(path, dpi=120)
plt.close(fig)
def main():
records = _load_records()
hops = _hops(records)
for hop in hops:
out = PLOT_DIR / f"{hop['id']}.png"
_plot_per_hop(records, hop, out)
print(f"wrote {out}")
overview = PLOT_DIR / "overview.png"
_plot_overview(records, hops, overview)
print(f"wrote {overview}")
if __name__ == "__main__":
main()
+42
View File
@@ -492,6 +492,48 @@ class TLContext:
)
return self._make_handle(addr=0, shape=shape, dtype=dtype)
def recv_no_consume(
self,
dir: str | None = None,
shape: tuple[int, ...] = (),
dtype: str = "f16",
) -> TensorHandle:
"""DIAGNOSTIC ONLY — recv that blocks for arrival but skips slot read.
Same blocking semantics as ``tl.recv``: the kernel waits until
the payload has landed in the IPCQ slot. Differs from ``tl.recv``
by skipping the slot-read latency charge (slot-IO + PE↔bank
fabric drain) on DST.
This entry point exists solely so the pe2pe overview plot can
draw an apples-to-apples comparison against ``tl.store`` (a
one-sided fabric write that pays no read on DST). Production
kernels MUST use ``tl.recv`` — they need to consume the data
they receive. This API is segregated from ``tl.recv`` so the
diagnostic flag can never accidentally be set in real workloads.
"""
self._emit_dispatch_overhead()
cmd = IpcqRecvCmd(
direction=dir,
shape=shape, dtype=dtype,
handle_id=self._next_handle_id(),
consume=False,
)
result = self._emit(cmd) # type: ignore[arg-type]
if isinstance(result, dict):
slot_addr = int(result.get("src_addr", 0))
slot_space = str(result.get("src_space", "tcm"))
return TensorHandle(
id=self._next_handle_id(),
addr=slot_addr,
shape=shape,
dtype=dtype,
nbytes=self._nbytes(shape, dtype),
data=None,
space=slot_space,
)
return self._make_handle(addr=0, shape=shape, dtype=dtype)
def recv_async(
self,
dir: str,
+22 -8
View File
@@ -1,7 +1,12 @@
"""PE-to-PE latency sweep across hop types and data sizes.
Compares IPCQ send/recv vs raw-DMA (tl.load + tl.store) latency for four
hop types:
hop types. The IPCQ path uses ``tl.recv_no_consume(...)`` so that DST
does not pay the slot-read latency — apples-to-apples with the DMA
path, which is a one-sided write that has no read on DST.
``tl.recv_no_consume`` is a DIAGNOSTIC-only entry point that exists
solely to draw this graph; production kernels use ``tl.recv``.
H1 Intra-cube horizontal pe0 → pe1
H2 Intra-cube vertical pe0 → pe4
@@ -28,7 +33,9 @@ from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
PLOT_DIR = Path(__file__).parent / "pe2pe_latency_plots"
PLOT_DIR = (
Path(__file__).parent.parent / "docs" / "diagrams" / "pe2pe_latency_plots"
)
SIZES = [128, 256, 384, 512, 768, 1024, 2048, 4096, 8192, 10240]
@@ -101,7 +108,12 @@ def _measure_ipcq(hop: Hop, nbytes: int) -> float:
data = tl.load(t_ptr, shape=(n_elem,), dtype="f16")
tl.send(dir=send_dir, src=data)
elif cube_id == dst_cube and pe_id == dst_pe:
tl.recv(dir=recv_dir, shape=(n_elem,), dtype="f16")
# tl.recv_no_consume: DST blocks until bytes land in
# slot but skips slot-read latency. Apples-to-apples
# with the raw-DMA path below, which has no DST read.
# Diagnostic-only — production kernels use tl.recv.
tl.recv_no_consume(dir=recv_dir,
shape=(n_elem,), dtype="f16")
tensors = []
for s in sorted({src_sip, dst_sip}):
@@ -238,7 +250,8 @@ def _plot_per_hop(records, hop: Hop, path: Path) -> None:
ax.plot(
[r["size_bytes"] for r in ipcq],
[r["total_ns"] for r in ipcq],
marker="o", label="IPCQ (send/recv)", color="tab:blue",
marker="o", label="IPCQ no-consume (send/recv, no slot read)",
color="tab:blue",
)
if raw:
ax.plot(
@@ -275,13 +288,13 @@ def _plot_overview(records, path: Path) -> None:
ax.plot(
[r["size_bytes"] for r in ipcq],
[r["total_ns"] for r in ipcq],
marker="o", label="IPCQ", color="tab:blue",
marker="o", label="IPCQ no-consume", color="tab:blue",
)
if raw:
ax.plot(
[r["size_bytes"] for r in raw],
[r["total_ns"] for r in raw],
marker="s", label="Raw", color="tab:orange",
marker="s", label="Raw DMA", color="tab:orange",
)
ax.set_title(hop.label, fontsize=10)
ax.set_xlabel("bytes")
@@ -291,7 +304,7 @@ def _plot_overview(records, path: Path) -> None:
for j in range(len(HOPS), len(axes)):
axes[j].axis("off")
fig.suptitle(
"PE-to-PE latency: IPCQ vs raw DMA",
"PE-to-PE latency: IPCQ no-consume vs raw DMA",
fontsize=14,
)
fig.tight_layout()
@@ -307,7 +320,8 @@ def test_pe_to_pe_latency_sweep():
for hop in HOPS:
for size in SIZES:
# IPCQ path
# IPCQ path uses tl.recv(consume=False) — apples-to-apples
# with the raw-DMA path, which has no DST read either.
ipcq_ns = _measure_ipcq(hop, size)
records.append({
"hop": hop.id, "label": hop.label,