diff --git a/docs/diagrams/allreduce_latency_plots/mesh_2d_no_wrap.png b/docs/diagrams/allreduce_latency_plots/mesh_2d_no_wrap.png index 6a4d0ca..33ae4ea 100644 Binary files a/docs/diagrams/allreduce_latency_plots/mesh_2d_no_wrap.png and b/docs/diagrams/allreduce_latency_plots/mesh_2d_no_wrap.png differ diff --git a/docs/diagrams/allreduce_latency_plots/overview.png b/docs/diagrams/allreduce_latency_plots/overview.png index 0622007..bad2afa 100644 Binary files a/docs/diagrams/allreduce_latency_plots/overview.png and b/docs/diagrams/allreduce_latency_plots/overview.png differ diff --git a/docs/diagrams/allreduce_latency_plots/ring_1d.png b/docs/diagrams/allreduce_latency_plots/ring_1d.png index beb73fa..6fcc8a5 100644 Binary files a/docs/diagrams/allreduce_latency_plots/ring_1d.png and b/docs/diagrams/allreduce_latency_plots/ring_1d.png differ diff --git a/docs/diagrams/allreduce_latency_plots/summary.csv b/docs/diagrams/allreduce_latency_plots/summary.csv index d40f782..2aa2778 100644 --- a/docs/diagrams/allreduce_latency_plots/summary.csv +++ b/docs/diagrams/allreduce_latency_plots/summary.csv @@ -1,37 +1,37 @@ algorithm,sip_topology,n_sips,n_elem,bytes_per_pe,bytes_per_sip,latency_ns -intercube_allreduce,mesh_2d_no_wrap,6,8,16,256,3508.4249999999993 -intercube_allreduce,mesh_2d_no_wrap,6,32,64,1024,3515.55 -intercube_allreduce,mesh_2d_no_wrap,6,64,128,2048,3525.0499999999975 -intercube_allreduce,mesh_2d_no_wrap,6,128,256,4096,3544.049999999992 -intercube_allreduce,mesh_2d_no_wrap,6,512,1024,16384,3667.049999999992 -intercube_allreduce,mesh_2d_no_wrap,6,1024,2048,32768,3837.049999999992 -intercube_allreduce,mesh_2d_no_wrap,6,2048,4096,65536,4177.049999999992 -intercube_allreduce,mesh_2d_no_wrap,6,4096,8192,131072,4857.049999999959 -intercube_allreduce,mesh_2d_no_wrap,6,8192,16384,262144,6217.049999999945 -intercube_allreduce,mesh_2d_no_wrap,6,16384,32768,524288,8937.049999999937 -intercube_allreduce,mesh_2d_no_wrap,6,32768,65536,1048576,14377.049999999872 -intercube_allreduce,mesh_2d_no_wrap,6,49152,98304,1572864,19817.049999999872 -intercube_allreduce,ring_1d,6,8,16,256,3073.1299999999937 -intercube_allreduce,ring_1d,6,32,64,1024,3079.8799999999947 -intercube_allreduce,ring_1d,6,64,128,2048,3088.879999999992 -intercube_allreduce,ring_1d,6,128,256,4096,3106.8799999999865 -intercube_allreduce,ring_1d,6,512,1024,16384,3225.8799999999865 -intercube_allreduce,ring_1d,6,1024,2048,32768,3391.8799999999865 -intercube_allreduce,ring_1d,6,2048,4096,65536,3723.8799999999865 -intercube_allreduce,ring_1d,6,4096,8192,131072,4387.879999999965 -intercube_allreduce,ring_1d,6,8192,16384,262144,5715.879999999957 -intercube_allreduce,ring_1d,6,16384,32768,524288,8371.879999999932 -intercube_allreduce,ring_1d,6,32768,65536,1048576,13683.879999999903 -intercube_allreduce,ring_1d,6,49152,98304,1572864,18995.879999999917 -intercube_allreduce,torus_2d,6,8,16,256,2190.4799999999923 -intercube_allreduce,torus_2d,6,32,64,1024,2196.479999999993 -intercube_allreduce,torus_2d,6,64,128,2048,2204.4799999999905 -intercube_allreduce,torus_2d,6,128,256,4096,2220.479999999985 -intercube_allreduce,torus_2d,6,512,1024,16384,2325.479999999985 -intercube_allreduce,torus_2d,6,1024,2048,32768,2471.479999999985 -intercube_allreduce,torus_2d,6,2048,4096,65536,2763.479999999985 -intercube_allreduce,torus_2d,6,4096,8192,131072,3347.4799999999777 -intercube_allreduce,torus_2d,6,8192,16384,262144,4515.4799999999705 -intercube_allreduce,torus_2d,6,16384,32768,524288,6851.479999999952 -intercube_allreduce,torus_2d,6,32768,65536,1048576,11523.479999999923 -intercube_allreduce,torus_2d,6,49152,98304,1572864,16195.479999999952 +intercube_allreduce,mesh_2d_no_wrap,6,8,16,256,2626.302499999998 +intercube_allreduce,mesh_2d_no_wrap,6,32,64,1024,2634.7399999999952 +intercube_allreduce,mesh_2d_no_wrap,6,64,128,2048,2645.9899999999925 +intercube_allreduce,mesh_2d_no_wrap,6,128,256,4096,2668.489999999987 +intercube_allreduce,mesh_2d_no_wrap,6,512,1024,16384,2812.489999999987 +intercube_allreduce,mesh_2d_no_wrap,6,1024,2048,32768,3010.489999999987 +intercube_allreduce,mesh_2d_no_wrap,6,2048,4096,65536,3406.489999999987 +intercube_allreduce,mesh_2d_no_wrap,6,4096,8192,131072,4198.489999999965 +intercube_allreduce,mesh_2d_no_wrap,6,8192,16384,262144,5782.489999999969 +intercube_allreduce,mesh_2d_no_wrap,6,16384,32768,524288,8950.489999999925 +intercube_allreduce,mesh_2d_no_wrap,6,32768,65536,1048576,15286.48999999986 +intercube_allreduce,mesh_2d_no_wrap,6,49152,98304,1572864,21622.489999999932 +intercube_allreduce,ring_1d,6,8,16,256,2302.9849999999933 +intercube_allreduce,ring_1d,6,32,64,1024,2310.8599999999906 +intercube_allreduce,ring_1d,6,64,128,2048,2321.359999999988 +intercube_allreduce,ring_1d,6,128,256,4096,2342.3599999999824 +intercube_allreduce,ring_1d,6,512,1024,16384,2479.3599999999824 +intercube_allreduce,ring_1d,6,1024,2048,32768,2669.3599999999824 +intercube_allreduce,ring_1d,6,2048,4096,65536,3049.3599999999824 +intercube_allreduce,ring_1d,6,4096,8192,131072,3809.3599999999715 +intercube_allreduce,ring_1d,6,8192,16384,262144,5329.359999999979 +intercube_allreduce,ring_1d,6,16384,32768,524288,8369.35999999992 +intercube_allreduce,ring_1d,6,32768,65536,1048576,14449.359999999899 +intercube_allreduce,ring_1d,6,49152,98304,1572864,20529.35999999997 +intercube_allreduce,torus_2d,6,8,16,256,1644.2899999999936 +intercube_allreduce,torus_2d,6,32,64,1024,1651.0399999999909 +intercube_allreduce,torus_2d,6,64,128,2048,1660.0399999999881 +intercube_allreduce,torus_2d,6,128,256,4096,1678.0399999999827 +intercube_allreduce,torus_2d,6,512,1024,16384,1795.0399999999827 +intercube_allreduce,torus_2d,6,1024,2048,32768,1957.0399999999827 +intercube_allreduce,torus_2d,6,2048,4096,65536,2281.0399999999827 +intercube_allreduce,torus_2d,6,4096,8192,131072,2929.039999999979 +intercube_allreduce,torus_2d,6,8192,16384,262144,4225.039999999986 +intercube_allreduce,torus_2d,6,16384,32768,524288,6817.039999999943 +intercube_allreduce,torus_2d,6,32768,65536,1048576,12001.03999999992 +intercube_allreduce,torus_2d,6,49152,98304,1572864,17185.039999999994 diff --git a/docs/diagrams/allreduce_latency_plots/torus_2d.png b/docs/diagrams/allreduce_latency_plots/torus_2d.png index ce4b502..bccab92 100644 Binary files a/docs/diagrams/allreduce_latency_plots/torus_2d.png and b/docs/diagrams/allreduce_latency_plots/torus_2d.png differ diff --git a/docs/diagrams/ipcq_diagram_plots/ipcq_send_recv.png b/docs/diagrams/ipcq_diagram_plots/ipcq_send_recv.png new file mode 100644 index 0000000..99dbf11 Binary files /dev/null and b/docs/diagrams/ipcq_diagram_plots/ipcq_send_recv.png differ diff --git a/docs/diagrams/ipcq_diagram_plots/ipcq_two_pe_dma.png b/docs/diagrams/ipcq_diagram_plots/ipcq_two_pe_dma.png new file mode 100644 index 0000000..78f2f42 Binary files /dev/null and b/docs/diagrams/ipcq_diagram_plots/ipcq_two_pe_dma.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png b/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png index 23a4db0..26a54be 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png and b/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png b/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png index a7af541..62f5f44 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png and b/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png b/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png index 94b9eef..bce23ea 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png and b/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png b/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png index 3f685da..5c77c81 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png and b/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/overview.png b/docs/diagrams/pe2pe_latency_plots/overview.png index 8914ae7..f9a8191 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/overview.png and b/docs/diagrams/pe2pe_latency_plots/overview.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/summary.csv b/docs/diagrams/pe2pe_latency_plots/summary.csv index a94b106..288abec 100644 --- a/docs/diagrams/pe2pe_latency_plots/summary.csv +++ b/docs/diagrams/pe2pe_latency_plots/summary.csv @@ -1,81 +1,81 @@ hop,label,size_bytes,path,total_ns -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.1399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,raw,12.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,32.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,raw,13.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,34.1399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,35.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,raw,14.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,35.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,37.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,raw,15.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,38.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,41.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,raw,17.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,41.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,45.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,raw,19.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,53.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,61.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,raw,27.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,77.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,93.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,raw,43.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,125.64000000000306 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,157.64000000000306 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,raw,75.02000000000407 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,149.64000000000306 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,189.64000000000306 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,raw,91.02000000000407 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.1399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,raw,12.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,32.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,raw,13.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,34.1399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,35.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,raw,14.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,35.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,37.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,raw,15.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,38.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,41.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,raw,17.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,41.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,45.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,raw,19.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,53.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,61.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,raw,27.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,77.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,93.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,raw,43.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,125.64000000000306 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,157.64000000000306 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,raw,75.02000000000407 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,149.64000000000306 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,189.64000000000306 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,raw,91.02000000000407 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.15999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,raw,68.53999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,68.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,raw,70.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,70.15999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,71.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,raw,71.53999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,71.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,73.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,raw,73.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,74.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,77.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,raw,76.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,77.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,81.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,raw,79.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,89.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,97.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,raw,91.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,113.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,129.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,raw,115.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,161.65999999999985 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,193.65999999999985 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,raw,163.04000000000087 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,185.65999999999985 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,225.65999999999985 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,raw,187.04000000000087 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.15999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,raw,88.53999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,88.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,raw,90.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,90.15999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,91.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,raw,91.53999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,91.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,93.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,raw,93.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,94.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,97.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,raw,96.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,97.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,101.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,raw,99.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,109.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,117.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,raw,111.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,133.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,149.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,raw,135.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,181.65999999999985 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,213.65999999999985 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,raw,183.04000000000087 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,205.65999999999985 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,245.65999999999985 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,raw,207.04000000000087 diff --git a/src/kernbench/ccl/algorithms/intercube_allreduce.py b/src/kernbench/ccl/algorithms/intercube_allreduce.py index 32be7cd..a141942 100644 --- a/src/kernbench/ccl/algorithms/intercube_allreduce.py +++ b/src/kernbench/ccl/algorithms/intercube_allreduce.py @@ -111,6 +111,11 @@ def allreduce_intercube_multidevice( ): """Intercube all-reduce (pe0-only) with configurable SIP topology. + Root cube sits at the geometric center (cube_w//2, cube_h//2) and + each phase converges bidirectionally so the intra-SIP critical path + is ~half what a corner-root walk would be (e.g., 4×4 mesh: 4 hops + reduce + 4 hops broadcast vs 6+6 with corner root). + Args: t_ptr: VA base of the row-wise-sharded tensor on this SIP. n_elem: f16 elements per cube tile. @@ -128,34 +133,59 @@ def allreduce_intercube_multidevice( col = cube_id % cube_w nbytes = n_elem * 2 + root_col = cube_w // 2 + root_row = cube_h // 2 + root_cube = root_row * cube_w + root_col + pe_addr = t_ptr + cube_id * nbytes acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") - # ── Phase 1: row reduce W → E ── - if col == 0: + # ── Phase 1: row reduce — converge at col == root_col ── + # Left half (col < root_col) walks W→E; right half (col > root_col) + # walks E→W; the root_col cube merges both sides. + if col == 0 and root_col > 0: tl.send(dir="E", src=acc) - elif col < cube_w - 1: + elif 0 < col < root_col: recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") acc = acc + recv tl.send(dir="E", src=acc) - else: - recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + elif col == root_col: + if root_col > 0: + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + acc = acc + recv + if cube_w - 1 > root_col: + recv = tl.recv(dir="E", shape=(n_elem,), dtype="f16") + acc = acc + recv + elif root_col < col < cube_w - 1: + recv = tl.recv(dir="E", shape=(n_elem,), dtype="f16") acc = acc + recv + tl.send(dir="W", src=acc) + elif col == cube_w - 1 and cube_w - 1 > root_col: + tl.send(dir="W", src=acc) - # ── Phase 2: col reduce N → S on rightmost column ── - if col == cube_w - 1: - if row == 0: + # ── Phase 2: col reduce on col == root_col — converge at row == root_row ── + if col == root_col: + if row == 0 and root_row > 0: tl.send(dir="S", src=acc) - elif row < cube_h - 1: + elif 0 < row < root_row: recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16") acc = acc + recv tl.send(dir="S", src=acc) - else: - recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16") + elif row == root_row: + if root_row > 0: + recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16") + acc = acc + recv + if cube_h - 1 > root_row: + recv = tl.recv(dir="S", shape=(n_elem,), dtype="f16") + acc = acc + recv + elif root_row < row < cube_h - 1: + recv = tl.recv(dir="S", shape=(n_elem,), dtype="f16") acc = acc + recv + tl.send(dir="N", src=acc) + elif row == cube_h - 1 and cube_h - 1 > root_row: + tl.send(dir="N", src=acc) # ── Phase 3: inter-SIP exchange on root cube ── - root_cube = (cube_h - 1) * cube_w + (cube_w - 1) if cube_id == root_cube and n_sips > 1: if sip_topo_kind == SIP_TOPO_RING: acc = _inter_sip_ring(acc, n_sips, n_elem, tl) @@ -164,24 +194,36 @@ def allreduce_intercube_multidevice( elif sip_topo_kind == SIP_TOPO_MESH: acc = _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl) - # ── Phase 4: col broadcast S → N on rightmost column ── - if col == cube_w - 1: - if row == cube_h - 1: - tl.send(dir="N", src=acc) - elif row > 0: - acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16") - tl.send(dir="N", src=acc) - else: + # ── Phase 4: col broadcast on col == root_col, outward from root_row ── + if col == root_col: + if row == root_row: + if root_row > 0: + tl.send(dir="N", src=acc) + if cube_h - 1 > root_row: + tl.send(dir="S", src=acc) + elif row < root_row: acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16") + if row > 0: + tl.send(dir="N", src=acc) + elif row > root_row: + acc = tl.recv(dir="N", shape=(n_elem,), dtype="f16") + if row < cube_h - 1: + tl.send(dir="S", src=acc) - # ── Phase 5: row broadcast E → W ── - if col == cube_w - 1: - tl.send(dir="W", src=acc) - elif col > 0: - acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16") - tl.send(dir="W", src=acc) - else: + # ── Phase 5: row broadcast outward from root_col ── + if col == root_col: + if root_col > 0: + tl.send(dir="W", src=acc) + if cube_w - 1 > root_col: + tl.send(dir="E", src=acc) + elif col < root_col: acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16") + if col > 0: + tl.send(dir="W", src=acc) + elif col > root_col: + acc = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + if col < cube_w - 1: + tl.send(dir="E", src=acc) tl.store(pe_addr, acc) diff --git a/tests/test_allreduce_multidevice.py b/tests/test_allreduce_multidevice.py index 783b819..351f03b 100644 --- a/tests/test_allreduce_multidevice.py +++ b/tests/test_allreduce_multidevice.py @@ -289,7 +289,8 @@ _SWEEP_TOPOLOGIES = [ # parametrized invocation writes one JSON file here; the aggregator # (run from conftest.pytest_sessionfinish) reads them and emits the # combined CSV + PNG plots. -_SWEEP_OUT_DIR = Path(__file__).parent / "allreduce_latency_plots" +_SWEEP_OUT_DIR = (Path(__file__).parent.parent / "docs" / "diagrams" + / "allreduce_latency_plots") _SWEEP_ROWS_DIR = _SWEEP_OUT_DIR / "_rows" @@ -447,7 +448,7 @@ def _aggregate_sweep_plots() -> bool: ax.plot(xs, ys, marker="o", color="tab:blue") ax.set_xscale("log", base=2) ax.set_xlabel("Bytes per PE (log scale)") - ax.set_ylabel("max pe_exec_ns (critical path)") + ax.set_ylabel("Time (ns)") ax.set_title(title) ax.grid(True, alpha=0.3) ax.xaxis.set_major_formatter(_bytes_fmt) @@ -457,7 +458,28 @@ def _aggregate_sweep_plots() -> bool: colors = {"ring_1d": "tab:blue", "torus_2d": "tab:orange", "mesh_2d_no_wrap": "tab:green"} - THEORETICAL_TORUS_2D_6SIP_NS = 10600.0 + + # ── Hand-derived theoretical model for torus_2d (6 SIPs) ── + # Critical-path analysis (per packet, packet = 128 B at NoC): + # local intra-SIP reduce + broadcast = 8 hops × 57 ns = 456 ns + # global X-direction reduce = 5 UCIe + 1 UAL = 445 ns + # global Y-direction reduce = 5 UCIe + 1 UAL = 445 ns + # per-packet startup latency = 456 + 445 + 445 = 1346 ns + # Packet count is PER CUBE (8 PEs/cube cooperate on the cube tile). + # At 6144 packets/cube the pipelined total is 8741 ns, so the + # bottleneck-stage interval τ = (8741 − 1346) / (6144 − 1) ≈ 1.204 ns. + # T_theoretical(N) = 1346 + (N − 1) × τ + # where N = ceil((bytes_per_pe × 8) / 128) = ceil(bytes_per_pe / 16) + NOC_PACKET_BYTES = 128 + PES_PER_CUBE = 8 + T_STARTUP_NS = 1346.0 + TAU_NS = (8741.0 - 1346.0) / (6144 - 1) # ≈ 1.2038 ns/packet + + def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float: + bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE + n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES)) # ceil + return T_STARTUP_NS + (n_packets - 1) * TAU_NS + fig, ax = plt.subplots(figsize=(9, 6)) for topo_name in topologies: rs = sorted( @@ -473,64 +495,28 @@ def _aggregate_sweep_plots() -> bool: label=f"{topo_name} (n_sips={rs[0]['n_sips']})", color=colors.get(topo_name), ) - ax.axhline( - y=THEORETICAL_TORUS_2D_6SIP_NS, - color="tab:red", linestyle="--", linewidth=1.5, - label=f"theoretical torus_2d (6 SIPs) = " - f"{THEORETICAL_TORUS_2D_6SIP_NS:.0f} ns", + + # Theoretical torus_2d curve across all payload sizes. + torus_rs = sorted( + [r for r in records if r["sip_topology"] == "torus_2d"], + key=lambda r: r["bytes_per_pe"], ) - BYTES_96KB = 96 * 1024 - ax.axvline( - x=BYTES_96KB, ymin=0, ymax=1, - color="tab:red", linestyle=":", linewidth=1.2, - ) - ax.plot( - [BYTES_96KB], [THEORETICAL_TORUS_2D_6SIP_NS], - marker="x", color="tab:red", markersize=10, markeredgewidth=2, - ) - # Find simulated torus_2d latency at 96 KB (if present) for direct - # comparison with the theoretical value. - sim_torus_at_96kb = next( - (r["latency_ns"] for r in records - if r["sip_topology"] == "torus_2d" and r["bytes_per_pe"] == BYTES_96KB), - None, - ) - if sim_torus_at_96kb is not None: + if torus_rs: + xs_th = [r["bytes_per_pe"] for r in torus_rs] + ys_th = [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs] ax.plot( - [BYTES_96KB], [sim_torus_at_96kb], - marker="o", color="tab:orange", - markersize=10, markeredgecolor="black", markeredgewidth=1.2, - ) - ax.annotate( - f"96 KB\n" - f"theoretical = {THEORETICAL_TORUS_2D_6SIP_NS:.0f} ns\n" - f"simulated = {sim_torus_at_96kb:.0f} ns", - xy=(BYTES_96KB, sim_torus_at_96kb), - xytext=(10, -20), textcoords="offset points", - color="tab:red", fontsize=9, - ) - else: - ax.annotate( - f"96 KB\n→ theoretical {THEORETICAL_TORUS_2D_6SIP_NS:.0f} ns", - xy=(BYTES_96KB, THEORETICAL_TORUS_2D_6SIP_NS), - xytext=(8, -20), textcoords="offset points", - color="tab:red", fontsize=9, + xs_th, ys_th, + color="tab:red", linestyle="--", linewidth=1.6, marker="x", + label="theoretical torus_2d (6 SIPs)", ) + ax.set_xscale("log", base=2) ax.set_xlabel("Bytes per PE (log scale)") - ax.set_ylabel("max pe_exec_ns (critical path)") + ax.set_ylabel("Time (ns)") ax.set_title("Multi-device allreduce latency by topology") ax.grid(True, alpha=0.3) - - # Drop 128 KB tick (overlaps visually with the explicit 96 KB marker) - # and add 96 KB. - BYTES_128KB = 128 * 1024 - existing_ticks = [t for t in ax.get_xticks() if int(t) != BYTES_128KB] - if BYTES_96KB not in existing_ticks: - existing_ticks.append(BYTES_96KB) - ax.set_xticks(sorted(existing_ticks)) ax.set_xlim(left=min(r["bytes_per_pe"] for r in records) / 2, - right=BYTES_96KB * 1.5) + right=max(r["bytes_per_pe"] for r in records) * 1.5) ax.legend() ax.xaxis.set_major_formatter(_bytes_fmt) fig.tight_layout() @@ -811,7 +797,7 @@ def _draw_cube_reduction(ax): def emit_topology_diagram() -> str: - """Emit a 2×2-panel topology diagram into allreduce_latency_plots/. + """Emit a 2×2-panel topology diagram into docs/diagrams/allreduce_latency_plots/. Top row: ring_1d | torus_2d (2×3) Bot row: mesh_2d_no_wrap (2×3) | cube-level reduction in SIP 0 diff --git a/tests/test_intercube_root_center.py b/tests/test_intercube_root_center.py new file mode 100644 index 0000000..ca40e3a --- /dev/null +++ b/tests/test_intercube_root_center.py @@ -0,0 +1,139 @@ +"""Phase 1 test for moving the intercube_allreduce root cube from the +bottom-right corner (3,3) to the geometric center (2,2). + +Today's algorithm (intercube_allreduce.py) hardcodes +``root_cube = (cube_h-1) * cube_w + (cube_w-1)`` (= cube 15 in 4×4). +The intra-SIP critical path for one allreduce is therefore:: + + Phase 1 (row reduce W→E to col 3) : 3 hops + Phase 2 (col reduce N→S to row 3 on col 3): 3 hops + Phase 3 (inter-SIP at root) : (separate) + Phase 4 (col broadcast S→N) : 3 hops + Phase 5 (row broadcast E→W) : 3 hops + Total intra-SIP critical path : 12 hops + +Moving the root to (2,2) and using BIDIRECTIONAL convergence (cols 0..2 +go W→E, col 3 goes E→W in parallel; rows 0..2 go N→S, row 3 goes S→N +in parallel) cuts each phase's critical path from 3 hops to 2:: + + Phase 1 critical path : max(2, 1) = 2 hops + Phase 2 critical path : max(2, 1) = 2 hops + Phase 4 critical path : 2 hops + Phase 5 critical path : 2 hops + Total intra-SIP critical path : 8 hops + +Per-hop cost at 96 KB on TCM ≈ 600 ns (slot IO write+read 384 ns + +fabric drain ~217 ns). 4 fewer hops ⇒ ~2.4 µs reduction. + +EXPECTED Phase 1 outcome: + - Today (root = corner) : ~22.0 µs ← test FAILS (> 20500 ns) + - After Phase 2 (root = center) : ~19.6 µs ← test PASSES (< 20500 ns) +""" +from __future__ import annotations + +from pathlib import Path + +import pytest + +from kernbench.runtime_api.context import RuntimeContext +from kernbench.runtime_api.types import DeviceSelector +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + +from tests.test_allreduce_multidevice import ( + _write_temp_configs, + run_allreduce, +) + + +def _run_torus_96kb(tmp_path: Path) -> float: + """Run torus_2d 6-SIP allreduce at 96 KB / slot, return critical-path + pe_exec_ns. Fixed at TCM (the project default).""" + sub = tmp_path / "torus_root_center" + sub.mkdir() + topo_path, ccl_path = _write_temp_configs( + sub, + sip_topology="torus_2d", + n_sips=6, + algorithm="intercube_allreduce", + sip_w=3, sip_h=2, + n_elem_override=49152, # 49152 × 2 = 96 KB / slot + ) + topo = resolve_topology(topo_path) + engine = GraphEngine(topo.topology_obj, enable_data=True) + spec = topo.topology_obj.spec + with RuntimeContext( + engine=engine, + target_device=DeviceSelector("all"), + correlation_id="root_center_phase1", + spec=spec, + ) as ctx: + result = run_allreduce( + ctx, engine, spec, + algorithm="intercube_allreduce", ccl_yaml=ccl_path, + ) + assert result["ok_cubes"] > 0 + pe_exec_vals = [ + float(tr.get("pe_exec_ns", 0.0) or 0.0) + for _, (_, tr) in engine._results.items() + if isinstance(tr, dict) + ] + return max(pe_exec_vals) if pe_exec_vals else 0.0 + + +def test_intra_sip_critical_path_at_96k_below_threshold(tmp_path): + """Post-Phase-2 (root=center, bidirectional reduce) the torus_2d + 96 KB allreduce on TCM should drop below 20.5 µs. + + Today's value: ~22.0 µs (12-hop critical path with corner root). + Expected post-Phase-2: ~19.6 µs (8-hop critical path with + center root) — model estimate, ~11% reduction end-to-end. + """ + lat_ns = _run_torus_96kb(tmp_path) + THRESHOLD_NS = 20_500.0 + assert lat_ns < THRESHOLD_NS, ( + f"torus_2d 6-SIP 96 KB allreduce should land below " + f"{THRESHOLD_NS:.0f} ns post-Phase-2 (root=center, " + f"bidirectional reduce). got {lat_ns:.1f} ns " + f"({lat_ns / 1000:.2f} µs)" + ) + + +def test_correctness_preserved(tmp_path): + """Smoke check: at small n_elem the new algorithm must still produce + the correct sum across all 96 cubes. ``run_allreduce`` validates + every cube against the expected reduce result (``ok_cubes`` must be + 96 = 6 SIPs × 16 cubes). + + This guards against the obvious Phase 2 risk: bidirectional reduce + sums each contribution exactly once. If implemented wrong (double- + counting or skipping the right edge column / bottom row), the + asserts inside run_allreduce fail. + """ + sub = tmp_path / "correctness" + sub.mkdir() + topo_path, ccl_path = _write_temp_configs( + sub, + sip_topology="torus_2d", + n_sips=6, + algorithm="intercube_allreduce", + sip_w=3, sip_h=2, + n_elem_override=128, # tiny payload to keep this fast + ) + topo = resolve_topology(topo_path) + engine = GraphEngine(topo.topology_obj, enable_data=True) + spec = topo.topology_obj.spec + with RuntimeContext( + engine=engine, + target_device=DeviceSelector("all"), + correlation_id="root_center_correctness", + spec=spec, + ) as ctx: + result = run_allreduce( + ctx, engine, spec, + algorithm="intercube_allreduce", ccl_yaml=ccl_path, + ) + n_cubes = 6 * 16 # 6 SIPs × 16 cubes/SIP + assert result["ok_cubes"] == n_cubes, ( + f"all 96 cubes must validate; got {result['ok_cubes']} OK" + )