diff --git a/docs/diagrams/gemm_sweep.json b/docs/diagrams/gemm_sweep.json index 9ab6fbd..e31c35d 100644 --- a/docs/diagrams/gemm_sweep.json +++ b/docs/diagrams/gemm_sweep.json @@ -33,7 +33,7 @@ "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, - "sim_wall_clock_s": 0.569, + "sim_wall_clock_s": 0.574, "engines": { "pe_dma": { "occupancy_ns": 52.0, @@ -88,7 +88,8 @@ "record_count": 0 } }, - "pe_window_ns": 88.38400000000001 + "pe_window_ns": 88.38400000000001, + "composite_window_ns": 88.38400000000001 }, { "M": 32, @@ -99,11 +100,11 @@ "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, - "sim_wall_clock_s": 0.409, + "sim_wall_clock_s": 0.392, "engines": { "pe_dma": { - "occupancy_ns": 44.5, - "wall_ns": 44.5, + "occupancy_ns": 44.0, + "wall_ns": 44.0, "record_count": 3 }, "pe_fetch_store": { @@ -154,7 +155,8 @@ "record_count": 0 } }, - "pe_window_ns": 81.894 + "pe_window_ns": 81.394, + "composite_window_ns": 68.38400000000001 }, { "M": 32, @@ -165,11 +167,11 @@ "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, - "sim_wall_clock_s": 0.567, + "sim_wall_clock_s": 0.53, "engines": { "pe_dma": { - "occupancy_ns": 37.0, - "wall_ns": 37.0, + "occupancy_ns": 36.0, + "wall_ns": 36.0, "record_count": 3 }, "pe_fetch_store": { @@ -220,7 +222,8 @@ "record_count": 0 } }, - "pe_window_ns": 75.404 + "pe_window_ns": 74.904, + "composite_window_ns": 48.384000000000015 }, { "M": 32, @@ -231,7 +234,7 @@ "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, - "sim_wall_clock_s": 0.838, + "sim_wall_clock_s": 0.625, "engines": { "pe_dma": { "occupancy_ns": 52.0, @@ -286,7 +289,8 @@ "record_count": 0 } }, - "pe_window_ns": 88.38400000000001 + "pe_window_ns": 88.38400000000001, + "composite_window_ns": 88.38400000000001 }, { "M": 32, @@ -297,11 +301,11 @@ "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, - "sim_wall_clock_s": 1.097, + "sim_wall_clock_s": 0.513, "engines": { "pe_dma": { - "occupancy_ns": 52.5, - "wall_ns": 52.5, + "occupancy_ns": 52.0, + "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { @@ -352,7 +356,8 @@ "record_count": 0 } }, - "pe_window_ns": 89.894 + "pe_window_ns": 89.394, + "composite_window_ns": 68.38400000000001 }, { "M": 32, @@ -363,11 +368,11 @@ "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, - "sim_wall_clock_s": 1.264, + "sim_wall_clock_s": 0.6, "engines": { "pe_dma": { - "occupancy_ns": 53.0, - "wall_ns": 53.0, + "occupancy_ns": 52.0, + "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { @@ -418,7 +423,8 @@ "record_count": 0 } }, - "pe_window_ns": 91.404 + "pe_window_ns": 90.904, + "composite_window_ns": 48.384000000000015 }, { "M": 32, @@ -429,20 +435,20 @@ "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, - "sim_wall_clock_s": 1.187, + "sim_wall_clock_s": 0.572, "engines": { "pe_dma": { - "occupancy_ns": 131.995, - "wall_ns": 80.0, + "occupancy_ns": 92.0, + "wall_ns": 92.0, "record_count": 5 }, "pe_fetch_store": { - "occupancy_ns": 36.0, + "occupancy_ns": 39.615999999999985, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -454,8 +460,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 119.995, - "wall_ns": 68.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "DMA_WRITE": { @@ -469,12 +475,12 @@ "record_count": 2 }, "STORE": { - "occupancy_ns": 4.0, - "wall_ns": 4.0, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -484,7 +490,8 @@ "record_count": 0 } }, - "pe_window_ns": 100.76800000000003 + "pe_window_ns": 112.38400000000001, + "composite_window_ns": 112.38400000000001 }, { "M": 32, @@ -495,20 +502,20 @@ "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, - "sim_wall_clock_s": 1.13, + "sim_wall_clock_s": 0.606, "engines": { "pe_dma": { - "occupancy_ns": 104.495, - "wall_ns": 84.5, + "occupancy_ns": 88.0, + "wall_ns": 88.0, "record_count": 4 }, "pe_fetch_store": { - "occupancy_ns": 36.0, + "occupancy_ns": 39.615999999999985, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -520,8 +527,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 55.995000000000005, - "wall_ns": 36.0, + "occupancy_ns": 40.0, + "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { @@ -535,12 +542,12 @@ "record_count": 2 }, "STORE": { - "occupancy_ns": 4.0, - "wall_ns": 4.0, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -550,7 +557,8 @@ "record_count": 0 } }, - "pe_window_ns": 106.27300000000002 + "pe_window_ns": 109.38900000000001, + "composite_window_ns": 72.38400000000001 }, { "M": 32, @@ -561,11 +569,11 @@ "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, - "sim_wall_clock_s": 1.113, + "sim_wall_clock_s": 0.535, "engines": { "pe_dma": { - "occupancy_ns": 85.0, - "wall_ns": 85.0, + "occupancy_ns": 84.0, + "wall_ns": 84.0, "record_count": 3 }, "pe_fetch_store": { @@ -616,7 +624,8 @@ "record_count": 0 } }, - "pe_window_ns": 123.78300000000002 + "pe_window_ns": 123.28300000000002, + "composite_window_ns": 48.76800000000003 }, { "M": 32, @@ -627,20 +636,20 @@ "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, - "sim_wall_clock_s": 1.451, + "sim_wall_clock_s": 0.746, "engines": { "pe_dma": { - "occupancy_ns": 1687.995, - "wall_ns": 272.0, + "occupancy_ns": 408.0, + "wall_ns": 348.0, "record_count": 20 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -652,27 +661,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 1631.995, - "wall_ns": 260.0, + "occupancy_ns": 328.0, + "wall_ns": 328.0, "record_count": 16 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -682,7 +691,8 @@ "record_count": 0 } }, - "pe_window_ns": 297.9200000000001 + "pe_window_ns": 360.384, + "composite_window_ns": 360.384 }, { "M": 32, @@ -693,20 +703,20 @@ "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, - "sim_wall_clock_s": 1.269, + "sim_wall_clock_s": 0.728, "engines": { "pe_dma": { - "occupancy_ns": 700.495, - "wall_ns": 180.5, + "occupancy_ns": 284.0, + "wall_ns": 224.0, "record_count": 13 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -718,27 +728,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 607.995, - "wall_ns": 132.0, + "occupancy_ns": 168.0, + "wall_ns": 168.0, "record_count": 8 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -748,7 +758,8 @@ "record_count": 0 } }, - "pe_window_ns": 207.42500000000007 + "pe_window_ns": 237.389, + "composite_window_ns": 200.38400000000001 }, { "M": 32, @@ -759,11 +770,11 @@ "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, - "sim_wall_clock_s": 1.225, + "sim_wall_clock_s": 0.584, "engines": { "pe_dma": { - "occupancy_ns": 217.0, - "wall_ns": 217.0, + "occupancy_ns": 216.0, + "wall_ns": 216.0, "record_count": 6 }, "pe_fetch_store": { @@ -814,7 +825,8 @@ "record_count": 0 } }, - "pe_window_ns": 318.0870000000001 + "pe_window_ns": 317.5870000000001, + "composite_window_ns": 147.07200000000012 }, { "M": 32, @@ -825,20 +837,20 @@ "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, - "sim_wall_clock_s": 2.724, + "sim_wall_clock_s": 1.348, "engines": { "pe_dma": { - "occupancy_ns": 55883.995, - "wall_ns": 1552.0, + "occupancy_ns": 1932.0, + "wall_ns": 1932.0, "record_count": 97 }, "pe_fetch_store": { - "occupancy_ns": 791.1039999999994, + "occupancy_ns": 775.616, "wall_ns": 772.0, "record_count": 49 }, "pe_gemm": { - "occupancy_ns": 1215.584000000017, + "occupancy_ns": 786.4320000000007, "wall_ns": 786.4320000000007, "record_count": 48 }, @@ -850,8 +862,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 55871.995, - "wall_ns": 1540.0, + "occupancy_ns": 1920.0, + "wall_ns": 1920.0, "record_count": 96 }, "DMA_WRITE": { @@ -860,17 +872,17 @@ "record_count": 1 }, "FETCH": { - "occupancy_ns": 772.0, - "wall_ns": 772.0, + "occupancy_ns": 768.0, + "wall_ns": 768.0, "record_count": 48 }, "STORE": { - "occupancy_ns": 19.10399999999936, - "wall_ns": 19.10399999999936, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 1215.584000000017, + "occupancy_ns": 786.4320000000007, "wall_ns": 786.4320000000007, "record_count": 48 }, @@ -880,7 +892,8 @@ "record_count": 0 } }, - "pe_window_ns": 1590.4320000000007 + "pe_window_ns": 1952.384, + "composite_window_ns": 1952.384 }, { "M": 32, @@ -891,21 +904,21 @@ "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, - "sim_wall_clock_s": 2.137, + "sim_wall_clock_s": 1.075, "engines": { "pe_dma": { - "occupancy_ns": 19792.495, - "wall_ns": 1556.5, + "occupancy_ns": 1744.0000000000002, + "wall_ns": 1744.0000000000002, "record_count": 50 }, "pe_fetch_store": { - "occupancy_ns": 791.1039999999994, - "wall_ns": 772.0, + "occupancy_ns": 775.6160000000002, + "wall_ns": 772.0000000000002, "record_count": 49 }, "pe_gemm": { - "occupancy_ns": 1215.584000000017, - "wall_ns": 786.4320000000007, + "occupancy_ns": 786.4320000000005, + "wall_ns": 786.4320000000005, "record_count": 48 }, "pe_math": { @@ -916,8 +929,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 19007.995, - "wall_ns": 772.0, + "occupancy_ns": 960.0000000000002, + "wall_ns": 960.0000000000002, "record_count": 48 }, "DMA_WRITE": { @@ -926,18 +939,18 @@ "record_count": 1 }, "FETCH": { - "occupancy_ns": 772.0, - "wall_ns": 772.0, + "occupancy_ns": 768.0000000000002, + "wall_ns": 768.0000000000002, "record_count": 48 }, "STORE": { - "occupancy_ns": 19.10399999999936, - "wall_ns": 19.10399999999936, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 1215.584000000017, - "wall_ns": 786.4320000000007, + "occupancy_ns": 786.4320000000005, + "wall_ns": 786.4320000000005, "record_count": 48 }, "MATH": { @@ -946,7 +959,8 @@ "record_count": 0 } }, - "pe_window_ns": 1595.9370000000008 + "pe_window_ns": 1765.3890000000004, + "composite_window_ns": 992.3840000000002 }, { "M": 32, @@ -957,11 +971,11 @@ "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, - "sim_wall_clock_s": 1.245, + "sim_wall_clock_s": 0.635, "engines": { "pe_dma": { - "occupancy_ns": 1557.0, - "wall_ns": 1557.0, + "occupancy_ns": 1556.0, + "wall_ns": 1556.0, "record_count": 3 }, "pe_fetch_store": { @@ -1012,7 +1026,8 @@ "record_count": 0 } }, - "pe_window_ns": 2349.447000000001 + "pe_window_ns": 2348.947000000001, + "composite_window_ns": 802.4320000000005 }, { "M": 8, @@ -1023,20 +1038,20 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, - "sim_wall_clock_s": 1.477, + "sim_wall_clock_s": 0.706, "engines": { "pe_dma": { - "occupancy_ns": 1687.995, - "wall_ns": 272.0, + "occupancy_ns": 408.0, + "wall_ns": 348.0, "record_count": 20 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1048,27 +1063,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 1631.995, - "wall_ns": 260.0, + "occupancy_ns": 328.0, + "wall_ns": 328.0, "record_count": 16 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1078,7 +1093,8 @@ "record_count": 0 } }, - "pe_window_ns": 297.9200000000001 + "pe_window_ns": 360.384, + "composite_window_ns": 360.384 }, { "M": 8, @@ -1089,20 +1105,20 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, - "sim_wall_clock_s": 1.443, + "sim_wall_clock_s": 0.678, "engines": { "pe_dma": { - "occupancy_ns": 676.495, - "wall_ns": 156.5, + "occupancy_ns": 260.0, + "wall_ns": 200.0, "record_count": 13 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1114,27 +1130,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 607.995, - "wall_ns": 132.0, + "occupancy_ns": 168.0, + "wall_ns": 168.0, "record_count": 8 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1144,7 +1160,8 @@ "record_count": 0 } }, - "pe_window_ns": 183.42500000000007 + "pe_window_ns": 213.389, + "composite_window_ns": 200.38400000000001 }, { "M": 8, @@ -1155,11 +1172,11 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, - "sim_wall_clock_s": 1.198, + "sim_wall_clock_s": 0.632, "engines": { "pe_dma": { - "occupancy_ns": 193.0, - "wall_ns": 193.0, + "occupancy_ns": 192.0, + "wall_ns": 192.0, "record_count": 6 }, "pe_fetch_store": { @@ -1210,7 +1227,8 @@ "record_count": 0 } }, - "pe_window_ns": 294.0870000000001 + "pe_window_ns": 293.5870000000001, + "composite_window_ns": 147.07200000000012 }, { "M": 128, @@ -1221,21 +1239,21 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, - "sim_wall_clock_s": 1.983, + "sim_wall_clock_s": 0.978, "engines": { "pe_dma": { - "occupancy_ns": 6547.771, - "wall_ns": 560.0, + "occupancy_ns": 1040.0000000000002, + "wall_ns": 732.0000000000001, "record_count": 48 }, "pe_fetch_store": { - "occupancy_ns": 481.72799999999916, - "wall_ns": 268.0, + "occupancy_ns": 422.2400000000001, + "wall_ns": 320.0000000000001, "record_count": 32 }, "pe_gemm": { - "occupancy_ns": 293.2480000000014, - "wall_ns": 262.14400000000023, + "occupancy_ns": 262.1440000000001, + "wall_ns": 262.1440000000001, "record_count": 16 }, "pe_math": { @@ -1246,28 +1264,28 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 6335.995, - "wall_ns": 516.0, + "occupancy_ns": 688.0000000000001, + "wall_ns": 688.0000000000001, "record_count": 32 }, "DMA_WRITE": { - "occupancy_ns": 211.77599999999984, - "wall_ns": 169.15200000000004, + "occupancy_ns": 352.0000000000001, + "wall_ns": 352.0000000000001, "record_count": 16 }, "FETCH": { - "occupancy_ns": 308.0, - "wall_ns": 260.0, + "occupancy_ns": 256.0, + "wall_ns": 256.0, "record_count": 16 }, "STORE": { - "occupancy_ns": 173.72799999999916, - "wall_ns": 164.2559999999994, + "occupancy_ns": 166.24000000000012, + "wall_ns": 166.24000000000012, "record_count": 16 }, "GEMM": { - "occupancy_ns": 293.2480000000014, - "wall_ns": 262.14400000000023, + "occupancy_ns": 262.1440000000001, + "wall_ns": 262.1440000000001, "record_count": 16 }, "MATH": { @@ -1276,7 +1294,8 @@ "record_count": 0 } }, - "pe_window_ns": 572.9920000000002 + "pe_window_ns": 736.3840000000001, + "composite_window_ns": 736.3840000000001 }, { "M": 128, @@ -1287,20 +1306,20 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, - "sim_wall_clock_s": 1.699, + "sim_wall_clock_s": 0.999, "engines": { "pe_dma": { - "occupancy_ns": 2464.2709999999997, - "wall_ns": 316.5, + "occupancy_ns": 732.0, + "wall_ns": 424.0, "record_count": 33 }, "pe_fetch_store": { - "occupancy_ns": 481.72799999999916, - "wall_ns": 268.0, + "occupancy_ns": 422.2399999999998, + "wall_ns": 320.0, "record_count": 32 }, "pe_gemm": { - "occupancy_ns": 293.2480000000014, + "occupancy_ns": 262.14400000000023, "wall_ns": 262.14400000000023, "record_count": 16 }, @@ -1312,27 +1331,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 2239.995, - "wall_ns": 260.0, + "occupancy_ns": 368.0, + "wall_ns": 368.0, "record_count": 16 }, "DMA_WRITE": { - "occupancy_ns": 211.77599999999984, - "wall_ns": 169.15200000000004, + "occupancy_ns": 352.0, + "wall_ns": 352.0, "record_count": 16 }, "FETCH": { - "occupancy_ns": 308.0, - "wall_ns": 260.0, + "occupancy_ns": 256.0, + "wall_ns": 256.0, "record_count": 16 }, "STORE": { - "occupancy_ns": 173.72799999999916, - "wall_ns": 164.2559999999994, + "occupancy_ns": 166.23999999999978, + "wall_ns": 166.23999999999978, "record_count": 16 }, "GEMM": { - "occupancy_ns": 293.2480000000014, + "occupancy_ns": 262.14400000000023, "wall_ns": 262.14400000000023, "record_count": 16 }, @@ -1342,7 +1361,8 @@ "record_count": 0 } }, - "pe_window_ns": 330.4970000000002 + "pe_window_ns": 429.389, + "composite_window_ns": 416.384 }, { "M": 128, @@ -1353,11 +1373,11 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, - "sim_wall_clock_s": 1.402, + "sim_wall_clock_s": 0.776, "engines": { "pe_dma": { - "occupancy_ns": 217.0, - "wall_ns": 217.0, + "occupancy_ns": 216.0, + "wall_ns": 216.0, "record_count": 18 }, "pe_fetch_store": { @@ -1408,7 +1428,8 @@ "record_count": 0 } }, - "pe_window_ns": 321.1590000000002 + "pe_window_ns": 320.6590000000002, + "composite_window_ns": 294.14400000000023 }, { "M": 512, @@ -1419,21 +1440,21 @@ "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, - "sim_wall_clock_s": 89.111, + "sim_wall_clock_s": 43.303, "engines": { "pe_dma": { - "occupancy_ns": 100690943.995, - "wall_ns": 65612.00000000001, + "occupancy_ns": 90092.00000000001, + "wall_ns": 82952.00000000001, "record_count": 4352 }, "pe_fetch_store": { - "occupancy_ns": 43566.52800034459, - "wall_ns": 32796.00000000001, + "occupancy_ns": 34717.695999999676, + "wall_ns": 33792.00000000001, "record_count": 2304 }, "pe_gemm": { - "occupancy_ns": 833762.8159962555, - "wall_ns": 33554.431999996836, + "occupancy_ns": 33554.43200000262, + "wall_ns": 33554.43200000262, "record_count": 2048 }, "pe_math": { @@ -1444,28 +1465,28 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 100687871.995, - "wall_ns": 65540.00000000001, + "occupancy_ns": 82940.00000000001, + "wall_ns": 82940.00000000001, "record_count": 4096 }, "DMA_WRITE": { - "occupancy_ns": 3072.0, - "wall_ns": 3072.0, + "occupancy_ns": 7152.0, + "wall_ns": 7152.0, "record_count": 256 }, "FETCH": { - "occupancy_ns": 40936.00000000001, - "wall_ns": 32772.00000000001, + "occupancy_ns": 32768.00000000001, + "wall_ns": 32768.00000000001, "record_count": 2048 }, "STORE": { - "occupancy_ns": 2630.5280003445805, - "wall_ns": 2630.5280003445805, + "occupancy_ns": 1949.6959999996689, + "wall_ns": 1949.6959999996689, "record_count": 256 }, "GEMM": { - "occupancy_ns": 833762.8159962555, - "wall_ns": 33554.431999996836, + "occupancy_ns": 33554.43200000262, + "wall_ns": 33554.43200000262, "record_count": 2048 }, "MATH": { @@ -1474,7 +1495,8 @@ "record_count": 0 } }, - "pe_window_ns": 66360.12799999685 + "pe_window_ns": 82972.38400000002, + "composite_window_ns": 82972.38400000002 }, { "M": 512, @@ -1485,21 +1507,21 @@ "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, - "sim_wall_clock_s": 48.616, + "sim_wall_clock_s": 23.224, "engines": { "pe_dma": { - "occupancy_ns": 33584132.495, - "wall_ns": 34896.5, + "occupancy_ns": 51184.0, + "wall_ns": 44044.0, "record_count": 2305 }, "pe_fetch_store": { - "occupancy_ns": 43562.81600011295, - "wall_ns": 32796.0, + "occupancy_ns": 34717.69600000031, + "wall_ns": 33792.0, "record_count": 2304 }, "pe_gemm": { - "occupancy_ns": 833762.8159987241, - "wall_ns": 33554.43199999785, + "occupancy_ns": 33554.43199999754, + "wall_ns": 33554.43199999755, "record_count": 2048 }, "pe_math": { @@ -1510,28 +1532,28 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 33579007.995, - "wall_ns": 32772.0, + "occupancy_ns": 41980.0, + "wall_ns": 41980.0, "record_count": 2048 }, "DMA_WRITE": { - "occupancy_ns": 3072.0, - "wall_ns": 3072.0, + "occupancy_ns": 7151.999999999999, + "wall_ns": 7151.999999999999, "record_count": 256 }, "FETCH": { - "occupancy_ns": 40936.0, - "wall_ns": 32772.0, + "occupancy_ns": 32768.0, + "wall_ns": 32768.0, "record_count": 2048 }, "STORE": { - "occupancy_ns": 2626.816000112947, - "wall_ns": 2626.816000112947, + "occupancy_ns": 1949.6960000003073, + "wall_ns": 1949.6960000003073, "record_count": 256 }, "GEMM": { - "occupancy_ns": 833762.8159987241, - "wall_ns": 33554.43199999785, + "occupancy_ns": 33554.43199999754, + "wall_ns": 33554.43199999755, "record_count": 2048 }, "MATH": { @@ -1540,7 +1562,8 @@ "record_count": 0 } }, - "pe_window_ns": 35645.632999997855 + "pe_window_ns": 44065.389, + "composite_window_ns": 42012.384 }, { "M": 512, @@ -1551,11 +1574,11 @@ "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, - "sim_wall_clock_s": 7.072, + "sim_wall_clock_s": 3.48, "engines": { "pe_dma": { - "occupancy_ns": 7177.0, - "wall_ns": 7177.0, + "occupancy_ns": 7176.0, + "wall_ns": 7176.0, "record_count": 258 }, "pe_fetch_store": { @@ -1606,7 +1629,8 @@ "record_count": 0 } }, - "pe_window_ns": 37677.44699999763 + "pe_window_ns": 37676.94699999763, + "composite_window_ns": 33570.43199999763 } ] } \ No newline at end of file diff --git a/docs/diagrams/kernbench2_overview.pptx b/docs/diagrams/kernbench2_overview.pptx index dde76aa..35bfaba 100644 Binary files a/docs/diagrams/kernbench2_overview.pptx and b/docs/diagrams/kernbench2_overview.pptx differ diff --git a/scripts/build_overview_slides.py b/scripts/build_overview_slides.py index 2eb0df5..8ccfded 100644 --- a/scripts/build_overview_slides.py +++ b/scripts/build_overview_slides.py @@ -114,23 +114,19 @@ SLIDES = [ "render": "hbm_topology", }, { - "title": "14. GEMM Utilization + Useful Pipeline Efficiency (load_ref)", - "render": "mac_utilization", + "title": "14. Formula vs Measured Useful Eff (load_ref) — sim validates model", + "render": "mac_utilization_formula_vs_measured", }, { - "title": "15. GEMM Utilization + Useful Pipeline Efficiency (ref_ref — both A & B via DMA_R)", - "render": "mac_utilization_ref_ref", - }, - { - "title": "16. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)", + "title": "15. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)", "render": "pipeline_eff_walkthrough", }, { - "title": "17. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)", + "title": "16. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)", "render": "pipeline_eff_walkthrough_largeK", }, { - "title": "18. Useful Pipelined Efficiency (ideal pipeline × GEMM util)", + "title": "17. Useful Pipelined Efficiency (ideal pipeline × GEMM util)", "render": "tflops_table", }, ] @@ -1715,12 +1711,12 @@ def _render_mac_utilization(slide): "Useful eff": "Useful eff %", } - _textbox(slide, 0.4, 1.0, 12.6, 0.70, - f"GEMM util = useful FLOPs ÷ (tile FLOPs × tile count) — pure " - f"shape-vs-tile metric. " - f"Useful eff = (N_tiles × T_stage × GEMM_util) ÷ wall — " - f"useful FLOPs delivered as a fraction of peak over the " - f"ideal-pipelined wall (head + K-loop + inter-(m,n) DMA_W).", + _textbox(slide, 0.4, 1.0, 12.6, 0.80, + "FORMULA-generated (analytical ideal-pipeline model — not " + "simulator data). GEMM util = useful FLOPs ÷ (tile FLOPs × " + "tile count). Useful eff = (N_tiles × T_stage × GEMM_util) " + "÷ wall, where wall = head + K-loop + inter-(m,n) DMA_W. " + "Slide 16 overlays this against measured pe_window_ns.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( @@ -1819,11 +1815,12 @@ def _render_mac_utilization_ref_ref(slide): "Useful eff": "Useful eff % (ref_ref)", } - _textbox(slide, 0.4, 1.0, 12.6, 0.75, - "ref_ref: scheduler issues DMA_R for BOTH A and B every tile. " - "Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH and GEMM stay " - "at 16 ns. Pipeline cycle is DMA-bound → useful eff caps near " - "50 % × GEMM_util, regardless of K-loop length.", + _textbox(slide, 0.4, 1.0, 12.6, 0.85, + "FORMULA-generated (ideal-pipeline model, ref_ref variant — " + "not simulator data). Scheduler issues DMA_R for BOTH A and B " + "every tile. Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH " + "and GEMM stay at 16 ns. Pipeline cycle is DMA-bound → useful " + "eff caps near 50 % × GEMM_util, regardless of K-loop length.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( @@ -1846,6 +1843,136 @@ def _render_mac_utilization_ref_ref(slide): ) +def _render_mac_utilization_formula_vs_measured(slide): + """Overlay slide: formula useful_eff vs measured useful_eff (load_ref). + + Formula = (N_tiles × T_stage × GEMM_util) / wall_formula × 100 + wall_formula = head + N_tiles·T_stage + inter·DMA_W + Measured = (useful_FLOPs / pe_window_ns) / peak_FLOPs_per_ns × 100 + pe_window_ns from gemm_sweep.json (honest post Option B). + + Agreement validates the analytical model against the simulator. + """ + data = _load_sweep_data() + rows = data["rows"] + if not rows: + _textbox(slide, 0.4, 3.0, 12.6, 1.0, + "No sweep data. Run scripts/gemm_sweep.py first.", + size=14, color=COL_RED, align=PP_ALIGN.LEFT) + return + tile = data["tile_sizes"] + TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] + tile_flops = 2 * TILE_M * TILE_K * TILE_N + + HBM_GBS = 256.0 + bpe = 2 + T_STAGE = 16.0 + D_STAGES = 3 + head_ns = (D_STAGES - 1) * T_STAGE + dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS + peak_per_ns = tile_flops / T_STAGE # MAC peak throughput, flops/ns + + by_shape: dict = {} + for r in rows: + if r["variant"] != "load_ref": + continue + by_shape[(r["M"], r["K"], r["N"])] = r + shapes = list(by_shape.keys()) + + shape_labels = [_shape_label(by_shape[k]) for k in shapes] + flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N) + for k in shapes] + tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] + + gemm_util_formula: list[float] = [] + gemm_util_measured: list[float] = [] + formula_eff: list[float] = [] + measured_eff: list[float] = [] + for k in shapes: + r = by_shape[k] + M, K, N = r["M"], r["K"], r["N"] + useful = 2 * M * K * N + tiles = r["tile_count_expected"] + gu_formula = useful / (tile_flops * tiles) + gemm_util_formula.append(gu_formula * 100) + + # Measured GEMM util uses the GEMM stage record count from op_log + # (i.e. the actual number of GEMM tiles the simulator ran). + gemm_record_count = ( + r.get("stages", {}).get("GEMM", {}).get("record_count", 0) + or tiles # fallback if stages dict missing for older sweep data + ) + gu_measured = useful / (tile_flops * gemm_record_count) \ + if gemm_record_count > 0 else 0.0 + gemm_util_measured.append(gu_measured * 100) + + m_tiles = (M + TILE_M - 1) // TILE_M + n_tiles = (N + TILE_N - 1) // TILE_N + n_mn = m_tiles * n_tiles + compute_total = tiles * T_STAGE + inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns + wall_formula = head_ns + compute_total + inter_dma_w + feff = (compute_total * gu_formula / wall_formula) * 100 \ + if wall_formula > 0 else 0.0 + formula_eff.append(feff) + + comp_window_ns = r.get("composite_window_ns", 0.0) or 0.0 + if comp_window_ns > 0: + meff = (useful / comp_window_ns / peak_per_ns) * 100 + else: + meff = 0.0 + measured_eff.append(meff) + + series = { + "GEMM util F": gemm_util_formula, + "GEMM util M": gemm_util_measured, + "Formula eff": formula_eff, + "Measured eff": measured_eff, + } + colors_map = { + "GEMM util F": COL_FS, # emerald (formula ceiling) + "GEMM util M": RGBColor(0x6E, 0xE7, 0xB7), # mint (measured ceiling) + "Formula eff": RGBColor(0xF5, 0x9E, 0x0B), # amber (formula eff) + "Measured eff": COL_DMA, # blue (measured eff) + } + display_map = { + "GEMM util F": "GEMM util % (formula)", + "GEMM util M": "GEMM util % (measured, op_log)", + "Formula eff": "Formula useful eff %", + "Measured eff": "Measured useful eff %", + } + + _textbox(slide, 0.4, 1.0, 12.6, 1.10, + "Four series per shape:\n" + " • GEMM util (formula, green): useful_FLOPs ÷ (tile_FLOPs × " + "tile_count_expected) — structural shape-vs-tile ceiling.\n" + " • GEMM util (measured, mint): useful_FLOPs ÷ (tile_FLOPs × " + "GEMM_record_count) — uses the actual GEMM ops the simulator " + "ran. Should equal formula → validates plan execution.\n" + " • Formula useful eff (amber): GEMM_util × ideal pipeline efficiency.\n" + " • Measured useful eff (blue): (useful_FLOPs ÷ " + "composite_window_ns) ÷ peak — composite_window_ns covers " + "only the tl.composite pipeline (excludes upfront tl.load).", + size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) + + _draw_native_bar_chart( + slide, + plot_x=1.0, plot_y=2.25, plot_w=10.0, plot_h=3.85, + shape_labels=shape_labels, flagged=flagged, + tile_counts=tile_counts, + series=series, colors_map=colors_map, + display_map=display_map, + wall_clocks=None, + y_label="%", + legend_x=11.4, legend_w=1.85, + foot_note=("Reading: GEMM util F = GEMM util M → simulator ran the " + "expected tile plan. Measured eff uses composite-only " + "window (excludes upfront tl.load), so it isolates " + "pipeline efficiency. Variant: load_ref."), + threshold_line=100.0, + ) + + def _render_tflops_table(slide): """Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM. @@ -2282,8 +2409,9 @@ _RENDERERS.update({ "variant_comparison": _render_variant_comparison, "hbm_topology": _render_hbm_topology, "per_op_dma": _render_per_op_dma, - "mac_utilization": _render_mac_utilization, - "mac_utilization_ref_ref": _render_mac_utilization_ref_ref, + "mac_utilization": _render_mac_utilization, + "mac_utilization_ref_ref": _render_mac_utilization_ref_ref, + "mac_utilization_formula_vs_measured": _render_mac_utilization_formula_vs_measured, "tflops_table": _render_tflops_table, "pipeline_eff_walkthrough": _render_pipeline_eff_walkthrough, "pipeline_eff_walkthrough_largeK": _render_pipeline_eff_walkthrough_largeK, diff --git a/scripts/gemm_sweep.py b/scripts/gemm_sweep.py index c991ed6..5400469 100644 --- a/scripts/gemm_sweep.py +++ b/scripts/gemm_sweep.py @@ -179,6 +179,13 @@ def _run_one(M: int, K: int, N: int, topology: str, variant: str = "ref_ref") -> - min(r.t_start for r in pe_records) else: row["pe_window_ns"] = 0.0 + stage_records = [r for r in op_log + if r.params.get("stage_type") in STAGES] + if stage_records: + row["composite_window_ns"] = max(r.t_end for r in stage_records) \ + - min(r.t_start for r in stage_records) + else: + row["composite_window_ns"] = 0.0 return row diff --git a/src/kernbench/components/base.py b/src/kernbench/components/base.py index 336b631..932b9e9 100644 --- a/src/kernbench/components/base.py +++ b/src/kernbench/components/base.py @@ -138,8 +138,16 @@ class PeEngineBase(ComponentBase): env.process(self._forward_txn(env, msg)) def _handle_with_hooks(self, env: simpy.Environment, pe_txn: Any) -> Generator: - """Wrap handle_command with op log hooks on the inner command.""" - self._on_process_start(env, pe_txn.command) + """Wrap handle_command with op log hooks on the inner command. + + Subclasses that need to defer record_start until after a resource + wait (e.g. pe_dma's DMA-channel acquire) set + ``_DEFER_RECORD_START = True`` and call + ``self._on_process_start(env, pe_txn.command)`` themselves at the + post-wait moment. record_end still fires here. + """ + if not getattr(self, "_DEFER_RECORD_START", False): + self._on_process_start(env, pe_txn.command) yield from self.handle_command(env, pe_txn) self._on_process_end(env, pe_txn.command) diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py index 7e11b8f..ed3de9e 100644 --- a/src/kernbench/components/builtin/pe_dma.py +++ b/src/kernbench/components/builtin/pe_dma.py @@ -27,6 +27,12 @@ class PeDmaComponent(PeEngineBase): (DmaReadCmd → HBM read, DmaWriteCmd → HBM write) """ + # Defer op_log record_start until AFTER the DMA channel is acquired so + # t_start reflects the serve-start moment (post queueing) rather than + # the queue-enter moment. ComponentBase._handle_with_hooks consults this + # flag. + _DEFER_RECORD_START = True + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: super().__init__(node, ctx) self._dma_read: simpy.Resource | None = None @@ -80,9 +86,16 @@ class PeDmaComponent(PeEngineBase): path = self.ctx.router.find_path(self._pe_prefix, dst_node) drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes) - # Acquire DMA channel (command issue serialization) + # Acquire DMA channel — held through the entire round-trip so the + # channel models "one DMA in flight per PE per direction" rather + # than just issue-time serialization. This is what makes Option B + # meaningful: t_start = serve-start covers the actual transfer. with dma_res.request() as req: yield req + # Option B: record_start fires AFTER channel acquired, so t_start + # = serve-start (excludes queue wait). _DEFER_RECORD_START=True + # suppresses the auto-start in ComponentBase._handle_with_hooks. + self._on_process_start(env, cmd) # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly) sub_done = env.event() sub_request = PeDmaMsg( @@ -99,10 +112,8 @@ class PeDmaComponent(PeEngineBase): # Send to next hop (path[0] is pe_dma itself, path[1] is router) if len(path) > 1: yield self.out_ports[path[1]].put(sub_txn.advance()) - # DMA channel released after issue - - # Wait for HBM transfer completion - yield sub_done + # Wait for HBM transfer completion BEFORE releasing the channel. + yield sub_done pe_txn.done.succeed() def _worker(self, env: simpy.Environment) -> Generator: @@ -293,15 +304,17 @@ class PeDmaComponent(PeEngineBase): txn.done.succeed() def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator: - """Pipeline mode: DMA read/write via fabric, then self-route.""" - self._on_process_start(env, token) + """Pipeline mode: DMA read/write via fabric, then self-route. + + Option B: record_start is fired *inside* _do_pipeline_dma, after the + DMA channel is acquired — record_end stays here. + """ yield from self._do_pipeline_dma(env, token) self._on_process_end(env, token) # Self-routing (handle same-component consecutive stages) next_stage = token.advance() while next_stage is not None and next_stage.component == self.node.id: - self._on_process_start(env, token) yield from self._do_pipeline_dma(env, token) self._on_process_end(env, token) next_stage = token.advance() @@ -340,8 +353,13 @@ class PeDmaComponent(PeEngineBase): path = self.ctx.router.find_path(self._pe_prefix, dst_node) drain_ns = self.ctx.compute_drain_ns(path, nbytes) + # Hold dma_res through the full round-trip — one DMA in flight + # per PE per direction — so Option B's t_start (post-acquire) + # bounds the actual transfer interval. with dma_res.request() as req: yield req + # Option B: t_start = post-acquire moment. + self._on_process_start(env, token) sub_done = env.event() sub_request = PeDmaMsg( correlation_id="pipeline", @@ -356,8 +374,11 @@ class PeDmaComponent(PeEngineBase): ) if len(path) > 1: yield self.out_ports[path[1]].put(sub_txn.advance()) - - yield sub_done + yield sub_done + else: + # No-op (nbytes==0 or no ctx): no channel wait, but still record + # so _on_process_end has a matching pending entry to finalise. + self._on_process_start(env, token) def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator: """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition.""" diff --git a/tests/test_pe_pipeline.py b/tests/test_pe_pipeline.py index cb7d7a6..910f5c5 100644 --- a/tests/test_pe_pipeline.py +++ b/tests/test_pe_pipeline.py @@ -307,3 +307,95 @@ def test_pipeline_overlap_within_command(): assert stage_times[(1, "dma")] == 10.0 # tile1 gemm starts when tile0 gemm finishes (serialized at gemm queue) assert stage_times[(1, "gemm")] == 30.0 + + +# ── 6. Option B: pe_dma record_start fires post channel-acquire ──────── + + +def test_pe_dma_record_start_after_channel_acquire(): + """Three back-to-back DMA_READs serialise on pe_dma.cap=1. + + With ``_DEFER_RECORD_START = True`` on PeDmaComponent, each op's + ``t_start`` is captured right after ``yield req`` succeeds. Result: + + - op N's ``(t_end - t_start)`` is the *actual transfer time* — same + across all three ops (no queueing inflation). + - op N+1's ``t_start`` >= op N's ``t_end - epsilon`` (waited for the + previous holder to release the channel before being recorded). + + Counter-example (the bug this fix addresses): if ``record_start`` fired + on command entry, all three ops would share ``t_start == 0`` and the + second/third would show inflated ``t_end - t_start``. + """ + from pathlib import Path + + from kernbench.common.pe_commands import DmaReadCmd, PeInternalTxn, TensorHandle + from kernbench.policy.address.phyaddr import PhysAddr + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import load_topology + + TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + def _hbm_pa() -> int: + slice_bytes = 48 * (1 << 30) // 8 + pa = PhysAddr.pe_hbm_addr( + sip_id=0, die_id=0, pe_id=0, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + # enable_data=True wires the OpLogger into every component. + engine = GraphEngine(load_topology(TOPOLOGY_PATH), enable_data=True) + pe_dma_id = "sip0.cube0.pe0.pe_dma" + pe_dma = engine._components[pe_dma_id] + env = engine._env + + # Three back-to-back DMA_READ commands fed straight into pe_dma's inbox + # at t=0 so they all race for the cap=1 channel. + handles = [ + TensorHandle(id=f"r{i}", addr=0x1000 + i * 0x1000, + shape=(64, 32), dtype="f16", nbytes=4096) + for i in range(3) + ] + cmds = [ + DmaReadCmd(handle=h, src_addr=_hbm_pa(), nbytes=4096) + for h in handles + ] + txns = [PeInternalTxn(command=c, done=env.event()) for c in cmds] + + def submit_all(): + for txn in txns: + yield pe_dma._inbox.put(txn) + + env.process(submit_all()) + env.run() + + # Pull the three dma_read records out of the op log in order + dma_records = [ + r for r in engine.op_log + if r.op_name == "dma_read" and r.component_id == pe_dma_id + ] + assert len(dma_records) == 3, ( + f"expected 3 dma_read records, got {len(dma_records)}: {dma_records}" + ) + + durations = [r.t_end - r.t_start for r in dma_records] + # All three should have the same actual transfer time within ±1 ns. + base = durations[0] + assert base > 0, f"first dma duration must be positive, got {base}" + for i, d in enumerate(durations): + assert abs(d - base) <= 1.0, ( + f"op {i} duration {d} differs from baseline {base} by >1 ns " + f"— record_start may still be including queue wait" + ) + + # Each subsequent op's t_start must be at or after the previous op's + # t_end (modulo a few ns of scheduler overhead) — i.e. the wait is + # *excluded* from the recorded interval, not folded into it. + for i in range(1, len(dma_records)): + prev_end = dma_records[i - 1].t_end + cur_start = dma_records[i].t_start + assert cur_start >= prev_end - 1.0, ( + f"op {i} t_start={cur_start} began before op {i-1} t_end={prev_end} " + f"— channel was not actually held, fix is incorrect" + )