{ "tile_sizes": { "M": 32, "K": 64, "N": 32 }, "engines": [ "pe_dma", "pe_fetch_store", "pe_gemm", "pe_math" ], "stages": [ "DMA_READ", "DMA_WRITE", "FETCH", "STORE", "GEMM", "MATH" ], "variants": [ "ref_ref", "load_ref", "load_load" ], "rows": [ { "M": 32, "K": 32, "N": 32, "variant": "ref_ref", "flops": 65536, "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, "sim_wall_clock_s": 0.574, "engines": { "pe_dma": { "occupancy_ns": 52.0, "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 40.0, "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 88.38400000000001, "composite_window_ns": 88.38400000000001 }, { "M": 32, "K": 32, "N": 32, "variant": "load_ref", "flops": 65536, "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, "sim_wall_clock_s": 0.392, "engines": { "pe_dma": { "occupancy_ns": 44.0, "wall_ns": 44.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 1 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 81.394, "composite_window_ns": 68.38400000000001 }, { "M": 32, "K": 32, "N": 32, "variant": "load_load", "flops": 65536, "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, "sim_wall_clock_s": 0.53, "engines": { "pe_dma": { "occupancy_ns": 36.0, "wall_ns": 36.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 74.904, "composite_window_ns": 48.384000000000015 }, { "M": 32, "K": 64, "N": 32, "variant": "ref_ref", "flops": 131072, "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, "sim_wall_clock_s": 0.625, "engines": { "pe_dma": { "occupancy_ns": 52.0, "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 40.0, "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 88.38400000000001, "composite_window_ns": 88.38400000000001 }, { "M": 32, "K": 64, "N": 32, "variant": "load_ref", "flops": 131072, "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, "sim_wall_clock_s": 0.513, "engines": { "pe_dma": { "occupancy_ns": 52.0, "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 1 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 89.394, "composite_window_ns": 68.38400000000001 }, { "M": 32, "K": 64, "N": 32, "variant": "load_load", "flops": 131072, "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, "sim_wall_clock_s": 0.6, "engines": { "pe_dma": { "occupancy_ns": 52.0, "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 90.904, "composite_window_ns": 48.384000000000015 }, { "M": 32, "K": 128, "N": 32, "variant": "ref_ref", "flops": 262144, "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, "sim_wall_clock_s": 0.572, "engines": { "pe_dma": { "occupancy_ns": 92.0, "wall_ns": 92.0, "record_count": 5 }, "pe_fetch_store": { "occupancy_ns": 39.615999999999985, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 80.0, "wall_ns": 80.0, "record_count": 4 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 32.0, "wall_ns": 32.0, "record_count": 2 }, "STORE": { "occupancy_ns": 7.6159999999999854, "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 112.38400000000001, "composite_window_ns": 112.38400000000001 }, { "M": 32, "K": 128, "N": 32, "variant": "load_ref", "flops": 262144, "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, "sim_wall_clock_s": 0.606, "engines": { "pe_dma": { "occupancy_ns": 88.0, "wall_ns": 88.0, "record_count": 4 }, "pe_fetch_store": { "occupancy_ns": 39.615999999999985, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 40.0, "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 32.0, "wall_ns": 32.0, "record_count": 2 }, "STORE": { "occupancy_ns": 7.6159999999999854, "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 109.38900000000001, "composite_window_ns": 72.38400000000001 }, { "M": 32, "K": 128, "N": 32, "variant": "load_load", "flops": 262144, "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, "sim_wall_clock_s": 0.535, "engines": { "pe_dma": { "occupancy_ns": 84.0, "wall_ns": 84.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 51.995000000000005, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 47.995000000000005, "wall_ns": 32.0, "record_count": 2 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 123.28300000000002, "composite_window_ns": 48.76800000000003 }, { "M": 32, "K": 128, "N": 128, "variant": "ref_ref", "flops": 1048576, "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, "sim_wall_clock_s": 0.746, "engines": { "pe_dma": { "occupancy_ns": 408.0, "wall_ns": 348.0, "record_count": 20 }, "pe_fetch_store": { "occupancy_ns": 162.46399999999994, "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 328.0, "wall_ns": 328.0, "record_count": 16 }, "DMA_WRITE": { "occupancy_ns": 80.0, "wall_ns": 80.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 128.0, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 34.46399999999994, "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 360.384, "composite_window_ns": 360.384 }, { "M": 32, "K": 128, "N": 128, "variant": "load_ref", "flops": 1048576, "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, "sim_wall_clock_s": 0.728, "engines": { "pe_dma": { "occupancy_ns": 284.0, "wall_ns": 224.0, "record_count": 13 }, "pe_fetch_store": { "occupancy_ns": 162.46399999999994, "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 168.0, "wall_ns": 168.0, "record_count": 8 }, "DMA_WRITE": { "occupancy_ns": 80.0, "wall_ns": 80.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 128.0, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 34.46399999999994, "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 237.389, "composite_window_ns": 200.38400000000001 }, { "M": 32, "K": 128, "N": 128, "variant": "load_load", "flops": 1048576, "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, "sim_wall_clock_s": 0.584, "engines": { "pe_dma": { "occupancy_ns": 216.0, "wall_ns": 216.0, "record_count": 6 }, "pe_fetch_store": { "occupancy_ns": 591.995, "wall_ns": 128.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 48.0, "wall_ns": 48.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 575.995, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 4 }, "GEMM": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 317.5870000000001, "composite_window_ns": 147.07200000000012 }, { "M": 32, "K": 3072, "N": 32, "variant": "ref_ref", "flops": 6291456, "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, "sim_wall_clock_s": 1.348, "engines": { "pe_dma": { "occupancy_ns": 1932.0, "wall_ns": 1932.0, "record_count": 97 }, "pe_fetch_store": { "occupancy_ns": 775.616, "wall_ns": 772.0, "record_count": 49 }, "pe_gemm": { "occupancy_ns": 786.4320000000007, "wall_ns": 786.4320000000007, "record_count": 48 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 1920.0, "wall_ns": 1920.0, "record_count": 96 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 768.0, "wall_ns": 768.0, "record_count": 48 }, "STORE": { "occupancy_ns": 7.6159999999999854, "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { "occupancy_ns": 786.4320000000007, "wall_ns": 786.4320000000007, "record_count": 48 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 1952.384, "composite_window_ns": 1952.384 }, { "M": 32, "K": 3072, "N": 32, "variant": "load_ref", "flops": 6291456, "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, "sim_wall_clock_s": 1.075, "engines": { "pe_dma": { "occupancy_ns": 1744.0000000000002, "wall_ns": 1744.0000000000002, "record_count": 50 }, "pe_fetch_store": { "occupancy_ns": 775.6160000000002, "wall_ns": 772.0000000000002, "record_count": 49 }, "pe_gemm": { "occupancy_ns": 786.4320000000005, "wall_ns": 786.4320000000005, "record_count": 48 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 960.0000000000002, "wall_ns": 960.0000000000002, "record_count": 48 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 768.0000000000002, "wall_ns": 768.0000000000002, "record_count": 48 }, "STORE": { "occupancy_ns": 7.6159999999999854, "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { "occupancy_ns": 786.4320000000005, "wall_ns": 786.4320000000005, "record_count": 48 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 1765.3890000000004, "composite_window_ns": 992.3840000000002 }, { "M": 32, "K": 3072, "N": 32, "variant": "load_load", "flops": 6291456, "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, "sim_wall_clock_s": 0.635, "engines": { "pe_dma": { "occupancy_ns": 1556.0, "wall_ns": 1556.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 18819.99500000001, "wall_ns": 772.0000000000002, "record_count": 49 }, "pe_gemm": { "occupancy_ns": 1219.5839999999987, "wall_ns": 786.4320000000005, "record_count": 48 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 18815.99500000001, "wall_ns": 768.0000000000002, "record_count": 48 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 1219.5839999999987, "wall_ns": 786.4320000000005, "record_count": 48 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 2348.947000000001, "composite_window_ns": 802.4320000000005 }, { "M": 8, "K": 128, "N": 128, "variant": "ref_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, "sim_wall_clock_s": 0.706, "engines": { "pe_dma": { "occupancy_ns": 408.0, "wall_ns": 348.0, "record_count": 20 }, "pe_fetch_store": { "occupancy_ns": 162.46399999999994, "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 328.0, "wall_ns": 328.0, "record_count": 16 }, "DMA_WRITE": { "occupancy_ns": 80.0, "wall_ns": 80.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 128.0, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 34.46399999999994, "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 360.384, "composite_window_ns": 360.384 }, { "M": 8, "K": 128, "N": 128, "variant": "load_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, "sim_wall_clock_s": 0.678, "engines": { "pe_dma": { "occupancy_ns": 260.0, "wall_ns": 200.0, "record_count": 13 }, "pe_fetch_store": { "occupancy_ns": 162.46399999999994, "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 168.0, "wall_ns": 168.0, "record_count": 8 }, "DMA_WRITE": { "occupancy_ns": 80.0, "wall_ns": 80.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 128.0, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 34.46399999999994, "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 213.389, "composite_window_ns": 200.38400000000001 }, { "M": 8, "K": 128, "N": 128, "variant": "load_load", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, "sim_wall_clock_s": 0.632, "engines": { "pe_dma": { "occupancy_ns": 192.0, "wall_ns": 192.0, "record_count": 6 }, "pe_fetch_store": { "occupancy_ns": 591.995, "wall_ns": 128.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 48.0, "wall_ns": 48.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 575.995, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 4 }, "GEMM": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 293.5870000000001, "composite_window_ns": 147.07200000000012 }, { "M": 128, "K": 8, "N": 128, "variant": "ref_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, "sim_wall_clock_s": 0.978, "engines": { "pe_dma": { "occupancy_ns": 1040.0000000000002, "wall_ns": 732.0000000000001, "record_count": 48 }, "pe_fetch_store": { "occupancy_ns": 422.2400000000001, "wall_ns": 320.0000000000001, "record_count": 32 }, "pe_gemm": { "occupancy_ns": 262.1440000000001, "wall_ns": 262.1440000000001, "record_count": 16 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 688.0000000000001, "wall_ns": 688.0000000000001, "record_count": 32 }, "DMA_WRITE": { "occupancy_ns": 352.0000000000001, "wall_ns": 352.0000000000001, "record_count": 16 }, "FETCH": { "occupancy_ns": 256.0, "wall_ns": 256.0, "record_count": 16 }, "STORE": { "occupancy_ns": 166.24000000000012, "wall_ns": 166.24000000000012, "record_count": 16 }, "GEMM": { "occupancy_ns": 262.1440000000001, "wall_ns": 262.1440000000001, "record_count": 16 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 736.3840000000001, "composite_window_ns": 736.3840000000001 }, { "M": 128, "K": 8, "N": 128, "variant": "load_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, "sim_wall_clock_s": 0.999, "engines": { "pe_dma": { "occupancy_ns": 732.0, "wall_ns": 424.0, "record_count": 33 }, "pe_fetch_store": { "occupancy_ns": 422.2399999999998, "wall_ns": 320.0, "record_count": 32 }, "pe_gemm": { "occupancy_ns": 262.14400000000023, "wall_ns": 262.14400000000023, "record_count": 16 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 368.0, "wall_ns": 368.0, "record_count": 16 }, "DMA_WRITE": { "occupancy_ns": 352.0, "wall_ns": 352.0, "record_count": 16 }, "FETCH": { "occupancy_ns": 256.0, "wall_ns": 256.0, "record_count": 16 }, "STORE": { "occupancy_ns": 166.23999999999978, "wall_ns": 166.23999999999978, "record_count": 16 }, "GEMM": { "occupancy_ns": 262.14400000000023, "wall_ns": 262.14400000000023, "record_count": 16 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 429.389, "composite_window_ns": 416.384 }, { "M": 128, "K": 8, "N": 128, "variant": "load_load", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, "sim_wall_clock_s": 0.776, "engines": { "pe_dma": { "occupancy_ns": 216.0, "wall_ns": 216.0, "record_count": 18 }, "pe_fetch_store": { "occupancy_ns": 2239.995, "wall_ns": 264.0, "record_count": 32 }, "pe_gemm": { "occupancy_ns": 308.224000000002, "wall_ns": 262.14400000000023, "record_count": 16 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 192.0, "wall_ns": 192.0, "record_count": 16 }, "FETCH": { "occupancy_ns": 2175.995, "wall_ns": 256.0, "record_count": 16 }, "STORE": { "occupancy_ns": 64.0, "wall_ns": 64.0, "record_count": 16 }, "GEMM": { "occupancy_ns": 308.224000000002, "wall_ns": 262.14400000000023, "record_count": 16 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 320.6590000000002, "composite_window_ns": 294.14400000000023 }, { "M": 512, "K": 512, "N": 512, "variant": "ref_ref", "flops": 268435456, "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, "sim_wall_clock_s": 43.303, "engines": { "pe_dma": { "occupancy_ns": 90092.00000000001, "wall_ns": 82952.00000000001, "record_count": 4352 }, "pe_fetch_store": { "occupancy_ns": 34717.695999999676, "wall_ns": 33792.00000000001, "record_count": 2304 }, "pe_gemm": { "occupancy_ns": 33554.43200000262, "wall_ns": 33554.43200000262, "record_count": 2048 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 82940.00000000001, "wall_ns": 82940.00000000001, "record_count": 4096 }, "DMA_WRITE": { "occupancy_ns": 7152.0, "wall_ns": 7152.0, "record_count": 256 }, "FETCH": { "occupancy_ns": 32768.00000000001, "wall_ns": 32768.00000000001, "record_count": 2048 }, "STORE": { "occupancy_ns": 1949.6959999996689, "wall_ns": 1949.6959999996689, "record_count": 256 }, "GEMM": { "occupancy_ns": 33554.43200000262, "wall_ns": 33554.43200000262, "record_count": 2048 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 82972.38400000002, "composite_window_ns": 82972.38400000002 }, { "M": 512, "K": 512, "N": 512, "variant": "load_ref", "flops": 268435456, "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, "sim_wall_clock_s": 23.224, "engines": { "pe_dma": { "occupancy_ns": 51184.0, "wall_ns": 44044.0, "record_count": 2305 }, "pe_fetch_store": { "occupancy_ns": 34717.69600000031, "wall_ns": 33792.0, "record_count": 2304 }, "pe_gemm": { "occupancy_ns": 33554.43199999754, "wall_ns": 33554.43199999755, "record_count": 2048 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 41980.0, "wall_ns": 41980.0, "record_count": 2048 }, "DMA_WRITE": { "occupancy_ns": 7151.999999999999, "wall_ns": 7151.999999999999, "record_count": 256 }, "FETCH": { "occupancy_ns": 32768.0, "wall_ns": 32768.0, "record_count": 2048 }, "STORE": { "occupancy_ns": 1949.6960000003073, "wall_ns": 1949.6960000003073, "record_count": 256 }, "GEMM": { "occupancy_ns": 33554.43199999754, "wall_ns": 33554.43199999755, "record_count": 2048 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 44065.389, "composite_window_ns": 42012.384 }, { "M": 512, "K": 512, "N": 512, "variant": "load_load", "flops": 268435456, "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, "sim_wall_clock_s": 3.48, "engines": { "pe_dma": { "occupancy_ns": 7176.0, "wall_ns": 7176.0, "record_count": 258 }, "pe_fetch_store": { "occupancy_ns": 33571839.995, "wall_ns": 32792.0, "record_count": 2304 }, "pe_gemm": { "occupancy_ns": 838467.5839984363, "wall_ns": 33554.43199999763, "record_count": 2048 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 3072.0, "wall_ns": 3072.0, "record_count": 256 }, "FETCH": { "occupancy_ns": 33570815.995, "wall_ns": 32767.999999999996, "record_count": 2048 }, "STORE": { "occupancy_ns": 1024.0, "wall_ns": 1024.0, "record_count": 256 }, "GEMM": { "occupancy_ns": 838467.5839984363, "wall_ns": 33554.43199999763, "record_count": 2048 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 37676.94699999763, "composite_window_ns": 33570.43199999763 } ] }