{ "tile_sizes": { "M": 32, "K": 64, "N": 32 }, "engines": [ "pe_dma", "pe_fetch_store", "pe_gemm", "pe_math" ], "stages": [ "DMA_READ", "DMA_WRITE", "FETCH", "STORE", "GEMM", "MATH" ], "variants": [ "ref_ref", "load_ref", "load_load" ], "rows": [ { "M": 32, "K": 32, "N": 32, "variant": "ref_ref", "flops": 65536, "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, "sim_wall_clock_s": 0.569, "engines": { "pe_dma": { "occupancy_ns": 52.0, "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 40.0, "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 88.38400000000001 }, { "M": 32, "K": 32, "N": 32, "variant": "load_ref", "flops": 65536, "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, "sim_wall_clock_s": 0.409, "engines": { "pe_dma": { "occupancy_ns": 44.5, "wall_ns": 44.5, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 1 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 81.894 }, { "M": 32, "K": 32, "N": 32, "variant": "load_load", "flops": 65536, "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, "sim_wall_clock_s": 0.567, "engines": { "pe_dma": { "occupancy_ns": 37.0, "wall_ns": 37.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 75.404 }, { "M": 32, "K": 64, "N": 32, "variant": "ref_ref", "flops": 131072, "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, "sim_wall_clock_s": 0.838, "engines": { "pe_dma": { "occupancy_ns": 52.0, "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 40.0, "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 88.38400000000001 }, { "M": 32, "K": 64, "N": 32, "variant": "load_ref", "flops": 131072, "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, "sim_wall_clock_s": 1.097, "engines": { "pe_dma": { "occupancy_ns": 52.5, "wall_ns": 52.5, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 1 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 89.894 }, { "M": 32, "K": 64, "N": 32, "variant": "load_load", "flops": 131072, "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, "sim_wall_clock_s": 1.264, "engines": { "pe_dma": { "occupancy_ns": 53.0, "wall_ns": 53.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 20.0, "wall_ns": 20.0, "record_count": 2 }, "pe_gemm": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 1 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 16.384000000000015, "wall_ns": 16.384000000000015, "record_count": 1 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 91.404 }, { "M": 32, "K": 128, "N": 32, "variant": "ref_ref", "flops": 262144, "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, "sim_wall_clock_s": 1.187, "engines": { "pe_dma": { "occupancy_ns": 131.995, "wall_ns": 80.0, "record_count": 5 }, "pe_fetch_store": { "occupancy_ns": 36.0, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 119.995, "wall_ns": 68.0, "record_count": 4 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 32.0, "wall_ns": 32.0, "record_count": 2 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 100.76800000000003 }, { "M": 32, "K": 128, "N": 32, "variant": "load_ref", "flops": 262144, "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, "sim_wall_clock_s": 1.13, "engines": { "pe_dma": { "occupancy_ns": 104.495, "wall_ns": 84.5, "record_count": 4 }, "pe_fetch_store": { "occupancy_ns": 36.0, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 55.995000000000005, "wall_ns": 36.0, "record_count": 2 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 32.0, "wall_ns": 32.0, "record_count": 2 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 106.27300000000002 }, { "M": 32, "K": 128, "N": 32, "variant": "load_load", "flops": 262144, "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, "sim_wall_clock_s": 1.113, "engines": { "pe_dma": { "occupancy_ns": 85.0, "wall_ns": 85.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 51.995000000000005, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 47.995000000000005, "wall_ns": 32.0, "record_count": 2 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 33.152000000000044, "wall_ns": 32.76800000000003, "record_count": 2 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 123.78300000000002 }, { "M": 32, "K": 128, "N": 128, "variant": "ref_ref", "flops": 1048576, "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, "sim_wall_clock_s": 1.451, "engines": { "pe_dma": { "occupancy_ns": 1687.995, "wall_ns": 272.0, "record_count": 20 }, "pe_fetch_store": { "occupancy_ns": 201.6959999999999, "wall_ns": 132.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 1631.995, "wall_ns": 260.0, "record_count": 16 }, "DMA_WRITE": { "occupancy_ns": 56.0, "wall_ns": 40.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 148.0, "wall_ns": 132.0, "record_count": 8 }, "STORE": { "occupancy_ns": 53.69599999999991, "wall_ns": 47.23199999999997, "record_count": 4 }, "GEMM": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 297.9200000000001 }, { "M": 32, "K": 128, "N": 128, "variant": "load_ref", "flops": 1048576, "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, "sim_wall_clock_s": 1.269, "engines": { "pe_dma": { "occupancy_ns": 700.495, "wall_ns": 180.5, "record_count": 13 }, "pe_fetch_store": { "occupancy_ns": 201.6959999999999, "wall_ns": 132.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 607.995, "wall_ns": 132.0, "record_count": 8 }, "DMA_WRITE": { "occupancy_ns": 56.0, "wall_ns": 40.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 148.0, "wall_ns": 132.0, "record_count": 8 }, "STORE": { "occupancy_ns": 53.69599999999991, "wall_ns": 47.23199999999997, "record_count": 4 }, "GEMM": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 207.42500000000007 }, { "M": 32, "K": 128, "N": 128, "variant": "load_load", "flops": 1048576, "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, "sim_wall_clock_s": 1.225, "engines": { "pe_dma": { "occupancy_ns": 217.0, "wall_ns": 217.0, "record_count": 6 }, "pe_fetch_store": { "occupancy_ns": 591.995, "wall_ns": 128.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 48.0, "wall_ns": 48.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 575.995, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 4 }, "GEMM": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 318.0870000000001 }, { "M": 32, "K": 3072, "N": 32, "variant": "ref_ref", "flops": 6291456, "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, "sim_wall_clock_s": 2.724, "engines": { "pe_dma": { "occupancy_ns": 55883.995, "wall_ns": 1552.0, "record_count": 97 }, "pe_fetch_store": { "occupancy_ns": 791.1039999999994, "wall_ns": 772.0, "record_count": 49 }, "pe_gemm": { "occupancy_ns": 1215.584000000017, "wall_ns": 786.4320000000007, "record_count": 48 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 55871.995, "wall_ns": 1540.0, "record_count": 96 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 772.0, "wall_ns": 772.0, "record_count": 48 }, "STORE": { "occupancy_ns": 19.10399999999936, "wall_ns": 19.10399999999936, "record_count": 1 }, "GEMM": { "occupancy_ns": 1215.584000000017, "wall_ns": 786.4320000000007, "record_count": 48 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 1590.4320000000007 }, { "M": 32, "K": 3072, "N": 32, "variant": "load_ref", "flops": 6291456, "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, "sim_wall_clock_s": 2.137, "engines": { "pe_dma": { "occupancy_ns": 19792.495, "wall_ns": 1556.5, "record_count": 50 }, "pe_fetch_store": { "occupancy_ns": 791.1039999999994, "wall_ns": 772.0, "record_count": 49 }, "pe_gemm": { "occupancy_ns": 1215.584000000017, "wall_ns": 786.4320000000007, "record_count": 48 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 19007.995, "wall_ns": 772.0, "record_count": 48 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 772.0, "wall_ns": 772.0, "record_count": 48 }, "STORE": { "occupancy_ns": 19.10399999999936, "wall_ns": 19.10399999999936, "record_count": 1 }, "GEMM": { "occupancy_ns": 1215.584000000017, "wall_ns": 786.4320000000007, "record_count": 48 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 1595.9370000000008 }, { "M": 32, "K": 3072, "N": 32, "variant": "load_load", "flops": 6291456, "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, "sim_wall_clock_s": 1.245, "engines": { "pe_dma": { "occupancy_ns": 1557.0, "wall_ns": 1557.0, "record_count": 3 }, "pe_fetch_store": { "occupancy_ns": 18819.99500000001, "wall_ns": 772.0000000000002, "record_count": 49 }, "pe_gemm": { "occupancy_ns": 1219.5839999999987, "wall_ns": 786.4320000000005, "record_count": 48 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 12.0, "wall_ns": 12.0, "record_count": 1 }, "FETCH": { "occupancy_ns": 18815.99500000001, "wall_ns": 768.0000000000002, "record_count": 48 }, "STORE": { "occupancy_ns": 4.0, "wall_ns": 4.0, "record_count": 1 }, "GEMM": { "occupancy_ns": 1219.5839999999987, "wall_ns": 786.4320000000005, "record_count": 48 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 2349.447000000001 }, { "M": 8, "K": 128, "N": 128, "variant": "ref_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, "sim_wall_clock_s": 1.477, "engines": { "pe_dma": { "occupancy_ns": 1687.995, "wall_ns": 272.0, "record_count": 20 }, "pe_fetch_store": { "occupancy_ns": 201.6959999999999, "wall_ns": 132.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 1631.995, "wall_ns": 260.0, "record_count": 16 }, "DMA_WRITE": { "occupancy_ns": 56.0, "wall_ns": 40.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 148.0, "wall_ns": 132.0, "record_count": 8 }, "STORE": { "occupancy_ns": 53.69599999999991, "wall_ns": 47.23199999999997, "record_count": 4 }, "GEMM": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 297.9200000000001 }, { "M": 8, "K": 128, "N": 128, "variant": "load_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, "sim_wall_clock_s": 1.443, "engines": { "pe_dma": { "occupancy_ns": 676.495, "wall_ns": 156.5, "record_count": 13 }, "pe_fetch_store": { "occupancy_ns": 201.6959999999999, "wall_ns": 132.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 607.995, "wall_ns": 132.0, "record_count": 8 }, "DMA_WRITE": { "occupancy_ns": 56.0, "wall_ns": 40.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 148.0, "wall_ns": 132.0, "record_count": 8 }, "STORE": { "occupancy_ns": 53.69599999999991, "wall_ns": 47.23199999999997, "record_count": 4 }, "GEMM": { "occupancy_ns": 136.0640000000003, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 183.42500000000007 }, { "M": 8, "K": 128, "N": 128, "variant": "load_load", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, "sim_wall_clock_s": 1.198, "engines": { "pe_dma": { "occupancy_ns": 193.0, "wall_ns": 193.0, "record_count": 6 }, "pe_fetch_store": { "occupancy_ns": 591.995, "wall_ns": 128.0, "record_count": 12 }, "pe_gemm": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 48.0, "wall_ns": 48.0, "record_count": 4 }, "FETCH": { "occupancy_ns": 575.995, "wall_ns": 128.0, "record_count": 8 }, "STORE": { "occupancy_ns": 16.0, "wall_ns": 16.0, "record_count": 4 }, "GEMM": { "occupancy_ns": 141.82400000000052, "wall_ns": 131.07200000000012, "record_count": 8 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 294.0870000000001 }, { "M": 128, "K": 8, "N": 128, "variant": "ref_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, "sim_wall_clock_s": 1.983, "engines": { "pe_dma": { "occupancy_ns": 6547.771, "wall_ns": 560.0, "record_count": 48 }, "pe_fetch_store": { "occupancy_ns": 481.72799999999916, "wall_ns": 268.0, "record_count": 32 }, "pe_gemm": { "occupancy_ns": 293.2480000000014, "wall_ns": 262.14400000000023, "record_count": 16 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 6335.995, "wall_ns": 516.0, "record_count": 32 }, "DMA_WRITE": { "occupancy_ns": 211.77599999999984, "wall_ns": 169.15200000000004, "record_count": 16 }, "FETCH": { "occupancy_ns": 308.0, "wall_ns": 260.0, "record_count": 16 }, "STORE": { "occupancy_ns": 173.72799999999916, "wall_ns": 164.2559999999994, "record_count": 16 }, "GEMM": { "occupancy_ns": 293.2480000000014, "wall_ns": 262.14400000000023, "record_count": 16 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 572.9920000000002 }, { "M": 128, "K": 8, "N": 128, "variant": "load_ref", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, "sim_wall_clock_s": 1.699, "engines": { "pe_dma": { "occupancy_ns": 2464.2709999999997, "wall_ns": 316.5, "record_count": 33 }, "pe_fetch_store": { "occupancy_ns": 481.72799999999916, "wall_ns": 268.0, "record_count": 32 }, "pe_gemm": { "occupancy_ns": 293.2480000000014, "wall_ns": 262.14400000000023, "record_count": 16 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 2239.995, "wall_ns": 260.0, "record_count": 16 }, "DMA_WRITE": { "occupancy_ns": 211.77599999999984, "wall_ns": 169.15200000000004, "record_count": 16 }, "FETCH": { "occupancy_ns": 308.0, "wall_ns": 260.0, "record_count": 16 }, "STORE": { "occupancy_ns": 173.72799999999916, "wall_ns": 164.2559999999994, "record_count": 16 }, "GEMM": { "occupancy_ns": 293.2480000000014, "wall_ns": 262.14400000000023, "record_count": 16 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 330.4970000000002 }, { "M": 128, "K": 8, "N": 128, "variant": "load_load", "flops": 262144, "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, "sim_wall_clock_s": 1.402, "engines": { "pe_dma": { "occupancy_ns": 217.0, "wall_ns": 217.0, "record_count": 18 }, "pe_fetch_store": { "occupancy_ns": 2239.995, "wall_ns": 264.0, "record_count": 32 }, "pe_gemm": { "occupancy_ns": 308.224000000002, "wall_ns": 262.14400000000023, "record_count": 16 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 192.0, "wall_ns": 192.0, "record_count": 16 }, "FETCH": { "occupancy_ns": 2175.995, "wall_ns": 256.0, "record_count": 16 }, "STORE": { "occupancy_ns": 64.0, "wall_ns": 64.0, "record_count": 16 }, "GEMM": { "occupancy_ns": 308.224000000002, "wall_ns": 262.14400000000023, "record_count": 16 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 321.1590000000002 }, { "M": 512, "K": 512, "N": 512, "variant": "ref_ref", "flops": 268435456, "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, "sim_wall_clock_s": 89.111, "engines": { "pe_dma": { "occupancy_ns": 100690943.995, "wall_ns": 65612.00000000001, "record_count": 4352 }, "pe_fetch_store": { "occupancy_ns": 43566.52800034459, "wall_ns": 32796.00000000001, "record_count": 2304 }, "pe_gemm": { "occupancy_ns": 833762.8159962555, "wall_ns": 33554.431999996836, "record_count": 2048 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 100687871.995, "wall_ns": 65540.00000000001, "record_count": 4096 }, "DMA_WRITE": { "occupancy_ns": 3072.0, "wall_ns": 3072.0, "record_count": 256 }, "FETCH": { "occupancy_ns": 40936.00000000001, "wall_ns": 32772.00000000001, "record_count": 2048 }, "STORE": { "occupancy_ns": 2630.5280003445805, "wall_ns": 2630.5280003445805, "record_count": 256 }, "GEMM": { "occupancy_ns": 833762.8159962555, "wall_ns": 33554.431999996836, "record_count": 2048 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 66360.12799999685 }, { "M": 512, "K": 512, "N": 512, "variant": "load_ref", "flops": 268435456, "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, "sim_wall_clock_s": 48.616, "engines": { "pe_dma": { "occupancy_ns": 33584132.495, "wall_ns": 34896.5, "record_count": 2305 }, "pe_fetch_store": { "occupancy_ns": 43562.81600011295, "wall_ns": 32796.0, "record_count": 2304 }, "pe_gemm": { "occupancy_ns": 833762.8159987241, "wall_ns": 33554.43199999785, "record_count": 2048 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 33579007.995, "wall_ns": 32772.0, "record_count": 2048 }, "DMA_WRITE": { "occupancy_ns": 3072.0, "wall_ns": 3072.0, "record_count": 256 }, "FETCH": { "occupancy_ns": 40936.0, "wall_ns": 32772.0, "record_count": 2048 }, "STORE": { "occupancy_ns": 2626.816000112947, "wall_ns": 2626.816000112947, "record_count": 256 }, "GEMM": { "occupancy_ns": 833762.8159987241, "wall_ns": 33554.43199999785, "record_count": 2048 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 35645.632999997855 }, { "M": 512, "K": 512, "N": 512, "variant": "load_load", "flops": 268435456, "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, "sim_wall_clock_s": 7.072, "engines": { "pe_dma": { "occupancy_ns": 7177.0, "wall_ns": 7177.0, "record_count": 258 }, "pe_fetch_store": { "occupancy_ns": 33571839.995, "wall_ns": 32792.0, "record_count": 2304 }, "pe_gemm": { "occupancy_ns": 838467.5839984363, "wall_ns": 33554.43199999763, "record_count": 2048 }, "pe_math": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "stages": { "DMA_READ": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 }, "DMA_WRITE": { "occupancy_ns": 3072.0, "wall_ns": 3072.0, "record_count": 256 }, "FETCH": { "occupancy_ns": 33570815.995, "wall_ns": 32767.999999999996, "record_count": 2048 }, "STORE": { "occupancy_ns": 1024.0, "wall_ns": 1024.0, "record_count": 256 }, "GEMM": { "occupancy_ns": 838467.5839984363, "wall_ns": 33554.43199999763, "record_count": 2048 }, "MATH": { "occupancy_ns": 0, "wall_ns": 0.0, "record_count": 0 } }, "pe_window_ns": 37677.44699999763 } ] }