Files
kernbench2/docs/diagrams/gemm_sweep.json
T
2026-05-13 15:00:41 -07:00

1612 lines
38 KiB
JSON

{
"tile_sizes": {
"M": 32,
"K": 64,
"N": 32
},
"engines": [
"pe_dma",
"pe_fetch_store",
"pe_gemm",
"pe_math"
],
"stages": [
"DMA_READ",
"DMA_WRITE",
"FETCH",
"STORE",
"GEMM",
"MATH"
],
"variants": [
"ref_ref",
"load_ref",
"load_load"
],
"rows": [
{
"M": 32,
"K": 32,
"N": 32,
"variant": "ref_ref",
"flops": 65536,
"bytes_hbm": 6144,
"arith_intensity": 10.666666666666666,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.569,
"engines": {
"pe_dma": {
"occupancy_ns": 52.0,
"wall_ns": 52.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 40.0,
"wall_ns": 40.0,
"record_count": 2
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 88.38400000000001
},
{
"M": 32,
"K": 32,
"N": 32,
"variant": "load_ref",
"flops": 65536,
"bytes_hbm": 6144,
"arith_intensity": 10.666666666666666,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.409,
"engines": {
"pe_dma": {
"occupancy_ns": 44.5,
"wall_ns": 44.5,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 1
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 81.894
},
{
"M": 32,
"K": 32,
"N": 32,
"variant": "load_load",
"flops": 65536,
"bytes_hbm": 6144,
"arith_intensity": 10.666666666666666,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.567,
"engines": {
"pe_dma": {
"occupancy_ns": 37.0,
"wall_ns": 37.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 75.404
},
{
"M": 32,
"K": 64,
"N": 32,
"variant": "ref_ref",
"flops": 131072,
"bytes_hbm": 10240,
"arith_intensity": 12.8,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.838,
"engines": {
"pe_dma": {
"occupancy_ns": 52.0,
"wall_ns": 52.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 40.0,
"wall_ns": 40.0,
"record_count": 2
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 88.38400000000001
},
{
"M": 32,
"K": 64,
"N": 32,
"variant": "load_ref",
"flops": 131072,
"bytes_hbm": 10240,
"arith_intensity": 12.8,
"tile_count_expected": 1,
"sim_wall_clock_s": 1.097,
"engines": {
"pe_dma": {
"occupancy_ns": 52.5,
"wall_ns": 52.5,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 1
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 89.894
},
{
"M": 32,
"K": 64,
"N": 32,
"variant": "load_load",
"flops": 131072,
"bytes_hbm": 10240,
"arith_intensity": 12.8,
"tile_count_expected": 1,
"sim_wall_clock_s": 1.264,
"engines": {
"pe_dma": {
"occupancy_ns": 53.0,
"wall_ns": 53.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 91.404
},
{
"M": 32,
"K": 128,
"N": 32,
"variant": "ref_ref",
"flops": 262144,
"bytes_hbm": 18432,
"arith_intensity": 14.222222222222221,
"tile_count_expected": 2,
"sim_wall_clock_s": 1.187,
"engines": {
"pe_dma": {
"occupancy_ns": 131.995,
"wall_ns": 80.0,
"record_count": 5
},
"pe_fetch_store": {
"occupancy_ns": 36.0,
"wall_ns": 36.0,
"record_count": 3
},
"pe_gemm": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 119.995,
"wall_ns": 68.0,
"record_count": 4
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 32.0,
"wall_ns": 32.0,
"record_count": 2
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 100.76800000000003
},
{
"M": 32,
"K": 128,
"N": 32,
"variant": "load_ref",
"flops": 262144,
"bytes_hbm": 18432,
"arith_intensity": 14.222222222222221,
"tile_count_expected": 2,
"sim_wall_clock_s": 1.13,
"engines": {
"pe_dma": {
"occupancy_ns": 104.495,
"wall_ns": 84.5,
"record_count": 4
},
"pe_fetch_store": {
"occupancy_ns": 36.0,
"wall_ns": 36.0,
"record_count": 3
},
"pe_gemm": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 55.995000000000005,
"wall_ns": 36.0,
"record_count": 2
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 32.0,
"wall_ns": 32.0,
"record_count": 2
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 106.27300000000002
},
{
"M": 32,
"K": 128,
"N": 32,
"variant": "load_load",
"flops": 262144,
"bytes_hbm": 18432,
"arith_intensity": 14.222222222222221,
"tile_count_expected": 2,
"sim_wall_clock_s": 1.113,
"engines": {
"pe_dma": {
"occupancy_ns": 85.0,
"wall_ns": 85.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 51.995000000000005,
"wall_ns": 36.0,
"record_count": 3
},
"pe_gemm": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 47.995000000000005,
"wall_ns": 32.0,
"record_count": 2
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 123.78300000000002
},
{
"M": 32,
"K": 128,
"N": 128,
"variant": "ref_ref",
"flops": 1048576,
"bytes_hbm": 49152,
"arith_intensity": 21.333333333333332,
"tile_count_expected": 8,
"sim_wall_clock_s": 1.451,
"engines": {
"pe_dma": {
"occupancy_ns": 1687.995,
"wall_ns": 272.0,
"record_count": 20
},
"pe_fetch_store": {
"occupancy_ns": 201.6959999999999,
"wall_ns": 132.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 1631.995,
"wall_ns": 260.0,
"record_count": 16
},
"DMA_WRITE": {
"occupancy_ns": 56.0,
"wall_ns": 40.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 148.0,
"wall_ns": 132.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 53.69599999999991,
"wall_ns": 47.23199999999997,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 297.9200000000001
},
{
"M": 32,
"K": 128,
"N": 128,
"variant": "load_ref",
"flops": 1048576,
"bytes_hbm": 49152,
"arith_intensity": 21.333333333333332,
"tile_count_expected": 8,
"sim_wall_clock_s": 1.269,
"engines": {
"pe_dma": {
"occupancy_ns": 700.495,
"wall_ns": 180.5,
"record_count": 13
},
"pe_fetch_store": {
"occupancy_ns": 201.6959999999999,
"wall_ns": 132.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 607.995,
"wall_ns": 132.0,
"record_count": 8
},
"DMA_WRITE": {
"occupancy_ns": 56.0,
"wall_ns": 40.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 148.0,
"wall_ns": 132.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 53.69599999999991,
"wall_ns": 47.23199999999997,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 207.42500000000007
},
{
"M": 32,
"K": 128,
"N": 128,
"variant": "load_load",
"flops": 1048576,
"bytes_hbm": 49152,
"arith_intensity": 21.333333333333332,
"tile_count_expected": 8,
"sim_wall_clock_s": 1.225,
"engines": {
"pe_dma": {
"occupancy_ns": 217.0,
"wall_ns": 217.0,
"record_count": 6
},
"pe_fetch_store": {
"occupancy_ns": 591.995,
"wall_ns": 128.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 48.0,
"wall_ns": 48.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 575.995,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 318.0870000000001
},
{
"M": 32,
"K": 3072,
"N": 32,
"variant": "ref_ref",
"flops": 6291456,
"bytes_hbm": 395264,
"arith_intensity": 15.917098445595855,
"tile_count_expected": 48,
"sim_wall_clock_s": 2.724,
"engines": {
"pe_dma": {
"occupancy_ns": 55883.995,
"wall_ns": 1552.0,
"record_count": 97
},
"pe_fetch_store": {
"occupancy_ns": 791.1039999999994,
"wall_ns": 772.0,
"record_count": 49
},
"pe_gemm": {
"occupancy_ns": 1215.584000000017,
"wall_ns": 786.4320000000007,
"record_count": 48
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 55871.995,
"wall_ns": 1540.0,
"record_count": 96
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 772.0,
"wall_ns": 772.0,
"record_count": 48
},
"STORE": {
"occupancy_ns": 19.10399999999936,
"wall_ns": 19.10399999999936,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 1215.584000000017,
"wall_ns": 786.4320000000007,
"record_count": 48
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 1590.4320000000007
},
{
"M": 32,
"K": 3072,
"N": 32,
"variant": "load_ref",
"flops": 6291456,
"bytes_hbm": 395264,
"arith_intensity": 15.917098445595855,
"tile_count_expected": 48,
"sim_wall_clock_s": 2.137,
"engines": {
"pe_dma": {
"occupancy_ns": 19792.495,
"wall_ns": 1556.5,
"record_count": 50
},
"pe_fetch_store": {
"occupancy_ns": 791.1039999999994,
"wall_ns": 772.0,
"record_count": 49
},
"pe_gemm": {
"occupancy_ns": 1215.584000000017,
"wall_ns": 786.4320000000007,
"record_count": 48
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 19007.995,
"wall_ns": 772.0,
"record_count": 48
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 772.0,
"wall_ns": 772.0,
"record_count": 48
},
"STORE": {
"occupancy_ns": 19.10399999999936,
"wall_ns": 19.10399999999936,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 1215.584000000017,
"wall_ns": 786.4320000000007,
"record_count": 48
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 1595.9370000000008
},
{
"M": 32,
"K": 3072,
"N": 32,
"variant": "load_load",
"flops": 6291456,
"bytes_hbm": 395264,
"arith_intensity": 15.917098445595855,
"tile_count_expected": 48,
"sim_wall_clock_s": 1.245,
"engines": {
"pe_dma": {
"occupancy_ns": 1557.0,
"wall_ns": 1557.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 18819.99500000001,
"wall_ns": 772.0000000000002,
"record_count": 49
},
"pe_gemm": {
"occupancy_ns": 1219.5839999999987,
"wall_ns": 786.4320000000005,
"record_count": 48
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 18815.99500000001,
"wall_ns": 768.0000000000002,
"record_count": 48
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 1219.5839999999987,
"wall_ns": 786.4320000000005,
"record_count": 48
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 2349.447000000001
},
{
"M": 8,
"K": 128,
"N": 128,
"variant": "ref_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 8,
"sim_wall_clock_s": 1.477,
"engines": {
"pe_dma": {
"occupancy_ns": 1687.995,
"wall_ns": 272.0,
"record_count": 20
},
"pe_fetch_store": {
"occupancy_ns": 201.6959999999999,
"wall_ns": 132.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 1631.995,
"wall_ns": 260.0,
"record_count": 16
},
"DMA_WRITE": {
"occupancy_ns": 56.0,
"wall_ns": 40.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 148.0,
"wall_ns": 132.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 53.69599999999991,
"wall_ns": 47.23199999999997,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 297.9200000000001
},
{
"M": 8,
"K": 128,
"N": 128,
"variant": "load_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 8,
"sim_wall_clock_s": 1.443,
"engines": {
"pe_dma": {
"occupancy_ns": 676.495,
"wall_ns": 156.5,
"record_count": 13
},
"pe_fetch_store": {
"occupancy_ns": 201.6959999999999,
"wall_ns": 132.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 607.995,
"wall_ns": 132.0,
"record_count": 8
},
"DMA_WRITE": {
"occupancy_ns": 56.0,
"wall_ns": 40.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 148.0,
"wall_ns": 132.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 53.69599999999991,
"wall_ns": 47.23199999999997,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 136.0640000000003,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 183.42500000000007
},
{
"M": 8,
"K": 128,
"N": 128,
"variant": "load_load",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 8,
"sim_wall_clock_s": 1.198,
"engines": {
"pe_dma": {
"occupancy_ns": 193.0,
"wall_ns": 193.0,
"record_count": 6
},
"pe_fetch_store": {
"occupancy_ns": 591.995,
"wall_ns": 128.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 48.0,
"wall_ns": 48.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 575.995,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 294.0870000000001
},
{
"M": 128,
"K": 8,
"N": 128,
"variant": "ref_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 16,
"sim_wall_clock_s": 1.983,
"engines": {
"pe_dma": {
"occupancy_ns": 6547.771,
"wall_ns": 560.0,
"record_count": 48
},
"pe_fetch_store": {
"occupancy_ns": 481.72799999999916,
"wall_ns": 268.0,
"record_count": 32
},
"pe_gemm": {
"occupancy_ns": 293.2480000000014,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 6335.995,
"wall_ns": 516.0,
"record_count": 32
},
"DMA_WRITE": {
"occupancy_ns": 211.77599999999984,
"wall_ns": 169.15200000000004,
"record_count": 16
},
"FETCH": {
"occupancy_ns": 308.0,
"wall_ns": 260.0,
"record_count": 16
},
"STORE": {
"occupancy_ns": 173.72799999999916,
"wall_ns": 164.2559999999994,
"record_count": 16
},
"GEMM": {
"occupancy_ns": 293.2480000000014,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 572.9920000000002
},
{
"M": 128,
"K": 8,
"N": 128,
"variant": "load_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 16,
"sim_wall_clock_s": 1.699,
"engines": {
"pe_dma": {
"occupancy_ns": 2464.2709999999997,
"wall_ns": 316.5,
"record_count": 33
},
"pe_fetch_store": {
"occupancy_ns": 481.72799999999916,
"wall_ns": 268.0,
"record_count": 32
},
"pe_gemm": {
"occupancy_ns": 293.2480000000014,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 2239.995,
"wall_ns": 260.0,
"record_count": 16
},
"DMA_WRITE": {
"occupancy_ns": 211.77599999999984,
"wall_ns": 169.15200000000004,
"record_count": 16
},
"FETCH": {
"occupancy_ns": 308.0,
"wall_ns": 260.0,
"record_count": 16
},
"STORE": {
"occupancy_ns": 173.72799999999916,
"wall_ns": 164.2559999999994,
"record_count": 16
},
"GEMM": {
"occupancy_ns": 293.2480000000014,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 330.4970000000002
},
{
"M": 128,
"K": 8,
"N": 128,
"variant": "load_load",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 16,
"sim_wall_clock_s": 1.402,
"engines": {
"pe_dma": {
"occupancy_ns": 217.0,
"wall_ns": 217.0,
"record_count": 18
},
"pe_fetch_store": {
"occupancy_ns": 2239.995,
"wall_ns": 264.0,
"record_count": 32
},
"pe_gemm": {
"occupancy_ns": 308.224000000002,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 192.0,
"wall_ns": 192.0,
"record_count": 16
},
"FETCH": {
"occupancy_ns": 2175.995,
"wall_ns": 256.0,
"record_count": 16
},
"STORE": {
"occupancy_ns": 64.0,
"wall_ns": 64.0,
"record_count": 16
},
"GEMM": {
"occupancy_ns": 308.224000000002,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 321.1590000000002
},
{
"M": 512,
"K": 512,
"N": 512,
"variant": "ref_ref",
"flops": 268435456,
"bytes_hbm": 1572864,
"arith_intensity": 170.66666666666666,
"tile_count_expected": 2048,
"sim_wall_clock_s": 89.111,
"engines": {
"pe_dma": {
"occupancy_ns": 100690943.995,
"wall_ns": 65612.00000000001,
"record_count": 4352
},
"pe_fetch_store": {
"occupancy_ns": 43566.52800034459,
"wall_ns": 32796.00000000001,
"record_count": 2304
},
"pe_gemm": {
"occupancy_ns": 833762.8159962555,
"wall_ns": 33554.431999996836,
"record_count": 2048
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 100687871.995,
"wall_ns": 65540.00000000001,
"record_count": 4096
},
"DMA_WRITE": {
"occupancy_ns": 3072.0,
"wall_ns": 3072.0,
"record_count": 256
},
"FETCH": {
"occupancy_ns": 40936.00000000001,
"wall_ns": 32772.00000000001,
"record_count": 2048
},
"STORE": {
"occupancy_ns": 2630.5280003445805,
"wall_ns": 2630.5280003445805,
"record_count": 256
},
"GEMM": {
"occupancy_ns": 833762.8159962555,
"wall_ns": 33554.431999996836,
"record_count": 2048
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 66360.12799999685
},
{
"M": 512,
"K": 512,
"N": 512,
"variant": "load_ref",
"flops": 268435456,
"bytes_hbm": 1572864,
"arith_intensity": 170.66666666666666,
"tile_count_expected": 2048,
"sim_wall_clock_s": 48.616,
"engines": {
"pe_dma": {
"occupancy_ns": 33584132.495,
"wall_ns": 34896.5,
"record_count": 2305
},
"pe_fetch_store": {
"occupancy_ns": 43562.81600011295,
"wall_ns": 32796.0,
"record_count": 2304
},
"pe_gemm": {
"occupancy_ns": 833762.8159987241,
"wall_ns": 33554.43199999785,
"record_count": 2048
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 33579007.995,
"wall_ns": 32772.0,
"record_count": 2048
},
"DMA_WRITE": {
"occupancy_ns": 3072.0,
"wall_ns": 3072.0,
"record_count": 256
},
"FETCH": {
"occupancy_ns": 40936.0,
"wall_ns": 32772.0,
"record_count": 2048
},
"STORE": {
"occupancy_ns": 2626.816000112947,
"wall_ns": 2626.816000112947,
"record_count": 256
},
"GEMM": {
"occupancy_ns": 833762.8159987241,
"wall_ns": 33554.43199999785,
"record_count": 2048
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 35645.632999997855
},
{
"M": 512,
"K": 512,
"N": 512,
"variant": "load_load",
"flops": 268435456,
"bytes_hbm": 1572864,
"arith_intensity": 170.66666666666666,
"tile_count_expected": 2048,
"sim_wall_clock_s": 7.072,
"engines": {
"pe_dma": {
"occupancy_ns": 7177.0,
"wall_ns": 7177.0,
"record_count": 258
},
"pe_fetch_store": {
"occupancy_ns": 33571839.995,
"wall_ns": 32792.0,
"record_count": 2304
},
"pe_gemm": {
"occupancy_ns": 838467.5839984363,
"wall_ns": 33554.43199999763,
"record_count": 2048
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 3072.0,
"wall_ns": 3072.0,
"record_count": 256
},
"FETCH": {
"occupancy_ns": 33570815.995,
"wall_ns": 32767.999999999996,
"record_count": 2048
},
"STORE": {
"occupancy_ns": 1024.0,
"wall_ns": 1024.0,
"record_count": 256
},
"GEMM": {
"occupancy_ns": 838467.5839984363,
"wall_ns": 33554.43199999763,
"record_count": 2048
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 37677.44699999763
}
]
}