Files
mukesh f6d262e359 Honest measured pipeline efficiency: two timing fixes
Two related issues caused measured pipeline efficiency to look
worse than the simulator's actual behavior:

1. DMA timing recorded too early. The op-log start timestamp
   for a DMA op fired when the request entered the queue, and
   the DMA channel was released as soon as the request was
   issued. Back-to-back DMAs therefore appeared to grab the
   channel simultaneously, with per-op duration drifting
   upward as queue depth grew - an artifact, not real cost.

   Fix: defer the start timestamp until after the channel is
   acquired, and hold the channel through the full HBM
   round-trip until the response returns. Per-op duration is
   now constant and equal to the actual transfer interval;
   serialization is visible as queue wait, not as inflated
   service time.

2. Sweep timing window folded in pre-composite work. The PE
   timing window spanned every PE engine record, which
   included the upfront pinned-operand DMA issued before the
   composite GEMM begins. For large-K shapes that one-shot
   load can be nearly half of the window, conflating
   operand-staging cost with composite-pipeline behavior.

   Fix: add a second window scoped to the composite pipeline
   by filtering op_log records to those tagged with a
   tile-pipeline stage; the legacy operand-load path is
   untagged and naturally excluded. For 32x3072x32 load_ref
   the window drops from 1765ns to 992ns and measured eff
   lines up with the steady-state DMA-bound stage limit
   instead of being penalized for the one-time load.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 14:19:17 -07:00

1636 lines
39 KiB
JSON

{
"tile_sizes": {
"M": 32,
"K": 64,
"N": 32
},
"engines": [
"pe_dma",
"pe_fetch_store",
"pe_gemm",
"pe_math"
],
"stages": [
"DMA_READ",
"DMA_WRITE",
"FETCH",
"STORE",
"GEMM",
"MATH"
],
"variants": [
"ref_ref",
"load_ref",
"load_load"
],
"rows": [
{
"M": 32,
"K": 32,
"N": 32,
"variant": "ref_ref",
"flops": 65536,
"bytes_hbm": 6144,
"arith_intensity": 10.666666666666666,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.574,
"engines": {
"pe_dma": {
"occupancy_ns": 52.0,
"wall_ns": 52.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 40.0,
"wall_ns": 40.0,
"record_count": 2
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 88.38400000000001,
"composite_window_ns": 88.38400000000001
},
{
"M": 32,
"K": 32,
"N": 32,
"variant": "load_ref",
"flops": 65536,
"bytes_hbm": 6144,
"arith_intensity": 10.666666666666666,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.392,
"engines": {
"pe_dma": {
"occupancy_ns": 44.0,
"wall_ns": 44.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 1
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 81.394,
"composite_window_ns": 68.38400000000001
},
{
"M": 32,
"K": 32,
"N": 32,
"variant": "load_load",
"flops": 65536,
"bytes_hbm": 6144,
"arith_intensity": 10.666666666666666,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.53,
"engines": {
"pe_dma": {
"occupancy_ns": 36.0,
"wall_ns": 36.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 74.904,
"composite_window_ns": 48.384000000000015
},
{
"M": 32,
"K": 64,
"N": 32,
"variant": "ref_ref",
"flops": 131072,
"bytes_hbm": 10240,
"arith_intensity": 12.8,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.625,
"engines": {
"pe_dma": {
"occupancy_ns": 52.0,
"wall_ns": 52.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 40.0,
"wall_ns": 40.0,
"record_count": 2
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 88.38400000000001,
"composite_window_ns": 88.38400000000001
},
{
"M": 32,
"K": 64,
"N": 32,
"variant": "load_ref",
"flops": 131072,
"bytes_hbm": 10240,
"arith_intensity": 12.8,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.513,
"engines": {
"pe_dma": {
"occupancy_ns": 52.0,
"wall_ns": 52.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 1
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 89.394,
"composite_window_ns": 68.38400000000001
},
{
"M": 32,
"K": 64,
"N": 32,
"variant": "load_load",
"flops": 131072,
"bytes_hbm": 10240,
"arith_intensity": 12.8,
"tile_count_expected": 1,
"sim_wall_clock_s": 0.6,
"engines": {
"pe_dma": {
"occupancy_ns": 52.0,
"wall_ns": 52.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 20.0,
"wall_ns": 20.0,
"record_count": 2
},
"pe_gemm": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 1
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 16.384000000000015,
"wall_ns": 16.384000000000015,
"record_count": 1
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 90.904,
"composite_window_ns": 48.384000000000015
},
{
"M": 32,
"K": 128,
"N": 32,
"variant": "ref_ref",
"flops": 262144,
"bytes_hbm": 18432,
"arith_intensity": 14.222222222222221,
"tile_count_expected": 2,
"sim_wall_clock_s": 0.572,
"engines": {
"pe_dma": {
"occupancy_ns": 92.0,
"wall_ns": 92.0,
"record_count": 5
},
"pe_fetch_store": {
"occupancy_ns": 39.615999999999985,
"wall_ns": 36.0,
"record_count": 3
},
"pe_gemm": {
"occupancy_ns": 32.76800000000003,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 80.0,
"wall_ns": 80.0,
"record_count": 4
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 32.0,
"wall_ns": 32.0,
"record_count": 2
},
"STORE": {
"occupancy_ns": 7.6159999999999854,
"wall_ns": 7.6159999999999854,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 32.76800000000003,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 112.38400000000001,
"composite_window_ns": 112.38400000000001
},
{
"M": 32,
"K": 128,
"N": 32,
"variant": "load_ref",
"flops": 262144,
"bytes_hbm": 18432,
"arith_intensity": 14.222222222222221,
"tile_count_expected": 2,
"sim_wall_clock_s": 0.606,
"engines": {
"pe_dma": {
"occupancy_ns": 88.0,
"wall_ns": 88.0,
"record_count": 4
},
"pe_fetch_store": {
"occupancy_ns": 39.615999999999985,
"wall_ns": 36.0,
"record_count": 3
},
"pe_gemm": {
"occupancy_ns": 32.76800000000003,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 40.0,
"wall_ns": 40.0,
"record_count": 2
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 32.0,
"wall_ns": 32.0,
"record_count": 2
},
"STORE": {
"occupancy_ns": 7.6159999999999854,
"wall_ns": 7.6159999999999854,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 32.76800000000003,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 109.38900000000001,
"composite_window_ns": 72.38400000000001
},
{
"M": 32,
"K": 128,
"N": 32,
"variant": "load_load",
"flops": 262144,
"bytes_hbm": 18432,
"arith_intensity": 14.222222222222221,
"tile_count_expected": 2,
"sim_wall_clock_s": 0.535,
"engines": {
"pe_dma": {
"occupancy_ns": 84.0,
"wall_ns": 84.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 51.995000000000005,
"wall_ns": 36.0,
"record_count": 3
},
"pe_gemm": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 47.995000000000005,
"wall_ns": 32.0,
"record_count": 2
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 33.152000000000044,
"wall_ns": 32.76800000000003,
"record_count": 2
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 123.28300000000002,
"composite_window_ns": 48.76800000000003
},
{
"M": 32,
"K": 128,
"N": 128,
"variant": "ref_ref",
"flops": 1048576,
"bytes_hbm": 49152,
"arith_intensity": 21.333333333333332,
"tile_count_expected": 8,
"sim_wall_clock_s": 0.746,
"engines": {
"pe_dma": {
"occupancy_ns": 408.0,
"wall_ns": 348.0,
"record_count": 20
},
"pe_fetch_store": {
"occupancy_ns": 162.46399999999994,
"wall_ns": 144.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 328.0,
"wall_ns": 328.0,
"record_count": 16
},
"DMA_WRITE": {
"occupancy_ns": 80.0,
"wall_ns": 80.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 128.0,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 34.46399999999994,
"wall_ns": 34.46399999999994,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 360.384,
"composite_window_ns": 360.384
},
{
"M": 32,
"K": 128,
"N": 128,
"variant": "load_ref",
"flops": 1048576,
"bytes_hbm": 49152,
"arith_intensity": 21.333333333333332,
"tile_count_expected": 8,
"sim_wall_clock_s": 0.728,
"engines": {
"pe_dma": {
"occupancy_ns": 284.0,
"wall_ns": 224.0,
"record_count": 13
},
"pe_fetch_store": {
"occupancy_ns": 162.46399999999994,
"wall_ns": 144.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 168.0,
"wall_ns": 168.0,
"record_count": 8
},
"DMA_WRITE": {
"occupancy_ns": 80.0,
"wall_ns": 80.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 128.0,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 34.46399999999994,
"wall_ns": 34.46399999999994,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 237.389,
"composite_window_ns": 200.38400000000001
},
{
"M": 32,
"K": 128,
"N": 128,
"variant": "load_load",
"flops": 1048576,
"bytes_hbm": 49152,
"arith_intensity": 21.333333333333332,
"tile_count_expected": 8,
"sim_wall_clock_s": 0.584,
"engines": {
"pe_dma": {
"occupancy_ns": 216.0,
"wall_ns": 216.0,
"record_count": 6
},
"pe_fetch_store": {
"occupancy_ns": 591.995,
"wall_ns": 128.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 48.0,
"wall_ns": 48.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 575.995,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 317.5870000000001,
"composite_window_ns": 147.07200000000012
},
{
"M": 32,
"K": 3072,
"N": 32,
"variant": "ref_ref",
"flops": 6291456,
"bytes_hbm": 395264,
"arith_intensity": 15.917098445595855,
"tile_count_expected": 48,
"sim_wall_clock_s": 1.348,
"engines": {
"pe_dma": {
"occupancy_ns": 1932.0,
"wall_ns": 1932.0,
"record_count": 97
},
"pe_fetch_store": {
"occupancy_ns": 775.616,
"wall_ns": 772.0,
"record_count": 49
},
"pe_gemm": {
"occupancy_ns": 786.4320000000007,
"wall_ns": 786.4320000000007,
"record_count": 48
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 1920.0,
"wall_ns": 1920.0,
"record_count": 96
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 768.0,
"wall_ns": 768.0,
"record_count": 48
},
"STORE": {
"occupancy_ns": 7.6159999999999854,
"wall_ns": 7.6159999999999854,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 786.4320000000007,
"wall_ns": 786.4320000000007,
"record_count": 48
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 1952.384,
"composite_window_ns": 1952.384
},
{
"M": 32,
"K": 3072,
"N": 32,
"variant": "load_ref",
"flops": 6291456,
"bytes_hbm": 395264,
"arith_intensity": 15.917098445595855,
"tile_count_expected": 48,
"sim_wall_clock_s": 1.075,
"engines": {
"pe_dma": {
"occupancy_ns": 1744.0000000000002,
"wall_ns": 1744.0000000000002,
"record_count": 50
},
"pe_fetch_store": {
"occupancy_ns": 775.6160000000002,
"wall_ns": 772.0000000000002,
"record_count": 49
},
"pe_gemm": {
"occupancy_ns": 786.4320000000005,
"wall_ns": 786.4320000000005,
"record_count": 48
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 960.0000000000002,
"wall_ns": 960.0000000000002,
"record_count": 48
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 768.0000000000002,
"wall_ns": 768.0000000000002,
"record_count": 48
},
"STORE": {
"occupancy_ns": 7.6159999999999854,
"wall_ns": 7.6159999999999854,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 786.4320000000005,
"wall_ns": 786.4320000000005,
"record_count": 48
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 1765.3890000000004,
"composite_window_ns": 992.3840000000002
},
{
"M": 32,
"K": 3072,
"N": 32,
"variant": "load_load",
"flops": 6291456,
"bytes_hbm": 395264,
"arith_intensity": 15.917098445595855,
"tile_count_expected": 48,
"sim_wall_clock_s": 0.635,
"engines": {
"pe_dma": {
"occupancy_ns": 1556.0,
"wall_ns": 1556.0,
"record_count": 3
},
"pe_fetch_store": {
"occupancy_ns": 18819.99500000001,
"wall_ns": 772.0000000000002,
"record_count": 49
},
"pe_gemm": {
"occupancy_ns": 1219.5839999999987,
"wall_ns": 786.4320000000005,
"record_count": 48
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 12.0,
"wall_ns": 12.0,
"record_count": 1
},
"FETCH": {
"occupancy_ns": 18815.99500000001,
"wall_ns": 768.0000000000002,
"record_count": 48
},
"STORE": {
"occupancy_ns": 4.0,
"wall_ns": 4.0,
"record_count": 1
},
"GEMM": {
"occupancy_ns": 1219.5839999999987,
"wall_ns": 786.4320000000005,
"record_count": 48
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 2348.947000000001,
"composite_window_ns": 802.4320000000005
},
{
"M": 8,
"K": 128,
"N": 128,
"variant": "ref_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 8,
"sim_wall_clock_s": 0.706,
"engines": {
"pe_dma": {
"occupancy_ns": 408.0,
"wall_ns": 348.0,
"record_count": 20
},
"pe_fetch_store": {
"occupancy_ns": 162.46399999999994,
"wall_ns": 144.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 328.0,
"wall_ns": 328.0,
"record_count": 16
},
"DMA_WRITE": {
"occupancy_ns": 80.0,
"wall_ns": 80.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 128.0,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 34.46399999999994,
"wall_ns": 34.46399999999994,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 360.384,
"composite_window_ns": 360.384
},
{
"M": 8,
"K": 128,
"N": 128,
"variant": "load_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 8,
"sim_wall_clock_s": 0.678,
"engines": {
"pe_dma": {
"occupancy_ns": 260.0,
"wall_ns": 200.0,
"record_count": 13
},
"pe_fetch_store": {
"occupancy_ns": 162.46399999999994,
"wall_ns": 144.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 168.0,
"wall_ns": 168.0,
"record_count": 8
},
"DMA_WRITE": {
"occupancy_ns": 80.0,
"wall_ns": 80.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 128.0,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 34.46399999999994,
"wall_ns": 34.46399999999994,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 131.07200000000012,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 213.389,
"composite_window_ns": 200.38400000000001
},
{
"M": 8,
"K": 128,
"N": 128,
"variant": "load_load",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 8,
"sim_wall_clock_s": 0.632,
"engines": {
"pe_dma": {
"occupancy_ns": 192.0,
"wall_ns": 192.0,
"record_count": 6
},
"pe_fetch_store": {
"occupancy_ns": 591.995,
"wall_ns": 128.0,
"record_count": 12
},
"pe_gemm": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 48.0,
"wall_ns": 48.0,
"record_count": 4
},
"FETCH": {
"occupancy_ns": 575.995,
"wall_ns": 128.0,
"record_count": 8
},
"STORE": {
"occupancy_ns": 16.0,
"wall_ns": 16.0,
"record_count": 4
},
"GEMM": {
"occupancy_ns": 141.82400000000052,
"wall_ns": 131.07200000000012,
"record_count": 8
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 293.5870000000001,
"composite_window_ns": 147.07200000000012
},
{
"M": 128,
"K": 8,
"N": 128,
"variant": "ref_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 16,
"sim_wall_clock_s": 0.978,
"engines": {
"pe_dma": {
"occupancy_ns": 1040.0000000000002,
"wall_ns": 732.0000000000001,
"record_count": 48
},
"pe_fetch_store": {
"occupancy_ns": 422.2400000000001,
"wall_ns": 320.0000000000001,
"record_count": 32
},
"pe_gemm": {
"occupancy_ns": 262.1440000000001,
"wall_ns": 262.1440000000001,
"record_count": 16
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 688.0000000000001,
"wall_ns": 688.0000000000001,
"record_count": 32
},
"DMA_WRITE": {
"occupancy_ns": 352.0000000000001,
"wall_ns": 352.0000000000001,
"record_count": 16
},
"FETCH": {
"occupancy_ns": 256.0,
"wall_ns": 256.0,
"record_count": 16
},
"STORE": {
"occupancy_ns": 166.24000000000012,
"wall_ns": 166.24000000000012,
"record_count": 16
},
"GEMM": {
"occupancy_ns": 262.1440000000001,
"wall_ns": 262.1440000000001,
"record_count": 16
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 736.3840000000001,
"composite_window_ns": 736.3840000000001
},
{
"M": 128,
"K": 8,
"N": 128,
"variant": "load_ref",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 16,
"sim_wall_clock_s": 0.999,
"engines": {
"pe_dma": {
"occupancy_ns": 732.0,
"wall_ns": 424.0,
"record_count": 33
},
"pe_fetch_store": {
"occupancy_ns": 422.2399999999998,
"wall_ns": 320.0,
"record_count": 32
},
"pe_gemm": {
"occupancy_ns": 262.14400000000023,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 368.0,
"wall_ns": 368.0,
"record_count": 16
},
"DMA_WRITE": {
"occupancy_ns": 352.0,
"wall_ns": 352.0,
"record_count": 16
},
"FETCH": {
"occupancy_ns": 256.0,
"wall_ns": 256.0,
"record_count": 16
},
"STORE": {
"occupancy_ns": 166.23999999999978,
"wall_ns": 166.23999999999978,
"record_count": 16
},
"GEMM": {
"occupancy_ns": 262.14400000000023,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 429.389,
"composite_window_ns": 416.384
},
{
"M": 128,
"K": 8,
"N": 128,
"variant": "load_load",
"flops": 262144,
"bytes_hbm": 36864,
"arith_intensity": 7.111111111111111,
"tile_count_expected": 16,
"sim_wall_clock_s": 0.776,
"engines": {
"pe_dma": {
"occupancy_ns": 216.0,
"wall_ns": 216.0,
"record_count": 18
},
"pe_fetch_store": {
"occupancy_ns": 2239.995,
"wall_ns": 264.0,
"record_count": 32
},
"pe_gemm": {
"occupancy_ns": 308.224000000002,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 192.0,
"wall_ns": 192.0,
"record_count": 16
},
"FETCH": {
"occupancy_ns": 2175.995,
"wall_ns": 256.0,
"record_count": 16
},
"STORE": {
"occupancy_ns": 64.0,
"wall_ns": 64.0,
"record_count": 16
},
"GEMM": {
"occupancy_ns": 308.224000000002,
"wall_ns": 262.14400000000023,
"record_count": 16
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 320.6590000000002,
"composite_window_ns": 294.14400000000023
},
{
"M": 512,
"K": 512,
"N": 512,
"variant": "ref_ref",
"flops": 268435456,
"bytes_hbm": 1572864,
"arith_intensity": 170.66666666666666,
"tile_count_expected": 2048,
"sim_wall_clock_s": 43.303,
"engines": {
"pe_dma": {
"occupancy_ns": 90092.00000000001,
"wall_ns": 82952.00000000001,
"record_count": 4352
},
"pe_fetch_store": {
"occupancy_ns": 34717.695999999676,
"wall_ns": 33792.00000000001,
"record_count": 2304
},
"pe_gemm": {
"occupancy_ns": 33554.43200000262,
"wall_ns": 33554.43200000262,
"record_count": 2048
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 82940.00000000001,
"wall_ns": 82940.00000000001,
"record_count": 4096
},
"DMA_WRITE": {
"occupancy_ns": 7152.0,
"wall_ns": 7152.0,
"record_count": 256
},
"FETCH": {
"occupancy_ns": 32768.00000000001,
"wall_ns": 32768.00000000001,
"record_count": 2048
},
"STORE": {
"occupancy_ns": 1949.6959999996689,
"wall_ns": 1949.6959999996689,
"record_count": 256
},
"GEMM": {
"occupancy_ns": 33554.43200000262,
"wall_ns": 33554.43200000262,
"record_count": 2048
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 82972.38400000002,
"composite_window_ns": 82972.38400000002
},
{
"M": 512,
"K": 512,
"N": 512,
"variant": "load_ref",
"flops": 268435456,
"bytes_hbm": 1572864,
"arith_intensity": 170.66666666666666,
"tile_count_expected": 2048,
"sim_wall_clock_s": 23.224,
"engines": {
"pe_dma": {
"occupancy_ns": 51184.0,
"wall_ns": 44044.0,
"record_count": 2305
},
"pe_fetch_store": {
"occupancy_ns": 34717.69600000031,
"wall_ns": 33792.0,
"record_count": 2304
},
"pe_gemm": {
"occupancy_ns": 33554.43199999754,
"wall_ns": 33554.43199999755,
"record_count": 2048
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 41980.0,
"wall_ns": 41980.0,
"record_count": 2048
},
"DMA_WRITE": {
"occupancy_ns": 7151.999999999999,
"wall_ns": 7151.999999999999,
"record_count": 256
},
"FETCH": {
"occupancy_ns": 32768.0,
"wall_ns": 32768.0,
"record_count": 2048
},
"STORE": {
"occupancy_ns": 1949.6960000003073,
"wall_ns": 1949.6960000003073,
"record_count": 256
},
"GEMM": {
"occupancy_ns": 33554.43199999754,
"wall_ns": 33554.43199999755,
"record_count": 2048
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 44065.389,
"composite_window_ns": 42012.384
},
{
"M": 512,
"K": 512,
"N": 512,
"variant": "load_load",
"flops": 268435456,
"bytes_hbm": 1572864,
"arith_intensity": 170.66666666666666,
"tile_count_expected": 2048,
"sim_wall_clock_s": 3.48,
"engines": {
"pe_dma": {
"occupancy_ns": 7176.0,
"wall_ns": 7176.0,
"record_count": 258
},
"pe_fetch_store": {
"occupancy_ns": 33571839.995,
"wall_ns": 32792.0,
"record_count": 2304
},
"pe_gemm": {
"occupancy_ns": 838467.5839984363,
"wall_ns": 33554.43199999763,
"record_count": 2048
},
"pe_math": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"stages": {
"DMA_READ": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
},
"DMA_WRITE": {
"occupancy_ns": 3072.0,
"wall_ns": 3072.0,
"record_count": 256
},
"FETCH": {
"occupancy_ns": 33570815.995,
"wall_ns": 32767.999999999996,
"record_count": 2048
},
"STORE": {
"occupancy_ns": 1024.0,
"wall_ns": 1024.0,
"record_count": 256
},
"GEMM": {
"occupancy_ns": 838467.5839984363,
"wall_ns": 33554.43199999763,
"record_count": 2048
},
"MATH": {
"occupancy_ns": 0,
"wall_ns": 0.0,
"record_count": 0
}
},
"pe_window_ns": 37676.94699999763,
"composite_window_ns": 33570.43199999763
}
]
}