diff --git a/.gitignore b/.gitignore index 6b3e645..93a3b88 100644 --- a/.gitignore +++ b/.gitignore @@ -6,9 +6,6 @@ # Auto-generated mesh file cube_mesh.yaml -# Milestone bench output (regenerable: kernbench run --bench milestone-1h-*) -src/kernbench/benches/1H_milestone_output/ - # Python __pycache__/ *.py[cod] diff --git a/docs/adr-ko/ADR-0054-eval-milestone-benches.md b/docs/adr-ko/ADR-0054-eval-milestone-benches.md index 0fb1f26..3454ca0 100644 --- a/docs/adr-ko/ADR-0054-eval-milestone-benches.md +++ b/docs/adr-ko/ADR-0054-eval-milestone-benches.md @@ -59,8 +59,8 @@ ADR-0045 D5는 bench를 단일 구성(single-SIP, 또는 ADR-0024 multi-SIP CCL 쓴다(사용자 요청 — bench 옆 아티팩트). 디렉터리는 생성된 PNG/CSV/JSON만 보유하며(`.py`/`__init__.py` 없음), 따라서 eager-import audit(ADR-0045 첫 동작)이 무시한다 — `pkgutil.iter_modules`는 비-패키지 하위 디렉터리를 -yield하지 않는다. committed `docs/diagrams/` 아티팩트와 달리 -**git-ignore**된다(요청 시 재생성 가능). +yield하지 않는다. `docs/diagrams/` 아티팩트처럼 **커밋된다**(원격에서 +figure를 볼 수 있도록); bench 재실행 시 제자리에서 재생성된다. ### D4. GEMM 무거운 sweep — 기본은 fresh, `MILESTONE_FAST`로 재사용 @@ -115,7 +115,8 @@ bench 실행이 곧 재생성이다. slow 경로는 `@pytest.mark.slow` bench 드로잉을 섞는다). 대부분 평가 하니스인 "bench"는 이례적이며, 본 ADR이 이를 정당화한다. - 생성 아티팩트가 명시적 요청에 의해 source tree(`src/kernbench/benches/`) - 안에 산다; 커밋을 피하려 git-ignore. + 안에 살며 커밋된다(원격에서 figure를 볼 수 있도록); bench 재실행 시 + 재생성된다. - `milestone-1h-ccl`(및 기본 `milestone-1h-gemm`)은 분 단위 소요 — on-demand 마일스톤 아티팩트에는 수용 가능, 일상 실행에는 아님. diff --git a/docs/adr/ADR-0054-eval-milestone-benches.md b/docs/adr/ADR-0054-eval-milestone-benches.md index f816a61..139f93c 100644 --- a/docs/adr/ADR-0054-eval-milestone-benches.md +++ b/docs/adr/ADR-0054-eval-milestone-benches.md @@ -61,8 +61,9 @@ Both benches write to `src/kernbench/benches/1H_milestone_output/{gemm,ccl}/` (per user request — artifacts beside the bench). The directory holds only generated PNG/CSV/JSON (never a `.py`/`__init__.py`), so the eager-import audit (ADR-0045 first action) ignores it — `pkgutil.iter_modules` does not -yield non-package subdirectories. It is **git-ignored** (regenerable on -demand), unlike the committed `docs/diagrams/` artifacts. +yield non-package subdirectories. It is **committed** (like the +`docs/diagrams/` artifacts) so the figures are viewable on the remote; +rerunning the bench regenerates it in place. ### D4. GEMM heavy sweep — fresh by default, `MILESTONE_FAST` to reuse @@ -118,7 +119,8 @@ ADR-0045 D1). sweeps, and matplotlib drawing). A "bench" that is mostly an eval harness is unusual; this ADR legitimizes it. - Generated artifacts live inside the source tree (`src/kernbench/benches/`) - by explicit request; git-ignored to avoid committing them. + by explicit request and are committed (so the figures are viewable on the + remote); rerunning the bench regenerates them. - `milestone-1h-ccl` (and the default `milestone-1h-gemm`) take minutes — acceptable for an on-demand milestone artifact, not for routine runs. diff --git a/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2DMesh_6SiP_2x3.png b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2DMesh_6SiP_2x3.png new file mode 100644 index 0000000..45d0b2c Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2DMesh_6SiP_2x3.png differ diff --git a/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3.png b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3.png new file mode 100644 index 0000000..eac4f1f Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3.png differ diff --git a/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.csv b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.csv new file mode 100644 index 0000000..013b00d --- /dev/null +++ b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.csv @@ -0,0 +1,13 @@ +buffer_kind,sip_topology,n_sips,n_elem,bytes_per_pe,latency_ns +hbm,torus_2d,6,128,256,2120.040000000012 +hbm,torus_2d,6,1024,2048,2717.2783333333473 +hbm,torus_2d,6,8192,16384,7315.184999999989 +hbm,torus_2d,6,32768,65536,23081.26500000037 +sram,torus_2d,6,128,256,2060.040000000012 +sram,torus_2d,6,1024,2048,2909.2783333333473 +sram,torus_2d,6,8192,16384,9523.184999999869 +sram,torus_2d,6,32768,65536,32201.265000000385 +tcm,torus_2d,6,128,256,1964.040000000012 +tcm,torus_2d,6,1024,2048,2477.2783333333473 +tcm,torus_2d,6,8192,16384,6403.185000000109 +tcm,torus_2d,6,32768,65536,19865.265000000378 diff --git a/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png new file mode 100644 index 0000000..19d4cc2 Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png differ diff --git a/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_Ring1D_6SiP_1x6.png b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_Ring1D_6SiP_1x6.png new file mode 100644 index 0000000..1bb1acc Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/ccl/AllReduce_LRAB_Ring1D_6SiP_1x6.png differ diff --git a/src/kernbench/benches/1H_milestone_output/ccl/comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png b/src/kernbench/benches/1H_milestone_output/ccl/comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png new file mode 100644 index 0000000..229dbd3 Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/ccl/comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png differ diff --git a/src/kernbench/benches/1H_milestone_output/ccl/summary.csv b/src/kernbench/benches/1H_milestone_output/ccl/summary.csv new file mode 100644 index 0000000..5ecd70a --- /dev/null +++ b/src/kernbench/benches/1H_milestone_output/ccl/summary.csv @@ -0,0 +1,37 @@ +algorithm,sip_topology,n_sips,n_elem,bytes_per_pe,bytes_per_sip,latency_ns +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,8,16,256,2666.552500000015 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,32,64,1024,2747.7400000000152 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,64,128,2048,2855.990000000018 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,128,256,4096,3072.490000000019 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,512,1024,16384,3337.1133333333582 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,1024,2048,32768,3708.0333333333692 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,2048,4096,65536,4449.873333333393 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,4096,8192,131072,5933.020000000124 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,8192,16384,262144,8900.379999999863 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,16384,32768,524288,14835.099999999224 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,32768,65536,1048576,26704.540000000765 +lrab_hierarchical_allreduce,mesh_2d_no_wrap,6,49152,98304,1572864,38573.97999999701 +lrab_hierarchical_allreduce,ring_1d,6,8,16,256,2365.255833333347 +lrab_hierarchical_allreduce,ring_1d,6,32,64,1024,2436.9433333333473 +lrab_hierarchical_allreduce,ring_1d,6,64,128,2048,2532.526666666683 +lrab_hierarchical_allreduce,ring_1d,6,128,256,4096,2723.693333333349 +lrab_hierarchical_allreduce,ring_1d,6,512,1024,16384,3048.635000000021 +lrab_hierarchical_allreduce,ring_1d,6,1024,2048,32768,3393.4016666666957 +lrab_hierarchical_allreduce,ring_1d,6,2048,4096,65536,4082.401666666714 +lrab_hierarchical_allreduce,ring_1d,6,4096,8192,131072,5458.80166666677 +lrab_hierarchical_allreduce,ring_1d,6,8192,16384,262144,8216.934999999943 +lrab_hierarchical_allreduce,ring_1d,6,16384,32768,524288,13733.201666665835 +lrab_hierarchical_allreduce,ring_1d,6,32768,65536,1048576,24765.73500000064 +lrab_hierarchical_allreduce,ring_1d,6,49152,98304,1572864,35798.268333331536 +lrab_hierarchical_allreduce,torus_2d,6,8,16,256,1700.6025000000095 +lrab_hierarchical_allreduce,torus_2d,6,32,64,1024,1753.2900000000102 +lrab_hierarchical_allreduce,torus_2d,6,64,128,2048,1823.540000000012 +lrab_hierarchical_allreduce,torus_2d,6,128,256,4096,1964.040000000012 +lrab_hierarchical_allreduce,torus_2d,6,512,1024,16384,2196.8183333333463 +lrab_hierarchical_allreduce,torus_2d,6,1024,2048,32768,2477.2783333333473 +lrab_hierarchical_allreduce,torus_2d,6,2048,4096,65536,3038.1983333333583 +lrab_hierarchical_allreduce,torus_2d,6,4096,8192,131072,4159.5050000000665 +lrab_hierarchical_allreduce,torus_2d,6,8192,16384,262144,6403.185000000109 +lrab_hierarchical_allreduce,torus_2d,6,16384,32768,524288,10890.5449999995 +lrab_hierarchical_allreduce,torus_2d,6,32768,65536,1048576,19865.265000000378 +lrab_hierarchical_allreduce,torus_2d,6,49152,98304,1572864,28839.98500000059 diff --git a/src/kernbench/benches/1H_milestone_output/ccl/topology.png b/src/kernbench/benches/1H_milestone_output/ccl/topology.png new file mode 100644 index 0000000..40e8719 Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/ccl/topology.png differ diff --git a/src/kernbench/benches/1H_milestone_output/gemm/gemm_mac_utilization_measured.png b/src/kernbench/benches/1H_milestone_output/gemm/gemm_mac_utilization_measured.png new file mode 100644 index 0000000..a896aaa Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/gemm/gemm_mac_utilization_measured.png differ diff --git a/src/kernbench/benches/1H_milestone_output/gemm/gemm_mac_utilization_theoretical_vs_measured.png b/src/kernbench/benches/1H_milestone_output/gemm/gemm_mac_utilization_theoretical_vs_measured.png new file mode 100644 index 0000000..c7075c7 Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/gemm/gemm_mac_utilization_theoretical_vs_measured.png differ diff --git a/src/kernbench/benches/1H_milestone_output/gemm/gemm_stage_breakdown.png b/src/kernbench/benches/1H_milestone_output/gemm/gemm_stage_breakdown.png new file mode 100644 index 0000000..bd82b68 Binary files /dev/null and b/src/kernbench/benches/1H_milestone_output/gemm/gemm_stage_breakdown.png differ diff --git a/src/kernbench/benches/1H_milestone_output/gemm/gemm_sweep.json b/src/kernbench/benches/1H_milestone_output/gemm/gemm_sweep.json new file mode 100644 index 0000000..910306a --- /dev/null +++ b/src/kernbench/benches/1H_milestone_output/gemm/gemm_sweep.json @@ -0,0 +1,1636 @@ +{ + "tile_sizes": { + "M": 32, + "K": 64, + "N": 32 + }, + "engines": [ + "pe_dma", + "pe_fetch_store", + "pe_gemm", + "pe_math" + ], + "stages": [ + "DMA_READ", + "DMA_WRITE", + "FETCH", + "STORE", + "GEMM", + "MATH" + ], + "variants": [ + "ref_ref", + "load_ref", + "load_load" + ], + "rows": [ + { + "M": 32, + "K": 32, + "N": 32, + "variant": "ref_ref", + "flops": 65536, + "bytes_hbm": 6144, + "arith_intensity": 10.666666666666666, + "tile_count_expected": 1, + "sim_wall_clock_s": 1.64, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 32, + "N": 32, + "variant": "load_ref", + "flops": 65536, + "bytes_hbm": 6144, + "arith_intensity": 10.666666666666666, + "tile_count_expected": 1, + "sim_wall_clock_s": 0.696, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 32, + "N": 32, + "variant": "load_load", + "flops": 65536, + "bytes_hbm": 6144, + "arith_intensity": 10.666666666666666, + "tile_count_expected": 1, + "sim_wall_clock_s": 0.605, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 64, + "N": 32, + "variant": "ref_ref", + "flops": 131072, + "bytes_hbm": 10240, + "arith_intensity": 12.8, + "tile_count_expected": 1, + "sim_wall_clock_s": 0.599, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 64, + "N": 32, + "variant": "load_ref", + "flops": 131072, + "bytes_hbm": 10240, + "arith_intensity": 12.8, + "tile_count_expected": 1, + "sim_wall_clock_s": 0.622, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 64, + "N": 32, + "variant": "load_load", + "flops": 131072, + "bytes_hbm": 10240, + "arith_intensity": 12.8, + "tile_count_expected": 1, + "sim_wall_clock_s": 0.63, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 128, + "N": 32, + "variant": "ref_ref", + "flops": 262144, + "bytes_hbm": 18432, + "arith_intensity": 14.222222222222221, + "tile_count_expected": 2, + "sim_wall_clock_s": 0.618, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 128, + "N": 32, + "variant": "load_ref", + "flops": 262144, + "bytes_hbm": 18432, + "arith_intensity": 14.222222222222221, + "tile_count_expected": 2, + "sim_wall_clock_s": 0.642, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 128, + "N": 32, + "variant": "load_load", + "flops": 262144, + "bytes_hbm": 18432, + "arith_intensity": 14.222222222222221, + "tile_count_expected": 2, + "sim_wall_clock_s": 0.769, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 128, + "N": 128, + "variant": "ref_ref", + "flops": 1048576, + "bytes_hbm": 49152, + "arith_intensity": 21.333333333333332, + "tile_count_expected": 8, + "sim_wall_clock_s": 0.649, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 128, + "N": 128, + "variant": "load_ref", + "flops": 1048576, + "bytes_hbm": 49152, + "arith_intensity": 21.333333333333332, + "tile_count_expected": 8, + "sim_wall_clock_s": 0.639, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 128, + "N": 128, + "variant": "load_load", + "flops": 1048576, + "bytes_hbm": 49152, + "arith_intensity": 21.333333333333332, + "tile_count_expected": 8, + "sim_wall_clock_s": 0.684, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 3072, + "N": 32, + "variant": "ref_ref", + "flops": 6291456, + "bytes_hbm": 395264, + "arith_intensity": 15.917098445595855, + "tile_count_expected": 48, + "sim_wall_clock_s": 0.623, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 3072, + "N": 32, + "variant": "load_ref", + "flops": 6291456, + "bytes_hbm": 395264, + "arith_intensity": 15.917098445595855, + "tile_count_expected": 48, + "sim_wall_clock_s": 0.763, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 32, + "K": 3072, + "N": 32, + "variant": "load_load", + "flops": 6291456, + "bytes_hbm": 395264, + "arith_intensity": 15.917098445595855, + "tile_count_expected": 48, + "sim_wall_clock_s": 0.647, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 8, + "K": 128, + "N": 128, + "variant": "ref_ref", + "flops": 262144, + "bytes_hbm": 36864, + "arith_intensity": 7.111111111111111, + "tile_count_expected": 8, + "sim_wall_clock_s": 0.637, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 8, + "K": 128, + "N": 128, + "variant": "load_ref", + "flops": 262144, + "bytes_hbm": 36864, + "arith_intensity": 7.111111111111111, + "tile_count_expected": 8, + "sim_wall_clock_s": 0.678, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 8, + "K": 128, + "N": 128, + "variant": "load_load", + "flops": 262144, + "bytes_hbm": 36864, + "arith_intensity": 7.111111111111111, + "tile_count_expected": 8, + "sim_wall_clock_s": 0.66, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 128, + "K": 8, + "N": 128, + "variant": "ref_ref", + "flops": 262144, + "bytes_hbm": 36864, + "arith_intensity": 7.111111111111111, + "tile_count_expected": 16, + "sim_wall_clock_s": 0.658, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 128, + "K": 8, + "N": 128, + "variant": "load_ref", + "flops": 262144, + "bytes_hbm": 36864, + "arith_intensity": 7.111111111111111, + "tile_count_expected": 16, + "sim_wall_clock_s": 0.646, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 128, + "K": 8, + "N": 128, + "variant": "load_load", + "flops": 262144, + "bytes_hbm": 36864, + "arith_intensity": 7.111111111111111, + "tile_count_expected": 16, + "sim_wall_clock_s": 0.673, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 512, + "K": 512, + "N": 512, + "variant": "ref_ref", + "flops": 268435456, + "bytes_hbm": 1572864, + "arith_intensity": 170.66666666666666, + "tile_count_expected": 2048, + "sim_wall_clock_s": 0.727, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 512, + "K": 512, + "N": 512, + "variant": "load_ref", + "flops": 268435456, + "bytes_hbm": 1572864, + "arith_intensity": 170.66666666666666, + "tile_count_expected": 2048, + "sim_wall_clock_s": 0.707, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + }, + { + "M": 512, + "K": 512, + "N": 512, + "variant": "load_load", + "flops": 268435456, + "bytes_hbm": 1572864, + "arith_intensity": 170.66666666666666, + "tile_count_expected": 2048, + "sim_wall_clock_s": 0.828, + "engines": { + "pe_dma": { + "occupancy_ns": 16594.192, + "wall_ns": 15271.192000000001, + "record_count": 576 + }, + "pe_fetch_store": { + "occupancy_ns": 8704.0, + "wall_ns": 7624.0, + "record_count": 320 + }, + "pe_gemm": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "pe_math": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "stages": { + "DMA_READ": { + "occupancy_ns": 15250.192000000001, + "wall_ns": 15250.192000000001, + "record_count": 512 + }, + "DMA_WRITE": { + "occupancy_ns": 1344.0, + "wall_ns": 1344.0, + "record_count": 64 + }, + "FETCH": { + "occupancy_ns": 8192.0, + "wall_ns": 7616.0, + "record_count": 256 + }, + "STORE": { + "occupancy_ns": 512.0, + "wall_ns": 512.0, + "record_count": 64 + }, + "GEMM": { + "occupancy_ns": 4194.304000000004, + "wall_ns": 4194.304000000004, + "record_count": 256 + }, + "MATH": { + "occupancy_ns": 0, + "wall_ns": 0.0, + "record_count": 0 + } + }, + "pe_window_ns": 15327.576000000001, + "composite_window_ns": 15327.576000000001 + } + ] +} \ No newline at end of file