From f6d262e359238af10850d2887cb88459bba05275 Mon Sep 17 00:00:00 2001 From: Mukesh Garg Date: Thu, 14 May 2026 14:19:17 -0700 Subject: [PATCH] Honest measured pipeline efficiency: two timing fixes Two related issues caused measured pipeline efficiency to look worse than the simulator's actual behavior: 1. DMA timing recorded too early. The op-log start timestamp for a DMA op fired when the request entered the queue, and the DMA channel was released as soon as the request was issued. Back-to-back DMAs therefore appeared to grab the channel simultaneously, with per-op duration drifting upward as queue depth grew - an artifact, not real cost. Fix: defer the start timestamp until after the channel is acquired, and hold the channel through the full HBM round-trip until the response returns. Per-op duration is now constant and equal to the actual transfer interval; serialization is visible as queue wait, not as inflated service time. 2. Sweep timing window folded in pre-composite work. The PE timing window spanned every PE engine record, which included the upfront pinned-operand DMA issued before the composite GEMM begins. For large-K shapes that one-shot load can be nearly half of the window, conflating operand-staging cost with composite-pipeline behavior. Fix: add a second window scoped to the composite pipeline by filtering op_log records to those tagged with a tile-pipeline stage; the legacy operand-load path is untagged and naturally excluded. For 32x3072x32 load_ref the window drops from 1765ns to 992ns and measured eff lines up with the steady-state DMA-bound stage limit instead of being penalized for the one-time load. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/diagrams/gemm_sweep.json | 482 +++++++++++---------- docs/diagrams/kernbench2_overview.pptx | Bin 628239 -> 625642 bytes scripts/build_overview_slides.py | 172 +++++++- scripts/gemm_sweep.py | 7 + src/kernbench/components/base.py | 12 +- src/kernbench/components/builtin/pe_dma.py | 41 +- tests/test_pe_pipeline.py | 92 ++++ 7 files changed, 543 insertions(+), 263 deletions(-) diff --git a/docs/diagrams/gemm_sweep.json b/docs/diagrams/gemm_sweep.json index 9ab6fbd..e31c35d 100644 --- a/docs/diagrams/gemm_sweep.json +++ b/docs/diagrams/gemm_sweep.json @@ -33,7 +33,7 @@ "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, - "sim_wall_clock_s": 0.569, + "sim_wall_clock_s": 0.574, "engines": { "pe_dma": { "occupancy_ns": 52.0, @@ -88,7 +88,8 @@ "record_count": 0 } }, - "pe_window_ns": 88.38400000000001 + "pe_window_ns": 88.38400000000001, + "composite_window_ns": 88.38400000000001 }, { "M": 32, @@ -99,11 +100,11 @@ "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, - "sim_wall_clock_s": 0.409, + "sim_wall_clock_s": 0.392, "engines": { "pe_dma": { - "occupancy_ns": 44.5, - "wall_ns": 44.5, + "occupancy_ns": 44.0, + "wall_ns": 44.0, "record_count": 3 }, "pe_fetch_store": { @@ -154,7 +155,8 @@ "record_count": 0 } }, - "pe_window_ns": 81.894 + "pe_window_ns": 81.394, + "composite_window_ns": 68.38400000000001 }, { "M": 32, @@ -165,11 +167,11 @@ "bytes_hbm": 6144, "arith_intensity": 10.666666666666666, "tile_count_expected": 1, - "sim_wall_clock_s": 0.567, + "sim_wall_clock_s": 0.53, "engines": { "pe_dma": { - "occupancy_ns": 37.0, - "wall_ns": 37.0, + "occupancy_ns": 36.0, + "wall_ns": 36.0, "record_count": 3 }, "pe_fetch_store": { @@ -220,7 +222,8 @@ "record_count": 0 } }, - "pe_window_ns": 75.404 + "pe_window_ns": 74.904, + "composite_window_ns": 48.384000000000015 }, { "M": 32, @@ -231,7 +234,7 @@ "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, - "sim_wall_clock_s": 0.838, + "sim_wall_clock_s": 0.625, "engines": { "pe_dma": { "occupancy_ns": 52.0, @@ -286,7 +289,8 @@ "record_count": 0 } }, - "pe_window_ns": 88.38400000000001 + "pe_window_ns": 88.38400000000001, + "composite_window_ns": 88.38400000000001 }, { "M": 32, @@ -297,11 +301,11 @@ "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, - "sim_wall_clock_s": 1.097, + "sim_wall_clock_s": 0.513, "engines": { "pe_dma": { - "occupancy_ns": 52.5, - "wall_ns": 52.5, + "occupancy_ns": 52.0, + "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { @@ -352,7 +356,8 @@ "record_count": 0 } }, - "pe_window_ns": 89.894 + "pe_window_ns": 89.394, + "composite_window_ns": 68.38400000000001 }, { "M": 32, @@ -363,11 +368,11 @@ "bytes_hbm": 10240, "arith_intensity": 12.8, "tile_count_expected": 1, - "sim_wall_clock_s": 1.264, + "sim_wall_clock_s": 0.6, "engines": { "pe_dma": { - "occupancy_ns": 53.0, - "wall_ns": 53.0, + "occupancy_ns": 52.0, + "wall_ns": 52.0, "record_count": 3 }, "pe_fetch_store": { @@ -418,7 +423,8 @@ "record_count": 0 } }, - "pe_window_ns": 91.404 + "pe_window_ns": 90.904, + "composite_window_ns": 48.384000000000015 }, { "M": 32, @@ -429,20 +435,20 @@ "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, - "sim_wall_clock_s": 1.187, + "sim_wall_clock_s": 0.572, "engines": { "pe_dma": { - "occupancy_ns": 131.995, - "wall_ns": 80.0, + "occupancy_ns": 92.0, + "wall_ns": 92.0, "record_count": 5 }, "pe_fetch_store": { - "occupancy_ns": 36.0, + "occupancy_ns": 39.615999999999985, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -454,8 +460,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 119.995, - "wall_ns": 68.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "DMA_WRITE": { @@ -469,12 +475,12 @@ "record_count": 2 }, "STORE": { - "occupancy_ns": 4.0, - "wall_ns": 4.0, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -484,7 +490,8 @@ "record_count": 0 } }, - "pe_window_ns": 100.76800000000003 + "pe_window_ns": 112.38400000000001, + "composite_window_ns": 112.38400000000001 }, { "M": 32, @@ -495,20 +502,20 @@ "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, - "sim_wall_clock_s": 1.13, + "sim_wall_clock_s": 0.606, "engines": { "pe_dma": { - "occupancy_ns": 104.495, - "wall_ns": 84.5, + "occupancy_ns": 88.0, + "wall_ns": 88.0, "record_count": 4 }, "pe_fetch_store": { - "occupancy_ns": 36.0, + "occupancy_ns": 39.615999999999985, "wall_ns": 36.0, "record_count": 3 }, "pe_gemm": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -520,8 +527,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 55.995000000000005, - "wall_ns": 36.0, + "occupancy_ns": 40.0, + "wall_ns": 40.0, "record_count": 2 }, "DMA_WRITE": { @@ -535,12 +542,12 @@ "record_count": 2 }, "STORE": { - "occupancy_ns": 4.0, - "wall_ns": 4.0, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 33.152000000000044, + "occupancy_ns": 32.76800000000003, "wall_ns": 32.76800000000003, "record_count": 2 }, @@ -550,7 +557,8 @@ "record_count": 0 } }, - "pe_window_ns": 106.27300000000002 + "pe_window_ns": 109.38900000000001, + "composite_window_ns": 72.38400000000001 }, { "M": 32, @@ -561,11 +569,11 @@ "bytes_hbm": 18432, "arith_intensity": 14.222222222222221, "tile_count_expected": 2, - "sim_wall_clock_s": 1.113, + "sim_wall_clock_s": 0.535, "engines": { "pe_dma": { - "occupancy_ns": 85.0, - "wall_ns": 85.0, + "occupancy_ns": 84.0, + "wall_ns": 84.0, "record_count": 3 }, "pe_fetch_store": { @@ -616,7 +624,8 @@ "record_count": 0 } }, - "pe_window_ns": 123.78300000000002 + "pe_window_ns": 123.28300000000002, + "composite_window_ns": 48.76800000000003 }, { "M": 32, @@ -627,20 +636,20 @@ "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, - "sim_wall_clock_s": 1.451, + "sim_wall_clock_s": 0.746, "engines": { "pe_dma": { - "occupancy_ns": 1687.995, - "wall_ns": 272.0, + "occupancy_ns": 408.0, + "wall_ns": 348.0, "record_count": 20 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -652,27 +661,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 1631.995, - "wall_ns": 260.0, + "occupancy_ns": 328.0, + "wall_ns": 328.0, "record_count": 16 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -682,7 +691,8 @@ "record_count": 0 } }, - "pe_window_ns": 297.9200000000001 + "pe_window_ns": 360.384, + "composite_window_ns": 360.384 }, { "M": 32, @@ -693,20 +703,20 @@ "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, - "sim_wall_clock_s": 1.269, + "sim_wall_clock_s": 0.728, "engines": { "pe_dma": { - "occupancy_ns": 700.495, - "wall_ns": 180.5, + "occupancy_ns": 284.0, + "wall_ns": 224.0, "record_count": 13 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -718,27 +728,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 607.995, - "wall_ns": 132.0, + "occupancy_ns": 168.0, + "wall_ns": 168.0, "record_count": 8 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -748,7 +758,8 @@ "record_count": 0 } }, - "pe_window_ns": 207.42500000000007 + "pe_window_ns": 237.389, + "composite_window_ns": 200.38400000000001 }, { "M": 32, @@ -759,11 +770,11 @@ "bytes_hbm": 49152, "arith_intensity": 21.333333333333332, "tile_count_expected": 8, - "sim_wall_clock_s": 1.225, + "sim_wall_clock_s": 0.584, "engines": { "pe_dma": { - "occupancy_ns": 217.0, - "wall_ns": 217.0, + "occupancy_ns": 216.0, + "wall_ns": 216.0, "record_count": 6 }, "pe_fetch_store": { @@ -814,7 +825,8 @@ "record_count": 0 } }, - "pe_window_ns": 318.0870000000001 + "pe_window_ns": 317.5870000000001, + "composite_window_ns": 147.07200000000012 }, { "M": 32, @@ -825,20 +837,20 @@ "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, - "sim_wall_clock_s": 2.724, + "sim_wall_clock_s": 1.348, "engines": { "pe_dma": { - "occupancy_ns": 55883.995, - "wall_ns": 1552.0, + "occupancy_ns": 1932.0, + "wall_ns": 1932.0, "record_count": 97 }, "pe_fetch_store": { - "occupancy_ns": 791.1039999999994, + "occupancy_ns": 775.616, "wall_ns": 772.0, "record_count": 49 }, "pe_gemm": { - "occupancy_ns": 1215.584000000017, + "occupancy_ns": 786.4320000000007, "wall_ns": 786.4320000000007, "record_count": 48 }, @@ -850,8 +862,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 55871.995, - "wall_ns": 1540.0, + "occupancy_ns": 1920.0, + "wall_ns": 1920.0, "record_count": 96 }, "DMA_WRITE": { @@ -860,17 +872,17 @@ "record_count": 1 }, "FETCH": { - "occupancy_ns": 772.0, - "wall_ns": 772.0, + "occupancy_ns": 768.0, + "wall_ns": 768.0, "record_count": 48 }, "STORE": { - "occupancy_ns": 19.10399999999936, - "wall_ns": 19.10399999999936, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 1215.584000000017, + "occupancy_ns": 786.4320000000007, "wall_ns": 786.4320000000007, "record_count": 48 }, @@ -880,7 +892,8 @@ "record_count": 0 } }, - "pe_window_ns": 1590.4320000000007 + "pe_window_ns": 1952.384, + "composite_window_ns": 1952.384 }, { "M": 32, @@ -891,21 +904,21 @@ "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, - "sim_wall_clock_s": 2.137, + "sim_wall_clock_s": 1.075, "engines": { "pe_dma": { - "occupancy_ns": 19792.495, - "wall_ns": 1556.5, + "occupancy_ns": 1744.0000000000002, + "wall_ns": 1744.0000000000002, "record_count": 50 }, "pe_fetch_store": { - "occupancy_ns": 791.1039999999994, - "wall_ns": 772.0, + "occupancy_ns": 775.6160000000002, + "wall_ns": 772.0000000000002, "record_count": 49 }, "pe_gemm": { - "occupancy_ns": 1215.584000000017, - "wall_ns": 786.4320000000007, + "occupancy_ns": 786.4320000000005, + "wall_ns": 786.4320000000005, "record_count": 48 }, "pe_math": { @@ -916,8 +929,8 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 19007.995, - "wall_ns": 772.0, + "occupancy_ns": 960.0000000000002, + "wall_ns": 960.0000000000002, "record_count": 48 }, "DMA_WRITE": { @@ -926,18 +939,18 @@ "record_count": 1 }, "FETCH": { - "occupancy_ns": 772.0, - "wall_ns": 772.0, + "occupancy_ns": 768.0000000000002, + "wall_ns": 768.0000000000002, "record_count": 48 }, "STORE": { - "occupancy_ns": 19.10399999999936, - "wall_ns": 19.10399999999936, + "occupancy_ns": 7.6159999999999854, + "wall_ns": 7.6159999999999854, "record_count": 1 }, "GEMM": { - "occupancy_ns": 1215.584000000017, - "wall_ns": 786.4320000000007, + "occupancy_ns": 786.4320000000005, + "wall_ns": 786.4320000000005, "record_count": 48 }, "MATH": { @@ -946,7 +959,8 @@ "record_count": 0 } }, - "pe_window_ns": 1595.9370000000008 + "pe_window_ns": 1765.3890000000004, + "composite_window_ns": 992.3840000000002 }, { "M": 32, @@ -957,11 +971,11 @@ "bytes_hbm": 395264, "arith_intensity": 15.917098445595855, "tile_count_expected": 48, - "sim_wall_clock_s": 1.245, + "sim_wall_clock_s": 0.635, "engines": { "pe_dma": { - "occupancy_ns": 1557.0, - "wall_ns": 1557.0, + "occupancy_ns": 1556.0, + "wall_ns": 1556.0, "record_count": 3 }, "pe_fetch_store": { @@ -1012,7 +1026,8 @@ "record_count": 0 } }, - "pe_window_ns": 2349.447000000001 + "pe_window_ns": 2348.947000000001, + "composite_window_ns": 802.4320000000005 }, { "M": 8, @@ -1023,20 +1038,20 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, - "sim_wall_clock_s": 1.477, + "sim_wall_clock_s": 0.706, "engines": { "pe_dma": { - "occupancy_ns": 1687.995, - "wall_ns": 272.0, + "occupancy_ns": 408.0, + "wall_ns": 348.0, "record_count": 20 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1048,27 +1063,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 1631.995, - "wall_ns": 260.0, + "occupancy_ns": 328.0, + "wall_ns": 328.0, "record_count": 16 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1078,7 +1093,8 @@ "record_count": 0 } }, - "pe_window_ns": 297.9200000000001 + "pe_window_ns": 360.384, + "composite_window_ns": 360.384 }, { "M": 8, @@ -1089,20 +1105,20 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, - "sim_wall_clock_s": 1.443, + "sim_wall_clock_s": 0.678, "engines": { "pe_dma": { - "occupancy_ns": 676.495, - "wall_ns": 156.5, + "occupancy_ns": 260.0, + "wall_ns": 200.0, "record_count": 13 }, "pe_fetch_store": { - "occupancy_ns": 201.6959999999999, - "wall_ns": 132.0, + "occupancy_ns": 162.46399999999994, + "wall_ns": 144.0, "record_count": 12 }, "pe_gemm": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1114,27 +1130,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 607.995, - "wall_ns": 132.0, + "occupancy_ns": 168.0, + "wall_ns": 168.0, "record_count": 8 }, "DMA_WRITE": { - "occupancy_ns": 56.0, - "wall_ns": 40.0, + "occupancy_ns": 80.0, + "wall_ns": 80.0, "record_count": 4 }, "FETCH": { - "occupancy_ns": 148.0, - "wall_ns": 132.0, + "occupancy_ns": 128.0, + "wall_ns": 128.0, "record_count": 8 }, "STORE": { - "occupancy_ns": 53.69599999999991, - "wall_ns": 47.23199999999997, + "occupancy_ns": 34.46399999999994, + "wall_ns": 34.46399999999994, "record_count": 4 }, "GEMM": { - "occupancy_ns": 136.0640000000003, + "occupancy_ns": 131.07200000000012, "wall_ns": 131.07200000000012, "record_count": 8 }, @@ -1144,7 +1160,8 @@ "record_count": 0 } }, - "pe_window_ns": 183.42500000000007 + "pe_window_ns": 213.389, + "composite_window_ns": 200.38400000000001 }, { "M": 8, @@ -1155,11 +1172,11 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 8, - "sim_wall_clock_s": 1.198, + "sim_wall_clock_s": 0.632, "engines": { "pe_dma": { - "occupancy_ns": 193.0, - "wall_ns": 193.0, + "occupancy_ns": 192.0, + "wall_ns": 192.0, "record_count": 6 }, "pe_fetch_store": { @@ -1210,7 +1227,8 @@ "record_count": 0 } }, - "pe_window_ns": 294.0870000000001 + "pe_window_ns": 293.5870000000001, + "composite_window_ns": 147.07200000000012 }, { "M": 128, @@ -1221,21 +1239,21 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, - "sim_wall_clock_s": 1.983, + "sim_wall_clock_s": 0.978, "engines": { "pe_dma": { - "occupancy_ns": 6547.771, - "wall_ns": 560.0, + "occupancy_ns": 1040.0000000000002, + "wall_ns": 732.0000000000001, "record_count": 48 }, "pe_fetch_store": { - "occupancy_ns": 481.72799999999916, - "wall_ns": 268.0, + "occupancy_ns": 422.2400000000001, + "wall_ns": 320.0000000000001, "record_count": 32 }, "pe_gemm": { - "occupancy_ns": 293.2480000000014, - "wall_ns": 262.14400000000023, + "occupancy_ns": 262.1440000000001, + "wall_ns": 262.1440000000001, "record_count": 16 }, "pe_math": { @@ -1246,28 +1264,28 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 6335.995, - "wall_ns": 516.0, + "occupancy_ns": 688.0000000000001, + "wall_ns": 688.0000000000001, "record_count": 32 }, "DMA_WRITE": { - "occupancy_ns": 211.77599999999984, - "wall_ns": 169.15200000000004, + "occupancy_ns": 352.0000000000001, + "wall_ns": 352.0000000000001, "record_count": 16 }, "FETCH": { - "occupancy_ns": 308.0, - "wall_ns": 260.0, + "occupancy_ns": 256.0, + "wall_ns": 256.0, "record_count": 16 }, "STORE": { - "occupancy_ns": 173.72799999999916, - "wall_ns": 164.2559999999994, + "occupancy_ns": 166.24000000000012, + "wall_ns": 166.24000000000012, "record_count": 16 }, "GEMM": { - "occupancy_ns": 293.2480000000014, - "wall_ns": 262.14400000000023, + "occupancy_ns": 262.1440000000001, + "wall_ns": 262.1440000000001, "record_count": 16 }, "MATH": { @@ -1276,7 +1294,8 @@ "record_count": 0 } }, - "pe_window_ns": 572.9920000000002 + "pe_window_ns": 736.3840000000001, + "composite_window_ns": 736.3840000000001 }, { "M": 128, @@ -1287,20 +1306,20 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, - "sim_wall_clock_s": 1.699, + "sim_wall_clock_s": 0.999, "engines": { "pe_dma": { - "occupancy_ns": 2464.2709999999997, - "wall_ns": 316.5, + "occupancy_ns": 732.0, + "wall_ns": 424.0, "record_count": 33 }, "pe_fetch_store": { - "occupancy_ns": 481.72799999999916, - "wall_ns": 268.0, + "occupancy_ns": 422.2399999999998, + "wall_ns": 320.0, "record_count": 32 }, "pe_gemm": { - "occupancy_ns": 293.2480000000014, + "occupancy_ns": 262.14400000000023, "wall_ns": 262.14400000000023, "record_count": 16 }, @@ -1312,27 +1331,27 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 2239.995, - "wall_ns": 260.0, + "occupancy_ns": 368.0, + "wall_ns": 368.0, "record_count": 16 }, "DMA_WRITE": { - "occupancy_ns": 211.77599999999984, - "wall_ns": 169.15200000000004, + "occupancy_ns": 352.0, + "wall_ns": 352.0, "record_count": 16 }, "FETCH": { - "occupancy_ns": 308.0, - "wall_ns": 260.0, + "occupancy_ns": 256.0, + "wall_ns": 256.0, "record_count": 16 }, "STORE": { - "occupancy_ns": 173.72799999999916, - "wall_ns": 164.2559999999994, + "occupancy_ns": 166.23999999999978, + "wall_ns": 166.23999999999978, "record_count": 16 }, "GEMM": { - "occupancy_ns": 293.2480000000014, + "occupancy_ns": 262.14400000000023, "wall_ns": 262.14400000000023, "record_count": 16 }, @@ -1342,7 +1361,8 @@ "record_count": 0 } }, - "pe_window_ns": 330.4970000000002 + "pe_window_ns": 429.389, + "composite_window_ns": 416.384 }, { "M": 128, @@ -1353,11 +1373,11 @@ "bytes_hbm": 36864, "arith_intensity": 7.111111111111111, "tile_count_expected": 16, - "sim_wall_clock_s": 1.402, + "sim_wall_clock_s": 0.776, "engines": { "pe_dma": { - "occupancy_ns": 217.0, - "wall_ns": 217.0, + "occupancy_ns": 216.0, + "wall_ns": 216.0, "record_count": 18 }, "pe_fetch_store": { @@ -1408,7 +1428,8 @@ "record_count": 0 } }, - "pe_window_ns": 321.1590000000002 + "pe_window_ns": 320.6590000000002, + "composite_window_ns": 294.14400000000023 }, { "M": 512, @@ -1419,21 +1440,21 @@ "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, - "sim_wall_clock_s": 89.111, + "sim_wall_clock_s": 43.303, "engines": { "pe_dma": { - "occupancy_ns": 100690943.995, - "wall_ns": 65612.00000000001, + "occupancy_ns": 90092.00000000001, + "wall_ns": 82952.00000000001, "record_count": 4352 }, "pe_fetch_store": { - "occupancy_ns": 43566.52800034459, - "wall_ns": 32796.00000000001, + "occupancy_ns": 34717.695999999676, + "wall_ns": 33792.00000000001, "record_count": 2304 }, "pe_gemm": { - "occupancy_ns": 833762.8159962555, - "wall_ns": 33554.431999996836, + "occupancy_ns": 33554.43200000262, + "wall_ns": 33554.43200000262, "record_count": 2048 }, "pe_math": { @@ -1444,28 +1465,28 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 100687871.995, - "wall_ns": 65540.00000000001, + "occupancy_ns": 82940.00000000001, + "wall_ns": 82940.00000000001, "record_count": 4096 }, "DMA_WRITE": { - "occupancy_ns": 3072.0, - "wall_ns": 3072.0, + "occupancy_ns": 7152.0, + "wall_ns": 7152.0, "record_count": 256 }, "FETCH": { - "occupancy_ns": 40936.00000000001, - "wall_ns": 32772.00000000001, + "occupancy_ns": 32768.00000000001, + "wall_ns": 32768.00000000001, "record_count": 2048 }, "STORE": { - "occupancy_ns": 2630.5280003445805, - "wall_ns": 2630.5280003445805, + "occupancy_ns": 1949.6959999996689, + "wall_ns": 1949.6959999996689, "record_count": 256 }, "GEMM": { - "occupancy_ns": 833762.8159962555, - "wall_ns": 33554.431999996836, + "occupancy_ns": 33554.43200000262, + "wall_ns": 33554.43200000262, "record_count": 2048 }, "MATH": { @@ -1474,7 +1495,8 @@ "record_count": 0 } }, - "pe_window_ns": 66360.12799999685 + "pe_window_ns": 82972.38400000002, + "composite_window_ns": 82972.38400000002 }, { "M": 512, @@ -1485,21 +1507,21 @@ "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, - "sim_wall_clock_s": 48.616, + "sim_wall_clock_s": 23.224, "engines": { "pe_dma": { - "occupancy_ns": 33584132.495, - "wall_ns": 34896.5, + "occupancy_ns": 51184.0, + "wall_ns": 44044.0, "record_count": 2305 }, "pe_fetch_store": { - "occupancy_ns": 43562.81600011295, - "wall_ns": 32796.0, + "occupancy_ns": 34717.69600000031, + "wall_ns": 33792.0, "record_count": 2304 }, "pe_gemm": { - "occupancy_ns": 833762.8159987241, - "wall_ns": 33554.43199999785, + "occupancy_ns": 33554.43199999754, + "wall_ns": 33554.43199999755, "record_count": 2048 }, "pe_math": { @@ -1510,28 +1532,28 @@ }, "stages": { "DMA_READ": { - "occupancy_ns": 33579007.995, - "wall_ns": 32772.0, + "occupancy_ns": 41980.0, + "wall_ns": 41980.0, "record_count": 2048 }, "DMA_WRITE": { - "occupancy_ns": 3072.0, - "wall_ns": 3072.0, + "occupancy_ns": 7151.999999999999, + "wall_ns": 7151.999999999999, "record_count": 256 }, "FETCH": { - "occupancy_ns": 40936.0, - "wall_ns": 32772.0, + "occupancy_ns": 32768.0, + "wall_ns": 32768.0, "record_count": 2048 }, "STORE": { - "occupancy_ns": 2626.816000112947, - "wall_ns": 2626.816000112947, + "occupancy_ns": 1949.6960000003073, + "wall_ns": 1949.6960000003073, "record_count": 256 }, "GEMM": { - "occupancy_ns": 833762.8159987241, - "wall_ns": 33554.43199999785, + "occupancy_ns": 33554.43199999754, + "wall_ns": 33554.43199999755, "record_count": 2048 }, "MATH": { @@ -1540,7 +1562,8 @@ "record_count": 0 } }, - "pe_window_ns": 35645.632999997855 + "pe_window_ns": 44065.389, + "composite_window_ns": 42012.384 }, { "M": 512, @@ -1551,11 +1574,11 @@ "bytes_hbm": 1572864, "arith_intensity": 170.66666666666666, "tile_count_expected": 2048, - "sim_wall_clock_s": 7.072, + "sim_wall_clock_s": 3.48, "engines": { "pe_dma": { - "occupancy_ns": 7177.0, - "wall_ns": 7177.0, + "occupancy_ns": 7176.0, + "wall_ns": 7176.0, "record_count": 258 }, "pe_fetch_store": { @@ -1606,7 +1629,8 @@ "record_count": 0 } }, - "pe_window_ns": 37677.44699999763 + "pe_window_ns": 37676.94699999763, + "composite_window_ns": 33570.43199999763 } ] } \ No newline at end of file diff --git a/docs/diagrams/kernbench2_overview.pptx b/docs/diagrams/kernbench2_overview.pptx index dde76aa2a83a7330db3a9a8e62f4c79bdca66f89..35bfabab505d8521cc4bbd3da07e605935e4cf00 100644 GIT binary patch delta 19875 zcmZs?V~}M{(=Ob$r>AY(wmogzJ#A|myKUQ^wr$(CZDZP}@427%J?Hu2`>`V`vaZU^ z%C%S4l^M07%gIiY${KtOA`I*38g3{A_}H3D`WfSBjVU>Wb`%Zsai=iDar_R?>?PD28tEZ?nd-D#!#m< zT!;CvB&YR#{+ES1VGuH4&@WSUR-nKC9iW0^fx!P`yr{lzHulSST|74=E?~>1i~z^y zYDx*i=9hgfW%1ASRZ=tZAF{RaEN72EGUk>+KYbqSJzhYQRJm%65w~%#-#@11NL)Vb ztkmstgcfP($*%d*z!H+JRqm`A-Mut658?%=dZ~LgWy428M@Ra0huYJf{iH^}D{wJS zt2AfCf}AIuni^1y;aZ2Z+XA>pS=Z|G2&Lo(*vi44i^|w(L=*!0g646SJ#RAKT4%RWiVapRao|1#1Re`~jggXZWv$8{ zyc7apXsf}-3wlJ~zXM9iOZY0loK=9p;%P>VGWH3OnCtSO!FCMsyaBP^+!mJ9*Dn{X6+Q z?~XkI%MZbxfYdX@=MSbO{M&bvN`6#Cvmsu}v*;Y2)4Q;3)ZG#`#Vi<~JCzDR+nZXrHDqDL}ewB#XIH3W@2gxQdhTqfEzizGm6C{BTZ2%d*7k{G&uDj2Sh^ zoVQEo4cjSr14syN+&QEh$0q?C#o-08W-**aIP+d&ZH^nTvy6=#Kd90=(JST#?orM4FPlzFT)~s1g+^@LTv}> z#hsQ{*1}R3mu7ueDNM1(Vk6`NBbW)d-zW}p_EwIes`$#WSc+RDFy3=|IDckDz4m9&s>`G*c}AnBV`5t;ec^z~-h9Rh zP(m(#p$R7%Y$TUM0+opf+|oD}8h{yT42xu;NCSYmSB0E;8C9d04IJKoqs8~rc&yL& z=$hg#8Y=*k!|yl@{d&9}*u*H6s>j%%Y*|ClC>=k}pnRTGHNUwU(xwMv&BaUj?8iO& ztlQLpUI1}ZK#pF56_-^q%3@)SbjY{hDZKuM06`gTA&fS38zBC^Euvp~g|e8-oqiRZ zYSRD<(XtTV@resfgyGjvlTKb1X|&O&V!movr5o4D%Mf1VD+zqzcg2iET21j7P9^zYyUK_23t+!L?xeeQ1# zB1fZw{5w#_RsI)JhW8Ho7g9#_`FEy**c9xqV1}1DUXo($uMnNG1|E2xQ`~88N|@wWtY=;4UV15M!z;mK;heQ4)`#w^xDqw^4|!$vdI&g_q# z2G05fAtOjtWCO~Vo9m1U?{5DTp<5V>AaJxW5i=pcST0b&!PBZsj>jsO_tqrNlh*tx zSu&^`A@xflIu}zwOA1gEWY*&~>6{&}9z_Yq?m!d)@Pms~s%F%x#o0*pmkP3c!<$Fd zPTj`f&3BA#Yf^xnNQp1Kz^*tvm3iIKYcj)f<3rd5qEf82M+%q_FhX05Z%5ip?d&u# zRJQ&@>eLjdTTSgjR7J2^wG+{!n>%~49sXOBW^c>bsbTiFNB)%$^Dm9nsXQQ;W--?o z(O6*u0I@cBF=#|QP1vhB5GrjgNf^W?Q9Fd<>N2}b(sqD-4rYVzT>J&Mk>%WCe+l(- z8mz+AB#Vb*+mx$~Q>d-ek>WRF!9Y}4fecksoNY8`>Sa}RHotW9d1E8wHjP>c2hiT- zmJR5`cg4%0GFCbxzTBlrGxgSGcMM9mOP6DQ!2K`I?^ul@+ivrj1?GEi@O9HhoQ@vr z+ph_A^>0wHSm^kfeKeo+MV9gB%rAl0j|N? zR4HWB^AGOnxPQ@d+l||>fEGwk!VQpYhk5ffCEse-+_jU4mug<6&JiF7zst_Tf+l95 z0SYwhLKWO1;In(v2b)G3)aby~tmiY}e)W%%_H~@hwQD(!s!!0+W;6@UAvOzr8+RdS zhrTB;X?rhNs-VO+d#8x>XyROxgY28fI|y$$P0Q^a^p~lZwp<06$|AG zi7Ve~#f5*}=*d*RfcLnmcILhq+_*|D1sv`L{e0B>R&z5?!KJYx##L81*;I%YxU{$9 zJy-!3HAnG_#I^0?`Dy+AZD@-(#?vx|ZqXuFGD%T6z=Lp7@Uc}M9w)EJ^;;{Y2}2ogpbyp(d16qae0 z8!N9i{9M}t!?V}|!yrluiT&xAfoc*-D)pbH(H#caxZ;waiO3tB*QBZw$=zcG5V*HH z33d>|{&#X*L${61pe0=T@4v+6Un_?C@K%|4AE*LZEV%qFGzed670$^W1@TPPD2pmJGe-erNQNLtUR~`g77_e9JKp_O(U#U^RXzjo01-A&| z@b6NHVu&V)Vn`+GzW|jxNaiMkIY<&PV4mclB}j(9Qjwi2=6{!MFG2phY-Aa-X=EAd z68SFxCe$~jsp}Mq0*ZaPgf((-ugxZ6Xld(@pu^!n*M^0=5<{=(>|FA$B*$csAz^s# z7S#RC|HGHR98LztfF@5KkAj9tmb-)M`J2+<+hIQkg^BuYi&ci$;;k<@kCq9AxD_H;P?|6e6`|EJ9-{Wjk z%sUhmET~e-ZgM&lG~++f_!o5<4Goh74c+q>*@2;r*E9kR4U7OHnV39|4$bsGz+32l z0<9Y>$-18Wh5>C0ij6l@QVxjkdb zhLO38R4sng9>-!W zlO^CTp1-~fq8gl#PjD{;Ed?YCzpTt}b0wA+ow?_cSkaT!08vI}De&|Yxs3tx^jFeB zwYi6#)fTP>C~-pThpQEQ>?ryQXh&6WvJDP26@W{^H2$ENa;!`tpWDqF(kTh?ZR94@ zd7l>N59+3+G50;4*(29A&n{gCUc3`brD9)^y!RwG$1Jziv>EHHY3MIx%QOkt_7ff06A93g7_VfovJY@gro4=v6Hnj&Kz0i z)z^j+(U={pT|?c6bN*`kp!R;E0V44C6*>g*3m|{>n|2OTiztT3 zXz5Vx>?qvP&^kp{8ks|(e zI+|({u3>b48Z9U2?yRM(zX1}$3TKl#l~iB$ebIzjwAPG++kxUKLBxM?k{B`9C18@2 zCQ(LA1pd;{$<@UtryZ42PE7BZQ%V7zt((-=#Q*;CvbBX?2(~d)!)L$5`WQHwmq&bV zW{|Pu^WHMd>JdOAr*mzZ%il5ic{#;d4mQ1aCE0lfpPLSw`3YAvy$6sad-4rdsM0>R zL4uTM1Z}IBR1lAO{dDsVW}b0m(;gX zX}`eD#g8~(2wmb+x7VdBTi*$)csv&9p7wzX?UCC{DD${&o$(zTRl7)jh$k{gb1qSy zYKJn~yKf-4Cbo=_Pam)h(Xl=w#APo}4k2q7dwSw!#kbZ_>7*7Qs=rpR{@mPnG5G`A z1npd@A4kUT9<)OL18K*i`Sq~QE|w+s5$;FMX2+({9b0Utd%6}r=`+2-RudkB$)~D={kF@XVEWw9^bY=SvJHrF^r!MM?4X+0O_T#QsG0}e?F)%kD?EkeKq{; z$T-e@rcANb9iZ;;{EX_pPfu}+BnXqJL*D;@l_g9N4Zjo?k0+VfJaL1&4b&x{c`uLW z$6x>8oER;1j(kxqHQ@c9#7}!abl5Fn5XdLU6X%j(_XgJ?Xb1dmq4Y_8#A60$hdJ?Aoym zKUM!B#?;LUfNq8ty5s7v!Yo7ZF4Sl`rnb%M5c&hv&dtjYiSDW?*PRUbK-4zwk@xR{ zG;ni68GO&~4rf4s$VTycVA5ly=BC&x-2G9q1&FYj=-X3{5v8j3phC^7mjv&7o+JG} zFHkE;HJfegXFLiXha|gF{6X<7KX7}{cdMot*X1k75%DXtn&*Yn3aPT%m%2LFkFhds zIpqgdW~jiUdtQ}a=QEOA#BsktjH-2!{5!3Lm|y~~KNb#9H~)ASA{VC&Y^xIsjwsgp z0U+jqdU*Vs;BBqI^~JNLe&am28P}O!`Vh#B&5fsLyl*JmkQ1)VJ@Ow87*+}KuAc(e zwp|V{YgJ(Q8jezyvH%tgu$gHq|MA$D2G6gajwca>$>b?c&cPXLb_})U%C9$2K5B1G z{%xrBg94~#n7DX$+Z9TSg(QYFZpvAlSwQU_K0I`O&%1KV?O8^a3+DanVVdfbM*XCg zPO5hib=(5A>P0yF)9Glt+2BYkuy&nemuiP~!t zgzz#!q=zaa$Bip9Ny)xi$5xAbj*0iz<2;ZT5v;~>;IW{J7OmZLD)XK%5`T*X! zIg47%Z$h&s##Q=UOFkwt$(|0qsypssatD}isyx|1Qrub75L4tCs9Qc}E?r<#mvF4n z;e>kaKbghRtwq2}=~g2gLXon>gQdwV9q)b z7O<#ftfEVn(l%zXT8deFI~L_9DFCYLmJ@6Z5XAcumdZ+&1zHt^VS;eK|AN5K2gF*? zu?<0XN3E_6>u76oF}P8%1JW$~bBtO91HJ`zte>Z>v(VbBW&J7N6d30$(zR_YVb73P z`z5S-57D-OEyG~y=iafx*pUn0L3sH9VH806{VyhtN$uC%E(-AwfbHfKEMzJDsF)o- z8>6C!LQI5GB4#O__7vVPLaRtU_UCr*#-0$x{!d0y*5L3Quq!6Ix&7TYE1*9jhSQX) z`;qXB3KmmgR)>2t?aXfr5&AHR0xFv*LWF38G7^&7a>OVg4yHrHdshtZ#5b4#9^9$h z5}>U|!e^_u#o8NbfW9g%gEV(aNlwZ&jA?%?YINT?vYDF>1MwfRktBCQyu*zE%s#Y~ zMBH%@7zlfKW>x^5Zz@48Tgp@lSK!4GAL1QdZ$uh{(ND;_Jb62X9vOMZ&Re+g1f#q~ z@H@+q<wc~%}^xVms*1q z@E{`agWY><#EA8je$VG6K=%)p=@A2jC|>q*(#bqkdo)ud2Z@f%y}j}uLmKVirlyc} z1R_|WPcBh_VD~Vytp7Hdc7E;TAi&`I-@`0pauf6#hRcLj zEOypWne3d85vH?3%#C@ehT}!%-lOvQ8KzmLQDZQ)R}OAz73MZ=YKa>Yr4$E9DCar| zhjl0_uFhxXwn7EKsS+OvbFjuH0_Jn?cn&7UrMcOKeQ{Y(w5NGFmVbAv(F3-z4lO zrP3JHWN1QJpb}E5XlJkahO%ZN&>%IP_&5Q7Etl=ejWcRg9&2;ALxqq;R z?G=~pHm}@GVdJi`aTO5f`#0{xmFP^y80DVAal{GT{@#I9gQ=kZxRxEt8o;NO6t+la z*@xTV0*~U1gDV}f9}GKSI<7#7uKSju1p`x}BS?szu*7LMUYz4F5`h3RL4T*j#WM-8 zv9qD1EN~$W)P@dB>_Q9;cHd)Qzr2&s**LOi^LtbsY7{@58agGA=LzR^W-9V$>r@PF zeUbhB<(U@lS|e96b5%a6y~G7duphEYQO|n;0zSZs;X|J`^vfX*RH5x)mgemF9rI0` zuYd)PS5{7I=Pt9rUy?h!rLjvm0;F{a>&mDTQY1uk{=zsWB45pz)Sjj?BC|W@YTmgk zxl*WVJ~o_Kij3a5PfcYZE!QIxT3WGB&QFdaqpY!UD?JewDcFoF!{Cp6Zknn#D%O^2 zlkRcqq^Bfvjd|Vuh?0Vfqs1#i(x#v*F4#gh3D)YW8-52xuPj_s{se$kr11!*DhM?g zBNdVqIa&KN*v4FZJzQ4L@yZ}MgUQpDZwdBHihxUzwEu`o}=MMH^ zR3jXe_JbHRCf@W!g(oj?dbY6_=#%aC_&*L$`O@kL>zjrftxq!e(XUS4@2+px)s5Q~ zT!wWLaVN99H;L97MFBz0#L*G#FbBH+oU%Y=FMB9EFG*nmRXr~l{9EUIzmNfct*D5dC_u86hO<{MGb0iK!kw0sL)FIQO0Ud{W?-Lc2FX7)t0EhT$ zu>J`hktaiLO1wS5(G0mljtl=qjXu#M?rivfYs44>0ANO$a1TKJOzNN`hT zrC|u5k3Ja|N^S;gT>XFAKVMv(A{mbp`rr1<4IbuP(+wvy88n++o2T=;0-flpw`L}b z)F1a0K*t>LP_78{)oC_KTF^m53T2+*@UPGRZlovw)i)OR)lLEkL;u@O()7}+Ab|%0 zdK>s#F92BR+^nrGGN)|TIPiw9UQq*l^Ps26-sBiDru5Uu1`NrV{msp)d=2M9t7{7q z_gPlvvJN}_!5=&6YY?k#XUR1b<8ap$&kOSM`%Muet(59T0QaO1?>FLJcN-HNU`tdJ zejKM=VvyIY@-qZ@5Hlv954PE|TfeFVlU`@05dnJ=4|-dr=MS!1A1CSi6tKYpS2!l( z-d(PVM{KI=(Y?@Zc1FnCYBw@bn|y_SN8vbXa`u6TXs{ zj4CI*w(w0CO3elt7GFV`(U17t03cO8HCD z-*h}dr_Z@%3}Uv9*(Z?aJ1sQTRNo$=ZNyHN-xiJ#?(qVcdkhfQPp`W1f~V)V@lR}l zL6r#eaEN+->E}q>!Q`qXaI}|ENY=gihXIDVt?oTau=aWHIsxxSNy(x`qT_1{o>&0} z?oJxcR)@#IO^0)^?`qkTNApXlB3LzeP@gy^?^%5PPOY(W>~y8@b!JJP{3FgA3u;N< zQxSV@t0s56t3~;d^XTv3>~I<*j+rN4)R2mx{e^kMVzi z*Xmo9A_jR^GBHF2Q?alfnsk_)eYa+{&hMind+HNPGU$0Uhz*(LqJc|ql2%6w7|)jz zmws#t&7O1<{0fh0DLN^8?Q&=H8o--`w-EgOn(6eZ_x2v$PN=#Uy0Rw}X0W+@ub@lu zOb+aJO(gQo_v3qX1I9pZwu*^ab>`r=_e?5y7LL%_bT#k1D6C_58%P9}=TBgK*~!Un z5=5bDb}IT*_H9L2t9k$!*}#K?_siZ?m+>~>6WpUUC9U&~okACWS27{H43I8DToAOd zkWdsa$S3(;Z57$Hw4$Yi0jG>~dU?1_+Bn&)7|X7YBl7&R$k=A#i`jotZB{w6!S6z{KxIPoD-n(!xTof3*pc0E54&*Vt_Z;d?c2P zjP0`nmJ8LC2{DKX$qK$+F+iE!YQ{k{|I8FFNGjVCg+wYoU#3SN>9}Y5VvKW1%8UE(2`K0Kx^W*xuSCum#e4W$qOd2QGXu!pOX;uXWvLKm7 zhxW7k!U%E@q4nCx24b)IO%-wgl;@&|0q|0f)7vF4w_xO65`*TkI`iC2V&Z41SgW(G zX+9BikZK*(dyhbLa*;FI?;jEU!^DqjaZsDOLN!*?*M_EG?r>4Y^l)G98Ys7pQ-?U`h-d0 zN`kqUyZ8%D1I--f+yWkJ`H_XD9Gj7lvVWb&ux?*{OQjm4&g}@aR`3xt-;M0f((W^m zeA-Qb^|fcW=*@SxLw4$Go{aoMZ^r#GVt}dnesqYy!f>|2C4evheP##mu5f>SL)_-1 zncw<&?ckfs#Q^`G?kva5#gNuTyEaW>UO})vS<6Og%XX6rpsf@WTZQ?R$Sk)rD#Lpg`67%rz_WyMjtX!9UbaS z9F!27R>GPlP5|(P5$?(y9Vg1R98JI33j}SebvkDr9HP#<1?D5Q~GMi2cJd>2UkXZ5YiuQ}%C$rRB(;$dfo*t#eUof%v~BI$`Qdf- zeqKt$KCe}`IryLveO_n$BY;Pmh=lSsq83%1Py$J}w>0hPl&hk_(O645Mzu|CAR< z11;zwRq*Ewo41o~AZ5ZY5hW)z#gy%NA_|FLZd@9gPnJlMD^Uq5bFo{JRR)?<%FQW+ zk7wQ$fZ&t(bj*6+R6%MxG3N|Mt6=8usNX(l><-DvcQfrcuHYt{fz9R-%z(lDn==Z) zC@Ufnj_geXVkTxhNB5zvE+KIz)+CtCb1<9oH54hDDa;3BrVobic_#YTVPb#G$ZWd* zN#IU%4La210MUXKACJtz915Vt=8HobRv1osqY6qpPOu-dX-h>9S6}~Tb=}#M%Eiq#dy8d&w)d7hmG0n` zwPCMmKP*A}J78IgGPUa^L%5L^0n;3c5)kV`NJ_+!G5Ffn$?7CK6cx#rwrFnABp+WG zFSGZqvv-~mh1-|Px-Fx%Bg1$&=L1-(tI3$FG2*y`$k5={?iiBletu*)rYu>#7=Ut8wAJ91n=(}S$D_f0(TTN&b&%@7oiY6 zwJ1+KPLj6~%{-X$ZIUhUzowx{TFC0VQ_XX7wSAcj5@I5GnY!I^xq}=p%;n}6_p<(R z>+H*|b$8rFKu6ResMr33nsicrW8A@E6~CE_leLRm(m28@5?8^6P%sYcvo_JriTGasWVn~>>B+Yt&sr|7M94v&y4lV+^wDL08Z23lWM)* zSk2Ux9*Zf1yhgl!7Qj1aUA|?V|EdGGy6|;O11uhl*gg%Z_A#K$2GKb_w{h%&tFAAS zpDGR8Ft}XjT>dWH40C5=_Pjj`H@}5&Af`c@ao(raoJ9NS5K8IFJI$GVfMgYQV)AsE zF%5vt1gFskCye+_4n=)tyhYy-d;&@kk-MmG#wze|H7sp>F)ZzF&+N#_sSBbka0Dft zUq-r)iw%(DV(MJI`cg2y_)>6}oXyPD%EndxjMI=HY)XA#2aM5_bKye!a|{c_sa(fr z^4n}WFDDQ*;XaIb4-Z`=Cty5M^CChM8wK#J-X_8a-H@eHN6uBa@@&*5%4|ElXLND< zhvd+T%iRxooVUFfzYk7u|BX?+4mS;($i7{NL{{;HQI;wV`c~@mtg}QwhNO4vyYxD&HCVX)%r4lK|vH z)(0cBNPBR#7fi!$v`B*$yqXA#GRReUZwRS5V~YOGBBr>L8wBK5+Yq_r9`~<$Z!FjO zJfI$-Qayp$m-r1uMTFduO=GfJ1wn>>f0$TbDmTk=N*cLu)i@0AKX|5*C8a{#zNkhMaBS9Nbv$PUF%Z`slZ!_Say(yi801Zztu z%buJGAuM6u%OW(%AXuL0ud8w`=ixXA6{|n8W#YJy1RMQjXQvA>4gZi8x5kzErYvc%Wr9S6ep`swI!k`BA!0PX1>hi*V6h_9FmPJ8w zML}i@-P5%=i$v(Xs{2AjTodp6es{kLTK=@>1>2D|=bIVjR_Z14!{v<;+qu@+!@M|F zO?D@5eFACd|61o+)I0>7SyZ9QvxnNaL}o&}`Hdy(xXU4Y=MBc>tfJSV@w6OfaOuIpWin4-JGM0*GUE3tbFYD2%^EDwV%rPF~Li zaIwrCvHl~c+`@4V?^Q!ccr}{6puA$3`uEbW67FEz##z*fAX#1kr(~#C%gmYex9&}q z{k9#t!|!b*f}5PJnm>1xDy7Rfo^ZhfSEz+%-nVzo3OmbF4vBc^wxnrSwX}y}DO~s7 zpZ|Ry;oq7|H~y>UvX=W#%|#~Dj(P_T1Qg@*Z_UN}Z_TywRdZnlO+8UTf%=pDM7dMT z^RtJ1wnpDD4kPjBg+onm3```ABq5)~xtKAG3J3os*b%Qg5|?l>9E-von|D)4Gp*hz zaeDP|7q(><%m2xQ`MfRGABql<311J(?gQ0oFa-;d8M#F#=kfFzJtRcnVu@jm()Y9sxLY30>SJ+{@B<}>h$nvn6sx+(&78&CeVHse-Xiz^RTi;D) &A4HbrpX1a;Uy0K_<1=}SW%EeSK&r8#gF z=FrEX5-X3^k#fR@Ub)%u`$_bdW}*ji0M6J131B#1bqKC~2PUIZ~inzy;98{LOS;&DajKqgC9i~2jG z7$ty0Wt_0KP|5^}+bvab0WXU__G~XX9mmeoFj4msXnpj(21An!(nJfj)Lt#b+bgZvL0$e-RBHf|{V9G>XCplOF@mtW|MZVNRTYP?8;3&H1p&x* zqD!1JpQQ}D$4;N8?faIk-X1SE+sb{Xx7SXh10|_+27B__2%W!lK@xIFH7@%Ci zqxE9G)GbMPg0R8>G{Nza>0Uy|W6*m2XO+f^2pZ`e2ih)(yAEQxSknA}PHB&tnu5wB zd`Mm|My?8$M;3Ff(w*32$@g#qEsLtrgzudj2Qt1YDYYOYy4?v*w_r-EoGrXIW{Z0+ zgH$V+`7)bs6|tFwZrgUo=i|#7`DFGEp6-8cK6@t%BF(u0-01#k=i_v%sVe@N8jYeM zHOxk4q60TW7vq)2W^Bge8syyE29u05HAF^cgOgcaJ!8$K#fs!yg>}m} z^Ae=b0^cG4mU&)2j?nADJG~|X@u8CvsbFOtJPr*VpBzeClFvW(XrN?~_M;xA+pQ*% zGhikft!1bodhoks5H`pm-=Md8*;qOks%m#&>G_- z?rgG+Zmbh@(q0b#TQAjnfy-{<;)t{~>Zj({6>7;}VIc#dg7tz7SE+ucb@W<89tLP4)`^67^gTyQMi%Rl;|YGjbF!7t-*0Q#ys+n*G%9R`sfe-iiSiiB1Adw+OlC{% zV~JpkbfM+JPC4zi;+<;8G<-RKZxK?sfUM&QIIpiPSNmw}3&>t1eG9OF<==_hN2yHG zuEiKwFC=?2Uhi;sr6)DDEXlwMsR`y`6?ksYLBBJIA73H|gFh;OS(CRLNj@Q@g%p2VBKjE@g5n};R8h6;~ zXjHvf2Sf8$NdJLeLh~jf9U)buZRxj87k;Qug679`wO!tjBy15mm7=2^UJc0J6!{Yx zgYvQ?b??YvdR?;Ro%ZF$mgP(|+OfDr-j-kakFCLbYt^&0F&+-uPX zqFv5jfCbFdB>0^lNA88|38+u*g~^D!nb>X7D3_>%e4#uGW}cNCkoZ3DWi-0b;o zODtheiYGfLC%MT|i1%LfFC5%IiG_%JO%U|l*g_T`tmJiF66AG{vFL{{Av*{YTqk}w zMZxResu|LAmEp@*g>@}=l+z}v*;bByd$xS* z%B-Y%*L}9Hx(`P67Z|9e3XZbfb*S&Cx9!MWyk(PMgO&ljSs{gX|2WWM zw0a8kAdZCa#uolr54r@aHTQBu_h8iX4X@CK3R}}y}XQd3{MN zJ{Fz4V##6ZSN2=yuOGXxJuR9rRa;?yl-?AMqxrP~sNHAWwDTaj*mgw=7IOS4XqIM) z_d*^OFLosoC}!Yyg*W4Xd+_Xnp&788XEtSn7}2JkD%rXtYAxLCy@^y%zxy=--JfbH9JDJUjQ{X7 zwZA-#drx^a%Z7=$fbTl~Ug_7FV~1YNOOYCEOX@yi%aTQQPa>Dgs?5HJ*MiWy7+h6r zC(u$QJ%4a>-?GzDa#%ydKQt{eLilbzTdT1nsoIYWNg*gjX$gA5FvXpmj)N~-md0_kWyAiR} zU6_G-#U!e5YN&x%nB)hu%YZiXu%BJ!KRk`QQMXf*W=l!%6sYxs!AaA*@rgQ4$c>7! zSyXY}c)a>9E-pO*4o2`7_MPuqJb4YZFY2=m6YbY=zCO|qZ(C&%9Ovx-u!HAiYeBTLV5$0DX6Q-Q>X) zvjU!#1%wS!==`fNmZUgNMDrRy4Xq~Ew+nX`gx7Y&$=_`2cE^qas{0v%-2 z9yU%ok~sWIZMx7nThq4;#Qof?ardHj;_P%M58e~#nfXGx3Mu%44R_Jnj@|lpzHWbC z9*MhAH;)5%_>Kb#ZDoXZ1{6_N9x3Z07&zijIY)MBuY5Q&)!o%p zd3zFWj-bHaERuVSdUExAa$NCpJRkzFA~%wm$@B__)|@w($$ELdN$%-Ne)G6rer$Sl z!g^mm2VRMEX?=o>@8Oj)TWXQwVBQZ&{ME-b3d@62ijrjnX9h)i1C3U)i5=gIR(5D& zSIo^PR5OZ%fQoO#@&jvCD+osPW%(^WhFkr4xR#wg1Ix;#GaA)S!iyk=^)^FXI^U_jHBiHht{fh(01Xx$>o6? zQIUBR9Kt{pNGFMvk@lOW>o8V&g4IR*yW3m6Wg9@*$J1>or_)6h%seBQMd&w0B%@Hk z^h63d1vDFslxRN z$a!kU8w9O__ckG92gugn30k!o5*49MO<6*0kaYO-9Cvt>Rd{Sg7Xkm#q=_}tOrRe! zDhh|mlxrO`5t3xsbfAYD(HIdwC);FjUuVEF83<>uLfH3R=bH%pTBq<5|J!=?)kn%C zp?sH*A(Dt6DENhy6vi}uLUvUk?5vs+IjitiZ zR->l{nyF(y0RS!?da-Z}gM|DYgDB|970`)x0fO5160HSi;gg_QZpBq9IdkG73)R@t zWPIdawA{g$2)Z#Q+Glo^2Oik*Iz{J5SCdy?P(&ff&J>f`X-Wg*vp4rq1Oj+bLP%lc z$UV^Ooh2H|AsS?H_l8t(FOYogY#>2wL(ic(?*3|PRlr!vqIpvv7kJ69%Q03#yt6#Mp4!T6^X%?bXRdQ)wH}4ILMxV)XD*tYGTT5ct}5d^BrQlt?g9NJ zaMGFwaT(>tvWca>CRZ`p^>$JwvK2CHTsPfg?f|~|-vOZXJ)M*3FBel&`bl9q_|3`e z0;)3Vtu{5%#^1M)(D~IS$&!VQ~OO zsn7X5CipTGg@Zh@g3{uV+G3iLZ0(Qmg@|dW87#*Teq@E2)%6|Q)T}>5&&~O)^NVhZ zVH&lO!jzS_VWXln5-P~yq@p8Q7ICoZEGebTj#G-WSD?6wA%o^TvUS@5vRnC?$29nE z_##W%#qxu@?scLT9!Pi74|8k6)1gMlEwxA4J`pZ9Ae+?r?2^yPI(E=(tcX>|$>(LP ztKqh5$IrXtk${dx2vBy(mg7^E8gzs@qZ(zAm6(SQkpCYh6JYuuCL>omcv=7f1a!pm zFOy+U-nNIv0wzs@WWYO5gt&za00zn+U<+^>$!1u@h?viEFMiVI8N< zv5yH*;JLKw;KQ8P(jp)U74D5oacMMFw&Gp+ytHp6SBK)@MDr+6N|#8z#0-KWbCMDv zmNbv>$`6nW?HA``6Ox@&I8MN9G=CXP5V5aF$<{UV9xchKAwgjWw8}ztMa3Aign_Ke z`B{b4IuK?s(JIU2DbGJD)NOx1bQ(7%q zh{(&wFx`*8kE~(x(#9g1_1spOdDOm#?udaL0ap)ao!5^+UoLjBGbw4I$*nTJGVK2% zv8|w!cAixy>8vS$9W0kI_8+9a=Q~tL6uwDk>8f*OfV1(BdO?6Vvy*;wpITbqXw!iNIj5S;?s7F4A9FDui?0DGZOWf@*WhF zUg8u}8Ll%x ztiX-p0EGwyCd1Se0TO{Y3RlEm$kj9jJ61Dx^@O26hAh)0wjC3}rz+J9rrM z{+~LoG^(j13cti4PDB<<0v3t{A}+{QT7qc7L+i$(f*$dp7TmxpprV$7)r8;@2nZNt z@VG!-3kcS>;6epgR4BHJEfp7nwAKv~qZTS)@5~Ek@suBMW^(U$zk6rqog~b6UE8Z; zi+j!#3@)tCvv*Mp_<30K$xBn?t6Xy@WZ&5qm2u}{+E~-)3%gsxUr77~KMc6i)cwPT zhTAKP0wfvDUMUU=H0_~ZtgGE5>A>y-9zl~dS8C7Qcr!1}JPhI%=%J>mHRAP zy{@%I*mR`9zI?rT@80%XeW6#+9&4$of8%LWch9mha>uwMy(!(vu^;-{|1sdTX#LrS zvW}fYG(OMXJd2P2pgI0;wM%7X&*U|#%IcN#HN`7OO9D6-y{k)4+!s#&=HmzEDgQ`^ zbw;(^?dsL!Kltme-RsAV%C?Vohc=(uhu6JWbh9iYYRtxDSx@4M;zbudB!AiTL@ca! zDlnbwxv{D>g14rL*H#c(RTn)*o-E@vzA81lpAaRf%ts+FE0Z+$kDq=N-%)$_m5*J* z%gTS;ua)mS2~$eh_PKUf#S=^ZFy>%c(EuadgwK8{@=N~=huk8 zsu~=a7<7nhUD34FJG8ST#`62M7gTfQMH8Ij4s%9Mn;J8Z``ISBBE**`7L+7(C0S^k zY!^zE;>_Pn%I$tpo7!HNEs?udS|Z;<@rmi%!^`Ia>eZj0 zsGqX%&h^5I+YxmS&oHIQWwSoYuQ}tDA5e#E16*7lX2(XV^9D^zo*_9nDF3$bplD0q zp6h*vN*eZ4cE>7IPXsc@wcHKS{zqO|U5ONP!6x4_H_nyd8x$T;(HpA85-_L6!mZ0pO{1c zm3|NSE8~bvRTE8xuEtRe`%maRWSl?oX&G{-QJ#`0hcQ1@8`Uip#_!{4{o^CR6!-;^UWYA##assP;*;{zpfGcbVG zB}E3-DrX9grD@2L^72vu`JRyk1K&tP0y_C54JLa>sapLV@W@TbN@rO-7A(c2frD^3 zSpj8qd*ot>H20lICHUATZSEunCnSF}8qbzuGoXihQ}pD``sjHQiVh7Qgf#1i2QFl{ z0lJU9aUMH-iMuuZC!ot@`vlCKF5O9idn1hPsU3z6Xluh8k1!oI)W!Q0PO5LMTK+2p@l~(6UgH zC2$I**0l!K&0&-pw4h+CEvfyuD+@MF!(ii0a|_Q*M_zPE{(q}{i;L3H1X_-kbXbxS z{5l;;sFSXB1;YpocFbrTmVv~&1{O1wNXVTLg)ME7Ei|%VOF=Q@bEXDQ3gj|Z=W>ls zV-&uZft*D|@HECCY)#TMF<9T@Xc%BejWNINCG67zD)~HMDkuBXAhhDF5gw#w0whdfG9U~c6g zoVx?rVo<=du0;LLhv)Rb^Tw{F*f~ev^??oRD$UVSrownrhAE%OFhYaPdB+xWROmf? zIR_3I;oX`CXQ|zh01PAgPCmBJMP{_zMC6G*b7A+kiC8a?d79v8cwOh}GM05c2zInw zRPYHIFZszhw!IsR@-$aJcCRlB02k1;PYF^tatqxJzJz|Z&KN~Aj-pt5lj{unPqO}O zIJ5Dw?Z}gsUw7}^7$zuRg+}Po*cZmKfGZS2nmoBZfi3bN`GRnk_FxdNvH+VP1_jW4S~Ug`En@*13;?*TsNNo{eI}RdR6VJ zy{lLEQNP-~PkYrAi!Fz1yBtS;vZJz0x;`~ zrjO*yzT$3%aqElU9`~1g*+aMo8&J8PySEp{xqV)IheKRw2OuEm{JQ)!tV~x(Cb0RS zTgY|bt6`*^t9f3L|HDb05Eun0_@}ozEAYa9Z%`p|!4UuPp91W=N&56Zo4^H40FbxK zXTlbGbcWlL$IQn(^Cwo8Dm*Tkks_e}z5?ESRVd63wX8sKd^|)!#??S`4|2wI`7#<= zYvkx&;#mV-S|+iF=7T2>h9a)2u|uET&L_BLnjmCFAv1^v`ezU3yHnF^t>s}Q1f&S7 zEep7+cnSCDw8p91*O0O$k?}d(SwJ9@&Ft?qtGYycW9nr=)s6$=a1z{wiZ3OTz|CQM zj`TABP5Ih#`4?4G*!FZ$+j5P}@`laQgZGHjpXjT7UXNu$f@~t?_{QipnRXZHD-@;b z>PML^zm zWS-5L8}82aq{gmquo6zLO_RjV{@!0YT-xtH?B7?-R@DFc!sq6>bKAc^cUZ-DwwhnX ze|DO$3CO&RZG8{BWORK)-84LTChhE+yO;C7fEB>=cXsvCtQ91v(KsLtYcp0 zK~s?y$H6=X6JqDUY(4rgq^Qqh+Qn@sn(-jT4pnDP+KeNn)GXHyL{^qMoy zS?X~Sax>u{nzGXs&()gV<`?ZqyeeC=bnq7l|4gA=fdCOio~(+PtDSsJQ7Co`*nC2k zr&J_Ag;l3J=lGeIaA-k~G!q^;9^GR|@|c_`(`Bk7n?fuxXyG5GY`Bn_m3M(El1jLW zL#)pxg;M5LvXZEf)RvrRS zRL^p^Db(quIO|kfw-*py#dQqYZ{c7@e=$$51EM?}lW;0W_!i3J2d58~dI7)Y?I__q z)CMHMXEwM3Q*;uq#-bt5ONY=v|G*_)Mmk4c|`z39{J*Un{BP)E<9+7 z0*Wy*NuXG+-?O|JkJZH6m=Pg*n8LJ)!a$*f>x9rn6MMVN>{X+b;Hp)`sRpTnH62ON z(Gq*hxltpiWwzBbVFnNiuV@sAIuM&dnD@~_4wFCQqi5CVT^s2@B|9kJ{D>0 zm-dW@)j87d1X$auE?_q3e|}S}3y)w4#)c%oivUO@qbY zkqLiy42G*^t;bKFVhKXJSU^`^2y{?HU<5A&IpFmtz+~})EJ4~61sFL1eYUeYa56Aj zF4CKfN2|ttAtq@I?J=xIF>>F0`?7Ptoddow&62N@Qyxa#ToIt$PLiv>JGZv~YA zy2aT`EX<6Xv4meG4an7~Qj~)vIG$!2VnAX^EX|3as-mO}HSY74_-DmoR92F}?oI&Z z7J!b@paVN-NZm+{QVzwer;)>fwE{JCgp)ox)>UbMJ<*|Nw~5WaXiq2n}ilh0CgZx7}Zw<8U3gLuu50>c)Vod&CYjaU5x)3@<9U)Cp|7Z^eUFH@iea#LTw3({n% z)6>xV3-wj5oDwf|X*`}ijxcUvd|lyYgz+3e)6ipfMtQgLf*?!S1}7o?^ACXrA;7#N zLk3)z#*yz#A-=%zJp`w;roDytf}rPk`K}4IT^(4{tmg0#8C*&eAy19%ijQP4PI1LL zR_MgE8_d#X@wI!N$+f0zs-Bpbp0?g-d>YOTNYIf?z;(b80zN~&n{50AW?UevP1d#Q zkEg~IseaYtm{nZKkYDjqaIlKJGeD>6Pyctm!4Sp0qH9~H^CVen7d&>xAe?KHVMqG| zw$p`-UlR^PPK;Gkci>f15OoAZoB>}8Y?=rkTiMO>>j^28FF7n;hK*7@z!jG_Y6vCY zHWnS+6xDtS28W`tcczzg*e8|*F)6%Mu5QNX|V&Anxneg_EJm27{o zqv9!6nkOV#7qD8K`dEKQZTU+1g44vpCDVKLbM0JXvg`2TQ1Cs3n~~K{$lhjx-m*0z z!hFjqpW_ri;m=y}<)Pq5Acj6}4-IPmZ%Bhw^@E^+u};c-fL5q4F-@5W1a#za)R>^0 z`QG{P8BR?`8oNZm4k|pIei;Dbdz8@AVF@cfco0npC;3L!2Pv>P#G*GBT#H(k2@IY* zi^)D#O%>P|{|lZgOogk*-mPcdnJ#*M`D<2ZSGCDU$pnLwvG5!c1E``6H>`^rkgN})6`%D_qyK`8ty88~; z1CC2-eFuY2RlcK(cVx;>RG?7vw^E#75P3ZbNI&(On)X_E&1S|*{JJWyYC_%C z0V)9WT!d!&$GE9t5&FM8w=Y3AwJ*V3p#GIT2n_mcW*86{&F?UYkQ|zQ1WIyW|z*R6!&nR;nskjYvkcD*az8(Dkr=kRo z27d}@+bc}Z-%!K9>dd3UgMgTa{S7te4dx6Ee3yI|7z+c-{Ldo)@lOfV3;7)M-2%4f z?_;a`afQ>=0SgO?1O^cg?8bs+`CrTP|F--avl8My%R2B18`d5?pnAH=1P7K843H71 zqSP1V-V0;6gE}Ee0&7OiJGcrtVS6szd+j?28=4H7W{t)5a%G>BEm%95O|@YpA##B; z){|cMjcSSliUyC0w8F~=VvMPm+R?vb>&qbkW4C|{;%u7x(gf=d(iR*D7SMhocALEY zeSd4weM*@4X;p{@#j2lJO@fetTt#$lYGalvmQrbBnu@d0f1Y@s3Z_FT@Qc4JILCbK zYTM7FZ--9?X4W+pl|5y{xDH2F=GE9&;e-)=|x3CpM5w_$A#o^mtP z%`%ADB7e9qx1=_sa&t>@3CsJnVJHc8PS|F2%7VOxFe=+(81VZDP` zlPAf~Wo9mArct;_ga^ZW8s~Ld@Y)QDN%9@RdR#ZL)NW?;Xl5EO>Uyl*rHI<;G-0QS01)?F?JR^)Jt80WtjIs{EQG%w}vdQ_Ky2O!x7v=A3f3dA_eSzZ@#g z_plJsl+$XMs{*-52kjtXGRDm>(RjQSmX_w}Q}Y>QR1%*};={m={%F#kAXRMH2}**A%EeR4Q|P2xaxX@)krI9rrVyh*&j9m7t;QQA{|&n?Ko24KAAZq?1PC!TZ%fY6|}H zSjqDT&J()(E-Wn{gFn}-htD!m$1G}Q+brpc0T(c4GV|a&&vokTI^|1k0|6@2YAZAD zBQn{1Qzl{N*6C+)SVk=3TT*SJEF845sls{RwR@NJN`UhiC-;uic@<*nEz7(eKX8cn zPsa-Hm2;~<-h7Q^rhF+DC@)$EJEyeOTz;4Bb>{v|wNoX7`#X)+&Fmps5`!<})SR_Y~rc$lk2M~va%_PRJ4zThqd=i9=h zIGask%&5loWB6-(be&01;3@h^O7XOkb?X-ya=!g(`y(H0ycXOY@DPCQ)I|*DOn49v zvqNB{n&RcR+vO_P!Zs2*u+^gA!`+oe-X@zw?HWZ?XC~MgAH%|FE?9qS76pfWAED_K@^i!$nh++Sg_d#^(1!Oo7kd7o^jS4JQ(-#uhV_48O;43XG+h5H# z7KNn(>e9iE{*`OpCQ#idAR|4jE+Cb}Y8X~>Mh6!>oGKhYekK;+cl&XG=ll5fHV56r z6G3Ur#3dwo1#)DB9YfLM;c&GWYY}<_(jSf$#V3exx3yIlgWL(Y54|)Wsuct?-z5@w zfa*9UOC@-GxxUg-?0BNeIPPwD$B7`TJM1+>wuM$}1OKE2bV6~!K`1uO?TP;i|EOqvsFIl;_gwtuU` zVv@rXOB5y3LPi+J%@CogV}C|Q9M9wOEbgP0!h-6MLQXJvZgqb-a7dE zJe0)38cP29cI5ngQRvB5?2P3Z64*_&0REmG=rP3_ik$>6-Qu842Pn`TtXt#^w`yB5 z+faHAA*93_wlK6fWS9LIHHaCr+c6x|ErnsZbzk{ixoPN6LkL|8JOn)Ty z8`pBYNt%Dw);&HMru`Ay-8Osc?BG~@kKps?@S|INCz}1K?eHa((vRyEw*5-f{*HP^ zlW|r)p*`UB+lWBo4WRD`$j7GuS*jXvu9}NdC{Kf}U^r2l0~L>`g8JdJEAGq?_rqaA z;+pJG)NB-pJZdN{{S4~$&V_yV`g(>{Bi!5#Rn;8=H`x5$$ za~|5WfnEh~=!v}n+jZa`cH$NY$h+~o@6@z$#&luOHQCe_10dK&OUT^dDi4SgXg_HP zURNx@H9LH@!e<`#P`B8Z4l!2GU2JV%6x%%%UMSm%fzMKGrs@Qp-1CFYdV^z+jU=w#tWPaW=O6*Y;Ix%! z5rbjO>myGf=SHsGh}o#iPxfwHS)#|=16+l>PpIpVrs?#Ze@DciXdIfe8n?5GuCR`- zsb)`P1w>%D>(t3t3I}S<;LgwCT4`E`nV^Us*PA!hL-n*SQN5Um$_eSEAS&22l5_lU zk~IlZQnf{?r{fHB$g4?6CrC&yBDWhwt3E;O&u~7uGf(&Gi~Ci`Ss-p-qutUIJO*BA8JD8AB*4H5{bACQnxT&!;HCtQd=9C4c*ucs&k3>*XhMrg8vg ziWz50X->)&2`n;mIbMu<3S^&LCA>MrCe<&;8>7jOLqpzKX9T{TN{~n8%)pSxKbXW4kTJZ zb!9o#zFEY;j_!s>q>6?eyV;t-yk7zegVY~>Ya-47yaT!IV&Aiux_AiN3169r6kS1p zD6{yP_SPPLN!T$EgZH4%>Lw5MvJ9LD+eC44KIV`bBeI%7D#-iFr1ZfJ0Ss z5CynGShA`6mhl;$kMMo#$q=NQeh~LXB`#mm&z{-D@aMM_PDHtsvwl;%`2!IZL8_8n z>l|R-kAwLc)-wK!IIAr+8|gaSc#2SjbMhlhxSAe)T~48xi!EmTE%!r|x!R@W^uXR$ zlY4#Hb=@s@<2t>&03g&gJO53n>*m=SeMnh%z;XU`5{F=wgQtKzKd5mJ!Kx{Pct|7( z$Dx4A;-0#;Y#Vx;Vgb`SQZQq-*5X=Hb|%hE7T3ufgkEK9b#Pgmz&pfQR+#u)L+Ll< zz8Mm}G;zz;sDlQa<4!wLXdOm|t_+-2Q@lvkkKU1S(H)}=F2Go{=hxCo&#&dE^>sXr z>#P6)4d{R!ZT!+CW`g?b1ev)7dm8m>K?_%#)svis_L$^;;`3xK^r(_eY1}n!f)@-x z&3`qCY=yElgTqI*0La8K)G_IaY{SO>Jy)8Xi_o%j1b~1>6>nM zOvAgi0I=xZT4?NA9J<&52G!w3$rk`A$%8HxA42C*$41f1QmK{8QsuyGR-Q&%m!%tS zt^B39Wf~4WkU@FFHr?69U&(j)r^#6fBEwEs5adxZwRGPHuxT#_!CGqO6_!?1`i5HP zuc$2{DzW)$Z8I`O8!fn+(64Ll#C_nLHITA$06$cSf#dDLzrFc)U_%>V~=OH$8@TYxx#_Y@C_WpA1 zZ;$-5xK$e73Io3hlGR4>kY@7OC{DQj=1P$kNRtf7M%}MC_Itg}4V&>n=(9JE>p#N* zDB}4uVYL0nov+<(o4tr4iV(#wMDvmTc&FIyDaSB+Uelwy7iab8#n*NNR*_>;!jZ>6 zXvd`J$mv2MO-FTaG`WqdChKHN<=K2c=T8=CuDyQ# z+*^aBr}RH`Ze-ha{j7vPm`LJkpNYBv(Z`!Kq8rDXF~^4aAW^F{Ai7h62;@lxF%7d_ z-bhj zt=mb{@`Sxdf(ur+#Gvrf2p5&u=r4H&E_?H^x<_N~Vr1GVv!PKQ6*c>IDk$)rzvwV= z`_`9Bhy(yjAe#QU4C}#se<6G$4NZErzxe?1Uq3u~z11(5ENKrP(0-2DsvTooLOGi2>1$iFL0WF`5=v7O6 zPUdZ84l*nlIYMKFXWsUIa#{b4@*o^QYf;#*Ko)M;(Z6Y$kHCgGdRP#UE7!jq54;S6 zg>0hafrWtOnA$uMIEi(lo_Ux(DNyd6?0Y@UMB+jsm+{d2Ae&~Rs~(U;#SmS1A^eX+ z1Mn04Z~sos%y)^>|9M1+{P&0eU}NyGwK>mRby{sE7`%Kz#~!z_ovfnI0BtQlBNH&? z(Y9bsFYrFjCI3B>NYX;KztnOvztJ?CDj-9cV;@gR?GYyzKG#ERI>6#%tV{;;^{y`z z`3Zb4R&YIr&mlk01N$kH-v#2)|E5jY{>ae%LBNkoFwvVxL9gr3E1;SO;O$Pc@p3$n z?S~Nits44n!q4Z-O(%#9Hg@&AnOBLFQvgn6|J3%42)8Bt9aq1}`{#?a1<%b2w#dfU z7SS&CGs3Dm0CuU?pYZe}w8~L39m<59O+vSMt0{t57XwddlD?2y8cB&QfNVY8r5=>IZh?g296KMUpF7}m7hzi36A(-ECTy5n`ei* z<3&>rw!WU```e`v=i!s|=;V~EI>6W7jiI7zzzhAb^%AFOc~{jd&(>uI7oT z1zvsg{DkmYOjc>D#qp1J!sk|I(! z5b%okV^=D`V=Lt3ib7zlF-GkRVw)tINYZ-`uhGZ3Su6ACZeL;Yi**u1kz}V_tWVt6 z(}Yt`?J8EvX~3-i>51sQKOQ*N9cfc~Qa}!}$o@M4gcqq4lni*e-X#;Aj2Ppj9Ux~d zi-FM|se=gSzIRGd&81oJxknJB7>kh+z{~YrPwuE7<+Y63<;~g`>$jY+r{y)(h?ovv z93~o{p}@_#_%%@sN(R^X$dF0rL|vCor_YsiHW8)+1|XlZT45Qd41;Gfn>xW!N8)A- zpziDIZePrjrqTL6t#I}H1kNqqBAG{ie({fjJ6X9q-HZii)8-hLXs6n<4) zf|z878_?WJU1R8})6X-n#lG>J>_+f}5`&=O1JBnZ`C&1=9X|}q{W$0>NiVAQ4JSj= z8G+~{$a*28yk0`BAtZesmxheuwp$BO>3O0njWFuvrQMMpe zrz46#989KR?IvrDl7WRc)V_p54KddCqhiTM-iC^ui~~k}9!j{+DG2|^;m=y9+r9mw z6+!#5pw0?TC6bWf`Si*Y#KWSykhvp7j&jZvF1az%Z}$hor-+qDh|JZTItnR(T`mp# z#xhQow3+lY0~6EcbS^~y98c4V4L;9~(mS+#L>F|@6Cs+o`E=V8M3GZBbjBWWHHYm3 zN8&`{@xCv55ab>}$5bdR3uw}A_Hbc~CF1Hdl-tbGGU)|1`Cf6L3);wlin@#l7->7HkMJ+RCJ$6I|Ii zjke2Qwc^p%^a$moK~pwO*Fn~_a=G~g@*4WE7gBtOc42)uqTE!^hd-Gt?oOq(eaaOT znTC9>r-+ijytsBPcs-j+LfIObPJPKx#;`z0-=@nkLOJ}YA<5J~V3NTN=-1nlpZ1x^ zVAW#8Pd?BuXqoAYiN3D^ilsTqVW(eq5kWZJ$%Sw(Hc>BBP6E|ftQQWkGC4Dw2!36_ z1d{iw4fUntq8L(ykOq{C`pNe5&1#hR)Z4))BD%z;gR35^4Gr?lk9I4qizUv0)q$jp z%Z>pB-(d&+6vdbE!w%K}HMsfWcsTT9M0;@z7fn=n^u4H|KkP@Wh@VD&vd@B6%&sUG z4_G^xk2|M|Y^moGqMfNAwph%Zrp0=F17WP4;7Qi{+}XB#l#D=T25iIS zZ>!279M*oVi8j|+0OsK!SDlQj0;2k%SyH2&*XqG!tJSVu>$@1C);mpIT3D&J{Qix_ z`Ca3AJ)2pvc=X3s^egmGZz@7*gWJk;tmHl6FDQ#?1PfvITTWte8v+CBAT0_8DGz+DSc9K+)G$4Fxcu@(gM8>g_Dnl5VLoKLdPVz}Kf8@GCvyZ`h&<<{zGHIG?s(D78NiTU4k|B#j z+BfQ80!~{i4pC>bXMA>c->Qn~1$lcwZ(#t#=6H5dZs+lMc1dcdl}>7ul)StI9hRVV zQIl;^wCAx5fazx%9K{64H;d|vg)?;>*7aD9^b5*hQcBW9oL;$j^33i;tG8t>!W!Ek ze%3`DwA}o05*MTc1YXGTrCe;@NUOnkhdHG;8@#a@y(zs^eo6W9tn_KiRj5@jyt8mN zyA8hR!Mt1!E%F>Gc}Vopfyu_6#&sysH6_w0brGXEfEtWV2=?bYQ7M64K34v%=wO~$ ziaArjxZ~ojKa4j_(O|RVc8|{2_wj&G^M#WWx7F`U(3_I@)<`r`O!}-Unup0am}1@B z5mXH8M~*95Z3f>?v(|=Exy5`Y)NS<-eB4|VmhjD6tfe4Jw2-4{QWm^has_= zFB8T^z1*xMVL!LDjA#RO(R%*w(>cr>xqLKD|~3mTA3CXhYvzLO)u&f7HM~o2hpZQRsJh&abX_^2$=@^#!X+2)+?}Bz*t&wwT;WnMlOdm9@p)P{J(d$8V7}FAc zeZthP<9|IGFur8AAN3*vT83HhTWTo|$#gN0-cDZ)65|UzkKW$@v+(dA2&Mv#h5y&T z&A&Jd79f+b@`=OEe{q-qp#KjD<8)rC8nGqtq}tJC2zzNRUo)>tcce>aYMBT(}e=&v6bXpc7H2eklzS{T*TEL4hg`CMb0B4+USN3BLL8-tULJA+o&nT6QmJ%|0UhlTV{L$80!&M$gQT?}kkn7QmVG5=`izX=g8|4mq#AQ)jr5;P~P zE`A!-e~JzJ@@A}=^<({7YI!8Oe`Yd@VzH0Pszcd*;qB=0&f<%HzsgDe)`2a)Z`8)n z^1h+{#?U_u3{ZlH*pPDjgkZ;iAehw*`n=^VNF|&94C|sh){Zm;*oBoP$k_fBsJ=sazv65;x84Dtu&Cu zC_LH`4;{Hw3NuYC_@KU1Rs+A%8}B?X5A3DWC^N|b)5SqoHb*a>=ka-Vtq&c%zV|Y| zWvhSiz2P7O0=3|u0EqDlvg*uP_yq|;UV^rnoJ&UAq$81p*J8(f7h<=+5`g)EbRkbr11V z9cP3egyfd_Q9#0N-zajm3$c}f+PhHQ5T~cjIjj@rZs^!9box+iBvs{3sIhNrcWfX2 zlgE?vJNLs5bzmHgS47^9Yz!GEZ)dBZA5j+I4*c#_c*^R}pY=%)gj_Hq2gvRcqo_7@ z|56f+IIy@4@n4s&^RXTkN7Q#61ya1hh5?1(Io#Ep4~5Uy38r)j_!2^ngj~<(J6OTY z;@fb>D#~vJN?J)TTOCn`r=*#fE`4N>1pYlA@9+!^7`w;?&oWFf@q|6g#{<`lslf<< z(~}c|D;59d)>c65&4s?EHlBHKS0~&M&D$a)z~6BLyP#KbG4czF6eAQuAXY1#?UhqS z*)@3Bes`pIr`#8mhn}`YKWvF^U=LBphW*;tqXK>(C5PL2n@MF4{a!3^bh(I}<38-ek_0!Rh%sidO+o04z;L%_l-bapy z%~D&YQ|X20{#?Q0d6|Km39D?S>V49>mBcTeZ$AaRo8LOUq-MuFTo0^bD)yL~C;4!p zjP)$KyBN6@vp0SH7TMR(ir*cNEZ!2ZT&hJ2#agX0@{Mua(ZezGTfFgn=sl^!_N z{nZcotrm&=Cn%4iCfTiJhFo)IoEuNaZiH~~Pb%&Q&0iCSnG`9W0>^Q>m_Lm99@H#> z*BncMDD)1!c*d>i7+vCo$Tb)e~)jCopa?Z_^8^C9ZNT z*W1NDzzu`lPPTT7`h;RiyhXlc6f9H)x@8;SClZT&BJuKPrKO7~(>{!D>oR+5slq4O z7Bb#O7vW)PcP`UxI@m?nOCfF=)%oNf4LahFa<&a}*7tJu^KBaS0`&STI+qzXxnih0 zOH;`uLW;v#pMz8uzch!z={0*cqnyBTRNIGvOR1PVjzhmtL%amEGT!HMXpan#(BL)RfV;jaKyQRzm(U9 z<(jpzmOjo8^Wtd2HC#zOE#HwS9j2}?D zIhqgvezqaRa#WrApHsbZ)&2#Mhx8*NkxYU5M|Lt+#2$M*?Ast3R_7%%JM5oGymjUj z0FzlUHTU^1qHoy|X0SH~k1j1br)4LiyGRe!EWU9!WPdkgyp3eybO>QnpMTTUV?A)- zvMS4b&?t%hV`(n$!|Al{TTLcTPz(D`pA{KEAZ!Z-&90$doiMgaKq^`mmeZM9IjUug zWN}7D%}6S_IWZ0;<4V3~IpWv?%7Se(6V0-Om$9xMV|Eo2peQLvs@KH#&{kf8$}UeM z@CnEIN`(g&^LLH&Y#asu!0{rY9oZ)wSLI(!lv&XVe%1Gz>u4=qZcE2@ePTn`1Kg7* z<_$PV;9gXHo39V9L;Yl9cLhY(L$l<@?_R4rlVw)B-&;HLkZk}-ITR)|C8cZc6&V|i z8E$gJR?7K;9;9zQwGNcD-zzg3pLdn-Lsh{)*?1z*Ny-8BQV64#KH8lqt~YvsSBv{? zY)dcvk)&v+V}#Cj|1-kP*`KojhzLq$RnofI`kN7UlaoyA{RfNBzI`_K!eT4=AGo%d zxfCgW)SK?LGjoHw0*r6Mj)zvi`H7(ldC>Ei7=jY#QmMd zt7>doL*8C+W2QUt@}S(x($4D^gPoOAp0+k@PL0l_<&QRVhb^HP}54jp3oNa{19d_wL)p*Mpxtv`z*C|0T-FT_8`}p*)VY1~=nNbaPMm( z4#RQ0?Me5L#4b!*=tX?|UO>ct{@yry^BKOnc0=~?_VPm-EQ1C>o?ejFK(Mpd8dqrB zEYZP!5j%P{h>evDEHCn-gT26p)mY;mOXRgkhG51r@rzC*?~x?yQ4Bwe_02ItFz^S~ zo*9+_s?%q(Y9TP045R~k@w&pU6mH1mBaO^Y3I8;T`%9LzI~dC&gZtik zO9}5VITh&&SDV=LD^^MG`n25z8DC*vN?&qab%nqnMCot4#>XOoT#o90F2`?MJCQZt zUPtfAK!7aLZ&YWtIRwq1rc1F!3E~rCXa7fR56j6>We;?KDK0<(lQ51IHCQ+`SMt6G9(>_)>vS@NDd4 z=yWa?Ep>$HmZc$IJ(}#4?F(Y4pJ8Y8^KAHcT6?me0ak1TFODmo`7sqA)YwOhYyz?c zbF$m}ZDjB00J*V~1i)x4>=kqe(ux06`fr2c+04omjMIapF_A}w&vGUy+(XDw`CZRh ziyNNR*eXyR8&NE%<%VaaOJOV3+eVe!lXC;)gm6FgAtW`#-qb#0=(Q4CqaNZpdXKj6 zv^mxP6qoGPARU!|`7ADx3%{@xb&NN^G-U(z%3YK;6+Atf1ayo73bYMe48#EV5|Vx> zd)~l_2l>mlEs5$5rRo!hsxOMAL~377{kJZSS58axVyX;~YNvd@s8Tu}bbyE}1!a zpG0b_tYon!0UJb}-~!LJ2(LQ{WgD~0LEA+Gui>WRE3@)yn3k40Iq?rM?m!8D>-F{;JlJPZ1>R! zm2@>2IAJ%m(cDE%tZSibD!6tR4Y%B;K5P3g&`wzRj|yH8_Cd(}g(bEQH#}+P*KD(w z-Ax}pP++EhM5=`>Lc*O$QHfu7(ZxjldqNlgcpcvl@ot^zG5e7TBNHL@)|W5Z26-@? z1w zju5$B(-fZY^XU^WzA$nwV?Bjw8eE|0mA-8wRK_Fl^fN41|4>3<@D*2~Uv9rvR!%!H0!T;W!!JvyFNxCBG?-)3jphha8OQ1Nb1dln1WO@XP42Vu6IG! z0buVyFmp;q`i~X5PX*3vE^l*dYY6R&_#$F@OMWGhRe*H{K3IMm<`d*XfIcv}w^)ecH-X!A_aX5aam+sh97;<&H7;+`2NaU(TcEf3ZrWwqAQl zxP@$W(#AEgT3b}6^-pgv_q=?-^Qv-~1J2IWVsL-47TZ{~gYxboQTDH-Hq`yv>3?%O znES;F&2k_>$;)eUL0nNoVr{W7(fJZsZU5yGDYl%4chX)m_7TY+NVXw=9v17($wr+9 zCyz-YfGr$&MNnip!er}`MxEj)bp;9V)d0xcSG#1(7Wd{S3l-WE%J3mO1uqBQvmW}ZZC4a=N8e*EjEf2D7632#lw(-) z<~p@2k=u&HW709f=>_B1x_I~x1yM4+c-N#hEVH+u);3GT9|wj9r?ZolIoTzWf}zTx ztgB76ch;GH*vUtS9O7Mh6}Fp$54H=Zt8pt~gC$Xx2-d`Djx_zE;d+slkw?WMF2|_~ zYjdKRvcDR`8p8~br|SjsMgg&z97*>y8*WQiv(B)Bxr4om%_J>9PRW5>`C43aA=wpe zO_vvidtdBmTI$KH@%+3{w<*VHk2Eg>gfHM0gO?s`>E2d~vsLp5?IZlewO2c%&sQrJ zn~d*K>1j~(Z;82x!$zAwpN`8*7z2MuaJmag=RQgS6hOLQ#f0ao=$uNqF_hZ6&f+dU0=h| zp~={m=*NrI3rF&2_FGUVkI*X2?{k5^c&km?)%Lq{l5FKdY?Z;51vYEJRld&?EYFNa zF2~a3Gjjm;jKrS1}GK0 zq_btj?Z+gOYANwM9FuoX!tLghws<`WBy4rob4QeE*C(+x#Dzpf)RGoyvFq4GED6G1UKl7Wy6LDQ!(>T<^EBbn) zJw+h_Raj^PK(eJtS4|mu!=ah)Zsp&zpwG~$!{MO7V`X}xb#Tg3l-9yV zdBs@b^4r^z(Lsm8E^9d&QV?{_b=y)9HtLFk2+1<&_o6!J(xm{3B}X zmz54`Bb5k2CDg1MQZeZ7*YK~wog(4im9hA;%h)B%0TS*fSRz-dRlIb#)B4o%BIz;- z=&qEB{W3vIjEegaD7KCln-+PzD`^(*$+8L?3b<>mxF@)QR^pECp+VUvv}K9;Ir95d za1knRb}9nnJmmG43}n=U)VN;}%&PpfB&1WN25Q zHVS!S05HY3@mImIj3eD{RsI5(JRhATF3@yQ&qB z3a8gBRr6a)Urc~1iLrFu)ca>uppyqJ@3cI~=5TkH=tkQ1$h7h)*(3Eb#wO0-c@0(h zp4_r~6&6p_*eR#zWJ3n8r-fDyJB2cy&l>@ACgo$kx-Be;XzVvqd zlmPFFCT3d+I1R3}dh-B2*KWZgt8ykG(zB4bF@HgXGKlR8_u3U&8Hs|qu+AU;9htCT z?P=Pf=x5kLaUHy+M<2XsFaApXPIY&HyhsRn^{fEjAan$^X4SAVi#Tg?UJ-9#S*=dU zfFZq8VYI=Pl?v$GO1R@n17X_?r9qep*DKYDTb&=%$TM8e9~Oc*QUA&t#~#)r2j19S zxdz>c0FwmyD0KmOmnbZT7$&y>#;@I*gQC+a=50)cDgB%*391CIAbVx7qH?-^Y8cN( z%R`%|5jtIZ_@aFpZ?&p+`qK3MG|49^)R|PTG*C&V0ut~F2o9f(mq)J)c4>5A*H>as zXl7L?JU?b&75yHRU64Jan|-<)F)!Gm^Lw2P@H<~)2%&1OW!9Wz^RyiEgww=h{vNW~ z&9{YM2kF~vkRi>&LZjOC9Xkthk0j$kCzg{#-rWB1>LKRq#q)=PBnd9Ec^dMKGfXxu>T1Gr5G}JWd4P+hSK_&vL%x_2+x7k0N9#+R=00XZmt^x>^cuyR={n zQ(8HJxo<*N=0b&q3gXyy4BAf7puY-2Wv^|x{tTe!K#9f~=IaMBk%5fV1HILe?*iHR zcFCH9DaD^n3BA>4h5`nip?~tYb&AzOq3Z&oV*k@XDX9RWgK2+f-p>K06U2Uk2iP?! z<_Y5$-b?K9F%p}H-7HL|a*1q3M&QV_WA{;0Z+R?Ne>lx`b}PUITyf~jodnhXdD$Uu z%?$7?ldshIM7D)in_9@9%}JmI3n;}@(_8Cl*yG#|F@_%^W^bM$kfCnK z#%XuYlMdPMvhp32xvzSq&~dJ?yeCF1Bmvn3uAuZ{C+?(%FDRU6(Im^dM}>Ln$t35t z^-8?dTaathkeMghEbU5^hM#5n|@iDnl6JS(R$*I=}KOZ$V$Mjn*qFQ z5~TGf1z}&FO}Hx6Jksjv&A7IFO95ZaWzY3ffAy{FkD+UYQ1ix}fyzl!x-%7akTs28 zj4ho_Xi>WlQYB>8Q0`5j@HNuUHP;Ph&YzGeC^EEOzGGg#i!?VCO8T7L8_SfJ8IKgA5=IdTdqh)5u*h-q44{wwe}!CmJXGBqpOJ7SB4&t~ zL6#P47=>(O-y>veA(ZUtAypT2L_ zlAN#&U$RSD5O*PP8?;zV8@HW8m(~Y6=gyD|SCeJ1`Hz%!vh!Fv*XDiD-`O_T#e_HlObD*{hJ~NS>a$?_y){o3Zk)kK?Pgl*RTY4rY;p&ORNVKk7=Ei$*AHLQxR=%|>owVDg zqvF2W7Mq^0*3}|}X{bn)b>6;pd9w1 zt9>G)Rpa8^vfo9Jz0=a=Agi|XoM0bUuhP;$H7}ZZB!Kx1acpA9kSU8a!eV)#ueLUG zLdz(YN8Kn3<*GCPRB}D;HSxKg52Jprp19=rgf%# zkbN-4;>b78aGT(cN*$>PYLL3wmIO!$mr|N|CP5s_B$|>+$lPyO;+US1*;Z_&bmOn} zGZ`4O;2%&VQ%rqZ0&d2^kS8aEJ(r!&rLV)I<& z@#e$jTQ?g|NP-^+l*G1cS2hd1HP^MSR1W#-P$*e%pfTmKwo2&&CC}dLoke2F-8_Hp zHC8rF2MjG|(#{SuVb&H-w7smL%q?l^Z_4xAkkYl4aH;Eyhvbe{)>o{ZO4!Ymn_utt ziB_qT!^RfJC-GxVH_48Dv*wz5J5|shGdpwgmjU$Nd|e?hls9N|X{Tf93LzqmoiU#)t7)hG`GBr-%iG8r_b3mO&GMr3bVD z?FPI&AW4$dI?3SS-U3w4@b|WocKwd#`Nsq>OyNT^}Zidezzs<*a)hai2U7x#%HDkPOvoxYU_#_dL~6Zh&LxU z>1!xxo5@&=#S2t>E;h&O`d!by=6|y2ldhw%wrsMSRqEcC3VN>u-ze`k$$DyA9-=zDK8QGob|=*L z%uzR(ko?x(d7o35t~u|^$41d?{cKt?{v3CoyZ*@8ZR(A7Y(}-IkG7wQxjo)ZSnFz9 zl`%W2J)+~TV;yjAPON!m@Z%I;-V&CZnEz+0wmRW!6q6z_SAIl@80!0qWWMy_d@lyY#=r@)#r7TL>`a;ZYRkTiujrnkHIw6+m{K2hW@iS?1b zq-1Ifc~Z(y0RRWn0gpq12U${5;+pJPPDU-^JRM5!u=8~6p{g33r4~+lbBezONz!tj2E{xXLTE-iTPB!6+ zq&KFcxNas0QRGAuZw4f(-(kA6PQuVYAoA-wADlKW4br?<%w@;3oW#5)UCB;cJCr#i zspX1rlA+MM_##$G91lTld`kPrSmhVq)aL=)c}UGM?(*xPBKODhV(c4lM3RpmBNQYn zX?}Z=#EilkC37Z8Lmj?n=kMuHzcnewQAVkO5zIfnud>`KUFKsR{BEOLMuo~3ot7>A zLHm}bA6z8a?{u5o@uoM_|AxBcdAv-;nuM#YbNn|0%uid3SVgND^Z!n+}N5w6xFHRx(Xt4zIalKsv!%z?+A}c6LaQn6 zNrJae8LH6iMA_!2xhhbHWq9J_De)d_LXE>jUkJE5jS1l2oxYsgA^)UpCYRXiPw*cd zB6IMsE~4-`o!2d8i$2{{K;Q_-7u_Rp#C;pwNv(Cp@>@H+XK;tAj#iK@Pf5L)b2v00 z8=Kt~+?{*i^O-9Lata1?L?KtT#aN2%<`NH~7W5VwkJ|n5e$z$6+C>?wfVmfhAq5Wi zI6bQRz*;zPwdQWz1Lr#=kI(0(d=_0>y8`3(7Te8a>+dhM;GO4Txs^Q_xG@4V+&Xz2 zvh9!Ruv$1Kzg{ClS~ZEAHCwSVASJCocuSgE%HV02#KDJgTUlqc7KV)GR5bW1R<-mj zR76oy2+!;FX6FM zo|B>}Q8~o@`w81L>Y>MleIA489@3Ro+(((`*vyMxY*e553%q!=CARK;;=T~)_@E0oU9FvWyf z>TnrX;n}w~RGjO0s^m>(tuWnQvsGppsU|hdWVY4ZXMEWVHd_x)e-s`e$~BQNhWsUK z991qfa>?bQTbR%L;CX!I&3w;`7dZ0aI#h~=PP!9nqjkgPS9jPsw zi-6?r?_HV-de58uP}Vo;A$**lsoC6M@m(#3fY-Kq)nzIjyQzlgNxnoXs{HZKll)Tf zyU>1j=n29~vFro&tVZZ7ev-F=H$l*LH5!S;r?o17pcfy^hyvK!myS@8%u(kzsLRXnCTN| zIu{144Q1#Eg~BqrfDj6aA{b7z@dgkU3L$JA;2LeL4<3g?I}vA$Pg>aChlzr&bD&V7 ze>ihNIO8TkSV3N%_VdOy|gRr(uj0s_;1qHTj{tkrxXV9{M5e5y$4}h z5<6}b3jc=-HQNw%jfmgW?G@<8k`&m%`3Q!wYZ0)qy^4tR=|i3A5tcuyu zfM*P(f~eBCwPs->MsP#&{ZUqB2hiyRt>^!jTsR7n#`l7>SO~-9=mLCoAORqB4B|sd z7rV#6n5fN!YPVxru_nEM#W6?>5&Ff^wbJT4kbVpjM;Kpd^jnCLRn>(|06)uyr4ez_ z8$?e7$U0`AQI(bG6;_&oSS%!l0MJ;ufEny1+8VgULgGjbl42pjRWN-I9k2nuicI&z z)9BM+{%>6xi4VZX!MfK0A&#Nz5(n#!7qZZGVg4R=a3PM-1=rzg=mpRc2Z`Zn+s;oK zpMNCQyo(jMbg^Q1SFUp1p^u96fMJp!x()YuhJP{fF#KI`E}oIWUvxh!Gk6`miigCJ zG&4vVkZl9^M{&6U{x~FybO1u}0}}^{X{K}G^i)VST60ZiN{eQqYECUdx38Dic3_L+ z2260l4mOe@4x|(a%LTlUUGJ_mDGdzBzndHb_Qmz`7j}7CvJ9 z=Q}6O6|^Q0ISI)j=6^kKw-Q1bc%S^;yo@=+JbXl%zFq!0oxD<4+yXEqGS&tXyn+Km z89uy0SEilYn1%1cr^c*QmU0A{e<+(DVkpD6@Y%s~BE*42|LZ>fN(7=i1T;M03TU4C zy#lv=894CQXLgVRUqg6Cknhld`xL|{eZvnz`T7RR26%b8x&_c*(E%i?Q=tsN gpBwpwQ}9N9EDYQ^1>reOQKqO|xQ*;daGa?B0i5dpga7~l diff --git a/scripts/build_overview_slides.py b/scripts/build_overview_slides.py index 2eb0df5..8ccfded 100644 --- a/scripts/build_overview_slides.py +++ b/scripts/build_overview_slides.py @@ -114,23 +114,19 @@ SLIDES = [ "render": "hbm_topology", }, { - "title": "14. GEMM Utilization + Useful Pipeline Efficiency (load_ref)", - "render": "mac_utilization", + "title": "14. Formula vs Measured Useful Eff (load_ref) — sim validates model", + "render": "mac_utilization_formula_vs_measured", }, { - "title": "15. GEMM Utilization + Useful Pipeline Efficiency (ref_ref — both A & B via DMA_R)", - "render": "mac_utilization_ref_ref", - }, - { - "title": "16. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)", + "title": "15. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)", "render": "pipeline_eff_walkthrough", }, { - "title": "17. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)", + "title": "16. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)", "render": "pipeline_eff_walkthrough_largeK", }, { - "title": "18. Useful Pipelined Efficiency (ideal pipeline × GEMM util)", + "title": "17. Useful Pipelined Efficiency (ideal pipeline × GEMM util)", "render": "tflops_table", }, ] @@ -1715,12 +1711,12 @@ def _render_mac_utilization(slide): "Useful eff": "Useful eff %", } - _textbox(slide, 0.4, 1.0, 12.6, 0.70, - f"GEMM util = useful FLOPs ÷ (tile FLOPs × tile count) — pure " - f"shape-vs-tile metric. " - f"Useful eff = (N_tiles × T_stage × GEMM_util) ÷ wall — " - f"useful FLOPs delivered as a fraction of peak over the " - f"ideal-pipelined wall (head + K-loop + inter-(m,n) DMA_W).", + _textbox(slide, 0.4, 1.0, 12.6, 0.80, + "FORMULA-generated (analytical ideal-pipeline model — not " + "simulator data). GEMM util = useful FLOPs ÷ (tile FLOPs × " + "tile count). Useful eff = (N_tiles × T_stage × GEMM_util) " + "÷ wall, where wall = head + K-loop + inter-(m,n) DMA_W. " + "Slide 16 overlays this against measured pe_window_ns.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( @@ -1819,11 +1815,12 @@ def _render_mac_utilization_ref_ref(slide): "Useful eff": "Useful eff % (ref_ref)", } - _textbox(slide, 0.4, 1.0, 12.6, 0.75, - "ref_ref: scheduler issues DMA_R for BOTH A and B every tile. " - "Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH and GEMM stay " - "at 16 ns. Pipeline cycle is DMA-bound → useful eff caps near " - "50 % × GEMM_util, regardless of K-loop length.", + _textbox(slide, 0.4, 1.0, 12.6, 0.85, + "FORMULA-generated (ideal-pipeline model, ref_ref variant — " + "not simulator data). Scheduler issues DMA_R for BOTH A and B " + "every tile. Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH " + "and GEMM stay at 16 ns. Pipeline cycle is DMA-bound → useful " + "eff caps near 50 % × GEMM_util, regardless of K-loop length.", size=11, color=COL_MUTED, align=PP_ALIGN.LEFT) _draw_native_bar_chart( @@ -1846,6 +1843,136 @@ def _render_mac_utilization_ref_ref(slide): ) +def _render_mac_utilization_formula_vs_measured(slide): + """Overlay slide: formula useful_eff vs measured useful_eff (load_ref). + + Formula = (N_tiles × T_stage × GEMM_util) / wall_formula × 100 + wall_formula = head + N_tiles·T_stage + inter·DMA_W + Measured = (useful_FLOPs / pe_window_ns) / peak_FLOPs_per_ns × 100 + pe_window_ns from gemm_sweep.json (honest post Option B). + + Agreement validates the analytical model against the simulator. + """ + data = _load_sweep_data() + rows = data["rows"] + if not rows: + _textbox(slide, 0.4, 3.0, 12.6, 1.0, + "No sweep data. Run scripts/gemm_sweep.py first.", + size=14, color=COL_RED, align=PP_ALIGN.LEFT) + return + tile = data["tile_sizes"] + TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] + tile_flops = 2 * TILE_M * TILE_K * TILE_N + + HBM_GBS = 256.0 + bpe = 2 + T_STAGE = 16.0 + D_STAGES = 3 + head_ns = (D_STAGES - 1) * T_STAGE + dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS + peak_per_ns = tile_flops / T_STAGE # MAC peak throughput, flops/ns + + by_shape: dict = {} + for r in rows: + if r["variant"] != "load_ref": + continue + by_shape[(r["M"], r["K"], r["N"])] = r + shapes = list(by_shape.keys()) + + shape_labels = [_shape_label(by_shape[k]) for k in shapes] + flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N) + for k in shapes] + tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] + + gemm_util_formula: list[float] = [] + gemm_util_measured: list[float] = [] + formula_eff: list[float] = [] + measured_eff: list[float] = [] + for k in shapes: + r = by_shape[k] + M, K, N = r["M"], r["K"], r["N"] + useful = 2 * M * K * N + tiles = r["tile_count_expected"] + gu_formula = useful / (tile_flops * tiles) + gemm_util_formula.append(gu_formula * 100) + + # Measured GEMM util uses the GEMM stage record count from op_log + # (i.e. the actual number of GEMM tiles the simulator ran). + gemm_record_count = ( + r.get("stages", {}).get("GEMM", {}).get("record_count", 0) + or tiles # fallback if stages dict missing for older sweep data + ) + gu_measured = useful / (tile_flops * gemm_record_count) \ + if gemm_record_count > 0 else 0.0 + gemm_util_measured.append(gu_measured * 100) + + m_tiles = (M + TILE_M - 1) // TILE_M + n_tiles = (N + TILE_N - 1) // TILE_N + n_mn = m_tiles * n_tiles + compute_total = tiles * T_STAGE + inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns + wall_formula = head_ns + compute_total + inter_dma_w + feff = (compute_total * gu_formula / wall_formula) * 100 \ + if wall_formula > 0 else 0.0 + formula_eff.append(feff) + + comp_window_ns = r.get("composite_window_ns", 0.0) or 0.0 + if comp_window_ns > 0: + meff = (useful / comp_window_ns / peak_per_ns) * 100 + else: + meff = 0.0 + measured_eff.append(meff) + + series = { + "GEMM util F": gemm_util_formula, + "GEMM util M": gemm_util_measured, + "Formula eff": formula_eff, + "Measured eff": measured_eff, + } + colors_map = { + "GEMM util F": COL_FS, # emerald (formula ceiling) + "GEMM util M": RGBColor(0x6E, 0xE7, 0xB7), # mint (measured ceiling) + "Formula eff": RGBColor(0xF5, 0x9E, 0x0B), # amber (formula eff) + "Measured eff": COL_DMA, # blue (measured eff) + } + display_map = { + "GEMM util F": "GEMM util % (formula)", + "GEMM util M": "GEMM util % (measured, op_log)", + "Formula eff": "Formula useful eff %", + "Measured eff": "Measured useful eff %", + } + + _textbox(slide, 0.4, 1.0, 12.6, 1.10, + "Four series per shape:\n" + " • GEMM util (formula, green): useful_FLOPs ÷ (tile_FLOPs × " + "tile_count_expected) — structural shape-vs-tile ceiling.\n" + " • GEMM util (measured, mint): useful_FLOPs ÷ (tile_FLOPs × " + "GEMM_record_count) — uses the actual GEMM ops the simulator " + "ran. Should equal formula → validates plan execution.\n" + " • Formula useful eff (amber): GEMM_util × ideal pipeline efficiency.\n" + " • Measured useful eff (blue): (useful_FLOPs ÷ " + "composite_window_ns) ÷ peak — composite_window_ns covers " + "only the tl.composite pipeline (excludes upfront tl.load).", + size=10, color=COL_MUTED, align=PP_ALIGN.LEFT) + + _draw_native_bar_chart( + slide, + plot_x=1.0, plot_y=2.25, plot_w=10.0, plot_h=3.85, + shape_labels=shape_labels, flagged=flagged, + tile_counts=tile_counts, + series=series, colors_map=colors_map, + display_map=display_map, + wall_clocks=None, + y_label="%", + legend_x=11.4, legend_w=1.85, + foot_note=("Reading: GEMM util F = GEMM util M → simulator ran the " + "expected tile plan. Measured eff uses composite-only " + "window (excludes upfront tl.load), so it isolates " + "pipeline efficiency. Variant: load_ref."), + threshold_line=100.0, + ) + + def _render_tflops_table(slide): """Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM. @@ -2282,8 +2409,9 @@ _RENDERERS.update({ "variant_comparison": _render_variant_comparison, "hbm_topology": _render_hbm_topology, "per_op_dma": _render_per_op_dma, - "mac_utilization": _render_mac_utilization, - "mac_utilization_ref_ref": _render_mac_utilization_ref_ref, + "mac_utilization": _render_mac_utilization, + "mac_utilization_ref_ref": _render_mac_utilization_ref_ref, + "mac_utilization_formula_vs_measured": _render_mac_utilization_formula_vs_measured, "tflops_table": _render_tflops_table, "pipeline_eff_walkthrough": _render_pipeline_eff_walkthrough, "pipeline_eff_walkthrough_largeK": _render_pipeline_eff_walkthrough_largeK, diff --git a/scripts/gemm_sweep.py b/scripts/gemm_sweep.py index c991ed6..5400469 100644 --- a/scripts/gemm_sweep.py +++ b/scripts/gemm_sweep.py @@ -179,6 +179,13 @@ def _run_one(M: int, K: int, N: int, topology: str, variant: str = "ref_ref") -> - min(r.t_start for r in pe_records) else: row["pe_window_ns"] = 0.0 + stage_records = [r for r in op_log + if r.params.get("stage_type") in STAGES] + if stage_records: + row["composite_window_ns"] = max(r.t_end for r in stage_records) \ + - min(r.t_start for r in stage_records) + else: + row["composite_window_ns"] = 0.0 return row diff --git a/src/kernbench/components/base.py b/src/kernbench/components/base.py index 336b631..932b9e9 100644 --- a/src/kernbench/components/base.py +++ b/src/kernbench/components/base.py @@ -138,8 +138,16 @@ class PeEngineBase(ComponentBase): env.process(self._forward_txn(env, msg)) def _handle_with_hooks(self, env: simpy.Environment, pe_txn: Any) -> Generator: - """Wrap handle_command with op log hooks on the inner command.""" - self._on_process_start(env, pe_txn.command) + """Wrap handle_command with op log hooks on the inner command. + + Subclasses that need to defer record_start until after a resource + wait (e.g. pe_dma's DMA-channel acquire) set + ``_DEFER_RECORD_START = True`` and call + ``self._on_process_start(env, pe_txn.command)`` themselves at the + post-wait moment. record_end still fires here. + """ + if not getattr(self, "_DEFER_RECORD_START", False): + self._on_process_start(env, pe_txn.command) yield from self.handle_command(env, pe_txn) self._on_process_end(env, pe_txn.command) diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py index 7e11b8f..ed3de9e 100644 --- a/src/kernbench/components/builtin/pe_dma.py +++ b/src/kernbench/components/builtin/pe_dma.py @@ -27,6 +27,12 @@ class PeDmaComponent(PeEngineBase): (DmaReadCmd → HBM read, DmaWriteCmd → HBM write) """ + # Defer op_log record_start until AFTER the DMA channel is acquired so + # t_start reflects the serve-start moment (post queueing) rather than + # the queue-enter moment. ComponentBase._handle_with_hooks consults this + # flag. + _DEFER_RECORD_START = True + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: super().__init__(node, ctx) self._dma_read: simpy.Resource | None = None @@ -80,9 +86,16 @@ class PeDmaComponent(PeEngineBase): path = self.ctx.router.find_path(self._pe_prefix, dst_node) drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes) - # Acquire DMA channel (command issue serialization) + # Acquire DMA channel — held through the entire round-trip so the + # channel models "one DMA in flight per PE per direction" rather + # than just issue-time serialization. This is what makes Option B + # meaningful: t_start = serve-start covers the actual transfer. with dma_res.request() as req: yield req + # Option B: record_start fires AFTER channel acquired, so t_start + # = serve-start (excludes queue wait). _DEFER_RECORD_START=True + # suppresses the auto-start in ComponentBase._handle_with_hooks. + self._on_process_start(env, cmd) # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly) sub_done = env.event() sub_request = PeDmaMsg( @@ -99,10 +112,8 @@ class PeDmaComponent(PeEngineBase): # Send to next hop (path[0] is pe_dma itself, path[1] is router) if len(path) > 1: yield self.out_ports[path[1]].put(sub_txn.advance()) - # DMA channel released after issue - - # Wait for HBM transfer completion - yield sub_done + # Wait for HBM transfer completion BEFORE releasing the channel. + yield sub_done pe_txn.done.succeed() def _worker(self, env: simpy.Environment) -> Generator: @@ -293,15 +304,17 @@ class PeDmaComponent(PeEngineBase): txn.done.succeed() def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator: - """Pipeline mode: DMA read/write via fabric, then self-route.""" - self._on_process_start(env, token) + """Pipeline mode: DMA read/write via fabric, then self-route. + + Option B: record_start is fired *inside* _do_pipeline_dma, after the + DMA channel is acquired — record_end stays here. + """ yield from self._do_pipeline_dma(env, token) self._on_process_end(env, token) # Self-routing (handle same-component consecutive stages) next_stage = token.advance() while next_stage is not None and next_stage.component == self.node.id: - self._on_process_start(env, token) yield from self._do_pipeline_dma(env, token) self._on_process_end(env, token) next_stage = token.advance() @@ -340,8 +353,13 @@ class PeDmaComponent(PeEngineBase): path = self.ctx.router.find_path(self._pe_prefix, dst_node) drain_ns = self.ctx.compute_drain_ns(path, nbytes) + # Hold dma_res through the full round-trip — one DMA in flight + # per PE per direction — so Option B's t_start (post-acquire) + # bounds the actual transfer interval. with dma_res.request() as req: yield req + # Option B: t_start = post-acquire moment. + self._on_process_start(env, token) sub_done = env.event() sub_request = PeDmaMsg( correlation_id="pipeline", @@ -356,8 +374,11 @@ class PeDmaComponent(PeEngineBase): ) if len(path) > 1: yield self.out_ports[path[1]].put(sub_txn.advance()) - - yield sub_done + yield sub_done + else: + # No-op (nbytes==0 or no ctx): no channel wait, but still record + # so _on_process_end has a matching pending entry to finalise. + self._on_process_start(env, token) def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator: """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition.""" diff --git a/tests/test_pe_pipeline.py b/tests/test_pe_pipeline.py index cb7d7a6..910f5c5 100644 --- a/tests/test_pe_pipeline.py +++ b/tests/test_pe_pipeline.py @@ -307,3 +307,95 @@ def test_pipeline_overlap_within_command(): assert stage_times[(1, "dma")] == 10.0 # tile1 gemm starts when tile0 gemm finishes (serialized at gemm queue) assert stage_times[(1, "gemm")] == 30.0 + + +# ── 6. Option B: pe_dma record_start fires post channel-acquire ──────── + + +def test_pe_dma_record_start_after_channel_acquire(): + """Three back-to-back DMA_READs serialise on pe_dma.cap=1. + + With ``_DEFER_RECORD_START = True`` on PeDmaComponent, each op's + ``t_start`` is captured right after ``yield req`` succeeds. Result: + + - op N's ``(t_end - t_start)`` is the *actual transfer time* — same + across all three ops (no queueing inflation). + - op N+1's ``t_start`` >= op N's ``t_end - epsilon`` (waited for the + previous holder to release the channel before being recorded). + + Counter-example (the bug this fix addresses): if ``record_start`` fired + on command entry, all three ops would share ``t_start == 0`` and the + second/third would show inflated ``t_end - t_start``. + """ + from pathlib import Path + + from kernbench.common.pe_commands import DmaReadCmd, PeInternalTxn, TensorHandle + from kernbench.policy.address.phyaddr import PhysAddr + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import load_topology + + TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + def _hbm_pa() -> int: + slice_bytes = 48 * (1 << 30) // 8 + pa = PhysAddr.pe_hbm_addr( + sip_id=0, die_id=0, pe_id=0, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + # enable_data=True wires the OpLogger into every component. + engine = GraphEngine(load_topology(TOPOLOGY_PATH), enable_data=True) + pe_dma_id = "sip0.cube0.pe0.pe_dma" + pe_dma = engine._components[pe_dma_id] + env = engine._env + + # Three back-to-back DMA_READ commands fed straight into pe_dma's inbox + # at t=0 so they all race for the cap=1 channel. + handles = [ + TensorHandle(id=f"r{i}", addr=0x1000 + i * 0x1000, + shape=(64, 32), dtype="f16", nbytes=4096) + for i in range(3) + ] + cmds = [ + DmaReadCmd(handle=h, src_addr=_hbm_pa(), nbytes=4096) + for h in handles + ] + txns = [PeInternalTxn(command=c, done=env.event()) for c in cmds] + + def submit_all(): + for txn in txns: + yield pe_dma._inbox.put(txn) + + env.process(submit_all()) + env.run() + + # Pull the three dma_read records out of the op log in order + dma_records = [ + r for r in engine.op_log + if r.op_name == "dma_read" and r.component_id == pe_dma_id + ] + assert len(dma_records) == 3, ( + f"expected 3 dma_read records, got {len(dma_records)}: {dma_records}" + ) + + durations = [r.t_end - r.t_start for r in dma_records] + # All three should have the same actual transfer time within ±1 ns. + base = durations[0] + assert base > 0, f"first dma duration must be positive, got {base}" + for i, d in enumerate(durations): + assert abs(d - base) <= 1.0, ( + f"op {i} duration {d} differs from baseline {base} by >1 ns " + f"— record_start may still be including queue wait" + ) + + # Each subsequent op's t_start must be at or after the previous op's + # t_end (modulo a few ns of scheduler overhead) — i.e. the wait is + # *excluded* from the recorded interval, not folded into it. + for i in range(1, len(dma_records)): + prev_end = dma_records[i - 1].t_end + cur_start = dma_records[i].t_start + assert cur_start >= prev_end - 1.0, ( + f"op {i} t_start={cur_start} began before op {i-1} t_end={prev_end} " + f"— channel was not actually held, fix is incorrect" + )