a7fe785e5f
Extend tl.composite() with an ordered epilogue list. Each op carries
a scope flag - output_tile (default, runs once per (m,n) before
STORE), k_tile (every K-tile right after GEMM), or kernel. Plan
generator slots MATH stages by scope; pe_math reuses pe_dma's
local-loop pattern so chained epilogues (bias->relu) skip the port
hop. op_log captures per-stage params for telemetry. Topology
gains a gemm->math edge (snapshot test updated).
API stays backward-compatible - `epilogue=` is opt-in.
Example:
h = tl.composite(
op="gemm", a=a, b=b, out_ptr=int(out),
epilogue=[
{"op": "dequant", "scale": s_per_k, "scope": "k_tile"},
{"op": "bias", "bias": bias_vec},
{"op": "relu"},
{"op": "scale", "factor": 0.5},
],
)
tl.wait(h)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
68 lines
5.1 KiB
JSON
68 lines
5.1 KiB
JSON
{
|
|
"permissions": {
|
|
"allow": [
|
|
"Bash(python -c \":*)",
|
|
"Bash(xargs ls -la)",
|
|
"Bash(wc -l /c/Users/ywkang/kernbench/src/kernbench/policy/address/phyaddr.py /c/Users/ywkang/kernbench/src/kernbench/runtime_api/*.py)",
|
|
"Bash(python scripts/gen_placement_diagrams.py)",
|
|
"Bash(python -c:*)",
|
|
"Bash(python -m kernbench.cli.main probe --topology topology.yaml)",
|
|
"Bash(xargs grep -l \"class.*ComponentBase\\\\|class.*DefaultComponent\")",
|
|
"Bash(python -m pytest tests/test_probe.py -v)",
|
|
"Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py -v)",
|
|
"Bash(python -m pytest -o \"addopts=\" --no-header tests/test_intercube_root_center.py)",
|
|
"Bash(python -m pytest -o \"addopts=\" --no-header tests/test_tp_layers.py tests/test_tp_mlp.py)",
|
|
"Bash(git commit -m ' *)",
|
|
"Bash(git stash *)",
|
|
"Bash(python scripts/emit_overview_with_external_ref.py)",
|
|
"Bash(where inkscape *)",
|
|
"Bash(\"/c/Program Files \\(x86\\)/Microsoft/Edge/Application/msedge.exe\" --headless --disable-gpu --screenshot=\"$\\(pwd\\)/docs/diagrams/cube_mesh_view.png\" --window-size=1400,1300 \"file:///$\\(pwd)",
|
|
"Bash(python scripts/build_overview_slides.py)",
|
|
"Bash(git fetch *)",
|
|
"Bash(git pull *)",
|
|
"Bash(python -m pytest --no-header tests/test_allreduce_buffer_kind_sweep.py)",
|
|
"Bash(python -m pytest --no-header tests/test_pe_to_pe_latency.py)",
|
|
"Bash(python -m pytest --no-header tests/test_ipcq_buffer_kind_locations.py -v)",
|
|
"Bash(python -m pytest --no-header tests/test_ipcq_buffer_kind_locations.py tests/test_ipcq_buffer_kind_latency.py tests/test_allreduce_buffer_kind_sweep.py)",
|
|
"Bash(git checkout *)",
|
|
"Bash(python -m pytest --no-header tests/test_ipcq_buffer_kind_latency.py::test_slot_write_latency_orders_tcm_hbm_sram)",
|
|
"Bash(python scripts/emit_ipcq_send_recv_model_plots.py)",
|
|
"Bash(python -m pytest --no-header tests/test_pe_to_pe_latency.py -x)",
|
|
"Bash(python -m pytest --no-header tests/test_pe_to_pe_latency.py tests/test_ipcq_buffer_kind_locations.py tests/test_ipcq_buffer_kind_latency.py tests/test_allreduce_buffer_kind_sweep.py)",
|
|
"Bash(kill %1)",
|
|
"Bash(awk '{print $2}')",
|
|
"Bash(xargs -r kill)",
|
|
"Bash(python scripts/_debug_op_log.py)",
|
|
"Bash(SWEEP_SHAPES=\"16,32,64,128,256\" python scripts/gemm_sweep.py)",
|
|
"Bash(python scripts/plot_gemm_sweep.py)",
|
|
"Bash(python scripts/gemm_sweep.py)",
|
|
"Bash(python scripts/gen_pe_pipeline_diagram.py)",
|
|
"Bash(python scripts/gen_matmul_32x128x32_diagram.py)",
|
|
"Bash(python -m pytest tests/test_pe_pipeline.py -x --tb=short)",
|
|
"Bash(python -m pytest tests/test_pe_pipeline.py tests/test_e2e_pipeline.py tests/test_op_log.py -x --tb=short -q)",
|
|
"Bash(ls -la C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/ 2>&1 | head -20)",
|
|
"Read(//c/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/**)",
|
|
"Bash(awk 'NR==1812 || NR==1815' C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/e55237ed-5c1f-4a89-a3b9-9b74fec45366.jsonl)",
|
|
"Bash(awk 'NR==1058' C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/e55237ed-5c1f-4a89-a3b9-9b74fec45366.jsonl)",
|
|
"Bash(awk -F: '$1 > 1700 && $1 < 1815 {print $1}')",
|
|
"Bash(awk 'NR==1812' C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/e55237ed-5c1f-4a89-a3b9-9b74fec45366.jsonl)",
|
|
"Bash(awk 'NR>=1815 && NR<=1825' C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/e55237ed-5c1f-4a89-a3b9-9b74fec45366.jsonl)",
|
|
"Bash(awk 'NR>1815' C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/e55237ed-5c1f-4a89-a3b9-9b74fec45366.jsonl)",
|
|
"Bash(awk 'NR==1839' C:/Users/mukes/.claude/projects/c--Users-mukes-Mukesh-ywkang-git-kernbench2/e55237ed-5c1f-4a89-a3b9-9b74fec45366.jsonl)",
|
|
"Bash(git log *)",
|
|
"Bash(python -m pytest tests/test_op_log.py tests/test_pe_components.py tests/test_pe_pipeline.py -x --tb=short)",
|
|
"Bash(python -m pytest tests/test_pe_to_pe_latency.py tests/test_e2e_pipeline.py tests/test_e2e_data.py tests/test_data_executor.py tests/test_pe_dma_ipcq.py -x --tb=short)",
|
|
"Bash(python -m pytest tests/test_pe_pipeline.py::test_pe_dma_record_start_after_channel_acquire -x --tb=long)",
|
|
"Bash(python -m pytest tests/test_pe_pipeline.py::test_pe_dma_record_start_after_channel_acquire -x --tb=short)",
|
|
"Bash(python -m pytest tests/test_op_log.py tests/test_pe_components.py tests/test_pe_pipeline.py tests/test_pe_to_pe_latency.py tests/test_e2e_pipeline.py tests/test_e2e_data.py tests/test_data_executor.py tests/test_pe_dma_ipcq.py --tb=short)",
|
|
"Bash(python -m pytest tests/test_pe_pipeline.py -q)",
|
|
"Bash(python -m pytest tests/test_pe_pipeline.py tests/test_triton_emu.py -q)",
|
|
"Bash(python -m pytest tests/test_composite_epilogue.py -v)"
|
|
],
|
|
"additionalDirectories": [
|
|
"c:\\Users\\mukes\\Mukesh\\ywkang_git\\kernbench2\\tests",
|
|
"C:\\Users\\mukes\\Mukesh\\ywkang_git\\kernbench2\\tests\\pe2pe_latency_plots"
|
|
]
|
|
}
|
|
}
|