diff --git a/.gitignore b/.gitignore index 93a3b88..6b3e645 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ # Auto-generated mesh file cube_mesh.yaml +# Milestone bench output (regenerable: kernbench run --bench milestone-1h-*) +src/kernbench/benches/1H_milestone_output/ + # Python __pycache__/ *.py[cod] diff --git a/CLAUDE.md b/CLAUDE.md index 1b2fa6f..43a7612 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -371,6 +371,13 @@ Concrete forms that Part 1's *Verification Plan* MUST take in this repo: - `kernbench run --device ` runs the benchmark on a single device. - Omitting `--device` runs the benchmark on all devices discovered in the topology (logically parallel). - Device enumeration is handled by the CLI only; benchmarks MUST remain single-device. +- **Eval-bench exception (ADR-0054)**: a *milestone / eval bench* + (`milestone-1h-*`) may drive many configurations and build its own + per-config engines to regenerate a domain's full result + figure set; it + ignores `--device` and submits a sentinel tensor to satisfy the + "must submit ≥1 request" contract (ADR-0045 D4). This is the eval-harness + carve-out to the single-device rule, alongside the ADR-0024 multi-SIP CCL + exception. ## Derived Artifacts (Clarification) diff --git a/docs/adr-ko/ADR-0043-eval-allreduce-harness.md b/docs/adr-ko/ADR-0043-eval-allreduce-harness.md index e0722ac..2a15389 100644 --- a/docs/adr-ko/ADR-0043-eval-allreduce-harness.md +++ b/docs/adr-ko/ADR-0043-eval-allreduce-harness.md @@ -7,6 +7,11 @@ Accepted `tests/sccl/` 평가 하니스를 문서화한다; 구현과 대조 검증 완료 (상수, 파일 집합, 스윕 차원을 교차 확인). +**ADR-0054로 개정됨**: 드라이버 코어, sweep, renderer가 `milestone-1h-ccl` +bench(단일 home)로 이동했다; `tests/sccl/_allreduce_helpers.py`는 이제 거기서 +re-export한다(pytest 전용 param 빌더 + `_run_distributed` wrapper는 로컬 +유지). figure 테스트는 변경 없음. + ## Context ADR-0032는 intercube all-reduce *알고리즘*을 정의하고, ADR-0023/0024/0027은 diff --git a/docs/adr-ko/ADR-0044-eval-gemm-harness.md b/docs/adr-ko/ADR-0044-eval-gemm-harness.md index db21a47..7152d23 100644 --- a/docs/adr-ko/ADR-0044-eval-gemm-harness.md +++ b/docs/adr-ko/ADR-0044-eval-gemm-harness.md @@ -8,6 +8,12 @@ GEMM 평가/특성화 하니스를 문서화한다; 구현과 대조 검증 완 (상수, tile 크기, figure 집합, script↔test 분할을 교차 확인). D5/D6 caveat은 부정확이 아니라 기록된 한계다. +**ADR-0054로 개정됨**: sweep + renderer가 `milestone-1h-gemm` bench(단일 +home)로 이동했다; `scripts/gemm_sweep.py`와 `tests/gemm/`는 이제 거기서 +re-export한다. D1/D2의 "데이터 생성은 수동 script / 무거운 작업은 opt-in"은 +평가-bench 패턴으로 대체된다(하나의 bench가 전부 재생성; +`MILESTONE_FAST=1`은 committed JSON 재사용). + ## Context ADR-0014(PE pipeline)와 ADR-0042(tile-plan generator)는 GEMM *구현*을 diff --git a/docs/adr-ko/ADR-0045-prog-bench-module-contract.md b/docs/adr-ko/ADR-0045-prog-bench-module-contract.md index 22d78fd..59817fc 100644 --- a/docs/adr-ko/ADR-0045-prog-bench-module-contract.md +++ b/docs/adr-ko/ADR-0045-prog-bench-module-contract.md @@ -10,6 +10,10 @@ Accepted (2026-05-21). **bench가 어떻게 등록되고 어떤 함수 시그너처를 따라야 하는가**는 ADR 레벨에 없었음. +**ADR-0054로 확장됨**: D5의 단일 구성 규칙에 세 번째 패턴이 추가된다 — +*평가 bench*(예: `milestone-1h-*`)는 여러 구성을 구동하고, 구성별 자체 엔진을 +빌드하며, D4를 만족시키기 위해 sentinel 텐서를 제출한다. + ## First action (제일 처음에 하는 일) `kernbench.benches` 패키지가 임포트되면 `__init__.py` 가 즉시 diff --git a/docs/adr-ko/ADR-0054-eval-milestone-benches.md b/docs/adr-ko/ADR-0054-eval-milestone-benches.md new file mode 100644 index 0000000..0fb1f26 --- /dev/null +++ b/docs/adr-ko/ADR-0054-eval-milestone-benches.md @@ -0,0 +1,137 @@ +# ADR-0054: 마일스톤 평가 bench — 자기완결적 sweep + figure bench + +## Status + +Accepted (2026-05-22). + +ADR-0044(D1/D2)와 ADR-0045(D5)를 개정하고, ADR-0043/0044의 "로직이 +`scripts/` + `tests/`에 산다" 배치를 대체한다: GEMM/allreduce 평가 +하니스가 이제 사용자가 실행하여 모든 결과 + figure를 재생성하는 +자기완결적 **bench**가 된다. + +## Context + +ADR-0043(allreduce 평가)과 ADR-0044(GEMM 평가)는 각 하니스를 **sweep** +(수동 `scripts/` 드라이버, 또는 allreduce의 경우 parametrized 테스트 +자체) + committed 데이터를 렌더링하는 **figure 테스트**로 분리했다. +따라서 sweep/render 로직은 `scripts/gemm_sweep.py`, +`tests/gemm/_gemm_plot_helpers.py`, `tests/sccl/_allreduce_helpers.py`에 +존재했다. + +마일스톤 요구사항("사용자가 *하나의 bench*를 실행해 모든 결과와 플롯을 +생성하도록 allreduce + GEMM 평가를 리팩터")은 그 배치로는 충족 불가다: +bench는 production 코드이며 **`tests/`를 import할 수 없다**(ADR-0007 레이어 +방향). 평가 로직은 bench에서 닿을 수 있도록 production으로 이동해야 했다. + +선택한 home은 별도 `kernbench.eval` 패키지가 아니라 bench 모듈 자체다. +bench 파일은 임의의 모듈 레벨 코드를 가질 수 있으며, 하니스를 bench로 +합치면 도메인당 파일 하나가 유지되고 패키지 레이어가 하나 줄어든다. + +## Decision + +### D1. 두 마일스톤 bench가 평가 로직을 보유 + +- `src/kernbench/benches/milestone_1h_gemm.py` — GEMM shape×variant sweep + + 세 figure renderer(`scripts/gemm_sweep.py` + + `tests/gemm/_gemm_plot_helpers.py`에서 이동). +- `src/kernbench/benches/milestone_1h_ccl.py` — distributed allreduce + 드라이버, latency + buffer-kind sweep, topology diagram, FSIM 비교, 그리고 + direct-launch 패리티 레퍼런스(`tests/sccl/_allreduce_helpers.py`에서 이동). + +각 파일은 해당 도메인 평가 로직의 **단일 home**이다. + +### D2. "평가 bench" 패턴 (ADR-0045 D5 확장) + +ADR-0045 D5는 bench를 단일 구성(single-SIP, 또는 ADR-0024 multi-SIP CCL +예외)으로 고정했다. 본 ADR은 세 번째 패턴을 추가한다: + +- **평가 bench**는 *여러* 구성을 구동하고 figure를 렌더링할 수 있다. 외부 + `run_bench` 엔진 대신 sweep 지점마다 자체 `GraphEngine` / + `RuntimeContext`를 빌드한다. +- 그러면 외부 ctx에 제출된 handle이 없으므로, bench는 마지막에 + **sentinel 텐서**(`torch.zeros((1, 1), …)`)를 제출하여 `run_bench`의 + "최소 한 번 제출" 계약(ADR-0045 D4)을 만족시키고 CLI가 0으로 종료되게 + 한다. + +### D3. 출력 위치 + +두 bench 모두 `src/kernbench/benches/1H_milestone_output/{gemm,ccl}/`에 +쓴다(사용자 요청 — bench 옆 아티팩트). 디렉터리는 생성된 PNG/CSV/JSON만 +보유하며(`.py`/`__init__.py` 없음), 따라서 eager-import audit(ADR-0045 +첫 동작)이 무시한다 — `pkgutil.iter_modules`는 비-패키지 하위 디렉터리를 +yield하지 않는다. committed `docs/diagrams/` 아티팩트와 달리 +**git-ignore**된다(요청 시 재생성 가능). + +### D4. GEMM 무거운 sweep — 기본은 fresh, `MILESTONE_FAST`로 재사용 + +`milestone-1h-gemm`은 기본적으로 전체 24-sim sweep을 실행한다(분 단위; +한 shape는 2048 tile). `MILESTONE_FAST=1`은 committed +`docs/diagrams/gemm_sweep.json`을 재사용하고 렌더링만 한다(초 단위). 이는 +ADR-0044 D1/D2의 "무거운 sweep은 수동/`slow` 단계로 유지"를 뒤집는다: +bench 실행이 곧 재생성이다. slow 경로는 `@pytest.mark.slow` bench +테스트로 행사되고, fast 경로는 기본 실행된다. + +### D5. 테스트 + 스크립트는 thin re-export shim으로 재사용 (단일 home 유지) + +기존 figure 테스트와 `scripts/gemm_sweep.py` 진입점은 유지되며 이제 bench +모듈을 재사용한다: + +- `tests/gemm/_gemm_plot_helpers.py` → renderer + + `GEMM_SWEEP_JSON`/`GEMM_PLOTS_DIR`/`ROOT`를 + `kernbench.benches.milestone_1h_gemm`에서 re-export. +- `tests/sccl/_allreduce_helpers.py` → 드라이버 코어, config writer, sweep + 상수, renderer, disk aggregator를 `kernbench.benches.milestone_1h_ccl`에서 + re-export하고, **pytest 전용** 조각은 로컬 유지: `pytest.param` 행렬 + (`CONFIGS` / `_sweep_params` / `_bk_params`)과 fixture 결합 + `_run_distributed`(`monkeypatch.chdir` + `_drive_distributed`) wrapper. +- `scripts/gemm_sweep.py` → bench의 `run_sweep` 위 thin wrapper. + +테스트가 bench 모듈을 import하는 것은 허용된다(테스트는 production 위에 +위치, ADR-0007); 이는 전체 패키지 eager audit을 유발하며, 그것은 이미 매 +`kernbench` 실행 시 동작한다. matplotlib는 renderer 내부에서 lazy import로 +유지되어 audit의 startup 비용은 불변이다. + +### D6. 평면 모듈 네이밍 (`benches/` 하위 폴더 없음) + +`1H_milestone…`로 명명된 `benches/` 하위 패키지는 불가능하다 — Python +패키지 이름은 숫자로 시작할 수 없다. 따라서 bench는 평면 모듈 +`milestone_1h_gemm.py` / `milestone_1h_ccl.py`이며 bench 이름은 +`milestone-1h-gemm` / `milestone-1h-ccl`(kebab-case, ADR-0045 D1에 따라 +글자로 시작)이다. + +## Consequences + +### Positive + +- `kernbench run --bench milestone-1h-gemm`(또는 `…-ccl`)이 도메인의 모든 + 결과 + figure를 한 명령으로 재생성한다 — 마일스톤 요구사항. +- 평가 로직의 단일 소스(bench), shim을 통해 테스트와 스크립트가 재사용; + 중복 없음. +- figure 테스트와 `scripts/gemm_sweep.py`는 변경 없이 계속 동작. + +### Negative / limitations + +- 두 bench 파일이 크다(CCL 쪽은 distributed 드라이버, sweep, matplotlib + 드로잉을 섞는다). 대부분 평가 하니스인 "bench"는 이례적이며, 본 ADR이 + 이를 정당화한다. +- 생성 아티팩트가 명시적 요청에 의해 source tree(`src/kernbench/benches/`) + 안에 산다; 커밋을 피하려 git-ignore. +- `milestone-1h-ccl`(및 기본 `milestone-1h-gemm`)은 분 단위 소요 — + on-demand 마일스톤 아티팩트에는 수용 가능, 일상 실행에는 아님. + +## Dependencies + +- **ADR-0007**: 레이어 방향(테스트는 production을 import할 수 있으나 bench는 + 테스트를 import할 수 없는 이유). +- **ADR-0043 / ADR-0044**: 본 ADR이 bench로 이전하는 allreduce / GEMM 평가 + 하니스. +- **ADR-0045**: bench 모듈 계약; 여기 D2가 그 D5(single-device 규칙)를 + 평가-bench 패턴으로 확장하고, sentinel을 위해 D4(NO_REQUESTS)에 의존. +- **ADR-0024**: allreduce sweep이 구동하는 rank = SIP launcher. + +## Open questions + +- GEMM theoretical 모델 상수(ADR-0044 D5)를 복사 대신 ADR-0033/0014에서 + 소싱해야 하는가? 본 ADR로는 불변. +- `build_overview_slides.py`가 GEMM 막대를 네이티브로 그리는 대신 마일스톤 + 출력 PNG를 소비해야 하는가? 여전히 open(ADR-0044 D6 / Negative). diff --git a/docs/adr-ko/INDEX.md b/docs/adr-ko/INDEX.md index 7ea6702..eadb4cd 100644 --- a/docs/adr-ko/INDEX.md +++ b/docs/adr-ko/INDEX.md @@ -1,6 +1,6 @@ # ADR Index -Auto-generated by `tools/generate_adr_index.py`. Total ADRs: **46**. +Auto-generated by `tools/generate_adr_index.py`. Total ADRs: **47**. Classification mirrors the `/report` skill's section assignment. When adding a new ADR, also add an entry to the `CLASSIFICATION` table in `tools/generate_adr_index.py`. @@ -152,6 +152,7 @@ One subsection per component file under `src/kernbench/components/builtin/`. - [ADR-0043](./ADR-0043-eval-allreduce-harness.md) — Allreduce 평가 하니스 — `tests/sccl/` - [ADR-0044](./ADR-0044-eval-gemm-harness.md) — GEMM 평가 하니스 — `scripts/gemm_sweep.py` + `tests/gemm/` +- [ADR-0054](./ADR-0054-eval-milestone-benches.md) — 마일스톤 평가 bench — 자기완결적 sweep + figure bench ### Bench Module Contract diff --git a/docs/adr/ADR-0043-eval-allreduce-harness.md b/docs/adr/ADR-0043-eval-allreduce-harness.md index 5ab0ebb..b6011d0 100644 --- a/docs/adr/ADR-0043-eval-allreduce-harness.md +++ b/docs/adr/ADR-0043-eval-allreduce-harness.md @@ -7,6 +7,11 @@ Accepted Documents the `tests/sccl/` evaluation harness; verified against the implementation (constants, file set, and sweep dimensions cross-checked). +**Amended by ADR-0054**: the driver core, sweeps, and renderers moved into +the `milestone-1h-ccl` bench (single home); `tests/sccl/_allreduce_helpers.py` +now re-exports from it (keeping the pytest-only param builders + +`_run_distributed` wrapper local). The figure tests are unchanged. + ## Context ADR-0032 defines the intercube all-reduce *algorithm*; ADR-0023/0024/0027 diff --git a/docs/adr/ADR-0044-eval-gemm-harness.md b/docs/adr/ADR-0044-eval-gemm-harness.md index 058c851..0a68278 100644 --- a/docs/adr/ADR-0044-eval-gemm-harness.md +++ b/docs/adr/ADR-0044-eval-gemm-harness.md @@ -9,6 +9,12 @@ implementation (constants, tile sizes, figure set, and the script↔test split cross-checked). The D5/D6 caveats are recorded limitations, not inaccuracies. +**Amended by ADR-0054**: the sweep + renderers moved into the +`milestone-1h-gemm` bench (single home); `scripts/gemm_sweep.py` and +`tests/gemm/` now re-export from it. D1/D2's "data generation stays a manual +script / heavy work is opt-in" is superseded by the eval-bench pattern (one +bench regenerates everything; `MILESTONE_FAST=1` reuses the committed JSON). + ## Context ADR-0014 (PE pipeline) and ADR-0042 (tile-plan generators) define the GEMM diff --git a/docs/adr/ADR-0045-prog-bench-module-contract.md b/docs/adr/ADR-0045-prog-bench-module-contract.md index db6331a..826ea69 100644 --- a/docs/adr/ADR-0045-prog-bench-module-contract.md +++ b/docs/adr/ADR-0045-prog-bench-module-contract.md @@ -10,6 +10,10 @@ module must follow. ADR-0010 (CLI surface) specifies the `kernbench list/run` interface, but **how benches are registered and what signature they must follow** had no ADR-level coverage. +**Extended by ADR-0054**: D5's single-config rule gains a third pattern — +the *eval bench* (e.g. `milestone-1h-*`) drives many configs, builds its +own per-config engines, and submits a sentinel tensor to satisfy D4. + ## First action When `kernbench.benches` is imported, `__init__.py` immediately calls diff --git a/docs/adr/ADR-0054-eval-milestone-benches.md b/docs/adr/ADR-0054-eval-milestone-benches.md new file mode 100644 index 0000000..f816a61 --- /dev/null +++ b/docs/adr/ADR-0054-eval-milestone-benches.md @@ -0,0 +1,141 @@ +# ADR-0054: Milestone Eval Benches — self-contained sweep + figure benches + +## Status + +Accepted (2026-05-22). + +Amends ADR-0044 (D1/D2) and ADR-0045 (D5) and supersedes the "logic lives +in `scripts/` + `tests/`" arrangement of ADR-0043/0044: the GEMM and +allreduce evaluation harnesses are now self-contained **benches** that a +user runs to regenerate every result + figure. + +## Context + +ADR-0043 (allreduce eval) and ADR-0044 (GEMM eval) split each harness into +a **sweep** (a manual `scripts/` driver, or — for allreduce — the +parametrized tests themselves) plus **figure tests** that render committed +data. The sweep/render logic therefore lived under `scripts/gemm_sweep.py`, +`tests/gemm/_gemm_plot_helpers.py`, and `tests/sccl/_allreduce_helpers.py`. + +A milestone requirement ("refactor allreduce + GEMM evaluation so a user +can run *one bench* to generate all the results and plots") cannot be met +by that layout: a bench is production code and **must not import from +`tests/`** (ADR-0007 layer direction). The eval logic had to move into +production, reachable from a bench. + +The chosen home is the bench module itself — not a separate +`kernbench.eval` package. A bench file may contain arbitrary module-level +code; collapsing the harness into the bench keeps one file per domain and +avoids an extra package layer. + +## Decision + +### D1. Two milestone benches own the eval logic + +- `src/kernbench/benches/milestone_1h_gemm.py` — GEMM shape×variant sweep + + the three figure renderers (moved from `scripts/gemm_sweep.py` + + `tests/gemm/_gemm_plot_helpers.py`). +- `src/kernbench/benches/milestone_1h_ccl.py` — the distributed allreduce + driver, latency + buffer-kind sweeps, topology diagram, FSIM comparison, + and the direct-launch parity reference (moved from + `tests/sccl/_allreduce_helpers.py`). + +Each file is the **single home** for its domain's eval logic. + +### D2. The "eval bench" pattern (extends ADR-0045 D5) + +ADR-0045 D5 fixed a bench to a single configuration (single-SIP, or the +ADR-0024 multi-SIP CCL exception). This ADR adds a third pattern: + +- An **eval bench** may drive *many* configurations and render figures. It + builds its own per-config `GraphEngine` / `RuntimeContext` instances + (one per sweep point) rather than using the outer `run_bench` engine. +- Because the outer ctx then has no submitted handles, the bench submits a + **sentinel tensor** (`torch.zeros((1, 1), …)`) at the end to satisfy + `run_bench`'s "must submit at least one request" contract (ADR-0045 D4), + so the CLI exits 0. + +### D3. Output location + +Both benches write to `src/kernbench/benches/1H_milestone_output/{gemm,ccl}/` +(per user request — artifacts beside the bench). The directory holds only +generated PNG/CSV/JSON (never a `.py`/`__init__.py`), so the eager-import +audit (ADR-0045 first action) ignores it — `pkgutil.iter_modules` does not +yield non-package subdirectories. It is **git-ignored** (regenerable on +demand), unlike the committed `docs/diagrams/` artifacts. + +### D4. GEMM heavy sweep — fresh by default, `MILESTONE_FAST` to reuse + +`milestone-1h-gemm` runs the full 24-sim sweep by default (minutes; one +shape is 2048 tiles). `MILESTONE_FAST=1` reuses the committed +`docs/diagrams/gemm_sweep.json` and only re-renders (seconds). This +reverses ADR-0044 D1/D2's "heavy sweep stays a manual/`slow`-marked step": +running the bench *is* the regeneration. The slow path is exercised by a +`@pytest.mark.slow` bench test; the fast path runs by default. + +### D5. Tests + script reuse via thin re-export shims (single home kept) + +The pre-existing figure tests and the `scripts/gemm_sweep.py` entry point +are retained and now reuse the bench modules: + +- `tests/gemm/_gemm_plot_helpers.py` → re-exports the renderers + + `GEMM_SWEEP_JSON`/`GEMM_PLOTS_DIR`/`ROOT` from + `kernbench.benches.milestone_1h_gemm`. +- `tests/sccl/_allreduce_helpers.py` → re-exports the driver core, config + writers, sweep constants, renderers, and disk aggregators from + `kernbench.benches.milestone_1h_ccl`, and keeps the **pytest-only** pieces + local: the `pytest.param` matrices (`CONFIGS` / `_sweep_params` / + `_bk_params`) and the fixture-coupled `_run_distributed` + (`monkeypatch.chdir` + `_drive_distributed`) wrapper. +- `scripts/gemm_sweep.py` → thin wrapper over the bench's `run_sweep`. + +Tests importing a bench module is permitted (tests sit above production, +ADR-0007); it triggers the whole-package eager audit, which already runs on +every `kernbench` invocation. matplotlib stays lazily imported inside the +renderers, so the audit's startup cost is unchanged. + +### D6. Flat module naming (no `benches/` subfolder) + +A `benches/` subpackage named `1H_milestone…` is impossible — a Python +package name cannot start with a digit. The benches are therefore flat +modules `milestone_1h_gemm.py` / `milestone_1h_ccl.py` with bench names +`milestone-1h-gemm` / `milestone-1h-ccl` (kebab-case, letter-first per +ADR-0045 D1). + +## Consequences + +### Positive + +- `kernbench run --bench milestone-1h-gemm` (or `…-ccl`) regenerates all of + a domain's results + figures in one command — the milestone requirement. +- Single source for the eval logic (the bench), reused by tests and the + script via shims; no duplication. +- The figure tests and `scripts/gemm_sweep.py` keep working unchanged. + +### Negative / limitations + +- The two bench files are large (the CCL one mixes the distributed driver, + sweeps, and matplotlib drawing). A "bench" that is mostly an eval harness + is unusual; this ADR legitimizes it. +- Generated artifacts live inside the source tree (`src/kernbench/benches/`) + by explicit request; git-ignored to avoid committing them. +- `milestone-1h-ccl` (and the default `milestone-1h-gemm`) take minutes — + acceptable for an on-demand milestone artifact, not for routine runs. + +## Dependencies + +- **ADR-0007**: layer direction (why tests may import production but a bench + may not import tests). +- **ADR-0043 / ADR-0044**: the allreduce / GEMM eval harnesses this ADR + relocates into benches. +- **ADR-0045**: bench module contract; D2 here extends its D5 (single-device + rule) with the eval-bench pattern, and relies on D4 (NO_REQUESTS) for the + sentinel. +- **ADR-0024**: rank = SIP launcher driven by the allreduce sweeps. + +## Open questions + +- Should the GEMM theoretical-model constants (ADR-0044 D5) be sourced from + ADR-0033/0014 rather than copied? Unchanged by this ADR. +- Should `build_overview_slides.py` consume the milestone output PNGs + instead of drawing GEMM bars natively? Still open (ADR-0044 D6 / Negative). diff --git a/docs/adr/INDEX.md b/docs/adr/INDEX.md index 7a6d610..81deee9 100644 --- a/docs/adr/INDEX.md +++ b/docs/adr/INDEX.md @@ -1,6 +1,6 @@ # ADR Index -Auto-generated by `tools/generate_adr_index.py`. Total ADRs: **46**. +Auto-generated by `tools/generate_adr_index.py`. Total ADRs: **47**. Classification mirrors the `/report` skill's section assignment. When adding a new ADR, also add an entry to the `CLASSIFICATION` table in `tools/generate_adr_index.py`. @@ -152,6 +152,7 @@ One subsection per component file under `src/kernbench/components/builtin/`. - [ADR-0043](./ADR-0043-eval-allreduce-harness.md) — Allreduce Evaluation Harness — `tests/sccl/` - [ADR-0044](./ADR-0044-eval-gemm-harness.md) — GEMM Evaluation Harness — `scripts/gemm_sweep.py` + `tests/gemm/` +- [ADR-0054](./ADR-0054-eval-milestone-benches.md) — Milestone Eval Benches — self-contained sweep + figure benches ### Bench Module Contract diff --git a/scripts/gemm_sweep.py b/scripts/gemm_sweep.py index c24909e..b4cfc13 100644 --- a/scripts/gemm_sweep.py +++ b/scripts/gemm_sweep.py @@ -1,237 +1,20 @@ """Sweep GEMM shapes through kernbench and dump PE_accelerator engine times. -For each shape: - - run benches.matmul_composite via the same run_bench path the CLI uses - - read result.engine.op_log - - filter to per-PE engines: pe_dma, pe_fetch_store, pe_gemm, pe_math - - record sum-of-durations (engine occupancy) AND wall-clock active interval +Thin wrapper: the sweep logic now lives in +``kernbench.benches.milestone_1h_gemm`` (the single home, ADR-0054, also the +``milestone-1h-gemm`` bench). This script remains the manual entry point for +regenerating ``docs/diagrams/gemm_sweep.json`` on demand and honors the same +``SWEEP_SHAPES`` / ``SWEEP_TOPOLOGY`` env overrides. -Output: docs/diagrams/gemm_sweep.json + python scripts/gemm_sweep.py """ from __future__ import annotations -import json -import os -import sys -import time -from pathlib import Path - -# Default sweep covering under-tile, single-tile, multi-tile, and asymmetric regimes. -# Each entry is either a single integer (square M=K=N=S) or "MxKxN". -# Override via env: SWEEP_SHAPES="16,32,16x2048x16,..." -DEFAULT_SHAPES = [ - "32x32x32", # 1 tile, K=32 < TILE_K=64 → under-tile in K - "32x64x32", # 1 tile, exact single-tile fit - "32x128x32", # 2 tiles, aligned - "32x128x128", # 8 tiles, aligned - "32x3072x32", # 48 tiles, all K-axis (tall-skinny) - "8x128x128", # 8 tiles, but M=8 < TILE_M=32 → MAC array under-fed - "128x8x128", # 16 tiles, but K=8 < TILE_K=64 → MAC array under-fed - "512", # 2048 tiles, fully aligned — "well-pipelined" reference -] - -# Operand-staging variants exercised per shape. -VARIANTS = ["ref_ref", "load_ref", "load_load"] - -# Engines whose timings we collect (component_id suffix match). -ENGINES = ["pe_dma", "pe_fetch_store", "pe_gemm", "pe_math"] - -# Per-stage breakdown labels (StageType enum names from pe_types.py). -STAGES = ["DMA_READ", "DMA_WRITE", "FETCH", "STORE", "GEMM", "MATH"] - -# Scheduler tile sizes (mirror of PeSchedulerComponent.TILE_M/K/N). -TILE_M, TILE_K, TILE_N = 32, 64, 32 - -OUT_PATH = Path(__file__).parent.parent / "docs" / "diagrams" / "gemm_sweep.json" - - -def _engine_wall_ns(records, suffix: str) -> float: - """Wall-clock interval the engine was active (union of overlapping ops).""" - intervals = [(r.t_start, r.t_end) for r in records - if r.component_id.endswith("." + suffix)] - if not intervals: - return 0.0 - intervals.sort() - merged_end = intervals[0][1] - merged_start = intervals[0][0] - total = 0.0 - for s, e in intervals[1:]: - if s <= merged_end: - merged_end = max(merged_end, e) - else: - total += merged_end - merged_start - merged_start, merged_end = s, e - total += merged_end - merged_start - return total - - -def _engine_occupancy_ns(records, suffix: str) -> float: - return sum(r.t_end - r.t_start for r in records - if r.component_id.endswith("." + suffix)) - - -def _engine_count(records, suffix: str) -> int: - return sum(1 for r in records if r.component_id.endswith("." + suffix)) - - -def _stage_occupancy_ns(records, stage_type: str) -> float: - """Sum t_end - t_start over op_log records whose params.stage_type matches. - - Requires op_log records produced post the TileToken stage_type capture - (sim_engine/op_log.py). - """ - return sum( - r.t_end - r.t_start - for r in records - if r.params.get("stage_type") == stage_type - ) - - -def _stage_wall_ns(records, stage_type: str) -> float: - """Interval-union wall-clock for records whose stage_type matches.""" - intervals = sorted( - (r.t_start, r.t_end) for r in records - if r.params.get("stage_type") == stage_type - ) - if not intervals: - return 0.0 - total = 0.0 - cs, ce = intervals[0] - for s, e in intervals[1:]: - if s <= ce: - ce = max(ce, e) - else: - total += ce - cs - cs, ce = s, e - total += ce - cs - return total - - -def _stage_count(records, stage_type: str) -> int: - return sum(1 for r in records if r.params.get("stage_type") == stage_type) - - -def _run_one(M: int, K: int, N: int, topology: str, variant: str = "ref_ref") -> dict: - os.environ["MATMUL_M"] = str(M) - os.environ["MATMUL_K"] = str(K) - os.environ["MATMUL_N"] = str(N) - os.environ["MATMUL_VARIANT"] = variant - - # Late imports so env vars are read by matmul_composite at module load. - # Force re-import to pick up new env values. - for mod_name in [m for m in list(sys.modules) if m.startswith("kernbench.benches.matmul_composite")]: - del sys.modules[mod_name] - - from kernbench.benches.registry import resolve as resolve_bench - from kernbench.runtime_api.bench_runner import run_bench - from kernbench.runtime_api.types import resolve_device - from kernbench.sim_engine.engine import GraphEngine - from kernbench.topology.builder import resolve_topology - - topo = resolve_topology(topology) - bench = resolve_bench("matmul-composite").run - device = resolve_device(None) - - t0 = time.time() - result = run_bench( - topology=topo, bench_fn=bench, device=device, - engine_factory=lambda t, d: GraphEngine( - getattr(t, "topology_obj", t), enable_data=True, - ), - ) - wall = time.time() - t0 - - op_log = result.engine.op_log - if not result.completion.ok: - raise RuntimeError(f"bench failed at M={M},K={K},N={N}: {result.completion}") - - # Bytes touched at f16 (2 B): full A + full B + full out (each operand - # streamed once through HBM by the composite plan). - bytes_total = (M * K + K * N + M * N) * 2 - row = { - "M": M, "K": K, "N": N, - "variant": variant, - "flops": 2 * M * K * N, - "bytes_hbm": bytes_total, - "arith_intensity": (2 * M * K * N) / bytes_total, # flops/byte - "tile_count_expected": _ceil(M, TILE_M) * _ceil(N, TILE_N) * _ceil(K, TILE_K), - "sim_wall_clock_s": round(wall, 3), - "engines": {}, - } - for eng in ENGINES: - row["engines"][eng] = { - "occupancy_ns": _engine_occupancy_ns(op_log, eng), - "wall_ns": _engine_wall_ns(op_log, eng), - "record_count": _engine_count(op_log, eng), - } - row["stages"] = {} - for stage in STAGES: - row["stages"][stage] = { - "occupancy_ns": _stage_occupancy_ns(op_log, stage), - "wall_ns": _stage_wall_ns(op_log, stage), - "record_count": _stage_count(op_log, stage), - } - # Kernel-window wall-clock = max t_end - min t_start over PE engine records. - pe_records = [r for r in op_log - if any(r.component_id.endswith("." + e) for e in ENGINES)] - if pe_records: - row["pe_window_ns"] = max(r.t_end for r in pe_records) \ - - min(r.t_start for r in pe_records) - else: - row["pe_window_ns"] = 0.0 - stage_records = [r for r in op_log - if r.params.get("stage_type") in STAGES] - if stage_records: - row["composite_window_ns"] = max(r.t_end for r in stage_records) \ - - min(r.t_start for r in stage_records) - else: - row["composite_window_ns"] = 0.0 - return row - - -def _ceil(a: int, b: int) -> int: - return (a + b - 1) // b +from kernbench.benches.milestone_1h_gemm import run_sweep def main() -> int: - shapes_env = os.environ.get("SWEEP_SHAPES") - raw = (shapes_env.split(",") if shapes_env else DEFAULT_SHAPES) - shapes: list[tuple[int, int, int]] = [] - for s in raw: - s = s.strip() - if not s: - continue - if "x" in s.lower(): - parts = s.lower().split("x") - shapes.append((int(parts[0]), int(parts[1]), int(parts[2]))) - else: - v = int(s) - shapes.append((v, v, v)) - topology = os.environ.get("SWEEP_TOPOLOGY", "topology.yaml") - - rows = [] - for M, K, N in shapes: - for variant in VARIANTS: - print(f"[sweep] M={M} K={K} N={N} variant={variant} ...", flush=True) - row = _run_one(M, K, N, topology, variant=variant) - rows.append(row) - eng_dma = row["engines"]["pe_dma"] - eng_gem = row["engines"]["pe_gemm"] - print(f" tiles={row['tile_count_expected']:>6} " - f"pe_window={row['pe_window_ns']:8.1f}ns " - f"dma_occ={eng_dma['occupancy_ns']:9.1f} " - f"gemm_occ={eng_gem['occupancy_ns']:8.1f} " - f"(sim {row['sim_wall_clock_s']:.1f}s)") - - OUT_PATH.parent.mkdir(parents=True, exist_ok=True) - OUT_PATH.write_text(json.dumps({ - "tile_sizes": {"M": TILE_M, "K": TILE_K, "N": TILE_N}, - "engines": ENGINES, - "stages": STAGES, - "variants": VARIANTS, - "rows": rows, - }, indent=2)) - print(f"\n[sweep] wrote {OUT_PATH}") + run_sweep() return 0 diff --git a/src/kernbench/benches/milestone_1h_ccl.py b/src/kernbench/benches/milestone_1h_ccl.py new file mode 100644 index 0000000..f417755 --- /dev/null +++ b/src/kernbench/benches/milestone_1h_ccl.py @@ -0,0 +1,1128 @@ +"""milestone-1h-ccl bench: allreduce evaluation harness (sweeps + figures). + +Self-contained milestone bench (ADR-0054). Holds the distributed driver +core, the direct-launch parity reference, config writers, the sweep / +buffer-kind constants, the plot renderers + disk aggregators (called from +``conftest.pytest_sessionfinish`` via the shim), the topology-diagram +emitter, and the broken-axis FSIM comparison. The ``run(torch)`` entry at +the bottom drives both sweeps in-process and writes every figure + CSV into +``benches/1H_milestone_output/ccl/``. + +This is the single home for the allreduce eval logic: the figure tests +import a thin re-export shim (``tests/sccl/_allreduce_helpers.py``), as do +the IPCQ micro-tests. The pytest-specific param builders (``CONFIGS``, +``_sweep_params``, ``_bk_params``) and the fixture-coupled +``_run_distributed`` wrapper stay in that shim; everything here is +pytest-free. +""" +from __future__ import annotations + +import csv +import importlib +import json +import math +import os +import tempfile +import textwrap +from pathlib import Path +from typing import Any + +import numpy as np +import yaml + +from kernbench.benches.registry import bench +from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config +from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip +from kernbench.policy.placement.dp import DPPolicy +from kernbench.runtime_api.context import RuntimeContext +from kernbench.runtime_api.types import DeviceSelector +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + +_OUTPUT_DIR = Path(__file__).resolve().parent / "1H_milestone_output" / "ccl" + +ROOT = Path(__file__).resolve().parents[3] +TOPOLOGY_PATH = ROOT / "topology.yaml" + +DEFAULT_N_ELEM = 8 + +DEFAULT_OUT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots" + + +# ── config writers ──────────────────────────────────────────────────── + + +def _write_ccl_yaml(tmp_path) -> str: + body = textwrap.dedent("""\ + defaults: + algorithm: lrab_hierarchical_allreduce + buffer_kind: tcm + backpressure: sleep + n_slots: 4 + slot_size: 4096 + vc_chunk_size: 256 + ipcq_credit_size_bytes: 16 + + algorithms: + lrab_hierarchical_allreduce: + module: kernbench.ccl.algorithms.lrab_hierarchical_allreduce + topology: none + buffer_kind: tcm + n_elem: 8 + root_cube: 15 + """) + (Path(tmp_path) / "ccl.yaml").write_text(body) + return str(tmp_path) + + +def _write_temp_configs( + tmp_path, sip_topology, n_sips, algorithm, n_elem_override=None, + sip_w=None, sip_h=None, +): + """Write temp topology.yaml and ccl.yaml with the given overrides.""" + with open(TOPOLOGY_PATH) as f: + topo_cfg = yaml.safe_load(f) + topo_cfg["system"]["sips"]["count"] = n_sips + topo_cfg["system"]["sips"]["topology"] = sip_topology + if sip_w is not None and sip_h is not None: + topo_cfg["system"]["sips"]["w"] = int(sip_w) + topo_cfg["system"]["sips"]["h"] = int(sip_h) + else: + topo_cfg["system"]["sips"].pop("w", None) + topo_cfg["system"]["sips"].pop("h", None) + topo_path = Path(tmp_path) / "topology.yaml" + with open(topo_path, "w") as f: + yaml.dump(topo_cfg, f, default_flow_style=False) + + ccl_path = ROOT / "ccl.yaml" + with open(ccl_path) as f: + ccl_cfg = yaml.safe_load(f) + ccl_cfg["defaults"]["algorithm"] = algorithm + if n_elem_override is not None: + ccl_cfg.setdefault("algorithms", {}).setdefault( + algorithm, {}, + )["n_elem"] = int(n_elem_override) + # Ensure IPCQ slot is big enough for the per-message payload. + per_msg_bytes = int(n_elem_override) * 2 # f16 + default_slot = int(ccl_cfg["defaults"].get("slot_size", 4096)) + if per_msg_bytes > default_slot: + ccl_cfg["defaults"]["slot_size"] = per_msg_bytes + tmp_ccl = Path(tmp_path) / "ccl.yaml" + with open(tmp_ccl, "w") as f: + yaml.dump(ccl_cfg, f, default_flow_style=False) + + return str(topo_path), str(tmp_ccl) + + +# ── distributed driver (init_process_group → mp.spawn → all_reduce) ──── + + +def _worker(rank: int, n_cubes: int, n_elem: int, n_sips: int, torch) -> None: + """Per-SIP worker: allocate, fill, all_reduce, verify.""" + torch.ahbm.set_device(rank) + + dp = DPPolicy( + cube="row_wise", pe="replicate", + num_pes=1, num_cubes=n_cubes, + ) + tensor = torch.zeros( + (n_cubes, n_elem), dtype="f16", dp=dp, + name=f"sip{rank}", + ) + tensor.copy_(torch.from_numpy( + np.full((n_cubes, n_elem), float(rank + 1), dtype=np.float16) + )) + + torch.distributed.all_reduce(tensor, op="sum") + + arr = tensor.numpy() + expected = float(n_cubes * sum(range(1, n_sips + 1))) + for cube_id in range(n_cubes): + assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), ( + f"SIP{rank} cube {cube_id}: " + f"got {arr[cube_id][:4]}, expected {expected}" + ) + + if rank == 0: + print(f"\n lrab_hierarchical_allreduce (ws={n_sips}): " + f"{n_sips * n_cubes} OK") + + +def _crit_ns(engine) -> float: + """Critical-path latency = max per-result pe_exec_ns over engine results.""" + vals = [ + float(tr.get("pe_exec_ns", 0.0) or 0.0) + for _, (_, tr) in engine._results.items() + if isinstance(tr, dict) + ] + return max(vals) if vals else 0.0 + + +def _drive_distributed(topo_path, correlation_id, n_elem): + """Build engine + run the collective via the full distributed path. + + Returns ``(engine, n_cubes)``. The caller MUST ensure the current + working directory contains the ``ccl.yaml`` the backend should read + (``load_ccl_config()`` looks in cwd). See ``_drive_in_dir`` and the + test shim's ``_run_distributed`` for the two cwd strategies. + """ + topo = resolve_topology(topo_path) + engine = GraphEngine(topo.topology_obj, enable_data=True) + spec = topo.topology_obj.spec + n_sips = int(spec["system"]["sips"]["count"]) + cm = spec["sip"]["cube_mesh"] + n_cubes = int(cm["w"]) * int(cm["h"]) + + with RuntimeContext( + engine=engine, + target_device=DeviceSelector("all"), + correlation_id=correlation_id, + spec=spec, + ) as ctx: + ctx.distributed.init_process_group(backend="ahbm") + assert ctx.distributed.get_world_size() == n_sips + ctx.multiprocessing.spawn( + _worker, args=(n_cubes, n_elem, n_sips, ctx), nprocs=n_sips, + ) + return engine, n_cubes + + +def _drive_in_dir(workdir, topo_path, correlation_id, n_elem): + """``_drive_distributed`` with cwd switched to ``workdir`` (restored).""" + prev = os.getcwd() + os.chdir(str(workdir)) + try: + return _drive_distributed(topo_path, correlation_id, n_elem) + finally: + os.chdir(prev) + + +# ── correctness config matrix (data only; pytest.param wrapper in shim) ─ + +# (algorithm, sip_topology, n_sips, sip_w, sip_h) for the correctness test. +CORRECTNESS_TOPOLOGIES = [ + ("lrab_hierarchical_allreduce", "ring_1d", 6, None, None), + ("lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3), + ("lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3), +] + + +# ── direct-launch helper (parity reference only) ─────────────────────── + + +def _sip_topo_dims( + sip_topo: str, n_sips: int, + spec_w: int | None = None, spec_h: int | None = None, +) -> tuple[int, int]: + if sip_topo == "ring_1d": + return (0, 0) + if spec_w is not None and spec_h is not None: + if spec_w * spec_h != n_sips: + raise ValueError( + f"sip layout {spec_w}x{spec_h} != n_sips ({n_sips})" + ) + return (spec_w, spec_h) + side = int(round(math.sqrt(n_sips))) + if side * side != n_sips: + raise ValueError( + f"SIP topology '{sip_topo}' requires square n_sips or " + f"explicit w/h in spec, got {n_sips}" + ) + return (side, side) + + +def run_allreduce( + ctx: Any, + engine: Any, + spec: dict, + *, + algorithm: str | None = None, + ccl_yaml: str | None = None, +) -> dict: + """Config-driven allreduce via direct ctx.launch (no distributed wrapper). + + Retained as the parity reference for the distributed path and reused by + the external buffer-kind / root-center micro-tests. + """ + cfg_all = load_ccl_config(ccl_yaml) + cfg = resolve_algorithm_config(cfg_all, algorithm) + + algo_module = importlib.import_module(cfg["module"]) + kernel_fn = algo_module.kernel + topo_name_to_kind = algo_module.TOPO_NAME_TO_KIND + + n_elem = int(cfg.get("n_elem", 8)) + sips_cfg = spec.get("system", {}).get("sips", {}) + n_sips = int(sips_cfg.get("count", 1)) + sip_topo = str(sips_cfg.get("topology", "ring_1d")) + spec_sip_w = sips_cfg.get("w") + spec_sip_h = sips_cfg.get("h") + spec_sip_w = int(spec_sip_w) if spec_sip_w is not None else None + spec_sip_h = int(spec_sip_h) if spec_sip_h is not None else None + + cm = spec["sip"]["cube_mesh"] + cube_w = int(cm["w"]) + cube_h = int(cm["h"]) + n_cubes = cube_w * cube_h + + sip_topo_kind = topo_name_to_kind.get(sip_topo, 0) + sip_topo_w, sip_topo_h = _sip_topo_dims( + sip_topo, n_sips, spec_w=spec_sip_w, spec_h=spec_sip_h, + ) + + algo_name = cfg.get("algorithm", "allreduce") + + configure_sfr_intercube_multisip(engine, spec, cfg) + + dp = DPPolicy( + cube="row_wise", pe="replicate", + num_pes=1, num_cubes=n_cubes, + ) + + tensors = [] + for sip in range(n_sips): + ctx.ahbm.set_device(sip) + t = ctx.zeros( + (n_cubes, n_elem), dtype="f16", dp=dp, + name=f"sip{sip}", + ) + t.copy_(ctx.from_numpy( + np.full((n_cubes, n_elem), float(sip + 1), dtype=np.float16) + )) + tensors.append(t) + + t_start = engine._env.now + + all_pending = [] + for sip_rank, t in enumerate(tensors): + pending = ctx.launch( + algo_name, kernel_fn, t, + n_elem, cube_w, cube_h, n_sips, sip_rank, + sip_topo_kind, sip_topo_w, sip_topo_h, + _defer_wait=True, + ) + all_pending.extend(pending) + + for h, _sip_id, meta in all_pending: + ctx.wait(h, _meta=meta) + + t_end = engine._env.now + latency_ns = t_end - t_start + + expected = float(n_cubes * sum(range(1, n_sips + 1))) + + ok_cubes = 0 + for sip in range(n_sips): + arr = tensors[sip].numpy() + for cube_id in range(n_cubes): + assert np.allclose( + arr[cube_id], expected, rtol=1e-1, atol=1e-1, + ), ( + f"SIP{sip} cube {cube_id}: " + f"got {arr[cube_id][:4]}, expected {expected}" + ) + ok_cubes += 1 + + return { + "expected": expected, + "latency_ns": latency_ns, + "ok_cubes": ok_cubes, + } + + +# ── Latency sweep constants + renderer/aggregator ────────────────────── + +# avoid 16 (== n_cubes, dim_map collision). Goes up to 96 KB per PE: +# bytes_per_pe = n_elem * 2 (f16). 49152 elem * 2 = 96 KB / PE. +_SWEEP_N_ELEM = [ + 8, 32, 64, 128, 512, 1024, 2048, + 4096, 8192, 16384, 32768, 49152, +] +_ELEM_BYTES_F16 = 2 + +_SWEEP_TOPOLOGIES = [ + ("lrab_hierarchical_allreduce", "ring_1d", 6, None, None), + ("lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3), + ("lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3), +] + +# Shared on-disk staging dir for parametrized sweep rows (test path). Each +# parametrized invocation writes one JSON file here; the aggregator (run +# from conftest.pytest_sessionfinish via the shim) reads them and emits the +# combined CSV + PNG plots. +_SWEEP_OUT_DIR = DEFAULT_OUT_DIR +_SWEEP_ROWS_DIR = _SWEEP_OUT_DIR / "_rows" + + +def _fmt_bytes(x, _pos): + if x <= 0: + return "0" + if x >= 1024 * 1024: + return f"{x / (1024 * 1024):.0f} MB" + if x >= 1024: + return f"{x / 1024:.0f} KB" + return f"{x:.0f} B" + + +def _render_latency_sweep(records: list[dict], out_dir: Path) -> None: + """Write summary.csv + per-topology latency PNGs into out_dir.""" + import matplotlib.pyplot as plt + from matplotlib.ticker import FuncFormatter + + _bytes_fmt = FuncFormatter(_fmt_bytes) + + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + with open(out_dir / "summary.csv", "w", + newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=[ + "algorithm", "sip_topology", "n_sips", "n_elem", + "bytes_per_pe", "bytes_per_sip", "latency_ns", + ]) + w.writeheader() + for r in sorted(records, key=lambda r: ( + r["sip_topology"], r["bytes_per_pe"], + )): + w.writerow(r) + + topologies = sorted({r["sip_topology"] for r in records}) + for topo_name in topologies: + rs = sorted( + [r for r in records if r["sip_topology"] == topo_name], + key=lambda r: r["bytes_per_pe"], + ) + if not rs: + continue + xs = [r["bytes_per_pe"] for r in rs] + ys = [r["latency_ns"] for r in rs] + _per_topo_titles = { + "ring_1d": "AllReduce_LRAB_Ring1D_6SiP(1x6)", + "torus_2d": "AllReduce_LRAB_2Dtorus_6SiP(2x3)", + "mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP(2x3)", + } + # Descriptive output filenames (parens → underscores for + # markdown/URL safety; topo key stays the summary.csv value). + _per_topo_files = { + "ring_1d": "AllReduce_LRAB_Ring1D_6SiP_1x6", + "torus_2d": "AllReduce_LRAB_2Dtorus_6SiP_2x3", + "mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP_2x3", + } + title = _per_topo_titles.get( + topo_name, f"Allreduce latency — {topo_name}" + ) + out_stem = _per_topo_files.get(topo_name, topo_name) + fig, ax = plt.subplots(figsize=(8, 5)) + ax.plot(xs, ys, marker="o", color="tab:blue") + ax.set_xscale("log", base=2) + ax.set_xlabel("Bytes per PE (log scale)") + ax.set_ylabel("Time (ns)") + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.xaxis.set_major_formatter(_bytes_fmt) + fig.tight_layout() + fig.savefig(out_dir / f"{out_stem}.png", dpi=120) + plt.close(fig) + + +def _aggregate_sweep_plots(out_dir: Path | None = None, + rows_dir: Path | None = None) -> bool: + """Read all per-config rows and emit CSV + PNG plots. + + Called by ``conftest.pytest_sessionfinish`` (via the shim, controller + node only). Returns True if any rows were aggregated, False otherwise. + """ + out_dir = Path(out_dir) if out_dir is not None else _SWEEP_OUT_DIR + rows_dir = Path(rows_dir) if rows_dir is not None else _SWEEP_ROWS_DIR + + row_files = sorted(rows_dir.glob("*.json")) if rows_dir.exists() else [] + records: list[dict] = [] + if row_files: + for p in row_files: + with open(p, encoding="utf-8") as f: + records.append(json.load(f)) + else: + # Fallback: replot from existing summary.csv (skip sweep re-run). + summary_path = out_dir / "summary.csv" + if not summary_path.exists(): + return False + with open(summary_path, encoding="utf-8") as f: + for row in csv.DictReader(f): + records.append({ + "algorithm": row["algorithm"], + "sip_topology": row["sip_topology"], + "n_sips": int(row["n_sips"]), + "n_elem": int(row["n_elem"]), + "bytes_per_pe": int(row["bytes_per_pe"]), + "bytes_per_sip": int(row["bytes_per_sip"]), + "latency_ns": float(row["latency_ns"]), + }) + if not records: + return False + + _render_latency_sweep(records, out_dir) + + # Combined overview.png is no longer emitted — the broken-y-axis + # comparison (emit_comparison_fsim_plot() below) supersedes it. + # Per-topology plots above and summary.csv are still produced. + + # Cleanup row staging dir so a partial future run doesn't pick up + # stale rows. + for p in row_files: + try: + p.unlink() + except OSError: + pass + try: + rows_dir.rmdir() + except OSError: + pass + + print(f"\nWrote per-topology plots + summary.csv to {out_dir} " + f"from {len(records)} rows") + return True + + +# ── Buffer-kind sweep constants + renderer/aggregator ────────────────── +# +# Parametrized over (buffer_kind, n_elem) on torus_2d 6 SIPs (3×2). Pre +# slot-latency modeling the three lines overlap exactly (slot access is +# latency-free today); they spread out once tcm/sram/hbm carry distinct +# access costs. + +_BUFFER_KINDS = ["tcm", "sram", "hbm"] +_BK_N_ELEM_GRID = [128, 1024, 8192, 32768] # 256 B → 64 KB per slot +_BK_ROWS_DIR = _SWEEP_OUT_DIR / "_buffer_kind_rows" +# Descriptive output stem (shared by the .png and .csv). +_BK_OUT_STEM = "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM" + + +def _render_buffer_kind(records: list[dict], out_dir: Path) -> None: + """Write the buffer-kind comparison .png + .csv (_BK_OUT_STEM) into out_dir.""" + import matplotlib.pyplot as plt + from matplotlib.ticker import FuncFormatter + + _bytes_fmt = FuncFormatter(_fmt_bytes) + + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + with open(out_dir / f"{_BK_OUT_STEM}.csv", "w", + newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=[ + "buffer_kind", "sip_topology", "n_sips", "n_elem", + "bytes_per_pe", "latency_ns", + ]) + w.writeheader() + for r in sorted(records, key=lambda r: ( + r["buffer_kind"], r["bytes_per_pe"], + )): + w.writerow(r) + + colors = {"tcm": "tab:blue", "sram": "tab:orange", "hbm": "tab:red"} + fig, ax = plt.subplots(figsize=(10, 6)) + for bk in ["tcm", "sram", "hbm"]: + rs = sorted( + [r for r in records if r["buffer_kind"] == bk], + key=lambda r: r["bytes_per_pe"], + ) + if not rs: + continue + ax.plot( + [r["bytes_per_pe"] for r in rs], + [r["latency_ns"] for r in rs], + marker="o", lw=2.0, + color=colors[bk], label=f"buffer_kind = {bk}", + ) + ax.set_xscale("log", base=2) + ax.set_xlabel("Bytes per PE (log scale)") + ax.set_ylabel("Time (ns)") + ax.set_title( + "AllReduce_LRAB_2Dtorus_6SiP(2x3) — IPCQ memory (SRAM, TCM, HBM)" + ) + ax.grid(True, alpha=0.3) + ax.legend() + ax.xaxis.set_major_formatter(_bytes_fmt) + fig.tight_layout() + fig.savefig(out_dir / f"{_BK_OUT_STEM}.png", dpi=130) + plt.close(fig) + + +def aggregate_buffer_kind_plot(out_dir: Path | None = None, + rows_dir: Path | None = None) -> bool: + """Read per-config rows and emit the descriptive .png + .csv (_BK_OUT_STEM). + + Called from conftest.pytest_sessionfinish (via the shim, controller-only). + Returns True if rows were aggregated. + """ + out_dir = Path(out_dir) if out_dir is not None else _SWEEP_OUT_DIR + rows_dir = Path(rows_dir) if rows_dir is not None else _BK_ROWS_DIR + + if not rows_dir.exists(): + return False + row_files = sorted(rows_dir.glob("*.json")) + if not row_files: + return False + + records = [] + for p in row_files: + with open(p, encoding="utf-8") as f: + records.append(json.load(f)) + + _render_buffer_kind(records, out_dir) + + for p in row_files: + try: + p.unlink() + except OSError: + pass + try: + rows_dir.rmdir() + except OSError: + pass + + print(f"\nWrote {out_dir / f'{_BK_OUT_STEM}.png'} " + f"from {len(records)} rows") + return True + + +# ── Topology diagram (device-level + cube-level reduction) ──────────── + +# Convention: "rows × cols" everywhere, row-major rank assignment +# (rank = row * n_cols + col). For the 2×3 inter-SIP grid, this means +# 2 rows × 3 columns: SIP 0 1 2 / SIP 3 4 5. + +_PALETTE_BG = "#fafbfd" +_PALETTE_FRAME = "#3a3f4a" +_PALETTE_BLUE = "#2c6fb6" +_PALETTE_GREEN = "#2e8a4e" +_PALETTE_TEXT = "#1f2530" +_PALETTE_BOX_FILL = "#eaf2fb" +_PALETTE_BOX_EDGE = "#2c4a78" +_PALETTE_ROOT_FILL = "#ffd9b8" +_PALETTE_ROOT_EDGE = "#bd5a14" + + +def _arrow(ax, xy_from, xy_to, color="black", lw=1.4, alpha=1.0, + style="-|>", curve=0.0): + from matplotlib.patches import FancyArrowPatch + arrow = FancyArrowPatch( + xy_from, xy_to, + arrowstyle=style, mutation_scale=12, + color=color, lw=lw, alpha=alpha, + connectionstyle=f"arc3,rad={curve}", + ) + ax.add_patch(arrow) + + +def _draw_sip_box(ax, cx, cy, w, h, label, *, fill=_PALETTE_BOX_FILL, + edge=_PALETTE_BOX_EDGE, text_color=_PALETTE_TEXT, + font=10): + from matplotlib.patches import FancyBboxPatch + box = FancyBboxPatch( + (cx - w / 2, cy - h / 2), w, h, + boxstyle="round,pad=0.02,rounding_size=0.10", + linewidth=1.4, edgecolor=edge, facecolor=fill, + ) + ax.add_patch(box) + ax.text(cx, cy, label, ha="center", va="center", + color=text_color, fontsize=font, fontweight="bold") + + +def _frame_panel(ax, title, lim_x=10.0, lim_y=6.0): + """Set up a square-ish panel with a visible outer border.""" + from matplotlib.patches import FancyBboxPatch + ax.set_xlim(0, lim_x) + ax.set_ylim(0, lim_y) + ax.set_aspect("equal") + ax.axis("off") + ax.set_facecolor(_PALETTE_BG) + border = FancyBboxPatch( + (0.05, 0.05), lim_x - 0.10, lim_y - 0.10, + boxstyle="round,pad=0.01,rounding_size=0.12", + linewidth=1.4, edgecolor=_PALETTE_FRAME, facecolor=_PALETTE_BG, + zorder=0, + ) + ax.add_patch(border) + ax.set_title(title, fontsize=12, fontweight="bold", + color=_PALETTE_TEXT, pad=8) + + +def _draw_ring_topology(ax): + _frame_panel(ax, "ring_1d (6 SIPs)", lim_x=10.0, lim_y=6.0) + + xs = [1.2, 2.7, 4.2, 5.7, 7.2, 8.7] + y = 3.1 + box_w, box_h = 1.05, 0.9 + for i, x in enumerate(xs): + _draw_sip_box(ax, x, y, box_w, box_h, f"SIP {i}") + # Forward ring (global_E) — adjacent neighbours, anchored to box edges. + for i in range(5): + _arrow(ax, (xs[i] + box_w / 2, y), + (xs[i + 1] - box_w / 2, y), + color=_PALETTE_BLUE, lw=1.6) + # Wrap (SIP 5 → SIP 0). Anchor at right-CENTER of SIP 5 and + # left-CENTER of SIP 0; arc OUTSIDE (above) the row so it does not + # overlap any of the SIP boxes in between. + _arrow( + ax, + (xs[5] + box_w / 2, y), + (xs[0] - box_w / 2, y), + color=_PALETTE_BLUE, lw=1.6, curve=-0.40, + ) + ax.text(5.0, y + 2.0, "global_E (ring)", ha="center", + color=_PALETTE_BLUE, fontsize=10, style="italic") + ax.text(5.0, y - 1.5, + "(global_W = reverse direction, used by the algorithm)", + ha="center", color="gray", fontsize=8, style="italic") + + +def _draw_grid_topology(ax, kind, *, n_rows=2, n_cols=3): + """kind ∈ {'torus', 'mesh'}. Lays out as n_rows × n_cols (row-major). + + For the sweep we use 2 rows × 3 cols → SIP layout:: + + row 0: SIP 0 SIP 1 SIP 2 + row 1: SIP 3 SIP 4 SIP 5 + """ + title = f"torus_2d ({n_rows}×{n_cols}, 6 SIPs)" if kind == "torus" \ + else f"mesh_2d_no_wrap ({n_rows}×{n_cols}, 6 SIPs)" + _frame_panel(ax, title, lim_x=10.0, lim_y=6.0) + + col_xs = [2.0, 5.0, 8.0] # 3 cols + row_ys = [4.3, 1.8] # 2 rows + box_w, box_h = 1.3, 0.95 + pos: dict[tuple[int, int], tuple[float, float]] = {} + for r in range(n_rows): + for c in range(n_cols): + rank = r * n_cols + c + x, y = col_xs[c], row_ys[r] + pos[(r, c)] = (x, y) + _draw_sip_box(ax, x, y, box_w, box_h, f"SIP {rank}") + + # Row edges (E↔W) — between adjacent columns within each row. + for r in range(n_rows): + for c in range(n_cols - 1): + x0, y0 = pos[(r, c)] + x1, y1 = pos[(r, c + 1)] + _arrow(ax, (x0 + box_w / 2, y0 + 0.10), + (x1 - box_w / 2, y1 + 0.10), + color=_PALETTE_BLUE, lw=1.5) + _arrow(ax, (x1 - box_w / 2, y1 - 0.10), + (x0 + box_w / 2, y0 - 0.10), + color=_PALETTE_BLUE, lw=1.5) + # Col edges (N↔S) — between adjacent rows within each column. + for c in range(n_cols): + for r in range(n_rows - 1): + x0, y0 = pos[(r, c)] + x1, y1 = pos[(r + 1, c)] + _arrow(ax, (x0 - 0.12, y0 - box_h / 2), + (x1 - 0.12, y1 + box_h / 2), + color=_PALETTE_GREEN, lw=1.5) + _arrow(ax, (x1 + 0.12, y1 + box_h / 2), + (x0 + 0.12, y0 - box_h / 2), + color=_PALETTE_GREEN, lw=1.5) + # Wrap arrows for torus only — anchor to the centre of the OUTER + # edge of the end SIPs and arc OUTSIDE the row/column so they do + # not overlap the SIPs in between. + if kind == "torus": + # Row wrap: last col → first col. Top row arcs UP, bottom row + # arcs DOWN, so each wrap sits clearly outside its own row. + for r in range(n_rows): + x0, y0 = pos[(r, 0)] + x1, y1 = pos[(r, n_cols - 1)] + curve = -0.45 if r == 0 else 0.45 + _arrow( + ax, + (x1 + box_w / 2, y1), + (x0 - box_w / 2, y0), + color=_PALETTE_BLUE, lw=1.5, + curve=curve, alpha=0.9, + ) + # Col wrap: last row → first row. Leftmost col arcs LEFT, + # rightmost col arcs RIGHT. Middle col(s) get a small inline + # marker + legend note (drawing them through the panel would + # collide with the row arrows). + for c in range(n_cols): + x0, y0 = pos[(0, c)] + x1, y1 = pos[(n_rows - 1, c)] + if c == 0: + curve = 0.55 + elif c == n_cols - 1: + curve = -0.55 + else: + continue # skip middle col — see legend note + _arrow( + ax, + (x1, y1 - box_h / 2), + (x0, y0 + box_h / 2), + color=_PALETTE_GREEN, lw=1.5, + curve=curve, alpha=0.9, + ) + + ax.text(0.7, 5.6, "global_E/W (row)", color=_PALETTE_BLUE, + fontsize=9, style="italic", fontweight="bold") + ax.text(0.7, 5.25, "global_N/S (col)", color=_PALETTE_GREEN, + fontsize=9, style="italic", fontweight="bold") + ax.text(0.7, 4.92, + "wrap = torus" if kind == "torus" else "no wrap = mesh", + color="gray", fontsize=8, style="italic") + if kind == "torus" and n_cols > 2: + ax.text(0.7, 0.3, + "(middle-col wrap omitted for clarity — every row " + "and every column wraps)", + color="gray", fontsize=7.5, style="italic") + + +def _draw_cube_reduction(ax): + """4×4 cube grid inside SIP 0 — compact layout with phase legend.""" + from matplotlib.patches import Rectangle + _frame_panel(ax, "Cube-level reduction inside SIP 0 (4×4 cubes)", + lim_x=10.0, lim_y=6.0) + + cube_w = 0.65 + cube_gap = 0.18 + # Center the 4×4 grid in the left half of the panel. + grid_total = 4 * cube_w + 3 * cube_gap + grid_x0 = 0.7 + grid_y0 = 0.7 + centers: dict[tuple[int, int], tuple[float, float]] = {} + for r in range(4): + for c in range(4): + cx = grid_x0 + c * (cube_w + cube_gap) + cube_w / 2 + cy = grid_y0 + (3 - r) * (cube_w + cube_gap) + cube_w / 2 + centers[(r, c)] = (cx, cy) + cube_id = r * 4 + c + is_root = (r == 3 and c == 3) + face = _PALETTE_ROOT_FILL if is_root else _PALETTE_BOX_FILL + edge = _PALETTE_ROOT_EDGE if is_root else _PALETTE_BOX_EDGE + rect = Rectangle( + (cx - cube_w / 2, cy - cube_w / 2), cube_w, cube_w, + linewidth=1.2, edgecolor=edge, facecolor=face, + ) + ax.add_patch(rect) + label = f"c{cube_id}" + ax.text(cx, cy, label, ha="center", va="center", + fontsize=7.5, fontweight="bold", + color=_PALETTE_ROOT_EDGE if is_root + else _PALETTE_TEXT) + + # Phase 1: row reduce W→E. + for r in range(4): + for c in range(3): + x0, y0 = centers[(r, c)] + x1, y1 = centers[(r, c + 1)] + _arrow(ax, (x0 + cube_w / 2, y0), (x1 - cube_w / 2, y1), + color=_PALETTE_BLUE, lw=1.5) + # Phase 2: col reduce N→S along rightmost column. + for r in range(3): + x0, y0 = centers[(r, 3)] + x1, y1 = centers[(r + 1, 3)] + _arrow(ax, (x0, y0 - cube_w / 2), (x1, y1 + cube_w / 2), + color=_PALETTE_GREEN, lw=1.7) + + # Phase legend on the right side. + legend_x = grid_x0 + grid_total + 0.55 + ax.text(legend_x, 5.0, "Phase 1: row reduce (W → E)", + color=_PALETTE_BLUE, fontsize=10, fontweight="bold") + ax.text(legend_x, 4.55, "Phase 2: col reduce (N → S, rightmost col)", + color=_PALETTE_GREEN, fontsize=10, fontweight="bold") + ax.text(legend_x, 4.10, "Phase 3: inter-SIP exchange at root cube", + color=_PALETTE_ROOT_EDGE, fontsize=10, fontweight="bold") + ax.text(legend_x, 3.65, "Phase 4: col broadcast (S → N)", + color=_PALETTE_GREEN, fontsize=10, style="italic") + ax.text(legend_x, 3.20, "Phase 5: row broadcast (E → W)", + color=_PALETTE_BLUE, fontsize=10, style="italic") + ax.text(legend_x, 2.55, + "(broadcast phases reverse phases 2 & 1)", + color="gray", fontsize=8.5, style="italic") + ax.text(legend_x, 1.7, + "Root cube (c15, bottom-right) is the only\n" + "cube that performs the inter-SIP exchange.", + color=_PALETTE_ROOT_EDGE, fontsize=9, style="italic") + + +def emit_topology_diagram(out_dir: Path | None = None) -> str: + """Emit a 2×2-panel topology diagram into out_dir (default docs/diagrams). + + Top row: ring_1d | torus_2d (2×3) + Bot row: mesh_2d_no_wrap (2×3) | cube-level reduction in SIP 0 + """ + import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt + + out_dir = Path(out_dir) if out_dir is not None else _SWEEP_OUT_DIR + out_dir.mkdir(parents=True, exist_ok=True) + fig = plt.figure(figsize=(16, 10), facecolor="white") + gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.30, wspace=0.10) + ax_ring = fig.add_subplot(gs[0, 0]) + ax_torus = fig.add_subplot(gs[0, 1]) + ax_mesh = fig.add_subplot(gs[1, 0]) + ax_cube = fig.add_subplot(gs[1, 1]) + + _draw_ring_topology(ax_ring) + _draw_grid_topology(ax_torus, "torus", n_rows=2, n_cols=3) + _draw_grid_topology(ax_mesh, "mesh", n_rows=2, n_cols=3) + _draw_cube_reduction(ax_cube) + + fig.suptitle( + "Allreduce topology — device-level (top: ring, torus, mesh) " + "and cube-level reduction in SIP 0", + fontsize=14, fontweight="bold", color=_PALETTE_TEXT, y=0.98, + ) + out_path = out_dir / "topology.png" + fig.savefig(out_path, dpi=130, bbox_inches="tight", + facecolor=fig.get_facecolor()) + plt.close(fig) + return str(out_path) + + +# ── Comparison vs FSIM (broken-y-axis) ──────────────────────────────── +# +# Post-processes summary.csv: today's three model curves + a hand-derived +# theoretical torus_2d line in the bottom panel, and a single external FSIM +# single-device reference marker in the top panel (hardcoded 366 µs; no +# external data file). Reads summary.csv written by _render_latency_sweep. + +_FSIM_EXT_LABEL = "FSIM (single device): 366 µs" +_FSIM_EXT_LATENCY_NS = 366_000.0 +_CMP_COLORS = { + "ring_1d": "tab:blue", + "torus_2d": "tab:orange", + "mesh_2d_no_wrap": "tab:green", +} +_CMP_DISPLAY = { + "ring_1d": "Ring 1x6 (6 devices)", + "torus_2d": "2D Torus 2x3 (6 devices)", + "mesh_2d_no_wrap": "2D Mesh 2x3 (6 devices)", +} +# Hand-derived theoretical model for torus_2d (6 SIPs): per-PE NOC-packet +# count fit to the simulated startup + per-packet tau. +_CMP_NOC_PACKET_BYTES = 128 +_CMP_PES_PER_CUBE = 8 +_CMP_T_STARTUP_NS = 1346.0 +_CMP_TAU_NS = (8741.0 - 1346.0) / (6144 - 1) + + +def emit_comparison_fsim_plot(out_dir: Path | None = None) -> str | None: + """Render comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png. + + Reads ``summary.csv`` from out_dir (default docs/diagrams). Returns the + output path, or ``None`` if summary.csv is absent / empty. + """ + out_dir = Path(out_dir) if out_dir is not None else _SWEEP_OUT_DIR + csv_path = out_dir / "summary.csv" + if not csv_path.exists(): + return None + records = [] + with open(csv_path, newline="", encoding="utf-8") as f: + for row in csv.DictReader(f): + records.append({ + "sip_topology": row["sip_topology"], + "bytes_per_pe": int(row["bytes_per_pe"]), + "latency_ns": float(row["latency_ns"]), + }) + if not records: + return None + + import matplotlib.pyplot as plt + import matplotlib.ticker as mticker + + def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float: + bytes_per_cube = int(bytes_per_pe) * _CMP_PES_PER_CUBE + n_packets = max(1, -(-bytes_per_cube // _CMP_NOC_PACKET_BYTES)) + return _CMP_T_STARTUP_NS + (n_packets - 1) * _CMP_TAU_NS + + def _bytes_fmt2(x, _pos): + if x >= 1024 * 1024: + return f"{x / (1024 * 1024):.0f}M" + if x >= 1024: + return f"{x / 1024:.0f}K" + return f"{int(x)}" + + topologies = sorted({r["sip_topology"] for r in records}) + max_local = max(r["latency_ns"] for r in records) + ext_x = max(r["bytes_per_pe"] for r in records) + + fig, (ax_top, ax_bot) = plt.subplots( + 2, 1, sharex=True, + gridspec_kw={"height_ratios": [1, 4], "hspace": 0.05}, + figsize=(9, 6.5), + ) + + # Bottom panel: model curves + theoretical torus, linear y. + for topo in topologies: + rs = sorted([r for r in records if r["sip_topology"] == topo], + key=lambda r: r["bytes_per_pe"]) + if not rs: + continue + ax_bot.plot( + [r["bytes_per_pe"] for r in rs], + [r["latency_ns"] for r in rs], + marker="o", label=_CMP_DISPLAY.get(topo, topo), + color=_CMP_COLORS.get(topo), + ) + torus_rs = sorted( + [r for r in records if r["sip_topology"] == "torus_2d"], + key=lambda r: r["bytes_per_pe"], + ) + if torus_rs: + ax_bot.plot( + [r["bytes_per_pe"] for r in torus_rs], + [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs], + color="tab:red", linestyle="--", linewidth=1.6, marker="x", + label="Theoretical 2D Torus 2x3", + ) + ax_bot.set_ylim(0, max_local * 1.10) + + # Top panel: external FSIM single-device reference marker. + ax_top.scatter( + [ext_x], [_FSIM_EXT_LATENCY_NS], + marker="*", s=240, color="tab:red", zorder=5, + label=_FSIM_EXT_LABEL, + ) + ax_top.set_ylim(_FSIM_EXT_LATENCY_NS * 0.93, _FSIM_EXT_LATENCY_NS * 1.05) + + # Hide spine between panels; draw diagonal break ticks. + ax_top.spines["bottom"].set_visible(False) + ax_bot.spines["top"].set_visible(False) + ax_top.tick_params(labeltop=False, bottom=False) + ax_bot.xaxis.tick_bottom() + d = 0.012 + kw = dict(transform=ax_top.transAxes, color="k", clip_on=False, lw=1) + ax_top.plot((-d, +d), (-d, +d), **kw) + ax_top.plot((1 - d, 1 + d), (-d, +d), **kw) + kw.update(transform=ax_bot.transAxes) + ax_bot.plot((-d, +d), (1 - d * 4, 1 + d * 4), **kw) + ax_bot.plot((1 - d, 1 + d), (1 - d * 4, 1 + d * 4), **kw) + + ax_bot.set_xscale("log", base=2) + ax_bot.set_xlabel("Bytes per PE (log scale)") + ax_bot.set_ylabel("Time (ns)") + ax_top.set_ylabel("Time (ns)") + ax_bot.grid(True, alpha=0.3) + ax_top.grid(True, alpha=0.3) + ax_bot.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt2)) + + handles_bot, labels_bot = ax_bot.get_legend_handles_labels() + handles_top, labels_top = ax_top.get_legend_handles_labels() + ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top, + loc="upper left") + + fig.suptitle("Multidevice allreduce (ring, Mesh, 2DTorus) vs FSIM latency") + fig.tight_layout() + out = out_dir / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png" + fig.savefig(out, dpi=120) + plt.close(fig) + return str(out) + + +# ── full eval (bench entry point) ────────────────────────────────────── + + +def run_full_allreduce_eval(out_dir: Path | str, workdir: Path | str) -> dict: + """Drive both sweeps in-process and render every allreduce artifact. + + Writes per-topology latency PNGs + summary.csv, the buffer-kind PNG + + csv, the topology diagram, and the FSIM comparison into ``out_dir``. + ``workdir`` holds the per-config temp topology.yaml / ccl.yaml. Returns + a small dict of counts for reporting. + """ + out_dir = Path(out_dir) + workdir = Path(workdir) + out_dir.mkdir(parents=True, exist_ok=True) + workdir.mkdir(parents=True, exist_ok=True) + + # 1. Latency sweep (n_elem × topology). + latency_records: list[dict] = [] + for algorithm, sip_topology, n_sips, sip_w, sip_h in _SWEEP_TOPOLOGIES: + for n_elem in _SWEEP_N_ELEM: + sub = workdir / f"lat_{sip_topology}_{n_elem}" + sub.mkdir(parents=True, exist_ok=True) + topo_path, _ = _write_temp_configs( + sub, sip_topology, n_sips, algorithm, + sip_w=sip_w, sip_h=sip_h, n_elem_override=n_elem, + ) + engine, n_cubes = _drive_in_dir( + sub, topo_path, + f"sweep_{algorithm}_{sip_topology}_{n_elem}", n_elem, + ) + crit_ns = _crit_ns(engine) + latency_records.append({ + "algorithm": algorithm, + "sip_topology": sip_topology, + "n_sips": n_sips, + "n_elem": n_elem, + "bytes_per_pe": n_elem * _ELEM_BYTES_F16, + "bytes_per_sip": n_cubes * n_elem * _ELEM_BYTES_F16, + "latency_ns": crit_ns, + }) + _render_latency_sweep(latency_records, out_dir) + + # 2. Buffer-kind sweep (tcm/sram/hbm × n_elem) on torus_2d 3×2. + bk_records: list[dict] = [] + for bk in _BUFFER_KINDS: + for n_elem in _BK_N_ELEM_GRID: + sub = workdir / f"bk_{bk}_{n_elem}" + sub.mkdir(parents=True, exist_ok=True) + topo_path, ccl_path = _write_temp_configs( + sub, "torus_2d", 6, "lrab_hierarchical_allreduce", + sip_w=3, sip_h=2, n_elem_override=n_elem, + ) + with open(ccl_path) as f: + ccl_cfg = yaml.safe_load(f) + ccl_cfg.setdefault("defaults", {})["buffer_kind"] = bk + ccl_cfg.setdefault("algorithms", {}).setdefault( + "lrab_hierarchical_allreduce", {}, + )["buffer_kind"] = bk + with open(ccl_path, "w") as f: + yaml.dump(ccl_cfg, f, default_flow_style=False) + engine, _ = _drive_in_dir( + sub, topo_path, f"bk_sweep_{bk}_{n_elem}", n_elem, + ) + bk_records.append({ + "buffer_kind": bk, + "sip_topology": "torus_2d", + "n_sips": 6, + "n_elem": n_elem, + "bytes_per_pe": n_elem * _ELEM_BYTES_F16, + "latency_ns": _crit_ns(engine), + }) + _render_buffer_kind(bk_records, out_dir) + + # 3. Topology diagram (pure matplotlib). + emit_topology_diagram(out_dir) + + # 4. FSIM comparison (reads the summary.csv just written). + emit_comparison_fsim_plot(out_dir) + + return { + "latency_rows": len(latency_records), + "buffer_kind_rows": len(bk_records), + "out_dir": str(out_dir), + } + + +# ── bench entry ──────────────────────────────────────────────────────── + + +@bench( + name="milestone-1h-ccl", + description="1H milestone: regenerate all allreduce results + figures.", +) +def run(torch) -> None: + """Drive the full allreduce eval and write every figure to the output dir. + + Eval runs its own per-config engines (many topologies), so the outer + ``run_bench`` ctx has no handles; we submit a sentinel tensor at the end + to satisfy the "must submit at least one request" contract (ADR-0045 D4). + """ + with tempfile.TemporaryDirectory(prefix="milestone_ccl_") as tmp: + summary = run_full_allreduce_eval(_OUTPUT_DIR, tmp) + print(f" milestone-1h-ccl: {summary['latency_rows']} latency rows, " + f"{summary['buffer_kind_rows']} buffer-kind rows -> " + f"{summary['out_dir']}") + + torch.zeros( + (1, 1), dtype="f16", + dp=DPPolicy(cube="row_wise", pe="replicate", num_cubes=1, num_pes=1), + name="milestone_ccl_sentinel", + ) diff --git a/src/kernbench/benches/milestone_1h_gemm.py b/src/kernbench/benches/milestone_1h_gemm.py new file mode 100644 index 0000000..4efff31 --- /dev/null +++ b/src/kernbench/benches/milestone_1h_gemm.py @@ -0,0 +1,568 @@ +"""milestone-1h-gemm bench: GEMM evaluation harness (sweep + figures). + +Self-contained milestone bench (ADR-0054). Holds the shape×variant sweep +and the figure renderers; the ``run(torch)`` entry at the bottom runs the +sweep (or reuses the committed JSON when ``MILESTONE_FAST=1``) and writes +every figure into ``benches/1H_milestone_output/gemm/``. + +This is the single home for the GEMM eval logic: the figure tests import a +thin re-export shim (``tests/gemm/_gemm_plot_helpers.py``), as does the +``scripts/gemm_sweep.py`` wrapper. + +The sweep drives ``matmul-composite`` across shapes×variants through the +same ``run_bench`` path the CLI uses, harvests ``result.engine.op_log``, +and writes the sweep JSON. The renderers read that JSON and emit matplotlib +PNGs. No simulation in the renderers — they are fast. + +Chart set (mirrors the GEMM MAC slides in scripts/build_overview_slides.py): + - stage breakdown (load_ref operand staging) + - MAC utilization — measured (load_ref) + - MAC utilization — theoretical vs measured (load_ref) +""" +from __future__ import annotations + +import json +import os +import sys +import time +from pathlib import Path + +from kernbench.benches.registry import bench +from kernbench.policy.placement.dp import DPPolicy + +ROOT = Path(__file__).resolve().parents[3] +DEFAULT_SWEEP_JSON = ROOT / "docs" / "diagrams" / "gemm_sweep.json" +DEFAULT_PLOTS_DIR = ROOT / "docs" / "diagrams" / "gemm_plots" +_OUTPUT_DIR = Path(__file__).resolve().parent / "1H_milestone_output" / "gemm" + +# ── sweep configuration ──────────────────────────────────────────────── + +# Default sweep covering under-tile, single-tile, multi-tile, and asymmetric +# regimes. Each entry is "MxKxN" or a single int (square M=K=N). +# Override via env: SWEEP_SHAPES="16,32,16x2048x16,..." +DEFAULT_SHAPES = [ + "32x32x32", # 1 tile, K=32 < TILE_K=64 → under-tile in K + "32x64x32", # 1 tile, exact single-tile fit + "32x128x32", # 2 tiles, aligned + "32x128x128", # 8 tiles, aligned + "32x3072x32", # 48 tiles, all K-axis (tall-skinny) + "8x128x128", # 8 tiles, but M=8 < TILE_M=32 → MAC array under-fed + "128x8x128", # 16 tiles, but K=8 < TILE_K=64 → MAC array under-fed + "512", # 2048 tiles, fully aligned — "well-pipelined" reference +] + +# Operand-staging variants exercised per shape. +VARIANTS = ["ref_ref", "load_ref", "load_load"] + +# Engines whose timings we collect (component_id suffix match). +ENGINES = ["pe_dma", "pe_fetch_store", "pe_gemm", "pe_math"] + +# Per-stage breakdown labels (StageType enum names from pe_types.py). +STAGES = ["DMA_READ", "DMA_WRITE", "FETCH", "STORE", "GEMM", "MATH"] + +# Scheduler tile sizes (mirror of PeSchedulerComponent.TILE_M/K/N). +TILE_M, TILE_K, TILE_N = 32, 64, 32 + + +def _ceil(a: int, b: int) -> int: + return (a + b - 1) // b + + +def _engine_wall_ns(records, suffix: str) -> float: + """Wall-clock interval the engine was active (union of overlapping ops).""" + intervals = [(r.t_start, r.t_end) for r in records + if r.component_id.endswith("." + suffix)] + if not intervals: + return 0.0 + intervals.sort() + merged_end = intervals[0][1] + merged_start = intervals[0][0] + total = 0.0 + for s, e in intervals[1:]: + if s <= merged_end: + merged_end = max(merged_end, e) + else: + total += merged_end - merged_start + merged_start, merged_end = s, e + total += merged_end - merged_start + return total + + +def _engine_occupancy_ns(records, suffix: str) -> float: + return sum(r.t_end - r.t_start for r in records + if r.component_id.endswith("." + suffix)) + + +def _engine_count(records, suffix: str) -> int: + return sum(1 for r in records if r.component_id.endswith("." + suffix)) + + +def _stage_occupancy_ns(records, stage_type: str) -> float: + return sum( + r.t_end - r.t_start + for r in records + if r.params.get("stage_type") == stage_type + ) + + +def _stage_wall_ns(records, stage_type: str) -> float: + """Interval-union wall-clock for records whose stage_type matches.""" + intervals = sorted( + (r.t_start, r.t_end) for r in records + if r.params.get("stage_type") == stage_type + ) + if not intervals: + return 0.0 + total = 0.0 + cs, ce = intervals[0] + for s, e in intervals[1:]: + if s <= ce: + ce = max(ce, e) + else: + total += ce - cs + cs, ce = s, e + total += ce - cs + return total + + +def _stage_count(records, stage_type: str) -> int: + return sum(1 for r in records if r.params.get("stage_type") == stage_type) + + +def _run_one(M: int, K: int, N: int, topology: str, variant: str = "ref_ref") -> dict: + os.environ["MATMUL_M"] = str(M) + os.environ["MATMUL_K"] = str(K) + os.environ["MATMUL_N"] = str(N) + os.environ["MATMUL_VARIANT"] = variant + + # Late imports so env vars are read by matmul_composite at module load. + # Force re-import to pick up new env values. + for mod_name in [m for m in list(sys.modules) + if m.startswith("kernbench.benches.matmul_composite")]: + del sys.modules[mod_name] + + from kernbench.benches.registry import resolve as resolve_bench + from kernbench.runtime_api.bench_runner import run_bench + from kernbench.runtime_api.types import resolve_device + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import resolve_topology + + topo = resolve_topology(topology) + bench = resolve_bench("matmul-composite").run + device = resolve_device(None) + + t0 = time.time() + result = run_bench( + topology=topo, bench_fn=bench, device=device, + engine_factory=lambda t, d: GraphEngine( + getattr(t, "topology_obj", t), enable_data=True, + ), + ) + wall = time.time() - t0 + + op_log = result.engine.op_log + if not result.completion.ok: + raise RuntimeError(f"bench failed at M={M},K={K},N={N}: {result.completion}") + + # Bytes touched at f16 (2 B): full A + full B + full out (each operand + # streamed once through HBM by the composite plan). + bytes_total = (M * K + K * N + M * N) * 2 + row = { + "M": M, "K": K, "N": N, + "variant": variant, + "flops": 2 * M * K * N, + "bytes_hbm": bytes_total, + "arith_intensity": (2 * M * K * N) / bytes_total, # flops/byte + "tile_count_expected": _ceil(M, TILE_M) * _ceil(N, TILE_N) * _ceil(K, TILE_K), + "sim_wall_clock_s": round(wall, 3), + "engines": {}, + } + for eng in ENGINES: + row["engines"][eng] = { + "occupancy_ns": _engine_occupancy_ns(op_log, eng), + "wall_ns": _engine_wall_ns(op_log, eng), + "record_count": _engine_count(op_log, eng), + } + row["stages"] = {} + for stage in STAGES: + row["stages"][stage] = { + "occupancy_ns": _stage_occupancy_ns(op_log, stage), + "wall_ns": _stage_wall_ns(op_log, stage), + "record_count": _stage_count(op_log, stage), + } + # Kernel-window wall-clock = max t_end - min t_start over PE engine records. + pe_records = [r for r in op_log + if any(r.component_id.endswith("." + e) for e in ENGINES)] + if pe_records: + row["pe_window_ns"] = max(r.t_end for r in pe_records) \ + - min(r.t_start for r in pe_records) + else: + row["pe_window_ns"] = 0.0 + stage_records = [r for r in op_log + if r.params.get("stage_type") in STAGES] + if stage_records: + row["composite_window_ns"] = max(r.t_end for r in stage_records) \ + - min(r.t_start for r in stage_records) + else: + row["composite_window_ns"] = 0.0 + return row + + +def _parse_shapes(raw) -> list[tuple[int, int, int]]: + shapes: list[tuple[int, int, int]] = [] + for s in raw: + s = s.strip() + if not s: + continue + if "x" in s.lower(): + parts = s.lower().split("x") + shapes.append((int(parts[0]), int(parts[1]), int(parts[2]))) + else: + v = int(s) + shapes.append((v, v, v)) + return shapes + + +def run_sweep(out_json: Path | str = DEFAULT_SWEEP_JSON) -> Path: + """Drive matmul-composite across shapes×variants; write the sweep JSON. + + Honors ``SWEEP_SHAPES`` / ``SWEEP_TOPOLOGY`` env overrides (same as the + historical ``scripts/gemm_sweep.py``). Returns the JSON path written. + """ + shapes_env = os.environ.get("SWEEP_SHAPES") + raw = (shapes_env.split(",") if shapes_env else DEFAULT_SHAPES) + shapes = _parse_shapes(raw) + topology = os.environ.get("SWEEP_TOPOLOGY", "topology.yaml") + + rows = [] + for M, K, N in shapes: + for variant in VARIANTS: + print(f"[sweep] M={M} K={K} N={N} variant={variant} ...", flush=True) + row = _run_one(M, K, N, topology, variant=variant) + rows.append(row) + eng_dma = row["engines"]["pe_dma"] + eng_gem = row["engines"]["pe_gemm"] + print(f" tiles={row['tile_count_expected']:>6} " + f"pe_window={row['pe_window_ns']:8.1f}ns " + f"dma_occ={eng_dma['occupancy_ns']:9.1f} " + f"gemm_occ={eng_gem['occupancy_ns']:8.1f} " + f"(sim {row['sim_wall_clock_s']:.1f}s)") + + out_json = Path(out_json) + out_json.parent.mkdir(parents=True, exist_ok=True) + out_json.write_text(json.dumps({ + "tile_sizes": {"M": TILE_M, "K": TILE_K, "N": TILE_N}, + "engines": ENGINES, + "stages": STAGES, + "variants": VARIANTS, + "rows": rows, + }, indent=2)) + print(f"\n[sweep] wrote {out_json}") + return out_json + + +# ── figure rendering ─────────────────────────────────────────────────── + +# Shapes excluded from the figures (mirrors build_overview_slides). +EXCLUDED_SHAPES = {(512, 512, 512)} + +# Stage bars shown (raw op_log stage_type keys) + display names + colors. +STAGE_KEYS = ["DMA_READ", "FETCH", "GEMM", "DMA_WRITE"] +STAGE_DISPLAY = { + "DMA_READ": "DMA in", + "FETCH": "Fetch", + "GEMM": "GEMM", + "DMA_WRITE": "DMA out", +} +STAGE_COLORS = { + "DMA_READ": "#3B82F6", + "FETCH": "#10B981", + "GEMM": "#F59E0B", + "DMA_WRITE": "#A855F7", +} + +# MAC-utilization model constants (mirror build_overview_slides). +_HBM_GBS = 256.0 +_BPE = 2 +_T_STAGE = 16.0 +_D_STAGES = 3 + +_PLOT_VARIANT = "load_ref" + + +def _load_sweep_data(sweep_json: Path | str = DEFAULT_SWEEP_JSON) -> dict: + sweep_json = Path(sweep_json) + if not sweep_json.exists(): + return {"rows": []} + data = json.loads(sweep_json.read_text()) + data["rows"] = [ + r for r in data.get("rows", []) + if (r["M"], r["K"], r["N"]) not in EXCLUDED_SHAPES + ] + return data + + +def _shape_label(r: dict) -> str: + if r["M"] == r["K"] == r["N"]: + return f"M=K=N={r['M']}" + return f"M={r['M']} K={r['K']} N={r['N']}" + + +def _under_tile(M, K, N, tile_M, tile_K, tile_N) -> bool: + return M < tile_M or K < tile_K or N < tile_N + + +def _xtick_labels(shape_labels, tile_counts, flagged) -> list[str]: + out = [] + for lbl, tc, fl in zip(shape_labels, tile_counts, flagged): + s = f"{lbl}\n({tc} tiles)" + if fl: + s += " *" + out.append(s) + return out + + +def _grouped_bar_png( + out_name: str, *, out_dir: Path, title: str, subtitle: str | None, + shape_labels, tile_counts, flagged, series: dict, colors: dict, + y_label: str, threshold: float | None = None, footnote: str | None = None, +) -> str: + """Render one grouped-bar chart to out_dir/out_name; return the path.""" + import matplotlib.pyplot as plt + import numpy as np + + n_groups = len(shape_labels) + n_series = max(1, len(series)) + x = np.arange(n_groups) + width = 0.8 / n_series + + fig, ax = plt.subplots(figsize=(11, 6)) + for i, (name, vals) in enumerate(series.items()): + offset = (i - (n_series - 1) / 2) * width + ax.bar(x + offset, vals, width, label=name, color=colors.get(name)) + + ax.set_xticks(x) + ax.set_xticklabels( + _xtick_labels(shape_labels, tile_counts, flagged), fontsize=8, + ) + ax.set_ylabel(y_label) + ax.set_title(title, fontsize=13, fontweight="bold") + if subtitle: + ax.text(0.5, 1.01, subtitle, transform=ax.transAxes, ha="center", + va="bottom", fontsize=8, color="#475569") + if threshold is not None: + ax.axhline(threshold, ls="--", color="gray", lw=1.0) + ax.legend(fontsize=8, loc="upper right") + ax.grid(True, axis="y", alpha=0.3) + + caption = "* = under-tile shape (M str | None: + """Per-stage engine wall-clock per shape (load_ref operand staging).""" + data = _load_sweep_data(sweep_json) + rows = [r for r in data["rows"] if r.get("variant") == _PLOT_VARIANT] + if not rows: + return None + tile = data["tile_sizes"] + shape_labels = [_shape_label(r) for r in rows] + flagged = [_under_tile(r["M"], r["K"], r["N"], tile["M"], tile["K"], tile["N"]) + for r in rows] + tile_counts = [r["tile_count_expected"] for r in rows] + series = { + STAGE_DISPLAY[s]: [r.get("stages", {}).get(s, {}).get("wall_ns", 0.0) + for r in rows] + for s in STAGE_KEYS + } + colors = {STAGE_DISPLAY[s]: STAGE_COLORS[s] for s in STAGE_KEYS} + return _grouped_bar_png( + "gemm_stage_breakdown.png", out_dir=Path(out_dir), + title="GEMM stage breakdown", + subtitle=(f"Per-stage engine wall-clock (DMA in / Fetch / GEMM / " + f"DMA out), {_PLOT_VARIANT} staging. " + f"Tile {tile['M']}x{tile['K']}x{tile['N']}."), + shape_labels=shape_labels, tile_counts=tile_counts, flagged=flagged, + series=series, colors=colors, y_label="ns", + footnote="Bars = engine wall-clock interval (merged overlaps).", + ) + + +def emit_mac_utilization_measured( + sweep_json: Path | str = DEFAULT_SWEEP_JSON, + out_dir: Path | str = DEFAULT_PLOTS_DIR, +) -> str | None: + """GEMM util % and useful pipeline-eff % (analytical model, load_ref).""" + data = _load_sweep_data(sweep_json) + rows = data["rows"] + if not rows: + return None + tile = data["tile_sizes"] + TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] + tile_flops = 2 * TILE_M * TILE_K * TILE_N + dma_w_per_pair = (TILE_M * TILE_N * _BPE) / _HBM_GBS + head_ns = (_D_STAGES - 1) * _T_STAGE + + by_shape = {(r["M"], r["K"], r["N"]): r + for r in rows if r["variant"] == _PLOT_VARIANT} + shapes = list(by_shape) + if not shapes: + return None + shape_labels = [_shape_label(by_shape[k]) for k in shapes] + flagged = [_under_tile(*k, TILE_M, TILE_K, TILE_N) for k in shapes] + tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] + + gemm_util, useful_eff = [], [] + for k in shapes: + r = by_shape[k] + M, K, N = r["M"], r["K"], r["N"] + useful = 2 * M * K * N + tiles = r["tile_count_expected"] + gu = useful / (tile_flops * tiles) * 100 + gemm_util.append(gu) + m_tiles = (M + TILE_M - 1) // TILE_M + n_tiles = (N + TILE_N - 1) // TILE_N + n_mn = m_tiles * n_tiles + compute_total = tiles * _T_STAGE + wall = head_ns + tiles * _T_STAGE + max(0, n_mn - 1) * dma_w_per_pair + ueff = (compute_total * (gu / 100.0) / wall) * 100 if wall > 0 else 0.0 + useful_eff.append(ueff) + + series = {"GEMM util %": gemm_util, "Useful eff %": useful_eff} + colors = {"GEMM util %": "#10B981", "Useful eff %": "#F59E0B"} + return _grouped_bar_png( + "gemm_mac_utilization_measured.png", out_dir=Path(out_dir), + title="GEMM MAC utilization — load_ref", + subtitle=("GEMM util = useful FLOPs / (tile FLOPs x tiles); " + "Useful eff = GEMM util x ideal pipeline efficiency."), + shape_labels=shape_labels, tile_counts=tile_counts, flagged=flagged, + series=series, colors=colors, y_label="%", threshold=100.0, + footnote="Theoretical ideal-pipeline model (not simulator data).", + ) + + +def emit_mac_utilization_theoretical_vs_measured( + sweep_json: Path | str = DEFAULT_SWEEP_JSON, + out_dir: Path | str = DEFAULT_PLOTS_DIR, +) -> str | None: + """Theoretical vs simulator-measured GEMM util / useful eff (load_ref).""" + data = _load_sweep_data(sweep_json) + rows = data["rows"] + if not rows: + return None + tile = data["tile_sizes"] + TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] + tile_flops = 2 * TILE_M * TILE_K * TILE_N + dma_w_per_pair = (TILE_M * TILE_N * _BPE) / _HBM_GBS + head_ns = (_D_STAGES - 1) * _T_STAGE + peak_per_ns = tile_flops / _T_STAGE + + by_shape = {(r["M"], r["K"], r["N"]): r + for r in rows if r["variant"] == _PLOT_VARIANT} + shapes = list(by_shape) + if not shapes: + return None + shape_labels = [_shape_label(by_shape[k]) for k in shapes] + flagged = [_under_tile(*k, TILE_M, TILE_K, TILE_N) for k in shapes] + tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] + + gu_t, gu_m, eff_t, eff_m = [], [], [], [] + for k in shapes: + r = by_shape[k] + M, K, N = r["M"], r["K"], r["N"] + useful = 2 * M * K * N + tiles = r["tile_count_expected"] + gut = useful / (tile_flops * tiles) + gu_t.append(gut * 100) + rec = r.get("stages", {}).get("GEMM", {}).get("record_count", 0) or tiles + gu_m.append((useful / (tile_flops * rec) * 100) if rec else 0.0) + m_tiles = (M + TILE_M - 1) // TILE_M + n_tiles = (N + TILE_N - 1) // TILE_N + n_mn = m_tiles * n_tiles + compute_total = tiles * _T_STAGE + wall_t = head_ns + compute_total + max(0, n_mn - 1) * dma_w_per_pair + eff_t.append((compute_total * gut / wall_t * 100) if wall_t > 0 else 0.0) + cw = r.get("composite_window_ns", 0.0) or 0.0 + eff_m.append((useful / cw / peak_per_ns * 100) if cw > 0 else 0.0) + + series = { + "GEMM util % (theoretical)": gu_t, + "GEMM util % (measured)": gu_m, + "Theoretical eff %": eff_t, + "Measured eff %": eff_m, + } + colors = { + "GEMM util % (theoretical)": "#10B981", + "GEMM util % (measured)": "#6EE7B7", + "Theoretical eff %": "#F59E0B", + "Measured eff %": "#3B82F6", + } + return _grouped_bar_png( + "gemm_mac_utilization_theoretical_vs_measured.png", out_dir=Path(out_dir), + title="GEMM MAC utilization — theoretical vs measured (load_ref)", + subtitle=("theoretical model vs simulator op_log; agreement " + "validates the analytical pipeline model."), + shape_labels=shape_labels, tile_counts=tile_counts, flagged=flagged, + series=series, colors=colors, y_label="%", threshold=100.0, + ) + + +def emit_all_gemm_plots( + sweep_json: Path | str = DEFAULT_SWEEP_JSON, + out_dir: Path | str = DEFAULT_PLOTS_DIR, +) -> list[str]: + """Render every GEMM figure that has data; return the paths written.""" + paths = [] + for fn in (emit_stage_breakdown, + emit_mac_utilization_measured, + emit_mac_utilization_theoretical_vs_measured): + p = fn(sweep_json, out_dir) + if p: + paths.append(p) + return paths + + +# ── bench entry ──────────────────────────────────────────────────────── + + +@bench( + name="milestone-1h-gemm", + description="1H milestone: regenerate all GEMM results + figures.", +) +def run(torch) -> None: + """Run the GEMM sweep (or reuse committed JSON) and render every figure. + + ``MILESTONE_FAST=1`` reuses the committed ``DEFAULT_SWEEP_JSON`` (seconds); + otherwise the full sweep runs into ``out_dir/gemm_sweep.json`` (minutes). + The sweep drives its own engines, so a sentinel tensor is submitted at the + end to satisfy the run_bench contract (ADR-0045 D4). + """ + _OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + fast = bool(os.environ.get("MILESTONE_FAST")) + if fast: + sweep_json = DEFAULT_SWEEP_JSON + else: + sweep_json = run_sweep(out_json=_OUTPUT_DIR / "gemm_sweep.json") + paths = emit_all_gemm_plots(sweep_json=sweep_json, out_dir=_OUTPUT_DIR) + print(f" milestone-1h-gemm: {len(paths)} figures -> {_OUTPUT_DIR} " + f"(fast={fast})") + + torch.zeros( + (1, 1), dtype="f16", + dp=DPPolicy(cube="row_wise", pe="replicate", num_cubes=1, num_pes=1), + name="milestone_gemm_sentinel", + ) diff --git a/tests/gemm/_gemm_plot_helpers.py b/tests/gemm/_gemm_plot_helpers.py index b7a97ce..da1a828 100644 --- a/tests/gemm/_gemm_plot_helpers.py +++ b/tests/gemm/_gemm_plot_helpers.py @@ -1,283 +1,31 @@ -"""Shared plotting plumbing for the GEMM figure tests. +"""Thin re-export shim for the GEMM figure tests. -Not a test module (no ``test_`` prefix -> pytest does not collect it). +Not a test module (no ``test_`` prefix → pytest does not collect it). -Reads the committed ``docs/diagrams/gemm_sweep.json`` (produced by the heavy -``scripts/gemm_sweep.py`` sim sweep) and renders matplotlib PNGs into -``docs/diagrams/gemm_plots/``. No simulation here -> the figure tests are fast -and run by default; regenerating the underlying data stays a manual script. - -Chart set (mirrors the GEMM MAC slides in scripts/build_overview_slides.py): - - stage breakdown (load_ref operand staging) - - MAC utilization — measured (load_ref) - - MAC utilization — theoretical vs measured (load_ref) +The sweep + renderer logic now lives in +``kernbench.benches.milestone_1h_gemm`` (production single home, ADR-0054, +also driven by ``scripts/gemm_sweep.py``). The figure tests import the same +names from here; behavior is unchanged (defaults still target +``docs/diagrams/gemm_plots/``). """ from __future__ import annotations -import json -from pathlib import Path +from kernbench.benches.milestone_1h_gemm import ( + DEFAULT_PLOTS_DIR as GEMM_PLOTS_DIR, + DEFAULT_SWEEP_JSON as GEMM_SWEEP_JSON, + ROOT, + emit_all_gemm_plots, + emit_mac_utilization_measured, + emit_mac_utilization_theoretical_vs_measured, + emit_stage_breakdown, +) -ROOT = Path(__file__).resolve().parent.parent.parent -GEMM_SWEEP_JSON = ROOT / "docs" / "diagrams" / "gemm_sweep.json" -GEMM_PLOTS_DIR = ROOT / "docs" / "diagrams" / "gemm_plots" - -# Shapes excluded from the figures (mirrors build_overview_slides). -EXCLUDED_SHAPES = {(512, 512, 512)} - -# Stage bars shown (raw op_log stage_type keys) + display names + colors. -STAGE_KEYS = ["DMA_READ", "FETCH", "GEMM", "DMA_WRITE"] -STAGE_DISPLAY = { - "DMA_READ": "DMA in", - "FETCH": "Fetch", - "GEMM": "GEMM", - "DMA_WRITE": "DMA out", -} -STAGE_COLORS = { - "DMA_READ": "#3B82F6", - "FETCH": "#10B981", - "GEMM": "#F59E0B", - "DMA_WRITE": "#A855F7", -} - -# MAC-utilization model constants (mirror build_overview_slides). -_HBM_GBS = 256.0 -_BPE = 2 -_T_STAGE = 16.0 -_D_STAGES = 3 - -_PLOT_VARIANT = "load_ref" - - -def _load_sweep_data() -> dict: - if not GEMM_SWEEP_JSON.exists(): - return {"rows": []} - data = json.loads(GEMM_SWEEP_JSON.read_text()) - data["rows"] = [ - r for r in data.get("rows", []) - if (r["M"], r["K"], r["N"]) not in EXCLUDED_SHAPES - ] - return data - - -def _shape_label(r: dict) -> str: - if r["M"] == r["K"] == r["N"]: - return f"M=K=N={r['M']}" - return f"M={r['M']} K={r['K']} N={r['N']}" - - -def _under_tile(M, K, N, tile_M, tile_K, tile_N) -> bool: - return M < tile_M or K < tile_K or N < tile_N - - -def _xtick_labels(shape_labels, tile_counts, flagged) -> list[str]: - out = [] - for lbl, tc, fl in zip(shape_labels, tile_counts, flagged): - s = f"{lbl}\n({tc} tiles)" - if fl: - s += " *" - out.append(s) - return out - - -def _grouped_bar_png( - out_name: str, *, title: str, subtitle: str | None, - shape_labels, tile_counts, flagged, series: dict, colors: dict, - y_label: str, threshold: float | None = None, footnote: str | None = None, -) -> str: - """Render one grouped-bar chart to GEMM_PLOTS_DIR/out_name; return the path.""" - import matplotlib.pyplot as plt - import numpy as np - - n_groups = len(shape_labels) - n_series = max(1, len(series)) - x = np.arange(n_groups) - width = 0.8 / n_series - - fig, ax = plt.subplots(figsize=(11, 6)) - for i, (name, vals) in enumerate(series.items()): - offset = (i - (n_series - 1) / 2) * width - ax.bar(x + offset, vals, width, label=name, color=colors.get(name)) - - ax.set_xticks(x) - ax.set_xticklabels( - _xtick_labels(shape_labels, tile_counts, flagged), fontsize=8, - ) - ax.set_ylabel(y_label) - ax.set_title(title, fontsize=13, fontweight="bold") - if subtitle: - ax.text(0.5, 1.01, subtitle, transform=ax.transAxes, ha="center", - va="bottom", fontsize=8, color="#475569") - if threshold is not None: - ax.axhline(threshold, ls="--", color="gray", lw=1.0) - ax.legend(fontsize=8, loc="upper right") - ax.grid(True, axis="y", alpha=0.3) - - caption = "* = under-tile shape (M str | None: - """Per-stage engine wall-clock per shape (load_ref operand staging).""" - data = _load_sweep_data() - rows = [r for r in data["rows"] if r.get("variant") == _PLOT_VARIANT] - if not rows: - return None - tile = data["tile_sizes"] - shape_labels = [_shape_label(r) for r in rows] - flagged = [_under_tile(r["M"], r["K"], r["N"], tile["M"], tile["K"], tile["N"]) - for r in rows] - tile_counts = [r["tile_count_expected"] for r in rows] - series = { - STAGE_DISPLAY[s]: [r.get("stages", {}).get(s, {}).get("wall_ns", 0.0) - for r in rows] - for s in STAGE_KEYS - } - colors = {STAGE_DISPLAY[s]: STAGE_COLORS[s] for s in STAGE_KEYS} - return _grouped_bar_png( - "gemm_stage_breakdown.png", - title="GEMM stage breakdown", - subtitle=(f"Per-stage engine wall-clock (DMA in / Fetch / GEMM / " - f"DMA out), {_PLOT_VARIANT} staging. " - f"Tile {tile['M']}x{tile['K']}x{tile['N']}."), - shape_labels=shape_labels, tile_counts=tile_counts, flagged=flagged, - series=series, colors=colors, y_label="ns", - footnote="Bars = engine wall-clock interval (merged overlaps).", - ) - - -def emit_mac_utilization_measured() -> str | None: - """GEMM util % and useful pipeline-eff % (analytical model, load_ref).""" - data = _load_sweep_data() - rows = data["rows"] - if not rows: - return None - tile = data["tile_sizes"] - TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] - tile_flops = 2 * TILE_M * TILE_K * TILE_N - dma_w_per_pair = (TILE_M * TILE_N * _BPE) / _HBM_GBS - head_ns = (_D_STAGES - 1) * _T_STAGE - - by_shape = {(r["M"], r["K"], r["N"]): r - for r in rows if r["variant"] == _PLOT_VARIANT} - shapes = list(by_shape) - if not shapes: - return None - shape_labels = [_shape_label(by_shape[k]) for k in shapes] - flagged = [_under_tile(*k, TILE_M, TILE_K, TILE_N) for k in shapes] - tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] - - gemm_util, useful_eff = [], [] - for k in shapes: - r = by_shape[k] - M, K, N = r["M"], r["K"], r["N"] - useful = 2 * M * K * N - tiles = r["tile_count_expected"] - gu = useful / (tile_flops * tiles) * 100 - gemm_util.append(gu) - m_tiles = (M + TILE_M - 1) // TILE_M - n_tiles = (N + TILE_N - 1) // TILE_N - n_mn = m_tiles * n_tiles - compute_total = tiles * _T_STAGE - wall = head_ns + tiles * _T_STAGE + max(0, n_mn - 1) * dma_w_per_pair - ueff = (compute_total * (gu / 100.0) / wall) * 100 if wall > 0 else 0.0 - useful_eff.append(ueff) - - series = {"GEMM util %": gemm_util, "Useful eff %": useful_eff} - colors = {"GEMM util %": "#10B981", "Useful eff %": "#F59E0B"} - return _grouped_bar_png( - "gemm_mac_utilization_measured.png", - title="GEMM MAC utilization — load_ref", - subtitle=("GEMM util = useful FLOPs / (tile FLOPs x tiles); " - "Useful eff = GEMM util x ideal pipeline efficiency."), - shape_labels=shape_labels, tile_counts=tile_counts, flagged=flagged, - series=series, colors=colors, y_label="%", threshold=100.0, - footnote="Theoretical ideal-pipeline model (not simulator data).", - ) - - -def emit_mac_utilization_theoretical_vs_measured() -> str | None: - """Theoretical vs simulator-measured GEMM util / useful eff (load_ref).""" - data = _load_sweep_data() - rows = data["rows"] - if not rows: - return None - tile = data["tile_sizes"] - TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"] - tile_flops = 2 * TILE_M * TILE_K * TILE_N - dma_w_per_pair = (TILE_M * TILE_N * _BPE) / _HBM_GBS - head_ns = (_D_STAGES - 1) * _T_STAGE - peak_per_ns = tile_flops / _T_STAGE - - by_shape = {(r["M"], r["K"], r["N"]): r - for r in rows if r["variant"] == _PLOT_VARIANT} - shapes = list(by_shape) - if not shapes: - return None - shape_labels = [_shape_label(by_shape[k]) for k in shapes] - flagged = [_under_tile(*k, TILE_M, TILE_K, TILE_N) for k in shapes] - tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes] - - gu_t, gu_m, eff_t, eff_m = [], [], [], [] - for k in shapes: - r = by_shape[k] - M, K, N = r["M"], r["K"], r["N"] - useful = 2 * M * K * N - tiles = r["tile_count_expected"] - gut = useful / (tile_flops * tiles) - gu_t.append(gut * 100) - rec = r.get("stages", {}).get("GEMM", {}).get("record_count", 0) or tiles - gu_m.append((useful / (tile_flops * rec) * 100) if rec else 0.0) - m_tiles = (M + TILE_M - 1) // TILE_M - n_tiles = (N + TILE_N - 1) // TILE_N - n_mn = m_tiles * n_tiles - compute_total = tiles * _T_STAGE - wall_t = head_ns + compute_total + max(0, n_mn - 1) * dma_w_per_pair - eff_t.append((compute_total * gut / wall_t * 100) if wall_t > 0 else 0.0) - cw = r.get("composite_window_ns", 0.0) or 0.0 - eff_m.append((useful / cw / peak_per_ns * 100) if cw > 0 else 0.0) - - series = { - "GEMM util % (theoretical)": gu_t, - "GEMM util % (measured)": gu_m, - "Theoretical eff %": eff_t, - "Measured eff %": eff_m, - } - colors = { - "GEMM util % (theoretical)": "#10B981", - "GEMM util % (measured)": "#6EE7B7", - "Theoretical eff %": "#F59E0B", - "Measured eff %": "#3B82F6", - } - return _grouped_bar_png( - "gemm_mac_utilization_theoretical_vs_measured.png", - title="GEMM MAC utilization — theoretical vs measured (load_ref)", - subtitle=("theoretical model vs simulator op_log; agreement " - "validates the analytical pipeline model."), - shape_labels=shape_labels, tile_counts=tile_counts, flagged=flagged, - series=series, colors=colors, y_label="%", threshold=100.0, - ) - - -def emit_all_gemm_plots() -> list[str]: - """Render every GEMM figure that has data; return the list of paths written.""" - paths = [] - for fn in (emit_stage_breakdown, - emit_mac_utilization_measured, - emit_mac_utilization_theoretical_vs_measured): - p = fn() - if p: - paths.append(p) - return paths +__all__ = [ + "GEMM_PLOTS_DIR", + "GEMM_SWEEP_JSON", + "ROOT", + "emit_all_gemm_plots", + "emit_mac_utilization_measured", + "emit_mac_utilization_theoretical_vs_measured", + "emit_stage_breakdown", +] diff --git a/tests/sccl/_allreduce_helpers.py b/tests/sccl/_allreduce_helpers.py index ea336eb..deaa8f2 100644 --- a/tests/sccl/_allreduce_helpers.py +++ b/tests/sccl/_allreduce_helpers.py @@ -1,175 +1,81 @@ -"""Shared plumbing for the sccl allreduce tests. +"""Thin re-export shim for the sccl allreduce tests. Not a test module (no ``test_`` prefix → pytest does not collect it). -Holds the distributed driver, the direct-launch parity reference, the -config writers, the sweep/buffer-kind constants, the plot aggregators -(called from ``conftest.pytest_sessionfinish``), and the topology-diagram -emitter. The per-test files under ``tests/sccl/`` import from here, as do -the external buffer-kind / root-center tests under ``tests/``. + +The driver core, config writers, sweep constants, renderers, aggregators, +topology-diagram + FSIM-comparison emitters, and the direct-launch parity +reference now live in ``kernbench.benches.milestone_1h_ccl`` (production +single home, ADR-0054). This shim re-exports them and keeps +the pytest-specific pieces local: the ``pytest.param`` matrices +(``CONFIGS`` / ``_sweep_params`` / ``_bk_params``) and the fixture-coupled +``_run_distributed`` wrapper. Behavior is unchanged (defaults still target +``docs/diagrams/allreduce_latency_plots/``). """ from __future__ import annotations -import importlib -import math -import textwrap -from pathlib import Path -from typing import Any - -import numpy as np import pytest -import yaml -from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config -from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip -from kernbench.policy.placement.dp import DPPolicy -from kernbench.runtime_api.context import RuntimeContext -from kernbench.runtime_api.types import DeviceSelector -from kernbench.sim_engine.engine import GraphEngine -from kernbench.topology.builder import resolve_topology +from kernbench.benches.milestone_1h_ccl import ( + DEFAULT_N_ELEM, + TOPOLOGY_PATH, + _aggregate_sweep_plots, + _BK_N_ELEM_GRID, + _BK_ROWS_DIR, + _BUFFER_KINDS, + _crit_ns, + _drive_distributed, + _ELEM_BYTES_F16, + _SWEEP_N_ELEM, + _SWEEP_OUT_DIR, + _SWEEP_ROWS_DIR, + _SWEEP_TOPOLOGIES, + _worker, + _write_ccl_yaml, + _write_temp_configs, + aggregate_buffer_kind_plot, + emit_comparison_fsim_plot, + emit_topology_diagram, + run_allreduce, +) -TOPOLOGY_PATH = Path(__file__).parent.parent.parent / "topology.yaml" - -DEFAULT_N_ELEM = 8 +__all__ = [ + "CONFIGS", + "DEFAULT_N_ELEM", + "TOPOLOGY_PATH", + "_BK_ROWS_DIR", + "_ELEM_BYTES_F16", + "_SWEEP_OUT_DIR", + "_SWEEP_ROWS_DIR", + "_aggregate_sweep_plots", + "_bk_params", + "_crit_ns", + "_run_distributed", + "_sweep_params", + "_worker", + "_write_ccl_yaml", + "_write_temp_configs", + "aggregate_buffer_kind_plot", + "emit_comparison_fsim_plot", + "emit_topology_diagram", + "run_allreduce", +] -# ── config writers ──────────────────────────────────────────────────── - - -def _write_ccl_yaml(tmp_path) -> str: - body = textwrap.dedent("""\ - defaults: - algorithm: lrab_hierarchical_allreduce - buffer_kind: tcm - backpressure: sleep - n_slots: 4 - slot_size: 4096 - vc_chunk_size: 256 - ipcq_credit_size_bytes: 16 - - algorithms: - lrab_hierarchical_allreduce: - module: kernbench.ccl.algorithms.lrab_hierarchical_allreduce - topology: none - buffer_kind: tcm - n_elem: 8 - root_cube: 15 - """) - (tmp_path / "ccl.yaml").write_text(body) - return str(tmp_path) - - -def _write_temp_configs( - tmp_path, sip_topology, n_sips, algorithm, n_elem_override=None, - sip_w=None, sip_h=None, -): - """Write temp topology.yaml and ccl.yaml with the given overrides.""" - with open(TOPOLOGY_PATH) as f: - topo_cfg = yaml.safe_load(f) - topo_cfg["system"]["sips"]["count"] = n_sips - topo_cfg["system"]["sips"]["topology"] = sip_topology - if sip_w is not None and sip_h is not None: - topo_cfg["system"]["sips"]["w"] = int(sip_w) - topo_cfg["system"]["sips"]["h"] = int(sip_h) - else: - topo_cfg["system"]["sips"].pop("w", None) - topo_cfg["system"]["sips"].pop("h", None) - topo_path = tmp_path / "topology.yaml" - with open(topo_path, "w") as f: - yaml.dump(topo_cfg, f, default_flow_style=False) - - ccl_path = Path(__file__).parent.parent.parent / "ccl.yaml" - with open(ccl_path) as f: - ccl_cfg = yaml.safe_load(f) - ccl_cfg["defaults"]["algorithm"] = algorithm - if n_elem_override is not None: - ccl_cfg.setdefault("algorithms", {}).setdefault( - algorithm, {}, - )["n_elem"] = int(n_elem_override) - # Ensure IPCQ slot is big enough for the per-message payload. - per_msg_bytes = int(n_elem_override) * 2 # f16 - default_slot = int(ccl_cfg["defaults"].get("slot_size", 4096)) - if per_msg_bytes > default_slot: - ccl_cfg["defaults"]["slot_size"] = per_msg_bytes - tmp_ccl = tmp_path / "ccl.yaml" - with open(tmp_ccl, "w") as f: - yaml.dump(ccl_cfg, f, default_flow_style=False) - - return str(topo_path), str(tmp_ccl) - - -# ── distributed driver (init_process_group → mp.spawn → all_reduce) ──── - - -def _worker(rank: int, n_cubes: int, n_elem: int, n_sips: int, torch) -> None: - """Per-SIP worker: allocate, fill, all_reduce, verify.""" - torch.ahbm.set_device(rank) - - dp = DPPolicy( - cube="row_wise", pe="replicate", - num_pes=1, num_cubes=n_cubes, - ) - tensor = torch.zeros( - (n_cubes, n_elem), dtype="f16", dp=dp, - name=f"sip{rank}", - ) - tensor.copy_(torch.from_numpy( - np.full((n_cubes, n_elem), float(rank + 1), dtype=np.float16) - )) - - torch.distributed.all_reduce(tensor, op="sum") - - arr = tensor.numpy() - expected = float(n_cubes * sum(range(1, n_sips + 1))) - for cube_id in range(n_cubes): - assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), ( - f"SIP{rank} cube {cube_id}: " - f"got {arr[cube_id][:4]}, expected {expected}" - ) - - if rank == 0: - print(f"\n lrab_hierarchical_allreduce (ws={n_sips}): " - f"{n_sips * n_cubes} OK") - - -def _crit_ns(engine) -> float: - """Critical-path latency = max per-result pe_exec_ns over engine results.""" - vals = [ - float(tr.get("pe_exec_ns", 0.0) or 0.0) - for _, (_, tr) in engine._results.items() - if isinstance(tr, dict) - ] - return max(vals) if vals else 0.0 +# ── pytest-coupled distributed driver wrapper ────────────────────────── def _run_distributed(tmp_path, monkeypatch, topo_path, correlation_id, n_elem): - """Build engine + run the collective via the full distributed path. + """Fixture-coupled wrapper: chdir via monkeypatch, then drive. - Returns ``(engine, n_cubes)``. ``monkeypatch.chdir`` points the backend's - ``load_ccl_config()`` (cwd lookup) at the temp ``ccl.yaml``. + ``monkeypatch.chdir`` points the backend's ``load_ccl_config()`` (cwd + lookup) at the temp ``ccl.yaml`` and auto-restores cwd at test teardown. + Returns ``(engine, n_cubes)``. """ monkeypatch.chdir(tmp_path) - topo = resolve_topology(topo_path) - engine = GraphEngine(topo.topology_obj, enable_data=True) - spec = topo.topology_obj.spec - n_sips = int(spec["system"]["sips"]["count"]) - cm = spec["sip"]["cube_mesh"] - n_cubes = int(cm["w"]) * int(cm["h"]) - - with RuntimeContext( - engine=engine, - target_device=DeviceSelector("all"), - correlation_id=correlation_id, - spec=spec, - ) as ctx: - ctx.distributed.init_process_group(backend="ahbm") - assert ctx.distributed.get_world_size() == n_sips - ctx.multiprocessing.spawn( - _worker, args=(n_cubes, n_elem, n_sips, ctx), nprocs=n_sips, - ) - return engine, n_cubes + return _drive_distributed(topo_path, correlation_id, n_elem) -# ── correctness config matrix (used by test_allreduce) ───────────────── +# ── pytest.param matrices ────────────────────────────────────────────── CONFIGS = [ pytest.param( @@ -187,154 +93,6 @@ CONFIGS = [ ] -# ── direct-launch helper (parity reference only) ─────────────────────── - - -def _sip_topo_dims( - sip_topo: str, n_sips: int, - spec_w: int | None = None, spec_h: int | None = None, -) -> tuple[int, int]: - if sip_topo == "ring_1d": - return (0, 0) - if spec_w is not None and spec_h is not None: - if spec_w * spec_h != n_sips: - raise ValueError( - f"sip layout {spec_w}x{spec_h} != n_sips ({n_sips})" - ) - return (spec_w, spec_h) - side = int(round(math.sqrt(n_sips))) - if side * side != n_sips: - raise ValueError( - f"SIP topology '{sip_topo}' requires square n_sips or " - f"explicit w/h in spec, got {n_sips}" - ) - return (side, side) - - -def run_allreduce( - ctx: Any, - engine: Any, - spec: dict, - *, - algorithm: str | None = None, - ccl_yaml: str | None = None, -) -> dict: - """Config-driven allreduce via direct ctx.launch (no distributed wrapper). - - Retained as the parity reference for the distributed path and reused by - the external buffer-kind / root-center micro-tests. - """ - cfg_all = load_ccl_config(ccl_yaml) - cfg = resolve_algorithm_config(cfg_all, algorithm) - - algo_module = importlib.import_module(cfg["module"]) - kernel_fn = algo_module.kernel - topo_name_to_kind = algo_module.TOPO_NAME_TO_KIND - - n_elem = int(cfg.get("n_elem", 8)) - sips_cfg = spec.get("system", {}).get("sips", {}) - n_sips = int(sips_cfg.get("count", 1)) - sip_topo = str(sips_cfg.get("topology", "ring_1d")) - spec_sip_w = sips_cfg.get("w") - spec_sip_h = sips_cfg.get("h") - spec_sip_w = int(spec_sip_w) if spec_sip_w is not None else None - spec_sip_h = int(spec_sip_h) if spec_sip_h is not None else None - - cm = spec["sip"]["cube_mesh"] - cube_w = int(cm["w"]) - cube_h = int(cm["h"]) - n_cubes = cube_w * cube_h - - sip_topo_kind = topo_name_to_kind.get(sip_topo, 0) - sip_topo_w, sip_topo_h = _sip_topo_dims( - sip_topo, n_sips, spec_w=spec_sip_w, spec_h=spec_sip_h, - ) - - algo_name = cfg.get("algorithm", "allreduce") - - configure_sfr_intercube_multisip(engine, spec, cfg) - - dp = DPPolicy( - cube="row_wise", pe="replicate", - num_pes=1, num_cubes=n_cubes, - ) - - tensors = [] - for sip in range(n_sips): - ctx.ahbm.set_device(sip) - t = ctx.zeros( - (n_cubes, n_elem), dtype="f16", dp=dp, - name=f"sip{sip}", - ) - t.copy_(ctx.from_numpy( - np.full((n_cubes, n_elem), float(sip + 1), dtype=np.float16) - )) - tensors.append(t) - - t_start = engine._env.now - - all_pending = [] - for sip_rank, t in enumerate(tensors): - pending = ctx.launch( - algo_name, kernel_fn, t, - n_elem, cube_w, cube_h, n_sips, sip_rank, - sip_topo_kind, sip_topo_w, sip_topo_h, - _defer_wait=True, - ) - all_pending.extend(pending) - - for h, _sip_id, meta in all_pending: - ctx.wait(h, _meta=meta) - - t_end = engine._env.now - latency_ns = t_end - t_start - - expected = float(n_cubes * sum(range(1, n_sips + 1))) - - ok_cubes = 0 - for sip in range(n_sips): - arr = tensors[sip].numpy() - for cube_id in range(n_cubes): - assert np.allclose( - arr[cube_id], expected, rtol=1e-1, atol=1e-1, - ), ( - f"SIP{sip} cube {cube_id}: " - f"got {arr[cube_id][:4]}, expected {expected}" - ) - ok_cubes += 1 - - return { - "expected": expected, - "latency_ns": latency_ns, - "ok_cubes": ok_cubes, - } - - -# ── Latency sweep constants + aggregator ────────────────────────────── - -# avoid 16 (== n_cubes, dim_map collision). Goes up to 96 KB per PE: -# bytes_per_pe = n_elem * 2 (f16). 49152 elem * 2 = 96 KB / PE. -_SWEEP_N_ELEM = [ - 8, 32, 64, 128, 512, 1024, 2048, - 4096, 8192, 16384, 32768, 49152, -] -_ELEM_BYTES_F16 = 2 - -_SWEEP_TOPOLOGIES = [ - ("lrab_hierarchical_allreduce", "ring_1d", 6, None, None), - ("lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3), - ("lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3), -] - -# Shared on-disk staging dir for parametrized sweep rows. Each -# parametrized invocation writes one JSON file here; the aggregator -# (run from conftest.pytest_sessionfinish) reads them and emits the -# combined CSV + PNG plots. -_SWEEP_OUT_DIR = (Path(__file__).parent.parent.parent / "docs" / "diagrams" - / "allreduce_latency_plots") -_SWEEP_ROWS_DIR = _SWEEP_OUT_DIR / "_rows" - - def _sweep_params(): out = [] for algorithm, sip_topology, n_sips, sip_w, sip_h in _SWEEP_TOPOLOGIES: @@ -346,668 +104,9 @@ def _sweep_params(): return out -def _aggregate_sweep_plots() -> bool: - """Read all per-config rows and emit CSV + PNG plots. - - Called by ``conftest.pytest_sessionfinish`` (controller node only). - Returns True if any rows were aggregated, False otherwise. - """ - import csv - import json - - row_files = sorted(_SWEEP_ROWS_DIR.glob("*.json")) \ - if _SWEEP_ROWS_DIR.exists() else [] - records: list[dict] = [] - if row_files: - for p in row_files: - with open(p, encoding="utf-8") as f: - records.append(json.load(f)) - else: - # Fallback: replot from existing summary.csv (skip sweep re-run). - summary_path = _SWEEP_OUT_DIR / "summary.csv" - if not summary_path.exists(): - return False - with open(summary_path, encoding="utf-8") as f: - for row in csv.DictReader(f): - records.append({ - "algorithm": row["algorithm"], - "sip_topology": row["sip_topology"], - "n_sips": int(row["n_sips"]), - "n_elem": int(row["n_elem"]), - "bytes_per_pe": int(row["bytes_per_pe"]), - "bytes_per_sip": int(row["bytes_per_sip"]), - "latency_ns": float(row["latency_ns"]), - }) - if not records: - return False - - import matplotlib.pyplot as plt - from matplotlib.ticker import FuncFormatter - - def _fmt_bytes(x, _pos): - if x <= 0: - return "0" - if x >= 1024 * 1024: - return f"{x / (1024 * 1024):.0f} MB" - if x >= 1024: - return f"{x / 1024:.0f} KB" - return f"{x:.0f} B" - - _bytes_fmt = FuncFormatter(_fmt_bytes) - - _SWEEP_OUT_DIR.mkdir(parents=True, exist_ok=True) - with open(_SWEEP_OUT_DIR / "summary.csv", "w", - newline="", encoding="utf-8") as f: - w = csv.DictWriter(f, fieldnames=[ - "algorithm", "sip_topology", "n_sips", "n_elem", - "bytes_per_pe", "bytes_per_sip", "latency_ns", - ]) - w.writeheader() - for r in sorted(records, key=lambda r: ( - r["sip_topology"], r["bytes_per_pe"], - )): - w.writerow(r) - - topologies = sorted({r["sip_topology"] for r in records}) - for topo_name in topologies: - rs = sorted( - [r for r in records if r["sip_topology"] == topo_name], - key=lambda r: r["bytes_per_pe"], - ) - if not rs: - continue - xs = [r["bytes_per_pe"] for r in rs] - ys = [r["latency_ns"] for r in rs] - _per_topo_titles = { - "ring_1d": "AllReduce_LRAB_Ring1D_6SiP(1x6)", - "torus_2d": "AllReduce_LRAB_2Dtorus_6SiP(2x3)", - "mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP(2x3)", - } - # Descriptive output filenames (parens → underscores for - # markdown/URL safety; topo key stays the summary.csv value). - _per_topo_files = { - "ring_1d": "AllReduce_LRAB_Ring1D_6SiP_1x6", - "torus_2d": "AllReduce_LRAB_2Dtorus_6SiP_2x3", - "mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP_2x3", - } - title = _per_topo_titles.get( - topo_name, f"Allreduce latency — {topo_name}" - ) - out_stem = _per_topo_files.get(topo_name, topo_name) - fig, ax = plt.subplots(figsize=(8, 5)) - ax.plot(xs, ys, marker="o", color="tab:blue") - ax.set_xscale("log", base=2) - ax.set_xlabel("Bytes per PE (log scale)") - ax.set_ylabel("Time (ns)") - ax.set_title(title) - ax.grid(True, alpha=0.3) - ax.xaxis.set_major_formatter(_bytes_fmt) - fig.tight_layout() - fig.savefig(_SWEEP_OUT_DIR / f"{out_stem}.png", dpi=120) - plt.close(fig) - - # Combined overview.png is no longer emitted — the broken-y-axis - # comparison (emit_comparison_fsim_plot() below → - # comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png) - # supersedes it. Per-topology plots above and summary.csv are still - # produced. - - # Cleanup row staging dir so a partial future run doesn't pick up - # stale rows. - for p in row_files: - try: - p.unlink() - except OSError: - pass - try: - _SWEEP_ROWS_DIR.rmdir() - except OSError: - pass - - print(f"\nWrote per-topology plots + summary.csv to {_SWEEP_OUT_DIR} " - f"from {len(records)} rows") - return True - - -# ── Buffer-kind sweep constants + aggregator ────────────────────────── -# -# Parametrized over (buffer_kind, n_elem) on torus_2d 6 SIPs (3×2). Pre -# slot-latency modeling the three lines overlap exactly (slot access is -# latency-free today); they spread out once tcm/sram/hbm carry distinct -# access costs. - -_BUFFER_KINDS = ["tcm", "sram", "hbm"] -_BK_N_ELEM_GRID = [128, 1024, 8192, 32768] # 256 B → 64 KB per slot -_BK_ROWS_DIR = _SWEEP_OUT_DIR / "_buffer_kind_rows" -# Descriptive output stem (shared by the .png and .csv). -_BK_OUT_STEM = "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM" - - def _bk_params(): out = [] for bk in _BUFFER_KINDS: for n_elem in _BK_N_ELEM_GRID: out.append(pytest.param(bk, n_elem, id=f"{bk}-n_elem{n_elem}")) return out - - -def aggregate_buffer_kind_plot() -> bool: - """Read per-config rows and emit the descriptive .png + .csv (_BK_OUT_STEM). - - Called from conftest.pytest_sessionfinish (controller-only). - Returns True if rows were aggregated. - """ - import csv - import json - - if not _BK_ROWS_DIR.exists(): - return False - row_files = sorted(_BK_ROWS_DIR.glob("*.json")) - if not row_files: - return False - - records = [] - for p in row_files: - with open(p, encoding="utf-8") as f: - records.append(json.load(f)) - - import matplotlib.pyplot as plt - from matplotlib.ticker import FuncFormatter - - def _fmt_bytes(x, _pos): - if x <= 0: - return "0" - if x >= 1024 * 1024: - return f"{x / (1024 * 1024):.0f} MB" - if x >= 1024: - return f"{x / 1024:.0f} KB" - return f"{x:.0f} B" - - _bytes_fmt = FuncFormatter(_fmt_bytes) - - _SWEEP_OUT_DIR.mkdir(parents=True, exist_ok=True) - with open(_SWEEP_OUT_DIR / f"{_BK_OUT_STEM}.csv", "w", - newline="", encoding="utf-8") as f: - w = csv.DictWriter(f, fieldnames=[ - "buffer_kind", "sip_topology", "n_sips", "n_elem", - "bytes_per_pe", "latency_ns", - ]) - w.writeheader() - for r in sorted(records, key=lambda r: ( - r["buffer_kind"], r["bytes_per_pe"], - )): - w.writerow(r) - - colors = {"tcm": "tab:blue", "sram": "tab:orange", "hbm": "tab:red"} - fig, ax = plt.subplots(figsize=(10, 6)) - for bk in ["tcm", "sram", "hbm"]: - rs = sorted( - [r for r in records if r["buffer_kind"] == bk], - key=lambda r: r["bytes_per_pe"], - ) - if not rs: - continue - ax.plot( - [r["bytes_per_pe"] for r in rs], - [r["latency_ns"] for r in rs], - marker="o", lw=2.0, - color=colors[bk], label=f"buffer_kind = {bk}", - ) - ax.set_xscale("log", base=2) - ax.set_xlabel("Bytes per PE (log scale)") - ax.set_ylabel("Time (ns)") - ax.set_title( - "AllReduce_LRAB_2Dtorus_6SiP(2x3) — IPCQ memory (SRAM, TCM, HBM)" - ) - ax.grid(True, alpha=0.3) - ax.legend() - ax.xaxis.set_major_formatter(_bytes_fmt) - fig.tight_layout() - fig.savefig(_SWEEP_OUT_DIR / f"{_BK_OUT_STEM}.png", dpi=130) - plt.close(fig) - - for p in row_files: - try: - p.unlink() - except OSError: - pass - try: - _BK_ROWS_DIR.rmdir() - except OSError: - pass - - print(f"\nWrote {_SWEEP_OUT_DIR / f'{_BK_OUT_STEM}.png'} " - f"from {len(records)} rows") - return True - - -# ── Topology diagram (device-level + cube-level reduction) ──────────── - -# Convention: "rows × cols" everywhere, row-major rank assignment -# (rank = row * n_cols + col). For the 2×3 inter-SIP grid, this means -# 2 rows × 3 columns: SIP 0 1 2 / SIP 3 4 5. - -_PALETTE_BG = "#fafbfd" -_PALETTE_FRAME = "#3a3f4a" -_PALETTE_BLUE = "#2c6fb6" -_PALETTE_GREEN = "#2e8a4e" -_PALETTE_TEXT = "#1f2530" -_PALETTE_BOX_FILL = "#eaf2fb" -_PALETTE_BOX_EDGE = "#2c4a78" -_PALETTE_ROOT_FILL = "#ffd9b8" -_PALETTE_ROOT_EDGE = "#bd5a14" - - -def _arrow(ax, xy_from, xy_to, color="black", lw=1.4, alpha=1.0, - style="-|>", curve=0.0): - from matplotlib.patches import FancyArrowPatch - arrow = FancyArrowPatch( - xy_from, xy_to, - arrowstyle=style, mutation_scale=12, - color=color, lw=lw, alpha=alpha, - connectionstyle=f"arc3,rad={curve}", - ) - ax.add_patch(arrow) - - -def _draw_sip_box(ax, cx, cy, w, h, label, *, fill=_PALETTE_BOX_FILL, - edge=_PALETTE_BOX_EDGE, text_color=_PALETTE_TEXT, - font=10): - from matplotlib.patches import FancyBboxPatch - box = FancyBboxPatch( - (cx - w / 2, cy - h / 2), w, h, - boxstyle="round,pad=0.02,rounding_size=0.10", - linewidth=1.4, edgecolor=edge, facecolor=fill, - ) - ax.add_patch(box) - ax.text(cx, cy, label, ha="center", va="center", - color=text_color, fontsize=font, fontweight="bold") - - -def _frame_panel(ax, title, lim_x=10.0, lim_y=6.0): - """Set up a square-ish panel with a visible outer border.""" - from matplotlib.patches import FancyBboxPatch - ax.set_xlim(0, lim_x) - ax.set_ylim(0, lim_y) - ax.set_aspect("equal") - ax.axis("off") - ax.set_facecolor(_PALETTE_BG) - border = FancyBboxPatch( - (0.05, 0.05), lim_x - 0.10, lim_y - 0.10, - boxstyle="round,pad=0.01,rounding_size=0.12", - linewidth=1.4, edgecolor=_PALETTE_FRAME, facecolor=_PALETTE_BG, - zorder=0, - ) - ax.add_patch(border) - ax.set_title(title, fontsize=12, fontweight="bold", - color=_PALETTE_TEXT, pad=8) - - -def _draw_ring_topology(ax): - _frame_panel(ax, "ring_1d (6 SIPs)", lim_x=10.0, lim_y=6.0) - - xs = [1.2, 2.7, 4.2, 5.7, 7.2, 8.7] - y = 3.1 - box_w, box_h = 1.05, 0.9 - for i, x in enumerate(xs): - _draw_sip_box(ax, x, y, box_w, box_h, f"SIP {i}") - # Forward ring (global_E) — adjacent neighbours, anchored to box edges. - for i in range(5): - _arrow(ax, (xs[i] + box_w / 2, y), - (xs[i + 1] - box_w / 2, y), - color=_PALETTE_BLUE, lw=1.6) - # Wrap (SIP 5 → SIP 0). Anchor at right-CENTER of SIP 5 and - # left-CENTER of SIP 0; arc OUTSIDE (above) the row so it does not - # overlap any of the SIP boxes in between. - _arrow( - ax, - (xs[5] + box_w / 2, y), - (xs[0] - box_w / 2, y), - color=_PALETTE_BLUE, lw=1.6, curve=-0.40, - ) - ax.text(5.0, y + 2.0, "global_E (ring)", ha="center", - color=_PALETTE_BLUE, fontsize=10, style="italic") - ax.text(5.0, y - 1.5, - "(global_W = reverse direction, used by the algorithm)", - ha="center", color="gray", fontsize=8, style="italic") - - -def _draw_grid_topology(ax, kind, *, n_rows=2, n_cols=3): - """kind ∈ {'torus', 'mesh'}. Lays out as n_rows × n_cols (row-major). - - For the sweep we use 2 rows × 3 cols → SIP layout:: - - row 0: SIP 0 SIP 1 SIP 2 - row 1: SIP 3 SIP 4 SIP 5 - """ - title = f"torus_2d ({n_rows}×{n_cols}, 6 SIPs)" if kind == "torus" \ - else f"mesh_2d_no_wrap ({n_rows}×{n_cols}, 6 SIPs)" - _frame_panel(ax, title, lim_x=10.0, lim_y=6.0) - - col_xs = [2.0, 5.0, 8.0] # 3 cols - row_ys = [4.3, 1.8] # 2 rows - box_w, box_h = 1.3, 0.95 - pos: dict[tuple[int, int], tuple[float, float]] = {} - for r in range(n_rows): - for c in range(n_cols): - rank = r * n_cols + c - x, y = col_xs[c], row_ys[r] - pos[(r, c)] = (x, y) - _draw_sip_box(ax, x, y, box_w, box_h, f"SIP {rank}") - - # Row edges (E↔W) — between adjacent columns within each row. - for r in range(n_rows): - for c in range(n_cols - 1): - x0, y0 = pos[(r, c)] - x1, y1 = pos[(r, c + 1)] - _arrow(ax, (x0 + box_w / 2, y0 + 0.10), - (x1 - box_w / 2, y1 + 0.10), - color=_PALETTE_BLUE, lw=1.5) - _arrow(ax, (x1 - box_w / 2, y1 - 0.10), - (x0 + box_w / 2, y0 - 0.10), - color=_PALETTE_BLUE, lw=1.5) - # Col edges (N↔S) — between adjacent rows within each column. - for c in range(n_cols): - for r in range(n_rows - 1): - x0, y0 = pos[(r, c)] - x1, y1 = pos[(r + 1, c)] - _arrow(ax, (x0 - 0.12, y0 - box_h / 2), - (x1 - 0.12, y1 + box_h / 2), - color=_PALETTE_GREEN, lw=1.5) - _arrow(ax, (x1 + 0.12, y1 + box_h / 2), - (x0 + 0.12, y0 - box_h / 2), - color=_PALETTE_GREEN, lw=1.5) - # Wrap arrows for torus only — anchor to the centre of the OUTER - # edge of the end SIPs and arc OUTSIDE the row/column so they do - # not overlap the SIPs in between. - if kind == "torus": - # Row wrap: last col → first col. Top row arcs UP, bottom row - # arcs DOWN, so each wrap sits clearly outside its own row. - for r in range(n_rows): - x0, y0 = pos[(r, 0)] - x1, y1 = pos[(r, n_cols - 1)] - curve = -0.45 if r == 0 else 0.45 - _arrow( - ax, - (x1 + box_w / 2, y1), - (x0 - box_w / 2, y0), - color=_PALETTE_BLUE, lw=1.5, - curve=curve, alpha=0.9, - ) - # Col wrap: last row → first row. Leftmost col arcs LEFT, - # rightmost col arcs RIGHT. Middle col(s) get a small inline - # marker + legend note (drawing them through the panel would - # collide with the row arrows). - for c in range(n_cols): - x0, y0 = pos[(0, c)] - x1, y1 = pos[(n_rows - 1, c)] - if c == 0: - curve = 0.55 - elif c == n_cols - 1: - curve = -0.55 - else: - continue # skip middle col — see legend note - _arrow( - ax, - (x1, y1 - box_h / 2), - (x0, y0 + box_h / 2), - color=_PALETTE_GREEN, lw=1.5, - curve=curve, alpha=0.9, - ) - - ax.text(0.7, 5.6, "global_E/W (row)", color=_PALETTE_BLUE, - fontsize=9, style="italic", fontweight="bold") - ax.text(0.7, 5.25, "global_N/S (col)", color=_PALETTE_GREEN, - fontsize=9, style="italic", fontweight="bold") - ax.text(0.7, 4.92, - "wrap = torus" if kind == "torus" else "no wrap = mesh", - color="gray", fontsize=8, style="italic") - if kind == "torus" and n_cols > 2: - ax.text(0.7, 0.3, - "(middle-col wrap omitted for clarity — every row " - "and every column wraps)", - color="gray", fontsize=7.5, style="italic") - - -def _draw_cube_reduction(ax): - """4×4 cube grid inside SIP 0 — compact layout with phase legend.""" - from matplotlib.patches import Rectangle - _frame_panel(ax, "Cube-level reduction inside SIP 0 (4×4 cubes)", - lim_x=10.0, lim_y=6.0) - - cube_w = 0.65 - cube_gap = 0.18 - # Center the 4×4 grid in the left half of the panel. - grid_total = 4 * cube_w + 3 * cube_gap - grid_x0 = 0.7 - grid_y0 = 0.7 - centers: dict[tuple[int, int], tuple[float, float]] = {} - for r in range(4): - for c in range(4): - cx = grid_x0 + c * (cube_w + cube_gap) + cube_w / 2 - cy = grid_y0 + (3 - r) * (cube_w + cube_gap) + cube_w / 2 - centers[(r, c)] = (cx, cy) - cube_id = r * 4 + c - is_root = (r == 3 and c == 3) - face = _PALETTE_ROOT_FILL if is_root else _PALETTE_BOX_FILL - edge = _PALETTE_ROOT_EDGE if is_root else _PALETTE_BOX_EDGE - rect = Rectangle( - (cx - cube_w / 2, cy - cube_w / 2), cube_w, cube_w, - linewidth=1.2, edgecolor=edge, facecolor=face, - ) - ax.add_patch(rect) - label = f"c{cube_id}" - ax.text(cx, cy, label, ha="center", va="center", - fontsize=7.5, fontweight="bold", - color=_PALETTE_ROOT_EDGE if is_root - else _PALETTE_TEXT) - - # Phase 1: row reduce W→E. - for r in range(4): - for c in range(3): - x0, y0 = centers[(r, c)] - x1, y1 = centers[(r, c + 1)] - _arrow(ax, (x0 + cube_w / 2, y0), (x1 - cube_w / 2, y1), - color=_PALETTE_BLUE, lw=1.5) - # Phase 2: col reduce N→S along rightmost column. - for r in range(3): - x0, y0 = centers[(r, 3)] - x1, y1 = centers[(r + 1, 3)] - _arrow(ax, (x0, y0 - cube_w / 2), (x1, y1 + cube_w / 2), - color=_PALETTE_GREEN, lw=1.7) - - # Phase legend on the right side. - legend_x = grid_x0 + grid_total + 0.55 - ax.text(legend_x, 5.0, "Phase 1: row reduce (W → E)", - color=_PALETTE_BLUE, fontsize=10, fontweight="bold") - ax.text(legend_x, 4.55, "Phase 2: col reduce (N → S, rightmost col)", - color=_PALETTE_GREEN, fontsize=10, fontweight="bold") - ax.text(legend_x, 4.10, "Phase 3: inter-SIP exchange at root cube", - color=_PALETTE_ROOT_EDGE, fontsize=10, fontweight="bold") - ax.text(legend_x, 3.65, "Phase 4: col broadcast (S → N)", - color=_PALETTE_GREEN, fontsize=10, style="italic") - ax.text(legend_x, 3.20, "Phase 5: row broadcast (E → W)", - color=_PALETTE_BLUE, fontsize=10, style="italic") - ax.text(legend_x, 2.55, - "(broadcast phases reverse phases 2 & 1)", - color="gray", fontsize=8.5, style="italic") - ax.text(legend_x, 1.7, - "Root cube (c15, bottom-right) is the only\n" - "cube that performs the inter-SIP exchange.", - color=_PALETTE_ROOT_EDGE, fontsize=9, style="italic") - - -def emit_topology_diagram() -> str: - """Emit a 2×2-panel topology diagram into docs/diagrams/allreduce_latency_plots/. - - Top row: ring_1d | torus_2d (2×3) - Bot row: mesh_2d_no_wrap (2×3) | cube-level reduction in SIP 0 - """ - import matplotlib.gridspec as gridspec - import matplotlib.pyplot as plt - - _SWEEP_OUT_DIR.mkdir(parents=True, exist_ok=True) - fig = plt.figure(figsize=(16, 10), facecolor="white") - gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.30, wspace=0.10) - ax_ring = fig.add_subplot(gs[0, 0]) - ax_torus = fig.add_subplot(gs[0, 1]) - ax_mesh = fig.add_subplot(gs[1, 0]) - ax_cube = fig.add_subplot(gs[1, 1]) - - _draw_ring_topology(ax_ring) - _draw_grid_topology(ax_torus, "torus", n_rows=2, n_cols=3) - _draw_grid_topology(ax_mesh, "mesh", n_rows=2, n_cols=3) - _draw_cube_reduction(ax_cube) - - fig.suptitle( - "Allreduce topology — device-level (top: ring, torus, mesh) " - "and cube-level reduction in SIP 0", - fontsize=14, fontweight="bold", color=_PALETTE_TEXT, y=0.98, - ) - out_path = _SWEEP_OUT_DIR / "topology.png" - fig.savefig(out_path, dpi=130, bbox_inches="tight", - facecolor=fig.get_facecolor()) - plt.close(fig) - return str(out_path) - - -# ── Comparison vs FSIM (broken-y-axis) ──────────────────────────────── -# -# Post-processes summary.csv: today's three model curves + a hand-derived -# theoretical torus_2d line in the bottom panel, and a single external FSIM -# single-device reference marker in the top panel (hardcoded 366 µs; no -# external data file). Reads summary.csv written by _aggregate_sweep_plots. - -_FSIM_EXT_LABEL = "FSIM (single device): 366 µs" -_FSIM_EXT_LATENCY_NS = 366_000.0 -_CMP_COLORS = { - "ring_1d": "tab:blue", - "torus_2d": "tab:orange", - "mesh_2d_no_wrap": "tab:green", -} -_CMP_DISPLAY = { - "ring_1d": "Ring 1x6 (6 devices)", - "torus_2d": "2D Torus 2x3 (6 devices)", - "mesh_2d_no_wrap": "2D Mesh 2x3 (6 devices)", -} -# Hand-derived theoretical model for torus_2d (6 SIPs): per-PE NOC-packet -# count fit to the simulated startup + per-packet tau. -_CMP_NOC_PACKET_BYTES = 128 -_CMP_PES_PER_CUBE = 8 -_CMP_T_STARTUP_NS = 1346.0 -_CMP_TAU_NS = (8741.0 - 1346.0) / (6144 - 1) - - -def emit_comparison_fsim_plot() -> str | None: - """Render comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png. - - Reads ``summary.csv`` (written by ``_aggregate_sweep_plots``). Returns the - output path, or ``None`` if summary.csv is absent / empty. - """ - import csv - - csv_path = _SWEEP_OUT_DIR / "summary.csv" - if not csv_path.exists(): - return None - records = [] - with open(csv_path, newline="", encoding="utf-8") as f: - for row in csv.DictReader(f): - records.append({ - "sip_topology": row["sip_topology"], - "bytes_per_pe": int(row["bytes_per_pe"]), - "latency_ns": float(row["latency_ns"]), - }) - if not records: - return None - - import matplotlib.pyplot as plt - import matplotlib.ticker as mticker - - def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float: - bytes_per_cube = int(bytes_per_pe) * _CMP_PES_PER_CUBE - n_packets = max(1, -(-bytes_per_cube // _CMP_NOC_PACKET_BYTES)) - return _CMP_T_STARTUP_NS + (n_packets - 1) * _CMP_TAU_NS - - def _bytes_fmt(x, _pos): - if x >= 1024 * 1024: - return f"{x / (1024 * 1024):.0f}M" - if x >= 1024: - return f"{x / 1024:.0f}K" - return f"{int(x)}" - - topologies = sorted({r["sip_topology"] for r in records}) - max_local = max(r["latency_ns"] for r in records) - ext_x = max(r["bytes_per_pe"] for r in records) - - fig, (ax_top, ax_bot) = plt.subplots( - 2, 1, sharex=True, - gridspec_kw={"height_ratios": [1, 4], "hspace": 0.05}, - figsize=(9, 6.5), - ) - - # Bottom panel: model curves + theoretical torus, linear y. - for topo in topologies: - rs = sorted([r for r in records if r["sip_topology"] == topo], - key=lambda r: r["bytes_per_pe"]) - if not rs: - continue - ax_bot.plot( - [r["bytes_per_pe"] for r in rs], - [r["latency_ns"] for r in rs], - marker="o", label=_CMP_DISPLAY.get(topo, topo), - color=_CMP_COLORS.get(topo), - ) - torus_rs = sorted( - [r for r in records if r["sip_topology"] == "torus_2d"], - key=lambda r: r["bytes_per_pe"], - ) - if torus_rs: - ax_bot.plot( - [r["bytes_per_pe"] for r in torus_rs], - [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs], - color="tab:red", linestyle="--", linewidth=1.6, marker="x", - label="Theoretical 2D Torus 2x3", - ) - ax_bot.set_ylim(0, max_local * 1.10) - - # Top panel: external FSIM single-device reference marker. - ax_top.scatter( - [ext_x], [_FSIM_EXT_LATENCY_NS], - marker="*", s=240, color="tab:red", zorder=5, - label=_FSIM_EXT_LABEL, - ) - ax_top.set_ylim(_FSIM_EXT_LATENCY_NS * 0.93, _FSIM_EXT_LATENCY_NS * 1.05) - - # Hide spine between panels; draw diagonal break ticks. - ax_top.spines["bottom"].set_visible(False) - ax_bot.spines["top"].set_visible(False) - ax_top.tick_params(labeltop=False, bottom=False) - ax_bot.xaxis.tick_bottom() - d = 0.012 - kw = dict(transform=ax_top.transAxes, color="k", clip_on=False, lw=1) - ax_top.plot((-d, +d), (-d, +d), **kw) - ax_top.plot((1 - d, 1 + d), (-d, +d), **kw) - kw.update(transform=ax_bot.transAxes) - ax_bot.plot((-d, +d), (1 - d * 4, 1 + d * 4), **kw) - ax_bot.plot((1 - d, 1 + d), (1 - d * 4, 1 + d * 4), **kw) - - ax_bot.set_xscale("log", base=2) - ax_bot.set_xlabel("Bytes per PE (log scale)") - ax_bot.set_ylabel("Time (ns)") - ax_top.set_ylabel("Time (ns)") - ax_bot.grid(True, alpha=0.3) - ax_top.grid(True, alpha=0.3) - ax_bot.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt)) - - handles_bot, labels_bot = ax_bot.get_legend_handles_labels() - handles_top, labels_top = ax_top.get_legend_handles_labels() - ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top, - loc="upper left") - - fig.suptitle("Multidevice allreduce (ring, Mesh, 2DTorus) vs FSIM latency") - fig.tight_layout() - out = (_SWEEP_OUT_DIR - / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png") - fig.savefig(out, dpi=120) - plt.close(fig) - return str(out) diff --git a/tests/test_milestone_benches.py b/tests/test_milestone_benches.py new file mode 100644 index 0000000..cf7fb5f --- /dev/null +++ b/tests/test_milestone_benches.py @@ -0,0 +1,77 @@ +"""Milestone benches: registration + figure/result generation (ADR-0054). + +``milestone-1h-gemm`` / ``milestone-1h-ccl`` are eval benches: run via the +normal ``run_bench`` path, they regenerate every GEMM / allreduce figure + +CSV into ``benches/1H_milestone_output/{gemm,ccl}/``. The GEMM bench in +``MILESTONE_FAST=1`` mode just re-renders the committed sweep JSON (fast, +default-run here); the CCL bench drives both full sweeps (slow, opt-in). +""" +from __future__ import annotations + +import re +from pathlib import Path + +import pytest + +from kernbench.benches.registry import resolve +from kernbench.runtime_api.bench_runner import run_bench +from kernbench.runtime_api.types import resolve_device +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + +import kernbench.benches.milestone_1h_ccl as ccl_bench +import kernbench.benches.milestone_1h_gemm as gemm_bench + +_NAME_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$") + + +def _run(name: str): + topo = resolve_topology("topology.yaml") + return run_bench( + topology=topo, bench_fn=resolve(name).run, device=resolve_device(None), + engine_factory=lambda t, d: GraphEngine( + getattr(t, "topology_obj", t), enable_data=True, + ), + ) + + +def test_milestone_benches_registered(): + for name in ("milestone-1h-gemm", "milestone-1h-ccl"): + spec = resolve(name) + assert spec.name == name + assert _NAME_RE.match(spec.name) + assert spec.description.strip() + + +@pytest.mark.skipif( + not gemm_bench.DEFAULT_SWEEP_JSON.exists(), + reason="gemm_sweep.json absent; run scripts/gemm_sweep.py first", +) +def test_milestone_gemm_fast_generates_figures(monkeypatch): + monkeypatch.setenv("MILESTONE_FAST", "1") + result = _run("milestone-1h-gemm") + assert result.completion.ok, result.completion + out = gemm_bench._OUTPUT_DIR + for png in ( + "gemm_stage_breakdown.png", + "gemm_mac_utilization_measured.png", + "gemm_mac_utilization_theoretical_vs_measured.png", + ): + assert (out / png).exists(), f"missing {png}" + + +@pytest.mark.slow +def test_milestone_ccl_generates_figures(): + result = _run("milestone-1h-ccl") + assert result.completion.ok, result.completion + out = ccl_bench._OUTPUT_DIR + for artifact in ( + "summary.csv", + "topology.png", + "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png", + "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png", + "AllReduce_LRAB_Ring1D_6SiP_1x6.png", + "AllReduce_LRAB_2Dtorus_6SiP_2x3.png", + "AllReduce_LRAB_2DMesh_6SiP_2x3.png", + ): + assert (out / artifact).exists(), f"missing {artifact}" diff --git a/tools/generate_adr_index.py b/tools/generate_adr_index.py index 5a7df60..54e3b53 100644 --- a/tools/generate_adr_index.py +++ b/tools/generate_adr_index.py @@ -93,6 +93,7 @@ CLASSIFICATION: dict[int, tuple[str, str | None]] = { 51: (IMPL_DECISIONS, "Routing & Helper API"), 52: (IMPL_DECISIONS, "Sim-engine Op Log and Memory Store Schemas"), 53: (IMPL_DECISIONS, "Topology Compilation, Diagrams & Builder Algorithms"), + 54: (IMPL_DECISIONS, "Evaluation Harnesses"), } # Canonical component order for the Detailed Architecture section.