kernbench2/tests/test_allreduce_buffer_kind_sweep.py

"""Phase 1 buffer-kind allreduce sweep — torus_2d 6 SIPs.

Parametrized over (buffer_kind, n_elem). Each case runs the standard
config-driven allreduce app and writes a JSON row to a shared staging
dir; the conftest sessionfinish hook (added in Phase 1) aggregates
rows into ``docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png``.

Pre-Phase-2: the three buffer-kind lines overlap exactly because slot
access is latency-free today. Post-Phase-2 they spread out (tcm
fastest, hbm slowest).
"""
from __future__ import annotations

import json
from pathlib import Path

import pytest
import yaml

from kernbench.runtime_api.context import RuntimeContext
from kernbench.runtime_api.types import DeviceSelector
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology

# Reuse the allreduce app helpers.
from tests.test_allreduce_multidevice import (
    _write_temp_configs,
    run_allreduce,
)


_BUFFER_KINDS = ["tcm", "sram", "hbm"]
_N_ELEM_GRID = [128, 1024, 8192, 32768]   # 256 B → 64 KB per slot
_ELEM_BYTES_F16 = 2

_OUT_DIR = (Path(__file__).parent.parent / "docs" / "diagrams"
            / "allreduce_latency_plots")
_ROWS_DIR = _OUT_DIR / "_buffer_kind_rows"


def _bk_params():
    out = []
    for bk in _BUFFER_KINDS:
        for n_elem in _N_ELEM_GRID:
            out.append(pytest.param(bk, n_elem, id=f"{bk}-n_elem{n_elem}"))
    return out


@pytest.mark.parametrize("buffer_kind,n_elem", _bk_params())
def test_buffer_kind_allreduce_one(tmp_path, buffer_kind, n_elem):
    """One config of the buffer-kind sweep. xdist parallelizes."""
    sub = tmp_path / f"{buffer_kind}_{n_elem}"
    sub.mkdir()
    topo_path, ccl_path = _write_temp_configs(
        sub,
        sip_topology="torus_2d",
        n_sips=6,
        algorithm="intercube_allreduce",
        sip_w=3, sip_h=2,
        n_elem_override=n_elem,
    )
    # Override buffer_kind in the temp ccl.yaml.
    with open(ccl_path) as f:
        ccl_cfg = yaml.safe_load(f)
    ccl_cfg.setdefault("defaults", {})["buffer_kind"] = buffer_kind
    ccl_cfg.setdefault("algorithms", {}).setdefault(
        "intercube_allreduce", {},
    )["buffer_kind"] = buffer_kind
    with open(ccl_path, "w") as f:
        yaml.dump(ccl_cfg, f, default_flow_style=False)

    topo = resolve_topology(topo_path)
    engine = GraphEngine(topo.topology_obj, enable_data=True)
    spec = topo.topology_obj.spec

    with RuntimeContext(
        engine=engine,
        target_device=DeviceSelector("all"),
        correlation_id=f"bk_sweep_{buffer_kind}_{n_elem}",
        spec=spec,
    ) as ctx:
        result = run_allreduce(
            ctx, engine, spec,
            algorithm="intercube_allreduce", ccl_yaml=ccl_path,
        )
        assert result["ok_cubes"] > 0

    pe_exec_vals = [
        float(tr.get("pe_exec_ns", 0.0) or 0.0)
        for _, (_, tr) in engine._results.items()
        if isinstance(tr, dict)
    ]
    crit_ns = max(pe_exec_vals) if pe_exec_vals else 0.0

    bytes_per_pe = n_elem * _ELEM_BYTES_F16
    record = {
        "buffer_kind": buffer_kind,
        "sip_topology": "torus_2d",
        "n_sips": 6,
        "n_elem": n_elem,
        "bytes_per_pe": bytes_per_pe,
        "latency_ns": crit_ns,
    }
    _ROWS_DIR.mkdir(parents=True, exist_ok=True)
    row_path = _ROWS_DIR / f"{buffer_kind}_{n_elem}.json"
    with open(row_path, "w", encoding="utf-8") as f:
        json.dump(record, f)


def aggregate_buffer_kind_plot() -> bool:
    """Read per-config rows and emit buffer_kind_sweep.png + CSV.

    Called from conftest.pytest_sessionfinish (controller-only).
    Returns True if rows were aggregated.
    """
    import csv

    if not _ROWS_DIR.exists():
        return False
    row_files = sorted(_ROWS_DIR.glob("*.json"))
    if not row_files:
        return False

    records = []
    for p in row_files:
        with open(p, encoding="utf-8") as f:
            records.append(json.load(f))

    import matplotlib.pyplot as plt
    from matplotlib.ticker import FuncFormatter

    def _fmt_bytes(x, _pos):
        if x <= 0:
            return "0"
        if x >= 1024 * 1024:
            return f"{x / (1024 * 1024):.0f} MB"
        if x >= 1024:
            return f"{x / 1024:.0f} KB"
        return f"{x:.0f} B"

    _bytes_fmt = FuncFormatter(_fmt_bytes)

    _OUT_DIR.mkdir(parents=True, exist_ok=True)
    with open(_OUT_DIR / "buffer_kind_sweep.csv", "w",
              newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=[
            "buffer_kind", "sip_topology", "n_sips", "n_elem",
            "bytes_per_pe", "latency_ns",
        ])
        w.writeheader()
        for r in sorted(records, key=lambda r: (
            r["buffer_kind"], r["bytes_per_pe"],
        )):
            w.writerow(r)

    colors = {"tcm": "tab:blue", "sram": "tab:orange", "hbm": "tab:red"}
    fig, ax = plt.subplots(figsize=(10, 6))
    for bk in ["tcm", "sram", "hbm"]:
        rs = sorted(
            [r for r in records if r["buffer_kind"] == bk],
            key=lambda r: r["bytes_per_pe"],
        )
        if not rs:
            continue
        ax.plot(
            [r["bytes_per_pe"] for r in rs],
            [r["latency_ns"] for r in rs],
            marker="o", lw=2.0,
            color=colors[bk], label=f"buffer_kind = {bk}",
        )
    ax.set_xscale("log", base=2)
    ax.set_xlabel("Bytes per PE (log scale)")
    ax.set_ylabel("Time (ns)")
    ax.set_title(
        "Allreduce torus_2d (6 SIPs, 3×2) — IPCQ slot memory tier"
    )
    ax.grid(True, alpha=0.3)
    ax.legend()
    ax.xaxis.set_major_formatter(_bytes_fmt)
    fig.tight_layout()
    fig.savefig(_OUT_DIR / "buffer_kind_sweep.png", dpi=130)
    plt.close(fig)

    for p in row_files:
        try:
            p.unlink()
        except OSError:
            pass
    try:
        _ROWS_DIR.rmdir()
    except OSError:
        pass

    print(f"\nWrote {_OUT_DIR / 'buffer_kind_sweep.png'} "
          f"from {len(records)} rows")
    return True