kernbench2/tools/generate_adr_index.py

"""Generate docs/adr/INDEX.md (and docs/adr-ko/INDEX.md) from the ADR corpus.

Auto-derives a section-based index following the same classification as
the /report skill — Design Principles / High-level Architecture /
Detailed Architecture (by component) / Implementation Decisions
(by topic). Run before publishing to refresh INDEX.md.

The classification table below is the single source of truth. When a new
ADR is added under docs/adr/, append an entry to ``CLASSIFICATION``. The
script exits 1 if any ADR file is missing from the table or any title
cannot be parsed, so omissions surface in CI.

Usage:
    python tools/generate_adr_index.py [--root <repo-root>] [--check]

  --check : exit 1 if the generated INDEX differs from the on-disk file
            (used by CI to detect un-regenerated indexes).
"""

from __future__ import annotations

import argparse
import re
import sys
from pathlib import Path

ADR_FILENAME_RE = re.compile(r"^ADR-(\d{4})-([a-z0-9_-]+)\.md$")
# Title separator may be ":" (most ADRs) or "—" (em-dash; ADR-0033 uses
# this). The verifier (tools/verify_adr_lang_pairs.py) only checks the
# number, so both styles already coexist in the corpus.
TITLE_RE = re.compile(r"^# ADR-(\d{4})\s*[:—]\s*(.+?)\s*$")

DESIGN_PRINCIPLES = "Design Principles"
HIGH_LEVEL = "High-level Architecture"
DETAILED = "Detailed Architecture"
IMPL_DECISIONS = "Implementation Decisions"


# (section, subgroup) per ADR. subgroup is used to sub-divide Detailed
# (by component, see DETAILED_COMPONENTS) and Implementation (by topic).
# Add a line here when introducing a new ADR.
CLASSIFICATION: dict[int, tuple[str, str | None]] = {
    # Design Principles
    13: (DESIGN_PRINCIPLES, None),
    33: (DESIGN_PRINCIPLES, None),

    # High-level Architecture
    3:  (HIGH_LEVEL, "System hierarchy (Tray / SIP / CUBE / PE)"),
    7:  (HIGH_LEVEL, "Runtime API ↔ sim_engine boundaries"),
    16: (HIGH_LEVEL, "IOChiplet NOC and memory data path"),
    17: (HIGH_LEVEL, "Cube NOC and HBM connectivity"),

    # Detailed Architecture (subgroup matches DETAILED_COMPONENTS entries)
    14: (DETAILED, "pe_pipeline"),  # covers pe_cpu/pe_dma/pe_fetch_store/pe_gemm/pe_math/pe_scheduler
    23: (DETAILED, "pe_ipcq"),
    34: (DETAILED, "hbm_ctrl"),
    35: (DETAILED, "m_cpu"),
    36: (DETAILED, "io_cpu"),
    37: (DETAILED, "forwarding"),
    38: (DETAILED, "pcie_ep"),
    39: (DETAILED, "pe_mmu"),
    40: (DETAILED, "pe_tcm"),
    41: (DETAILED, "sram"),
    42: (DETAILED, "tiling"),

    # Implementation Decisions
    1:  (IMPL_DECISIONS, "Address Scheme"),
    2:  (IMPL_DECISIONS, "Routing & Helper API"),
    4:  (IMPL_DECISIONS, "Memory Semantics & Local-HBM Bandwidth"),
    5:  (IMPL_DECISIONS, "Topology Compilation, Diagrams & Builder Algorithms"),
    6:  (IMPL_DECISIONS, "Topology Compilation, Diagrams & Builder Algorithms"),
    8:  (IMPL_DECISIONS, "Tensor Deployment and Allocation"),
    9:  (IMPL_DECISIONS, "Kernel Execution and Host-Device Messaging"),
    10: (IMPL_DECISIONS, "CLI Surface and Semantics"),
    11: (IMPL_DECISIONS, "Address Scheme"),
    12: (IMPL_DECISIONS, "Kernel Execution and Host-Device Messaging"),
    15: (IMPL_DECISIONS, "Component Port/Wire Fabric Model"),
    20: (IMPL_DECISIONS, "Two-Pass Data Execution"),
    22: (IMPL_DECISIONS, "2D Grid Program Identity"),
    24: (IMPL_DECISIONS, "Parallelism (Launcher, DP, TP, AHBM backend, CCL algorithm)"),
    25: (IMPL_DECISIONS, "IPCQ Direction Addressing"),
    26: (IMPL_DECISIONS, "Parallelism (Launcher, DP, TP, AHBM backend, CCL algorithm)"),
    27: (IMPL_DECISIONS, "Parallelism (Launcher, DP, TP, AHBM backend, CCL algorithm)"),
    32: (IMPL_DECISIONS, "Intercube All-Reduce"),
    43: (IMPL_DECISIONS, "Evaluation Harnesses"),
    44: (IMPL_DECISIONS, "Evaluation Harnesses"),
    45: (IMPL_DECISIONS, "Bench Module Contract"),
    46: (IMPL_DECISIONS, "Kernel-side tl.* API (TLContext)"),
    47: (IMPL_DECISIONS, "Parallelism (Launcher, DP, TP, AHBM backend, CCL algorithm)"),
    48: (IMPL_DECISIONS, "Memory Allocator Algorithms"),
    49: (IMPL_DECISIONS, "Probe Subcommand"),
    50: (IMPL_DECISIONS, "Parallelism (Launcher, DP, TP, AHBM backend, CCL algorithm)"),
    51: (IMPL_DECISIONS, "Routing & Helper API"),
    52: (IMPL_DECISIONS, "Sim-engine Op Log and Memory Store Schemas"),
    53: (IMPL_DECISIONS, "Topology Compilation, Diagrams & Builder Algorithms"),
}

# Canonical component order for the Detailed Architecture section.
# Each entry: (component_name, list[ADR-numbers that cover it]).
# Order matches src/kernbench/components/builtin/*.py alphabetical
# (the same order /report uses).
DETAILED_COMPONENTS: list[tuple[str, list[int]]] = [
    ("forwarding",      [37]),
    ("hbm_ctrl",        [34]),
    ("io_cpu",          [36]),
    ("m_cpu",           [35]),
    ("pcie_ep",         [38]),
    ("pe_cpu",          [14]),
    ("pe_dma",          [14, 23]),
    ("pe_fetch_store",  [14]),
    ("pe_gemm",         [14]),
    ("pe_ipcq",         [23]),
    ("pe_math",         [14]),
    ("pe_mmu",          [39]),
    ("pe_scheduler",    [14]),
    ("pe_tcm",          [40]),
    ("sram",            [41]),
    ("tiling",          [42]),
]


def _strip_bom(text: str) -> str:
    """Strip leading UTF-8 BOM if present."""
    if text and ord(text[0]) == 0xFEFF:
        return text[1:]
    return text


def _find_adrs(adr_dir: Path) -> list[tuple[int, str, Path]]:
    """Return [(num, slug, path), ...] for ADR files in adr_dir, sorted by num."""
    out: list[tuple[int, str, Path]] = []
    for p in sorted(adr_dir.iterdir()):
        if not p.is_file():
            continue
        m = ADR_FILENAME_RE.match(p.name)
        if not m:
            continue
        out.append((int(m.group(1)), m.group(2), p))
    out.sort(key=lambda t: t[0])
    return out


def _extract_title(path: Path) -> str:
    """Parse the title from the first line `# ADR-NNNN: <title>`. Strips BOM."""
    text = _strip_bom(path.read_text(encoding="utf-8"))
    first_line = text.split("\n", 1)[0] if text else ""
    m = TITLE_RE.match(first_line)
    if not m:
        raise ValueError(
            f"{path.name}: cannot parse title from first line: {first_line!r}"
        )
    return m.group(2)


def _build_index(adr_dir: Path, link_prefix: str) -> str:
    """Build the INDEX.md text for adr_dir.

    link_prefix is the relative href used for ADR links (e.g., ``./``
    so links resolve relative to the INDEX file location).
    """
    adrs = _find_adrs(adr_dir)
    if not adrs:
        raise RuntimeError(f"No ADR files found under {adr_dir}")

    # Validate every ADR is classified.
    missing = sorted(num for num, _slug, _ in adrs if num not in CLASSIFICATION)
    if missing:
        raise RuntimeError(
            "ADR(s) missing from CLASSIFICATION table in "
            "tools/generate_adr_index.py: "
            + ", ".join(f"ADR-{n:04d}" for n in missing)
            + ". Add an entry for each."
        )

    # Map: num → (filename, title)
    num_to_meta: dict[int, tuple[str, str]] = {}
    for num, _slug, path in adrs:
        num_to_meta[num] = (path.name, _extract_title(path))

    # ── Section assembly ────────────────────────────────────────────
    lines: list[str] = []
    lines.append("# ADR Index")
    lines.append("")
    lines.append(
        f"Auto-generated by `tools/generate_adr_index.py`. "
        f"Total ADRs: **{len(adrs)}**."
    )
    lines.append("")
    lines.append(
        "Classification mirrors the `/report` skill's section assignment. "
        "When adding a new ADR, also add an entry to the "
        "`CLASSIFICATION` table in `tools/generate_adr_index.py`."
    )
    lines.append("")

    def fmt_entry(num: int) -> str:
        fname, title = num_to_meta[num]
        return f"- [ADR-{num:04d}]({link_prefix}{fname}) — {title}"

    # Design Principles
    lines.append("## Design Principles")
    lines.append("")
    nums = sorted(n for n, (sec, _) in CLASSIFICATION.items()
                  if sec == DESIGN_PRINCIPLES and n in num_to_meta)
    for n in nums:
        lines.append(fmt_entry(n))
    lines.append("")

    # High-level Architecture (preserve declaration order via CLASSIFICATION dict's insertion order)
    lines.append("## High-level Architecture")
    lines.append("")
    nums = sorted(n for n, (sec, _) in CLASSIFICATION.items()
                  if sec == HIGH_LEVEL and n in num_to_meta)
    for n in nums:
        sub = CLASSIFICATION[n][1] or ""
        fname, title = num_to_meta[n]
        if sub:
            lines.append(
                f"- [ADR-{n:04d}]({link_prefix}{fname}) — {title}"
                f"  _({sub})_"
            )
        else:
            lines.append(fmt_entry(n))
    lines.append("")

    # Detailed Architecture (canonical component order)
    lines.append("## Detailed Architecture")
    lines.append("")
    lines.append("One subsection per component file under `src/kernbench/components/builtin/`.")
    lines.append("")
    for comp, adr_nums in DETAILED_COMPONENTS:
        lines.append(f"### {comp}")
        lines.append("")
        if adr_nums:
            for n in adr_nums:
                if n not in num_to_meta:
                    raise RuntimeError(
                        f"DETAILED_COMPONENTS references ADR-{n:04d} for "
                        f"'{comp}' but no such ADR file exists."
                    )
                lines.append(fmt_entry(n))
        else:
            lines.append("_(no ADR coverage)_")
        lines.append("")

    # Implementation Decisions — group by subgroup, preserving first-appearance order.
    lines.append("## Implementation Decisions")
    lines.append("")
    topic_order: list[str] = []
    topic_to_nums: dict[str, list[int]] = {}
    for n, (sec, sub) in CLASSIFICATION.items():
        if sec != IMPL_DECISIONS or n not in num_to_meta:
            continue
        topic = sub or "Uncategorized"
        if topic not in topic_to_nums:
            topic_order.append(topic)
            topic_to_nums[topic] = []
        topic_to_nums[topic].append(n)
    # Stable order: by smallest ADR-number in topic, so older infra appears first.
    topic_order.sort(key=lambda t: min(topic_to_nums[t]))
    for topic in topic_order:
        lines.append(f"### {topic}")
        lines.append("")
        for n in sorted(topic_to_nums[topic]):
            lines.append(fmt_entry(n))
        lines.append("")

    return "\n".join(lines).rstrip() + "\n"


def _check_or_write(path: Path, content: str, check: bool) -> bool:
    """Write content to path, or compare in --check mode. Returns True on diff."""
    existing = path.read_text(encoding="utf-8") if path.exists() else ""
    if check:
        if existing != content:
            print(f"[diff] {path} would change.")
            return True
        return False
    path.write_text(content, encoding="utf-8")
    if existing != content:
        print(f"[wrote] {path}")
    else:
        print(f"[unchanged] {path}")
    return False


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument(
        "--root", type=Path, default=Path.cwd(),
        help="Repository root (default: cwd)",
    )
    p.add_argument(
        "--check", action="store_true",
        help="Exit 1 if generated INDEX would differ from disk",
    )
    args = p.parse_args(argv)

    en_dir = args.root / "docs" / "adr"
    ko_dir = args.root / "docs" / "adr-ko"

    if not en_dir.is_dir():
        print(f"error: {en_dir} does not exist", file=sys.stderr)
        return 1

    any_diff = False
    try:
        en_index = _build_index(en_dir, link_prefix="./")
    except (RuntimeError, ValueError) as e:
        print(f"error (EN): {e}", file=sys.stderr)
        return 1
    any_diff |= _check_or_write(en_dir / "INDEX.md", en_index, args.check)

    if ko_dir.is_dir():
        try:
            ko_index = _build_index(ko_dir, link_prefix="./")
        except (RuntimeError, ValueError) as e:
            print(f"error (KO): {e}", file=sys.stderr)
            return 1
        any_diff |= _check_or_write(ko_dir / "INDEX.md", ko_index, args.check)

    if args.check and any_diff:
        print(
            "INDEX.md is out of date. "
            "Run `python tools/generate_adr_index.py` to refresh.",
            file=sys.stderr,
        )
        return 1
    return 0


if __name__ == "__main__":
    sys.exit(main())