kernbench2/tests/test_bw_occupancy.py

"""Tests for per-link BW occupancy (available_at) model.

Verifies that:
  - Single transactions see no extra delay from BW tracking.
  - Back-to-back transactions on the same link see BW contention.
  - Transactions on independent paths see no contention.
  - Response messages (nbytes=0) do not occupy BW.
"""
from pathlib import Path

from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology

TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"


def _engine():
    return GraphEngine(load_topology(TOPOLOGY_PATH))


def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
    slice_bytes = 48 * (1 << 30) // 8
    pa = PhysAddr.pe_hbm_addr(
        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
    )
    return pa.encode()


# ── 1. Single transaction: available_at does not add delay ────────


def test_single_txn_latency_unchanged():
    """Single H2D write must produce identical latency with or without BW tracking.

    With no contention, available_at starts at 0 for every link, so the
    current transaction sees zero BW wait. Total latency = prop + overhead + drain.
    Two separate engines running the same request must match exactly.
    """
    msg = MemoryWriteMsg(
        correlation_id="bw", request_id="single",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    e1 = _engine()
    h1 = e1.submit(msg)
    e1.wait(h1)
    _, t1 = e1.get_completion(h1)

    e2 = _engine()
    h2 = e2.submit(msg)
    e2.wait(h2)
    _, t2 = e2.get_completion(h2)

    assert t1["total_ns"] == t2["total_ns"], (
        f"Single txn must be deterministic: {t1['total_ns']} vs {t2['total_ns']}"
    )
    assert t1["total_ns"] > 0


# ── 2. Back-to-back transactions: BW contention on shared links ──


def test_back_to_back_same_cube_bw_contention():
    """Two concurrent H2D writes to the same cube must show BW contention.

    Both transactions share the same command path (pcie_ep → io_cpu → ... → m_cpu).
    The second transaction must wait for BW occupancy on shared links.
    The slower (later-finishing) request must take longer than a single isolated request.
    """
    # Baseline: single isolated write
    engine_single = _engine()
    msg_single = MemoryWriteMsg(
        correlation_id="bw", request_id="baseline",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    h = engine_single.submit(msg_single)
    engine_single.wait(h)
    _, t_single = engine_single.get_completion(h)
    single_ns = t_single["total_ns"]

    # Concurrent: two writes to same cube, different PEs
    engine_conc = _engine()
    msg_a = MemoryWriteMsg(
        correlation_id="bw", request_id="conc-a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    msg_b = MemoryWriteMsg(
        correlation_id="bw", request_id="conc-b",
        dst_sip=0, dst_cube=0, dst_pe=1,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
        pattern="zero", target_pe=1,
    )
    ha = engine_conc.submit(msg_a)
    hb = engine_conc.submit(msg_b)
    engine_conc.wait(ha)
    engine_conc.wait(hb)
    _, ta = engine_conc.get_completion(ha)
    _, tb = engine_conc.get_completion(hb)

    max_ns = max(ta["total_ns"], tb["total_ns"])
    assert max_ns > single_ns, (
        f"BW contention: concurrent max ({max_ns:.2f}ns) must > "
        f"single ({single_ns:.2f}ns) due to link BW occupancy"
    )


def test_back_to_back_bw_delay_magnitude():
    """BW contention delay must be approximately nbytes / bottleneck_bw.

    For two 4KB writes on the same path, the second txn should be delayed
    by roughly 4096 / bottleneck_bw_gbs ns on the shared links.
    The delta between concurrent max and single should be > 0 and bounded.
    """
    nbytes = 4096

    # Single isolated
    engine_single = _engine()
    msg = MemoryWriteMsg(
        correlation_id="bw", request_id="mag-single",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
        pattern="zero", target_pe=0,
    )
    h = engine_single.submit(msg)
    engine_single.wait(h)
    _, t = engine_single.get_completion(h)
    single_ns = t["total_ns"]

    # Concurrent pair
    engine_conc = _engine()
    msg_a = MemoryWriteMsg(
        correlation_id="bw", request_id="mag-a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
        pattern="zero", target_pe=0,
    )
    msg_b = MemoryWriteMsg(
        correlation_id="bw", request_id="mag-b",
        dst_sip=0, dst_cube=0, dst_pe=1,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=nbytes,
        pattern="zero", target_pe=1,
    )
    ha = engine_conc.submit(msg_a)
    hb = engine_conc.submit(msg_b)
    engine_conc.wait(ha)
    engine_conc.wait(hb)
    _, ta = engine_conc.get_completion(ha)
    _, tb = engine_conc.get_completion(hb)

    max_ns = max(ta["total_ns"], tb["total_ns"])
    delta = max_ns - single_ns

    # Delta should be positive (contention exists)
    assert delta > 0, f"Expected positive BW contention delta, got {delta:.4f}"

    # Delta should be at least nbytes / max_possible_bw (768 GB/s PCIe)
    min_expected_delta = nbytes / 768.0  # ~5.3ns
    assert delta >= min_expected_delta * 0.5, (
        f"BW contention delta ({delta:.2f}ns) too small, "
        f"expected >= ~{min_expected_delta:.2f}ns"
    )


# ── 3. Independent paths: no cross-contention ────────────────────


def test_independent_paths_no_contention():
    """Two concurrent H2D writes to different cubes via different paths
    should not contend (or contend minimally) with each other.

    cube0 and cube3 are in different columns of the 4x4 mesh,
    so their UCIe paths diverge after IO_CPU.
    """
    # Single write to cube0
    engine_single = _engine()
    msg0 = MemoryWriteMsg(
        correlation_id="bw", request_id="indep-single",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    h0 = engine_single.submit(msg0)
    engine_single.wait(h0)
    _, t0 = engine_single.get_completion(h0)
    single_ns_cube0 = t0["total_ns"]

    # Concurrent: cube0 + cube3 (different column, divergent paths)
    engine_conc = _engine()
    msg_a = MemoryWriteMsg(
        correlation_id="bw", request_id="indep-a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    msg_b = MemoryWriteMsg(
        correlation_id="bw", request_id="indep-b",
        dst_sip=0, dst_cube=3, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=3, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    ha = engine_conc.submit(msg_a)
    hb = engine_conc.submit(msg_b)
    engine_conc.wait(ha)
    engine_conc.wait(hb)
    _, ta = engine_conc.get_completion(ha)
    _, tb = engine_conc.get_completion(hb)

    # cube0 in concurrent run should be close to its single-run time
    # Allow some tolerance for shared early links (pcie_ep → io_cpu)
    delta_cube0 = ta["total_ns"] - single_ns_cube0
    # Shared early links may add some contention, but much less than same-path
    assert delta_cube0 < single_ns_cube0 * 0.5, (
        f"Independent path cube0 delta ({delta_cube0:.2f}ns) too large; "
        f"paths should mostly be independent"
    )


# ── 4. Response messages: no BW occupancy ────────────────────────


def test_response_no_bw_occupancy():
    """Response messages have nbytes=0 and must not occupy link BW.

    A single H2D write completes (including response path). If responses
    occupied BW, a concurrent write would see extra contention on the
    reverse path. We verify that two writes complete without excessive delay.
    """
    engine = _engine()
    msg_a = MemoryWriteMsg(
        correlation_id="bw", request_id="resp-a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    msg_b = MemoryWriteMsg(
        correlation_id="bw", request_id="resp-b",
        dst_sip=0, dst_cube=0, dst_pe=2,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=2), nbytes=4096,
        pattern="zero", target_pe=2,
    )
    ha = engine.submit(msg_a)
    hb = engine.submit(msg_b)
    engine.wait(ha)
    engine.wait(hb)
    comp_a, _ = engine.get_completion(ha)
    comp_b, _ = engine.get_completion(hb)
    assert comp_a.ok is True
    assert comp_b.ok is True


# ── 5. PE DMA: formula still equals actual for single txn ────────


def test_pe_dma_formula_lower_bound():
    """PE DMA single transaction: formula_latency is a lower bound of actual.

    PE DMA now routes through NOC, which applies internal mesh traversal
    latency (XY routing distance) not captured by the formula (edge
    distance_mm=0 for distributed NOC). The formula is a lower bound:
    formula <= actual.
    """
    from kernbench.policy.routing.router import AddressResolver, PathRouter

    graph = load_topology(TOPOLOGY_PATH)
    engine = GraphEngine(graph)

    pa = _hbm_pa(sip=0, cube=0, pe_id=0)
    msg = PeDmaMsg(
        correlation_id="bw", request_id="formula-check",
        src_sip=0, src_cube=0, src_pe=0,
        dst_pa=pa, nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    actual = trace["total_ns"]
    formula = trace.get("formula_ns")

    if formula is not None:
        assert formula <= actual + 0.01, (
            f"PE DMA formula ({formula:.4f}) must be <= actual ({actual:.4f})"
        )
    assert actual > 0


# ── 6. Existing probe invariants preserved ───────────────────────


def test_h2d_monotonicity_preserved():
    """H2D latency monotonicity (1hop < 2hop < 3hop) must be preserved.

    available_at does not affect single-transaction ordering since each
    engine instance starts fresh with available_at=0 everywhere.
    """
    cubes = [0, 4, 8]
    latencies = []
    for cube in cubes:
        engine = _engine()
        msg = MemoryWriteMsg(
            correlation_id="bw", request_id=f"mono-c{cube}",
            dst_sip=0, dst_cube=cube, dst_pe=0,
            dst_pa=_hbm_pa(sip=0, cube=cube, pe_id=0), nbytes=4096,
            pattern="zero", target_pe=0,
        )
        h = engine.submit(msg)
        engine.wait(h)
        _, t = engine.get_completion(h)
        latencies.append(t["total_ns"])

    for i in range(len(latencies) - 1):
        assert latencies[i] < latencies[i + 1], (
            f"Monotonicity: cube{cubes[i]}({latencies[i]:.2f}) "
            f"must < cube{cubes[i+1]}({latencies[i+1]:.2f})"
        )


# ── 7. BW contention scales with payload size ───────────────────


def test_bw_contention_scales_with_nbytes():
    """Contention delay must increase with payload size.

    With link BW occupancy, a 64KB concurrent write should cause more
    contention delay than a 4KB concurrent write (occupancy = nbytes/bw).
    Component-level contention (DMA capacity) is fixed regardless of size,
    so this test specifically validates link-level BW modeling.

    NOTE: This test is expected to FAIL before available_at implementation
    (contention delta is fixed by component resources, not proportional to nbytes).
    After implementation, it should PASS.
    """
    def _concurrent_max_ns(nbytes: int) -> tuple[float, float]:
        """Return (single_ns, concurrent_max_ns) for given payload."""
        engine_s = _engine()
        msg_s = MemoryWriteMsg(
            correlation_id="bw", request_id=f"scale-s-{nbytes}",
            dst_sip=0, dst_cube=0, dst_pe=0,
            dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
            pattern="zero", target_pe=0,
        )
        hs = engine_s.submit(msg_s)
        engine_s.wait(hs)
        _, ts = engine_s.get_completion(hs)

        engine_c = _engine()
        msg_a = MemoryWriteMsg(
            correlation_id="bw", request_id=f"scale-a-{nbytes}",
            dst_sip=0, dst_cube=0, dst_pe=0,
            dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=nbytes,
            pattern="zero", target_pe=0,
        )
        msg_b = MemoryWriteMsg(
            correlation_id="bw", request_id=f"scale-b-{nbytes}",
            dst_sip=0, dst_cube=0, dst_pe=1,
            dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=nbytes,
            pattern="zero", target_pe=1,
        )
        ha = engine_c.submit(msg_a)
        hb = engine_c.submit(msg_b)
        engine_c.wait(ha)
        engine_c.wait(hb)
        _, ta = engine_c.get_completion(ha)
        _, tb = engine_c.get_completion(hb)
        return ts["total_ns"], max(ta["total_ns"], tb["total_ns"])

    single_4k, max_4k = _concurrent_max_ns(4096)
    single_64k, max_64k = _concurrent_max_ns(65536)

    delta_4k = max_4k - single_4k
    delta_64k = max_64k - single_64k

    # With BW occupancy: delta_64k should be significantly larger than delta_4k
    # (64KB occupies links 16x longer than 4KB)
    assert delta_64k > delta_4k * 2.0, (
        f"BW contention must scale with payload: "
        f"delta_64k ({delta_64k:.2f}ns) should be >> delta_4k ({delta_4k:.2f}ns)"
    )