Honest measured pipeline efficiency: two timing fixes

Two related issues caused measured pipeline efficiency to look worse than the simulator's actual behavior: 1. DMA timing recorded too early. The op-log start timestamp for a DMA op fired when the request entered the queue, and the DMA channel was released as soon as the request was issued. Back-to-back DMAs therefore appeared to grab the channel simultaneously, with per-op duration drifting upward as queue depth grew - an artifact, not real cost. Fix: defer the start timestamp until after the channel is acquired, and hold the channel through the full HBM round-trip until the response returns. Per-op duration is now constant and equal to the actual transfer interval; serialization is visible as queue wait, not as inflated service time. 2. Sweep timing window folded in pre-composite work. The PE timing window spanned every PE engine record, which included the upfront pinned-operand DMA issued before the composite GEMM begins. For large-K shapes that one-shot load can be nearly half of the window, conflating operand-staging cost with composite-pipeline behavior. Fix: add a second window scoped to the composite pipeline by filtering op_log records to those tagged with a tile-pipeline stage; the legacy operand-load path is untagged and naturally excluded. For 32x3072x32 load_ref the window drops from 1765ns to 992ns and measured eff lines up with the steady-state DMA-bound stage limit instead of being penalized for the one-time load. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 14:19:17 -07:00
parent 83ea97b05f
commit f6d262e359
7 changed files with 543 additions and 263 deletions
@@ -114,23 +114,19 @@ SLIDES = [
        "render": "hbm_topology",
    },
    {
-        "title": "14. GEMM Utilization + Useful Pipeline Efficiency (load_ref)",
+        "title": "14. Formula vs Measured Useful Eff (load_ref) — sim validates model",
-        "render": "mac_utilization",
+        "render": "mac_utilization_formula_vs_measured",
    },
    {
-        "title": "15. GEMM Utilization + Useful Pipeline Efficiency (ref_ref — both A & B via DMA_R)",
+        "title": "15. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)",
        "render": "mac_utilization_ref_ref",
    },
    {
        "title": "16. Pipeline Efficiency Walkthrough — 32×128×128 (with inter flushes)",
        "render": "pipeline_eff_walkthrough",
    },
    {
-        "title": "17. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)",
+        "title": "16. Pipeline Efficiency Walkthrough — 32×3072×32 (large K, no flushes)",
        "render": "pipeline_eff_walkthrough_largeK",
    },
    {
-        "title": "18. Useful Pipelined Efficiency (ideal pipeline × GEMM util)",
+        "title": "17. Useful Pipelined Efficiency (ideal pipeline × GEMM util)",
        "render": "tflops_table",
    },
 ]
@@ -1715,12 +1711,12 @@ def _render_mac_utilization(slide):
        "Useful eff":  "Useful eff %",
    }
-    _textbox(slide, 0.4, 1.0, 12.6, 0.70,
+    _textbox(slide, 0.4, 1.0, 12.6, 0.80,
-             f"GEMM util = useful FLOPs ÷ (tile FLOPs × tile count) — pure "
+             "FORMULA-generated (analytical ideal-pipeline model — not "
-             f"shape-vs-tile metric. "
+             "simulator data). GEMM util = useful FLOPs ÷ (tile FLOPs × "
-             f"Useful eff = (N_tiles × T_stage × GEMM_util) ÷ wall — "
+             "tile count). Useful eff = (N_tiles × T_stage × GEMM_util) "
-             f"useful FLOPs delivered as a fraction of peak over the "
+             "÷ wall, where wall = head + K-loop + inter-(m,n) DMA_W. "
-             f"ideal-pipelined wall (head + K-loop + inter-(m,n) DMA_W).",
+             "Slide 16 overlays this against measured pe_window_ns.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
    _draw_native_bar_chart(
@@ -1819,11 +1815,12 @@ def _render_mac_utilization_ref_ref(slide):
        "Useful eff":  "Useful eff % (ref_ref)",
    }
-    _textbox(slide, 0.4, 1.0, 12.6, 0.75,
+    _textbox(slide, 0.4, 1.0, 12.6, 0.85,
-             "ref_ref: scheduler issues DMA_R for BOTH A and B every tile. "
+             "FORMULA-generated (ideal-pipeline model, ref_ref variant — "
-             "Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH and GEMM stay "
+             "not simulator data). Scheduler issues DMA_R for BOTH A and B "
-             "at 16 ns. Pipeline cycle is DMA-bound → useful eff caps near "
+             "every tile. Per-tile DMA cost = 2 × T_stage = 32 ns; FETCH "
-             "50 % × GEMM_util, regardless of K-loop length.",
+             "and GEMM stay at 16 ns. Pipeline cycle is DMA-bound → useful "
             "eff caps near 50 % × GEMM_util, regardless of K-loop length.",
             size=11, color=COL_MUTED, align=PP_ALIGN.LEFT)
    _draw_native_bar_chart(
@@ -1846,6 +1843,136 @@ def _render_mac_utilization_ref_ref(slide):
    )
 def _render_mac_utilization_formula_vs_measured(slide):
    """Overlay slide: formula useful_eff vs measured useful_eff (load_ref).
    Formula  = (N_tiles × T_stage × GEMM_util) / wall_formula × 100
               wall_formula = head + N_tiles·T_stage + inter·DMA_W
    Measured = (useful_FLOPs / pe_window_ns) / peak_FLOPs_per_ns × 100
               pe_window_ns from gemm_sweep.json (honest post Option B).
    Agreement validates the analytical model against the simulator.
    """
    data = _load_sweep_data()
    rows = data["rows"]
    if not rows:
        _textbox(slide, 0.4, 3.0, 12.6, 1.0,
                 "No sweep data. Run scripts/gemm_sweep.py first.",
                 size=14, color=COL_RED, align=PP_ALIGN.LEFT)
        return
    tile = data["tile_sizes"]
    TILE_M, TILE_K, TILE_N = tile["M"], tile["K"], tile["N"]
    tile_flops = 2 * TILE_M * TILE_K * TILE_N
    HBM_GBS = 256.0
    bpe = 2
    T_STAGE = 16.0
    D_STAGES = 3
    head_ns = (D_STAGES - 1) * T_STAGE
    dma_w_per_pair_ns = (TILE_M * TILE_N * bpe) / HBM_GBS
    peak_per_ns = tile_flops / T_STAGE  # MAC peak throughput, flops/ns
    by_shape: dict = {}
    for r in rows:
        if r["variant"] != "load_ref":
            continue
        by_shape[(r["M"], r["K"], r["N"])] = r
    shapes = list(by_shape.keys())
    shape_labels = [_shape_label(by_shape[k]) for k in shapes]
    flagged = [_under_tile(k[0], k[1], k[2], TILE_M, TILE_K, TILE_N)
               for k in shapes]
    tile_counts = [by_shape[k]["tile_count_expected"] for k in shapes]
    gemm_util_formula: list[float] = []
    gemm_util_measured: list[float] = []
    formula_eff: list[float] = []
    measured_eff: list[float] = []
    for k in shapes:
        r = by_shape[k]
        M, K, N = r["M"], r["K"], r["N"]
        useful = 2 * M * K * N
        tiles = r["tile_count_expected"]
        gu_formula = useful / (tile_flops * tiles)
        gemm_util_formula.append(gu_formula * 100)
        # Measured GEMM util uses the GEMM stage record count from op_log
        # (i.e. the actual number of GEMM tiles the simulator ran).
        gemm_record_count = (
            r.get("stages", {}).get("GEMM", {}).get("record_count", 0)
            or tiles  # fallback if stages dict missing for older sweep data
        )
        gu_measured = useful / (tile_flops * gemm_record_count) \
                      if gemm_record_count > 0 else 0.0
        gemm_util_measured.append(gu_measured * 100)
        m_tiles = (M + TILE_M - 1) // TILE_M
        n_tiles = (N + TILE_N - 1) // TILE_N
        n_mn = m_tiles * n_tiles
        compute_total = tiles * T_STAGE
        inter_dma_w = max(0, n_mn - 1) * dma_w_per_pair_ns
        wall_formula = head_ns + compute_total + inter_dma_w
        feff = (compute_total * gu_formula / wall_formula) * 100 \
               if wall_formula > 0 else 0.0
        formula_eff.append(feff)
        comp_window_ns = r.get("composite_window_ns", 0.0) or 0.0
        if comp_window_ns > 0:
            meff = (useful / comp_window_ns / peak_per_ns) * 100
        else:
            meff = 0.0
        measured_eff.append(meff)
    series = {
        "GEMM util F":   gemm_util_formula,
        "GEMM util M":   gemm_util_measured,
        "Formula eff":   formula_eff,
        "Measured eff":  measured_eff,
    }
    colors_map = {
        "GEMM util F":  COL_FS,                       # emerald (formula ceiling)
        "GEMM util M":  RGBColor(0x6E, 0xE7, 0xB7),   # mint (measured ceiling)
        "Formula eff":  RGBColor(0xF5, 0x9E, 0x0B),   # amber (formula eff)
        "Measured eff": COL_DMA,                      # blue  (measured eff)
    }
    display_map = {
        "GEMM util F":  "GEMM util % (formula)",
        "GEMM util M":  "GEMM util % (measured, op_log)",
        "Formula eff":  "Formula useful eff %",
        "Measured eff": "Measured useful eff %",
    }
    _textbox(slide, 0.4, 1.0, 12.6, 1.10,
             "Four series per shape:\n"
             "  • GEMM util (formula, green): useful_FLOPs ÷ (tile_FLOPs × "
             "tile_count_expected) — structural shape-vs-tile ceiling.\n"
             "  • GEMM util (measured, mint): useful_FLOPs ÷ (tile_FLOPs × "
             "GEMM_record_count) — uses the actual GEMM ops the simulator "
             "ran. Should equal formula → validates plan execution.\n"
             "  • Formula useful eff (amber): GEMM_util × ideal pipeline efficiency.\n"
             "  • Measured useful eff (blue): (useful_FLOPs ÷ "
             "composite_window_ns) ÷ peak — composite_window_ns covers "
             "only the tl.composite pipeline (excludes upfront tl.load).",
             size=10, color=COL_MUTED, align=PP_ALIGN.LEFT)
    _draw_native_bar_chart(
        slide,
        plot_x=1.0, plot_y=2.25, plot_w=10.0, plot_h=3.85,
        shape_labels=shape_labels, flagged=flagged,
        tile_counts=tile_counts,
        series=series, colors_map=colors_map,
        display_map=display_map,
        wall_clocks=None,
        y_label="%",
        legend_x=11.4, legend_w=1.85,
        foot_note=("Reading: GEMM util F = GEMM util M → simulator ran the "
                   "expected tile plan. Measured eff uses composite-only "
                   "window (excludes upfront tl.load), so it isolates "
                   "pipeline efficiency. Variant: load_ref."),
        threshold_line=100.0,
    )
 def _render_tflops_table(slide):
    """Ideal pipelined pipe_eff: assumes non-blocking tl.load + multi-channel HBM.
@@ -2282,8 +2409,9 @@ _RENDERERS.update({
    "variant_comparison":        _render_variant_comparison,
    "hbm_topology":              _render_hbm_topology,
    "per_op_dma":                _render_per_op_dma,
-    "mac_utilization":           _render_mac_utilization,
+    "mac_utilization":                       _render_mac_utilization,
-    "mac_utilization_ref_ref":   _render_mac_utilization_ref_ref,
+    "mac_utilization_ref_ref":               _render_mac_utilization_ref_ref,
    "mac_utilization_formula_vs_measured":   _render_mac_utilization_formula_vs_measured,
    "tflops_table":              _render_tflops_table,
    "pipeline_eff_walkthrough":  _render_pipeline_eff_walkthrough,
    "pipeline_eff_walkthrough_largeK":  _render_pipeline_eff_walkthrough_largeK,
@@ -179,6 +179,13 @@ def _run_one(M: int, K: int, N: int, topology: str, variant: str = "ref_ref") ->
                              - min(r.t_start for r in pe_records)
    else:
        row["pe_window_ns"] = 0.0
    stage_records = [r for r in op_log
                     if r.params.get("stage_type") in STAGES]
    if stage_records:
        row["composite_window_ns"] = max(r.t_end for r in stage_records) \
                                     - min(r.t_start for r in stage_records)
    else:
        row["composite_window_ns"] = 0.0
    return row
@@ -138,8 +138,16 @@ class PeEngineBase(ComponentBase):
                env.process(self._forward_txn(env, msg))
    def _handle_with_hooks(self, env: simpy.Environment, pe_txn: Any) -> Generator:
-        """Wrap handle_command with op log hooks on the inner command."""
+        """Wrap handle_command with op log hooks on the inner command.
-        self._on_process_start(env, pe_txn.command)
+
        Subclasses that need to defer record_start until after a resource
        wait (e.g. pe_dma's DMA-channel acquire) set
        ``_DEFER_RECORD_START = True`` and call
        ``self._on_process_start(env, pe_txn.command)`` themselves at the
        post-wait moment. record_end still fires here.
        """
        if not getattr(self, "_DEFER_RECORD_START", False):
            self._on_process_start(env, pe_txn.command)
        yield from self.handle_command(env, pe_txn)
        self._on_process_end(env, pe_txn.command)
@@ -27,6 +27,12 @@ class PeDmaComponent(PeEngineBase):
        (DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
    """
    # Defer op_log record_start until AFTER the DMA channel is acquired so
    # t_start reflects the serve-start moment (post queueing) rather than
    # the queue-enter moment. ComponentBase._handle_with_hooks consults this
    # flag.
    _DEFER_RECORD_START = True
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._dma_read: simpy.Resource | None = None
@@ -80,9 +86,16 @@ class PeDmaComponent(PeEngineBase):
        path = self.ctx.router.find_path(self._pe_prefix, dst_node)
        drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
-        # Acquire DMA channel (command issue serialization)
+        # Acquire DMA channel — held through the entire round-trip so the
        # channel models "one DMA in flight per PE per direction" rather
        # than just issue-time serialization. This is what makes Option B
        # meaningful: t_start = serve-start covers the actual transfer.
        with dma_res.request() as req:
            yield req
            # Option B: record_start fires AFTER channel acquired, so t_start
            # = serve-start (excludes queue wait). _DEFER_RECORD_START=True
            # suppresses the auto-start in ComponentBase._handle_with_hooks.
            self._on_process_start(env, cmd)
            # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
            sub_done = env.event()
            sub_request = PeDmaMsg(
@@ -99,10 +112,8 @@ class PeDmaComponent(PeEngineBase):
            # Send to next hop (path[0] is pe_dma itself, path[1] is router)
            if len(path) > 1:
                yield self.out_ports[path[1]].put(sub_txn.advance())
-        # DMA channel released after issue
+            # Wait for HBM transfer completion BEFORE releasing the channel.
-
+            yield sub_done
        # Wait for HBM transfer completion
        yield sub_done
        pe_txn.done.succeed()
    def _worker(self, env: simpy.Environment) -> Generator:
@@ -293,15 +304,17 @@ class PeDmaComponent(PeEngineBase):
            txn.done.succeed()
    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
-        """Pipeline mode: DMA read/write via fabric, then self-route."""
+        """Pipeline mode: DMA read/write via fabric, then self-route.
-        self._on_process_start(env, token)
+
        Option B: record_start is fired *inside* _do_pipeline_dma, after the
        DMA channel is acquired — record_end stays here.
        """
        yield from self._do_pipeline_dma(env, token)
        self._on_process_end(env, token)
        # Self-routing (handle same-component consecutive stages)
        next_stage = token.advance()
        while next_stage is not None and next_stage.component == self.node.id:
            self._on_process_start(env, token)
            yield from self._do_pipeline_dma(env, token)
            self._on_process_end(env, token)
            next_stage = token.advance()
@@ -340,8 +353,13 @@ class PeDmaComponent(PeEngineBase):
            path = self.ctx.router.find_path(self._pe_prefix, dst_node)
            drain_ns = self.ctx.compute_drain_ns(path, nbytes)
            # Hold dma_res through the full round-trip — one DMA in flight
            # per PE per direction — so Option B's t_start (post-acquire)
            # bounds the actual transfer interval.
            with dma_res.request() as req:
                yield req
                # Option B: t_start = post-acquire moment.
                self._on_process_start(env, token)
                sub_done = env.event()
                sub_request = PeDmaMsg(
                    correlation_id="pipeline",
@@ -356,8 +374,11 @@ class PeDmaComponent(PeEngineBase):
                )
                if len(path) > 1:
                    yield self.out_ports[path[1]].put(sub_txn.advance())
-
+                yield sub_done
-            yield sub_done
+        else:
            # No-op (nbytes==0 or no ctx): no channel wait, but still record
            # so _on_process_end has a matching pending entry to finalise.
            self._on_process_start(env, token)
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
@@ -307,3 +307,95 @@ def test_pipeline_overlap_within_command():
    assert stage_times[(1, "dma")] == 10.0
    # tile1 gemm starts when tile0 gemm finishes (serialized at gemm queue)
    assert stage_times[(1, "gemm")] == 30.0
 # ── 6. Option B: pe_dma record_start fires post channel-acquire ────────
 def test_pe_dma_record_start_after_channel_acquire():
    """Three back-to-back DMA_READs serialise on pe_dma.cap=1.
    With ``_DEFER_RECORD_START = True`` on PeDmaComponent, each op's
    ``t_start`` is captured right after ``yield req`` succeeds. Result:
      - op N's ``(t_end - t_start)`` is the *actual transfer time* — same
        across all three ops (no queueing inflation).
      - op N+1's ``t_start`` >= op N's ``t_end - epsilon`` (waited for the
        previous holder to release the channel before being recorded).
    Counter-example (the bug this fix addresses): if ``record_start`` fired
    on command entry, all three ops would share ``t_start == 0`` and the
    second/third would show inflated ``t_end - t_start``.
    """
    from pathlib import Path
    from kernbench.common.pe_commands import DmaReadCmd, PeInternalTxn, TensorHandle
    from kernbench.policy.address.phyaddr import PhysAddr
    from kernbench.sim_engine.engine import GraphEngine
    from kernbench.topology.builder import load_topology
    TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
    def _hbm_pa() -> int:
        slice_bytes = 48 * (1 << 30) // 8
        pa = PhysAddr.pe_hbm_addr(
            sip_id=0, die_id=0, pe_id=0,
            pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
        )
        return pa.encode()
    # enable_data=True wires the OpLogger into every component.
    engine = GraphEngine(load_topology(TOPOLOGY_PATH), enable_data=True)
    pe_dma_id = "sip0.cube0.pe0.pe_dma"
    pe_dma = engine._components[pe_dma_id]
    env = engine._env
    # Three back-to-back DMA_READ commands fed straight into pe_dma's inbox
    # at t=0 so they all race for the cap=1 channel.
    handles = [
        TensorHandle(id=f"r{i}", addr=0x1000 + i * 0x1000,
                     shape=(64, 32), dtype="f16", nbytes=4096)
        for i in range(3)
    ]
    cmds = [
        DmaReadCmd(handle=h, src_addr=_hbm_pa(), nbytes=4096)
        for h in handles
    ]
    txns = [PeInternalTxn(command=c, done=env.event()) for c in cmds]
    def submit_all():
        for txn in txns:
            yield pe_dma._inbox.put(txn)
    env.process(submit_all())
    env.run()
    # Pull the three dma_read records out of the op log in order
    dma_records = [
        r for r in engine.op_log
        if r.op_name == "dma_read" and r.component_id == pe_dma_id
    ]
    assert len(dma_records) == 3, (
        f"expected 3 dma_read records, got {len(dma_records)}: {dma_records}"
    )
    durations = [r.t_end - r.t_start for r in dma_records]
    # All three should have the same actual transfer time within ±1 ns.
    base = durations[0]
    assert base > 0, f"first dma duration must be positive, got {base}"
    for i, d in enumerate(durations):
        assert abs(d - base) <= 1.0, (
            f"op {i} duration {d} differs from baseline {base} by >1 ns "
            f"— record_start may still be including queue wait"
        )
    # Each subsequent op's t_start must be at or after the previous op's
    # t_end (modulo a few ns of scheduler overhead) — i.e. the wait is
    # *excluded* from the recorded interval, not folded into it.
    for i in range(1, len(dma_records)):
        prev_end = dma_records[i - 1].t_end
        cur_start = dma_records[i].t_start
        assert cur_start >= prev_end - 1.0, (
            f"op {i} t_start={cur_start} began before op {i-1} t_end={prev_end} "
            f"— channel was not actually held, fix is incorrect"
        )