Add tl.recv_no_consume diagnostic API for apples-to-apples pe2pe plot

The pe2pe overview compared IPCQ (tl.send + tl.recv) against raw DMA (tl.load + tl.store), but DMA is one-sided — DST never reads — while tl.recv pays a slot-read on DST. The comparison was unfair: IPCQ looked slower partly because it does more work. Adds tl.recv_no_consume() — a separate, diagnostic-only entry point that blocks for slot arrival but skips the slot-read (and bank-hop) charge on DST. Production tl.recv is unchanged (no `consume` kwarg on the public API), so the diagnostic flag can never accidentally leak into real workloads. Updates test_pe_to_pe_latency to call tl.recv_no_consume so the overview.png shows IPCQ no-consume vs raw DMA on equal footing. Also fixes PLOT_DIR back to docs/diagrams/pe2pe_latency_plots/ (was lost in a merge). Adds scripts/replot_pe2pe.py for label-only re-renders without re-measuring. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 18:20:44 -07:00
parent 9c129d6131
commit a563169e89
9 changed files with 245 additions and 48 deletions
@@ -1,7 +1,12 @@
 """PE-to-PE latency sweep across hop types and data sizes.

 Compares IPCQ send/recv vs raw-DMA (tl.load + tl.store) latency for four
-hop types:
+hop types. The IPCQ path uses ``tl.recv_no_consume(...)`` so that DST
+does not pay the slot-read latency — apples-to-apples with the DMA
+path, which is a one-sided write that has no read on DST.
+
+``tl.recv_no_consume`` is a DIAGNOSTIC-only entry point that exists
+solely to draw this graph; production kernels use ``tl.recv``.

  H1 Intra-cube horizontal   pe0 → pe1
  H2 Intra-cube vertical     pe0 → pe4
@@ -28,7 +33,9 @@ from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import resolve_topology

 TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
-PLOT_DIR = Path(__file__).parent / "pe2pe_latency_plots"
+PLOT_DIR = (
+    Path(__file__).parent.parent / "docs" / "diagrams" / "pe2pe_latency_plots"
+)

 SIZES = [128, 256, 384, 512, 768, 1024, 2048, 4096, 8192, 10240]

@@ -101,7 +108,12 @@ def _measure_ipcq(hop: Hop, nbytes: int) -> float:
                data = tl.load(t_ptr, shape=(n_elem,), dtype="f16")
                tl.send(dir=send_dir, src=data)
            elif cube_id == dst_cube and pe_id == dst_pe:
-                tl.recv(dir=recv_dir, shape=(n_elem,), dtype="f16")
+                # tl.recv_no_consume: DST blocks until bytes land in
+                # slot but skips slot-read latency. Apples-to-apples
+                # with the raw-DMA path below, which has no DST read.
+                # Diagnostic-only — production kernels use tl.recv.
+                tl.recv_no_consume(dir=recv_dir,
+                                   shape=(n_elem,), dtype="f16")

        tensors = []
        for s in sorted({src_sip, dst_sip}):
@@ -238,7 +250,8 @@ def _plot_per_hop(records, hop: Hop, path: Path) -> None:
        ax.plot(
            [r["size_bytes"] for r in ipcq],
            [r["total_ns"] for r in ipcq],
-            marker="o", label="IPCQ (send/recv)", color="tab:blue",
+            marker="o", label="IPCQ no-consume (send/recv, no slot read)",
+            color="tab:blue",
        )
    if raw:
        ax.plot(
@@ -275,13 +288,13 @@ def _plot_overview(records, path: Path) -> None:
            ax.plot(
                [r["size_bytes"] for r in ipcq],
                [r["total_ns"] for r in ipcq],
-                marker="o", label="IPCQ", color="tab:blue",
+                marker="o", label="IPCQ no-consume", color="tab:blue",
            )
        if raw:
            ax.plot(
                [r["size_bytes"] for r in raw],
                [r["total_ns"] for r in raw],
-                marker="s", label="Raw", color="tab:orange",
+                marker="s", label="Raw DMA", color="tab:orange",
            )
        ax.set_title(hop.label, fontsize=10)
        ax.set_xlabel("bytes")
@@ -291,7 +304,7 @@ def _plot_overview(records, path: Path) -> None:
    for j in range(len(HOPS), len(axes)):
        axes[j].axis("off")
    fig.suptitle(
-        "PE-to-PE latency: IPCQ vs raw DMA",
+        "PE-to-PE latency: IPCQ no-consume vs raw DMA",
        fontsize=14,
    )
    fig.tight_layout()
@@ -307,7 +320,8 @@ def test_pe_to_pe_latency_sweep():

    for hop in HOPS:
        for size in SIZES:
-            # IPCQ path
+            # IPCQ path uses tl.recv(consume=False) — apples-to-apples
+            # with the raw-DMA path, which has no DST read either.
            ipcq_ns = _measure_ipcq(hop, size)
            records.append({
                "hop": hop.id, "label": hop.label,