diff --git a/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png b/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png index 26a54be..848db8e 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png and b/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png b/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png index 62f5f44..396bb0d 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png and b/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png b/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png index bce23ea..ab29c2f 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png and b/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png b/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png index 5c77c81..2a8212d 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png and b/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/overview.png b/docs/diagrams/pe2pe_latency_plots/overview.png index f9a8191..6193f96 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/overview.png and b/docs/diagrams/pe2pe_latency_plots/overview.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/summary.csv b/docs/diagrams/pe2pe_latency_plots/summary.csv index 288abec..ee95166 100644 --- a/docs/diagrams/pe2pe_latency_plots/summary.csv +++ b/docs/diagrams/pe2pe_latency_plots/summary.csv @@ -1,81 +1,81 @@ hop,label,size_bytes,path,total_ns -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,31.3899999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,raw,12.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,33.1399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,raw,13.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,35.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,34.8899999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,raw,14.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,37.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,36.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,raw,15.019999999996799 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,41.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,40.1399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,raw,17.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,45.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,43.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,raw,19.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,61.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,57.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,raw,27.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,93.6399999999976 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,85.6399999999976 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,raw,43.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,157.64000000000306 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,141.64000000000306 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,raw,75.02000000000407 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,189.64000000000306 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,169.64000000000306 h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,raw,91.02000000000407 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,31.3899999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,raw,12.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,33.1399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,raw,13.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,35.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,34.8899999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,raw,14.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,37.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,36.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,raw,15.019999999996799 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,41.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,40.1399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,raw,17.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,45.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,43.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,raw,19.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,61.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,57.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,raw,27.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,93.6399999999976 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,85.6399999999976 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,raw,43.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,157.64000000000306 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,141.64000000000306 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,raw,75.02000000000407 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,189.64000000000306 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,169.64000000000306 h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,raw,91.02000000000407 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,67.40999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,raw,68.53999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,69.15999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,raw,70.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,71.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,ipcq,70.90999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),384,raw,71.53999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,73.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,ipcq,72.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),512,raw,73.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,77.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,ipcq,76.15999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),768,raw,76.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,81.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,79.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,raw,79.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,97.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,93.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,raw,91.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,129.65999999999804 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,121.65999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,raw,115.03999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,193.65999999999985 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,177.65999999999985 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,raw,163.04000000000087 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,225.65999999999985 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,205.65999999999985 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,raw,187.04000000000087 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,ipcq,87.40999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),128,raw,88.53999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,ipcq,89.15999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),256,raw,90.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,91.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,ipcq,90.90999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),384,raw,91.53999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,93.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,ipcq,92.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),512,raw,93.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,97.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,ipcq,96.15999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),768,raw,96.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,101.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,99.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,raw,99.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,117.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,113.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,raw,111.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,149.65999999999804 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,141.65999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,raw,135.03999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,213.65999999999985 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,197.65999999999985 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,raw,183.04000000000087 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,245.65999999999985 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,225.65999999999985 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,raw,207.04000000000087 diff --git a/scripts/replot_pe2pe.py b/scripts/replot_pe2pe.py new file mode 100644 index 0000000..3260790 --- /dev/null +++ b/scripts/replot_pe2pe.py @@ -0,0 +1,141 @@ +"""Re-render pe2pe latency PNGs from the existing summary.csv with the +current (no-consume) labels. Used after a label-only test edit to avoid +re-measuring (~5 min) when the data on disk is already correct. + +Reads docs/diagrams/pe2pe_latency_plots/summary.csv. Plots 2 curves: +"IPCQ no-consume" (from the ipcq_no_consume rows if present, else from +the ipcq rows) and "Raw DMA" (raw rows). +""" +from __future__ import annotations + +import csv +from pathlib import Path + +import matplotlib.pyplot as plt + +ROOT = Path(__file__).resolve().parent.parent +PLOT_DIR = ROOT / "docs" / "diagrams" / "pe2pe_latency_plots" +CSV_PATH = PLOT_DIR / "summary.csv" + + +def _load_records(): + rows = [] + with open(CSV_PATH, newline="") as f: + for r in csv.DictReader(f): + rows.append({ + "hop": r["hop"], + "label": r["label"], + "size_bytes": int(r["size_bytes"]), + "path": r["path"], + "total_ns": float(r["total_ns"]), + }) + return rows + + +def _ipcq_rows(records, hop): + # Prefer ipcq_no_consume if present (older 3-path CSV); fall back to ipcq + # (current single-path CSV where ipcq IS no-consume). + nc = [r for r in records + if r["hop"] == hop and r["path"] == "ipcq_no_consume"] + if nc: + return sorted(nc, key=lambda r: r["size_bytes"]) + return sorted( + [r for r in records if r["hop"] == hop and r["path"] == "ipcq"], + key=lambda r: r["size_bytes"], + ) + + +def _raw_rows(records, hop): + return sorted( + [r for r in records if r["hop"] == hop and r["path"] == "raw"], + key=lambda r: r["size_bytes"], + ) + + +def _hops(records): + seen = [] + for r in records: + if r["hop"] not in {h["id"] for h in seen}: + seen.append({"id": r["hop"], "label": r["label"]}) + return seen + + +def _plot_per_hop(records, hop, path): + ipcq = _ipcq_rows(records, hop["id"]) + raw = _raw_rows(records, hop["id"]) + fig, ax = plt.subplots(figsize=(8, 5)) + if ipcq: + ax.plot( + [r["size_bytes"] for r in ipcq], + [r["total_ns"] for r in ipcq], + marker="o", color="tab:blue", + label="IPCQ no-consume (send/recv, no slot read)", + ) + if raw: + ax.plot( + [r["size_bytes"] for r in raw], + [r["total_ns"] for r in raw], + marker="s", color="tab:orange", + label="Raw DMA (load+store)", + ) + ax.set_xlabel("Data size (bytes)") + ax.set_ylabel("Latency (ns)") + ax.set_title(hop["label"]) + ax.grid(True, alpha=0.3) + ax.legend() + fig.tight_layout() + fig.savefig(path, dpi=120) + plt.close(fig) + + +def _plot_overview(records, hops, path): + fig, axes = plt.subplots(2, 2, figsize=(13, 9)) + axes = axes.flatten() + for i, hop in enumerate(hops): + ax = axes[i] + ipcq = _ipcq_rows(records, hop["id"]) + raw = _raw_rows(records, hop["id"]) + if ipcq: + ax.plot( + [r["size_bytes"] for r in ipcq], + [r["total_ns"] for r in ipcq], + marker="o", color="tab:blue", + label="IPCQ no-consume", + ) + if raw: + ax.plot( + [r["size_bytes"] for r in raw], + [r["total_ns"] for r in raw], + marker="s", color="tab:orange", + label="Raw DMA", + ) + ax.set_title(hop["label"], fontsize=10) + ax.set_xlabel("bytes") + ax.set_ylabel("ns") + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8) + for j in range(len(hops), len(axes)): + axes[j].axis("off") + fig.suptitle( + "PE-to-PE latency: IPCQ no-consume vs raw DMA", + fontsize=14, + ) + fig.tight_layout() + fig.savefig(path, dpi=120) + plt.close(fig) + + +def main(): + records = _load_records() + hops = _hops(records) + for hop in hops: + out = PLOT_DIR / f"{hop['id']}.png" + _plot_per_hop(records, hop, out) + print(f"wrote {out}") + overview = PLOT_DIR / "overview.png" + _plot_overview(records, hops, overview) + print(f"wrote {overview}") + + +if __name__ == "__main__": + main() diff --git a/src/kernbench/triton_emu/tl_context.py b/src/kernbench/triton_emu/tl_context.py index 64b65a8..e35ccbe 100644 --- a/src/kernbench/triton_emu/tl_context.py +++ b/src/kernbench/triton_emu/tl_context.py @@ -492,6 +492,48 @@ class TLContext: ) return self._make_handle(addr=0, shape=shape, dtype=dtype) + def recv_no_consume( + self, + dir: str | None = None, + shape: tuple[int, ...] = (), + dtype: str = "f16", + ) -> TensorHandle: + """DIAGNOSTIC ONLY — recv that blocks for arrival but skips slot read. + + Same blocking semantics as ``tl.recv``: the kernel waits until + the payload has landed in the IPCQ slot. Differs from ``tl.recv`` + by skipping the slot-read latency charge (slot-IO + PE↔bank + fabric drain) on DST. + + This entry point exists solely so the pe2pe overview plot can + draw an apples-to-apples comparison against ``tl.store`` (a + one-sided fabric write that pays no read on DST). Production + kernels MUST use ``tl.recv`` — they need to consume the data + they receive. This API is segregated from ``tl.recv`` so the + diagnostic flag can never accidentally be set in real workloads. + """ + self._emit_dispatch_overhead() + cmd = IpcqRecvCmd( + direction=dir, + shape=shape, dtype=dtype, + handle_id=self._next_handle_id(), + consume=False, + ) + result = self._emit(cmd) # type: ignore[arg-type] + if isinstance(result, dict): + slot_addr = int(result.get("src_addr", 0)) + slot_space = str(result.get("src_space", "tcm")) + return TensorHandle( + id=self._next_handle_id(), + addr=slot_addr, + shape=shape, + dtype=dtype, + nbytes=self._nbytes(shape, dtype), + data=None, + space=slot_space, + ) + return self._make_handle(addr=0, shape=shape, dtype=dtype) + def recv_async( self, dir: str, diff --git a/tests/test_pe_to_pe_latency.py b/tests/test_pe_to_pe_latency.py index 622ce7c..8af2896 100644 --- a/tests/test_pe_to_pe_latency.py +++ b/tests/test_pe_to_pe_latency.py @@ -1,7 +1,12 @@ """PE-to-PE latency sweep across hop types and data sizes. Compares IPCQ send/recv vs raw-DMA (tl.load + tl.store) latency for four -hop types: +hop types. The IPCQ path uses ``tl.recv_no_consume(...)`` so that DST +does not pay the slot-read latency — apples-to-apples with the DMA +path, which is a one-sided write that has no read on DST. + +``tl.recv_no_consume`` is a DIAGNOSTIC-only entry point that exists +solely to draw this graph; production kernels use ``tl.recv``. H1 Intra-cube horizontal pe0 → pe1 H2 Intra-cube vertical pe0 → pe4 @@ -28,7 +33,9 @@ from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import resolve_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" -PLOT_DIR = Path(__file__).parent / "pe2pe_latency_plots" +PLOT_DIR = ( + Path(__file__).parent.parent / "docs" / "diagrams" / "pe2pe_latency_plots" +) SIZES = [128, 256, 384, 512, 768, 1024, 2048, 4096, 8192, 10240] @@ -101,7 +108,12 @@ def _measure_ipcq(hop: Hop, nbytes: int) -> float: data = tl.load(t_ptr, shape=(n_elem,), dtype="f16") tl.send(dir=send_dir, src=data) elif cube_id == dst_cube and pe_id == dst_pe: - tl.recv(dir=recv_dir, shape=(n_elem,), dtype="f16") + # tl.recv_no_consume: DST blocks until bytes land in + # slot but skips slot-read latency. Apples-to-apples + # with the raw-DMA path below, which has no DST read. + # Diagnostic-only — production kernels use tl.recv. + tl.recv_no_consume(dir=recv_dir, + shape=(n_elem,), dtype="f16") tensors = [] for s in sorted({src_sip, dst_sip}): @@ -238,7 +250,8 @@ def _plot_per_hop(records, hop: Hop, path: Path) -> None: ax.plot( [r["size_bytes"] for r in ipcq], [r["total_ns"] for r in ipcq], - marker="o", label="IPCQ (send/recv)", color="tab:blue", + marker="o", label="IPCQ no-consume (send/recv, no slot read)", + color="tab:blue", ) if raw: ax.plot( @@ -275,13 +288,13 @@ def _plot_overview(records, path: Path) -> None: ax.plot( [r["size_bytes"] for r in ipcq], [r["total_ns"] for r in ipcq], - marker="o", label="IPCQ", color="tab:blue", + marker="o", label="IPCQ no-consume", color="tab:blue", ) if raw: ax.plot( [r["size_bytes"] for r in raw], [r["total_ns"] for r in raw], - marker="s", label="Raw", color="tab:orange", + marker="s", label="Raw DMA", color="tab:orange", ) ax.set_title(hop.label, fontsize=10) ax.set_xlabel("bytes") @@ -291,7 +304,7 @@ def _plot_overview(records, path: Path) -> None: for j in range(len(HOPS), len(axes)): axes[j].axis("off") fig.suptitle( - "PE-to-PE latency: IPCQ vs raw DMA", + "PE-to-PE latency: IPCQ no-consume vs raw DMA", fontsize=14, ) fig.tight_layout() @@ -307,7 +320,8 @@ def test_pe_to_pe_latency_sweep(): for hop in HOPS: for size in SIZES: - # IPCQ path + # IPCQ path uses tl.recv(consume=False) — apples-to-apples + # with the raw-DMA path, which has no DST read either. ipcq_ns = _measure_ipcq(hop, size) records.append({ "hop": hop.id, "label": hop.label,