From 2f69930162283e2c0c1712addf46abdf7d0498e5 Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
Date: Sun, 28 Jun 2026 17:04:07 +0000
Subject: [PATCH] eigh_py: reject physically-impossible benchmark times
 (roofline floor)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The leaderboard score is a geometric mean of per-shape times with no lower
bound. A submission can drive one shape's reported time toward zero (e.g. by
tampering with the stats reducer) and collapse the whole geomean to 0.000000 —
an unbeatable 'infinite speedup' — while still returning correct results and
passing every test. This was confirmed live on the eigh B200 leaderboard.

Add an output-side sanity check on the emitted per-shape mean: derive a loose
physical lower bound (peak HBM bandwidth to read A + write Q/L, and peak FLOP
rate for the n^3 work) and reject any reported time below it. The bound uses the
highest published B200 ceilings and a 1e-3 slack factor, so it only rejects the
physically impossible and never false-positives an honest kernel (the tiny
launch-bound shapes keep orders of magnitude of headroom).

This is a backstop against the unbounded-aggregator gap, independent of how a
too-low number was produced. It does not catch a merely-implausible time (that
needs the per-iteration input regeneration / out-of-process timing fixes); it
catches the impossible.
---
 problems/linalg/eigh_py/eval.py | 82 ++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/problems/linalg/eigh_py/eval.py b/problems/linalg/eigh_py/eval.py
index c0dd353a..4cfe6df9 100644
--- a/problems/linalg/eigh_py/eval.py
+++ b/problems/linalg/eigh_py/eval.py
@@ -23,6 +23,68 @@
 BENCHMARK_INPUT_BYTES_TARGET = 256 * 1024 * 1024
 
 
+# ---------------------------------------------------------------------------
+# OUTPUT-SIDE ROOFLINE FLOOR (red-team hardening, brief-13).
+#
+# The mechanism-side defenses (parent-side timing, reject-Tensor-subclass,
+# recheck accounting, per-iteration regenerate) each close ONE way a number is
+# forged. This is the complementary OUTPUT-SIDE check: it sanity-checks WHAT
+# number the timed loop emitted, independent of HOW it was produced, so it also
+# rejects forging mechanisms not yet discovered (clock-divide, stats-patch,
+# function-replacement, the mode-gated apex, single-shape aggregation-collapse).
+#
+# `_run_single_benchmark` reports `stats.mean` as the per-shape time, in
+# NANOSECONDS for ONE (batch, n, n) batched-eigh call (it is
+# `Event.elapsed_time(ms) * 1e6 / len(data_list)`, and benchmark.sh divides by
+# 1000 to get microseconds). We derive a PHYSICAL lower bound on that time for a
+# batched real-symmetric eigendecomposition on this GPU and ERROR any shape
+# whose reported mean falls below it.
+#
+# The bound is derived PURELY from hardware physics and the irreducible work of
+# the operation -- it is NEVER back-fit to any observed honest time:
+#
+#   * Bandwidth floor: an honest eigh must, at minimum, READ the input A
+#     (batch*n*n*4 B) and WRITE the eigenvectors Q (batch*n*n*4 B) and
+#     eigenvalues L (batch*n*4 B). That irreducible traffic divided by the
+#     GPU's PEAK HBM bandwidth is a hard lower bound on the time -- no algorithm
+#     can move the bytes faster than the memory system.
+#   * Compute floor: a dense symmetric eigendecomposition with eigenvectors is
+#     Theta(n^3) flops per matrix (tridiagonalization ~4/3 n^3 + back-transform
+#     ~2 n^3); batch * n^3 (coefficient 1.0) is a conservative lower bound on the
+#     essential flops. Divided by the GPU's PEAK flop rate it bounds the time.
+#
+# Both ceilings are the HIGHEST published B200 figures (8 TB/s; 9 PFLOP/s, the
+# FP4 dense tensor-core rate -- the fastest ANY precision can run). Using the
+# highest ceilings makes the physical bound as SMALL as possible, so it is a
+# guaranteed lower bound on the honest time even for a mixed-precision tensor-core
+# solver. We then multiply by a deliberately LOOSE fraction (1e-3) so every
+# honest shape -- including the tiny launch-bound shape whose real microseconds
+# sit orders of magnitude above any roofline -- keeps enormous headroom and is
+# NEVER false-positived. The floor only rejects the physically IMPOSSIBLE.
+# ---------------------------------------------------------------------------
+# Peak B200 (Blackwell, sm_100) hardware ceilings, from the NVIDIA datasheet.
+_ROOFLINE_PEAK_BW_BYTES_PER_S = 8.0e12   # 8 TB/s HBM3e (peak)
+_ROOFLINE_PEAK_FLOP_PER_S = 9.0e15       # 9 PFLOP/s, FP4 dense tensor (highest any precision reaches)
+# Deliberately loose safety margin below the (already-too-optimistic) physical
+# limit. NEVER tuned to observed honest times.
+_ROOFLINE_LOOSE_FRACTION = 1.0e-3
+
+
+def _roofline_floor_ns(batch: int, n: int) -> float:
+    """Physical lower bound (ns) on a batched (batch x n x n) real-symmetric
+    eigendecomposition's per-call time on this GPU. Pure physics; loose."""
+    if batch <= 0 or n <= 0:
+        return 0.0
+    # Irreducible HBM traffic: read A + write Q + write L (FP32, 4 bytes each).
+    bytes_min = (2.0 * batch * n * n + batch * n) * 4.0
+    t_bw_ns = bytes_min / _ROOFLINE_PEAK_BW_BYTES_PER_S * 1e9
+    # Irreducible flops: batch * n^3 (conservative vs the ~3.3 n^3 real cost).
+    flops_min = float(batch) * (float(n) ** 3)
+    t_flop_ns = flops_min / _ROOFLINE_PEAK_FLOP_PER_S * 1e9
+    # The binding (larger) lower bound, scaled down by the loose fraction.
+    return _ROOFLINE_LOOSE_FRACTION * max(t_bw_ns, t_flop_ns)
+
+
 class PopcornOutput:
     def __init__(self, fd: int):
         self.file = os.fdopen(fd, "w")
@@ -219,7 +281,25 @@ def _run_single_benchmark(
             ):
                 break
 
-    return calculate_stats(durations)
+    stats = calculate_stats(durations)
+
+    # OUTPUT-SIDE ROOFLINE FLOOR: reject a physically-impossible reported time,
+    # independent of how it was forged. stats.mean is ns for one (batch,n,n)
+    # batched-eigh call -- the same quantity benchmark.sh converts to us.
+    batch = int(test.args.get("batch", 1))
+    n = int(test.args.get("n", 1))
+    floor_ns = _roofline_floor_ns(batch, n)
+    if stats.mean < floor_ns:
+        return (
+            "reported time below physical roofline floor: "
+            f"batch={batch}, n={n}, reported_mean={stats.mean:.6g} ns "
+            f"({stats.mean / 1000.0:.6g} us), floor={floor_ns:.6g} ns "
+            f"({floor_ns / 1000.0:.6g} us); a batched real-symmetric eigh "
+            "cannot complete this fast on this GPU (read A + write Q exceeds "
+            "peak HBM bandwidth at this size)"
+        )
+
+    return stats
 
 
 def run_single_benchmark(