From 2f69930162283e2c0c1712addf46abdf7d0498e5 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach Date: Sun, 28 Jun 2026 17:04:07 +0000 Subject: [PATCH] eigh_py: reject physically-impossible benchmark times (roofline floor) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The leaderboard score is a geometric mean of per-shape times with no lower bound. A submission can drive one shape's reported time toward zero (e.g. by tampering with the stats reducer) and collapse the whole geomean to 0.000000 — an unbeatable 'infinite speedup' — while still returning correct results and passing every test. This was confirmed live on the eigh B200 leaderboard. Add an output-side sanity check on the emitted per-shape mean: derive a loose physical lower bound (peak HBM bandwidth to read A + write Q/L, and peak FLOP rate for the n^3 work) and reject any reported time below it. The bound uses the highest published B200 ceilings and a 1e-3 slack factor, so it only rejects the physically impossible and never false-positives an honest kernel (the tiny launch-bound shapes keep orders of magnitude of headroom). This is a backstop against the unbounded-aggregator gap, independent of how a too-low number was produced. It does not catch a merely-implausible time (that needs the per-iteration input regeneration / out-of-process timing fixes); it catches the impossible. --- problems/linalg/eigh_py/eval.py | 82 ++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/problems/linalg/eigh_py/eval.py b/problems/linalg/eigh_py/eval.py index c0dd353a..4cfe6df9 100644 --- a/problems/linalg/eigh_py/eval.py +++ b/problems/linalg/eigh_py/eval.py @@ -23,6 +23,68 @@ BENCHMARK_INPUT_BYTES_TARGET = 256 * 1024 * 1024 +# --------------------------------------------------------------------------- +# OUTPUT-SIDE ROOFLINE FLOOR (red-team hardening, brief-13). +# +# The mechanism-side defenses (parent-side timing, reject-Tensor-subclass, +# recheck accounting, per-iteration regenerate) each close ONE way a number is +# forged. This is the complementary OUTPUT-SIDE check: it sanity-checks WHAT +# number the timed loop emitted, independent of HOW it was produced, so it also +# rejects forging mechanisms not yet discovered (clock-divide, stats-patch, +# function-replacement, the mode-gated apex, single-shape aggregation-collapse). +# +# `_run_single_benchmark` reports `stats.mean` as the per-shape time, in +# NANOSECONDS for ONE (batch, n, n) batched-eigh call (it is +# `Event.elapsed_time(ms) * 1e6 / len(data_list)`, and benchmark.sh divides by +# 1000 to get microseconds). We derive a PHYSICAL lower bound on that time for a +# batched real-symmetric eigendecomposition on this GPU and ERROR any shape +# whose reported mean falls below it. +# +# The bound is derived PURELY from hardware physics and the irreducible work of +# the operation -- it is NEVER back-fit to any observed honest time: +# +# * Bandwidth floor: an honest eigh must, at minimum, READ the input A +# (batch*n*n*4 B) and WRITE the eigenvectors Q (batch*n*n*4 B) and +# eigenvalues L (batch*n*4 B). That irreducible traffic divided by the +# GPU's PEAK HBM bandwidth is a hard lower bound on the time -- no algorithm +# can move the bytes faster than the memory system. +# * Compute floor: a dense symmetric eigendecomposition with eigenvectors is +# Theta(n^3) flops per matrix (tridiagonalization ~4/3 n^3 + back-transform +# ~2 n^3); batch * n^3 (coefficient 1.0) is a conservative lower bound on the +# essential flops. Divided by the GPU's PEAK flop rate it bounds the time. +# +# Both ceilings are the HIGHEST published B200 figures (8 TB/s; 9 PFLOP/s, the +# FP4 dense tensor-core rate -- the fastest ANY precision can run). Using the +# highest ceilings makes the physical bound as SMALL as possible, so it is a +# guaranteed lower bound on the honest time even for a mixed-precision tensor-core +# solver. We then multiply by a deliberately LOOSE fraction (1e-3) so every +# honest shape -- including the tiny launch-bound shape whose real microseconds +# sit orders of magnitude above any roofline -- keeps enormous headroom and is +# NEVER false-positived. The floor only rejects the physically IMPOSSIBLE. +# --------------------------------------------------------------------------- +# Peak B200 (Blackwell, sm_100) hardware ceilings, from the NVIDIA datasheet. +_ROOFLINE_PEAK_BW_BYTES_PER_S = 8.0e12 # 8 TB/s HBM3e (peak) +_ROOFLINE_PEAK_FLOP_PER_S = 9.0e15 # 9 PFLOP/s, FP4 dense tensor (highest any precision reaches) +# Deliberately loose safety margin below the (already-too-optimistic) physical +# limit. NEVER tuned to observed honest times. +_ROOFLINE_LOOSE_FRACTION = 1.0e-3 + + +def _roofline_floor_ns(batch: int, n: int) -> float: + """Physical lower bound (ns) on a batched (batch x n x n) real-symmetric + eigendecomposition's per-call time on this GPU. Pure physics; loose.""" + if batch <= 0 or n <= 0: + return 0.0 + # Irreducible HBM traffic: read A + write Q + write L (FP32, 4 bytes each). + bytes_min = (2.0 * batch * n * n + batch * n) * 4.0 + t_bw_ns = bytes_min / _ROOFLINE_PEAK_BW_BYTES_PER_S * 1e9 + # Irreducible flops: batch * n^3 (conservative vs the ~3.3 n^3 real cost). + flops_min = float(batch) * (float(n) ** 3) + t_flop_ns = flops_min / _ROOFLINE_PEAK_FLOP_PER_S * 1e9 + # The binding (larger) lower bound, scaled down by the loose fraction. + return _ROOFLINE_LOOSE_FRACTION * max(t_bw_ns, t_flop_ns) + + class PopcornOutput: def __init__(self, fd: int): self.file = os.fdopen(fd, "w") @@ -219,7 +281,25 @@ def _run_single_benchmark( ): break - return calculate_stats(durations) + stats = calculate_stats(durations) + + # OUTPUT-SIDE ROOFLINE FLOOR: reject a physically-impossible reported time, + # independent of how it was forged. stats.mean is ns for one (batch,n,n) + # batched-eigh call -- the same quantity benchmark.sh converts to us. + batch = int(test.args.get("batch", 1)) + n = int(test.args.get("n", 1)) + floor_ns = _roofline_floor_ns(batch, n) + if stats.mean < floor_ns: + return ( + "reported time below physical roofline floor: " + f"batch={batch}, n={n}, reported_mean={stats.mean:.6g} ns " + f"({stats.mean / 1000.0:.6g} us), floor={floor_ns:.6g} ns " + f"({floor_ns / 1000.0:.6g} us); a batched real-symmetric eigh " + "cannot complete this fast on this GPU (read A + write Q exceeds " + "peak HBM bandwidth at this size)" + ) + + return stats def run_single_benchmark(