From 2574ea30ba59304922037e207c77232e46cdc5ef Mon Sep 17 00:00:00 2001
From: Ryan Shatford <ev1lm0nk3y@users.noreply.github.com>
Date: Thu, 4 Jun 2026 01:12:22 -0700
Subject: [PATCH 1/2] feat: add --strict-deterministic flag to strictly
 evaluate deterministic metrics

This flag allows benchdiff to bypass benchstat's statistical significance
checks for allocs/op and B/op, ensuring that any regression in these
metrics exceeding the tolerance threshold triggers a failure.
---
 cmd/benchdiff/benchdiff.go              |  5 +-
 cmd/benchdiff/internal/benchdiff.go     | 21 ++++--
 cmd/benchdiff/internal/degraded_test.go | 96 +++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 8 deletions(-)
 create mode 100644 cmd/benchdiff/internal/degraded_test.go

diff --git a/cmd/benchdiff/benchdiff.go b/cmd/benchdiff/benchdiff.go
index dcd13b6..6254913 100644
--- a/cmd/benchdiff/benchdiff.go
+++ b/cmd/benchdiff/benchdiff.go
@@ -81,6 +81,7 @@ var benchVars = kong.Vars{
 	"WarmupCountHelp":      `Run benchmarks with -count=n as a warmup`,
 	"WarmupTimeHelp":       `When warmups are run, set -benchtime=n`,
 	"TagsHelp":             `Set the -tags flag on the go test command`,
+	"StrictDeterministicHelp": `Strictly evaluate deterministic metrics (allocs/op, B/op) for regressions, ignoring statistical significance.`,
 }
 
 var groupHelp = kong.Vars{
@@ -100,6 +101,7 @@ var cli struct {
 	JSON      bool          `kong:"help=${JSONHelp},group='x'"`
 	OnDegrade int           `kong:"name=on-degrade,default=0,help=${OnDegradeHelp},group='x'"`
 	Tolerance float64       `kong:"default='10.0',help=${ToleranceHelp},group='x'"`
+	StrictDeterministic bool `kong:"help=${StrictDeterministicHelp},group='x'"`
 
 	Bench            string               `kong:"default='.',help=${BenchHelp},group='gotest'"`
 	BenchmarkArgs    string               `kong:"placeholder='args',help=${BenchmarkArgsHelp},group='gotest'"`
@@ -289,9 +291,10 @@ func main() {
 		BenchstatFormatter: bStat.OutputFormatter,
 		OutputFormat:       outputFormat,
 		Tolerance:          cli.Tolerance,
+		StrictDeterministic: cli.StrictDeterministic,
 	})
 	kctx.FatalIfErrorf(err)
-	if result.HasDegradedResult(cli.Tolerance) {
+	if result.HasDegradedResult(cli.Tolerance, cli.StrictDeterministic) {
 		os.Exit(cli.OnDegrade)
 	}
 }
diff --git a/cmd/benchdiff/internal/benchdiff.go b/cmd/benchdiff/internal/benchdiff.go
index 5d4a46d..ef12fce 100644
--- a/cmd/benchdiff/internal/benchdiff.go
+++ b/cmd/benchdiff/internal/benchdiff.go
@@ -251,6 +251,7 @@ type RunResultOutputOptions struct {
 	BenchstatFormatter benchstatter.OutputFormatter // default benchstatter.TextFormatter(nil)
 	OutputFormat       string                       // one of json or human. default: human
 	Tolerance          float64
+	StrictDeterministic bool
 }
 
 // WriteOutput outputs the result
@@ -262,6 +263,7 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error
 		BenchstatFormatter: benchstatter.TextFormatter(nil),
 		OutputFormat:       "human",
 		Tolerance:          opts.Tolerance,
+		StrictDeterministic: opts.StrictDeterministic,
 	}
 	if opts.BenchstatFormatter != nil {
 		finalOpts.BenchstatFormatter = opts.BenchstatFormatter
@@ -281,13 +283,13 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error
 	case "human":
 		return r.writeHumanResult(w, benchstatBuf.String())
 	case "json":
-		return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance)
+		return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.StrictDeterministic)
 	default:
 		return fmt.Errorf("unknown OutputFormat")
 	}
 }
 
-func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64) error {
+func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64, strictDeterministic bool) error {
 	type runResultJSON struct {
 		BenchCommand    string `json:"bench_command,omitempty"`
 		HeadSHA         string `json:"head_sha,omitempty"`
@@ -302,7 +304,7 @@ func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, toleran
 		BenchstatOutput: benchstatResult,
 		HeadSHA:         r.headSHA,
 		BaseSHA:         r.baseSHA,
-		DegradedResult:  r.HasDegradedResult(tolerance),
+		DegradedResult:  r.HasDegradedResult(tolerance, strictDeterministic),
 	})
 }
 
@@ -329,15 +331,20 @@ func (r *RunResult) writeHumanResult(w io.Writer, benchstatResult string) error
 }
 
 // HasDegradedResult returns true if there are any rows with DegradingChange and PctDelta over tolerance
-func (r *RunResult) HasDegradedResult(tolerance float64) bool {
-	return r.maxDegradedPct() > tolerance
+func (r *RunResult) HasDegradedResult(tolerance float64, strictDeterministic bool) bool {
+	return r.maxDegradedPct(strictDeterministic) > tolerance
 }
 
-func (r *RunResult) maxDegradedPct() float64 {
+func (r *RunResult) maxDegradedPct(strictDeterministic bool) float64 {
 	max := 0.0
 	for _, table := range r.tables {
+		isDeterministic := table.Metric == "B/op" || table.Metric == "allocs/op"
 		for _, row := range table.Rows {
-			if row.Change != DegradingChange {
+			isDegraded := row.Change == DegradingChange
+			if strictDeterministic && isDeterministic && row.PctDelta > 0 {
+				isDegraded = true
+			}
+			if !isDegraded {
 				continue
 			}
 			if row.PctDelta > max {
diff --git a/cmd/benchdiff/internal/degraded_test.go b/cmd/benchdiff/internal/degraded_test.go
new file mode 100644
index 0000000..0552a56
--- /dev/null
+++ b/cmd/benchdiff/internal/degraded_test.go
@@ -0,0 +1,96 @@
+package internal
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"golang.org/x/perf/benchstat"
+)
+
+func TestRunResult_HasDegradedResult(t *testing.T) {
+	t.Run("time-based degradation", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "ns/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 15.0,
+							Change:   DegradingChange,
+						},
+					},
+				},
+			},
+		}
+		require.True(t, r.HasDegradedResult(10.0, false))
+		require.False(t, r.HasDegradedResult(20.0, false))
+	})
+
+	t.Run("deterministic degradation with strict flag", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "allocs/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 5.0,
+							Change:   InsignificantChange, // benchstat didn't mark it
+						},
+					},
+				},
+			},
+		}
+		// Without strict flag, it should not be degraded because Change != DegradingChange
+		require.False(t, r.HasDegradedResult(0.0, false))
+		// With strict flag, it should be degraded because PctDelta > tolerance (0.0)
+		require.True(t, r.HasDegradedResult(0.0, true))
+		require.False(t, r.HasDegradedResult(10.0, true))
+	})
+
+	t.Run("deterministic degradation with B/op and strict flag", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "B/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 20.0,
+							Change:   InsignificantChange,
+						},
+					},
+				},
+			},
+		}
+		require.False(t, r.HasDegradedResult(10.0, false))
+		require.True(t, r.HasDegradedResult(10.0, true))
+	})
+
+	t.Run("mixed metrics", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "ns/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 5.0,
+							Change:   InsignificantChange,
+						},
+					},
+				},
+				{
+					Metric: "allocs/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 15.0,
+							Change:   InsignificantChange,
+						},
+					},
+				},
+			},
+		}
+		// ns/op is below tolerance, and allocs/op is ignored without strict
+		require.False(t, r.HasDegradedResult(10.0, false))
+		// with strict, allocs/op is over tolerance
+		require.True(t, r.HasDegradedResult(10.0, true))
+	})
+}

From 43d62382dc405881d66794598ce69b57d6317fa2 Mon Sep 17 00:00:00 2001
From: Ryan Shatford <ev1lm0nk3y@users.noreply.github.com>
Date: Thu, 4 Jun 2026 01:43:31 -0700
Subject: [PATCH 2/2] feat: add --deterministic-tolerance flag

This allows users to define a separate threshold for regressions in
allocs/op and B/op. It defaults to 0.0 as requested.
---
 cmd/benchdiff/benchdiff.go              |  5 ++++-
 cmd/benchdiff/internal/benchdiff.go     | 28 ++++++++++++-------------
 cmd/benchdiff/internal/degraded_test.go | 27 ++++++++++++------------
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/cmd/benchdiff/benchdiff.go b/cmd/benchdiff/benchdiff.go
index 6254913..a16085d 100644
--- a/cmd/benchdiff/benchdiff.go
+++ b/cmd/benchdiff/benchdiff.go
@@ -82,6 +82,7 @@ var benchVars = kong.Vars{
 	"WarmupTimeHelp":       `When warmups are run, set -benchtime=n`,
 	"TagsHelp":             `Set the -tags flag on the go test command`,
 	"StrictDeterministicHelp": `Strictly evaluate deterministic metrics (allocs/op, B/op) for regressions, ignoring statistical significance.`,
+	"DeterministicToleranceHelp": `The minimum percent change before a deterministic metric (allocs/op, B/op) is considered degraded.`,
 }
 
 var groupHelp = kong.Vars{
@@ -102,6 +103,7 @@ var cli struct {
 	OnDegrade int           `kong:"name=on-degrade,default=0,help=${OnDegradeHelp},group='x'"`
 	Tolerance float64       `kong:"default='10.0',help=${ToleranceHelp},group='x'"`
 	StrictDeterministic bool `kong:"help=${StrictDeterministicHelp},group='x'"`
+	DeterministicTolerance float64 `kong:"name=deterministic-tolerance,default='0.0',help=${DeterministicToleranceHelp},group='x'"`
 
 	Bench            string               `kong:"default='.',help=${BenchHelp},group='gotest'"`
 	BenchmarkArgs    string               `kong:"placeholder='args',help=${BenchmarkArgsHelp},group='gotest'"`
@@ -292,9 +294,10 @@ func main() {
 		OutputFormat:       outputFormat,
 		Tolerance:          cli.Tolerance,
 		StrictDeterministic: cli.StrictDeterministic,
+		DeterministicTolerance: cli.DeterministicTolerance,
 	})
 	kctx.FatalIfErrorf(err)
-	if result.HasDegradedResult(cli.Tolerance, cli.StrictDeterministic) {
+	if result.HasDegradedResult(cli.Tolerance, cli.DeterministicTolerance, cli.StrictDeterministic) {
 		os.Exit(cli.OnDegrade)
 	}
 }
diff --git a/cmd/benchdiff/internal/benchdiff.go b/cmd/benchdiff/internal/benchdiff.go
index ef12fce..1671dab 100644
--- a/cmd/benchdiff/internal/benchdiff.go
+++ b/cmd/benchdiff/internal/benchdiff.go
@@ -252,6 +252,7 @@ type RunResultOutputOptions struct {
 	OutputFormat       string                       // one of json or human. default: human
 	Tolerance          float64
 	StrictDeterministic bool
+	DeterministicTolerance float64
 }
 
 // WriteOutput outputs the result
@@ -264,6 +265,7 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error
 		OutputFormat:       "human",
 		Tolerance:          opts.Tolerance,
 		StrictDeterministic: opts.StrictDeterministic,
+		DeterministicTolerance: opts.DeterministicTolerance,
 	}
 	if opts.BenchstatFormatter != nil {
 		finalOpts.BenchstatFormatter = opts.BenchstatFormatter
@@ -283,13 +285,13 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error
 	case "human":
 		return r.writeHumanResult(w, benchstatBuf.String())
 	case "json":
-		return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.StrictDeterministic)
+		return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.DeterministicTolerance, finalOpts.StrictDeterministic)
 	default:
 		return fmt.Errorf("unknown OutputFormat")
 	}
 }
 
-func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64, strictDeterministic bool) error {
+func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance, detTolerance float64, strictDeterministic bool) error {
 	type runResultJSON struct {
 		BenchCommand    string `json:"bench_command,omitempty"`
 		HeadSHA         string `json:"head_sha,omitempty"`
@@ -304,7 +306,7 @@ func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, toleran
 		BenchstatOutput: benchstatResult,
 		HeadSHA:         r.headSHA,
 		BaseSHA:         r.baseSHA,
-		DegradedResult:  r.HasDegradedResult(tolerance, strictDeterministic),
+		DegradedResult:  r.HasDegradedResult(tolerance, detTolerance, strictDeterministic),
 	})
 }
 
@@ -331,28 +333,24 @@ func (r *RunResult) writeHumanResult(w io.Writer, benchstatResult string) error
 }
 
 // HasDegradedResult returns true if there are any rows with DegradingChange and PctDelta over tolerance
-func (r *RunResult) HasDegradedResult(tolerance float64, strictDeterministic bool) bool {
-	return r.maxDegradedPct(strictDeterministic) > tolerance
-}
-
-func (r *RunResult) maxDegradedPct(strictDeterministic bool) float64 {
-	max := 0.0
+func (r *RunResult) HasDegradedResult(tolerance, detTolerance float64, strictDeterministic bool) bool {
 	for _, table := range r.tables {
 		isDeterministic := table.Metric == "B/op" || table.Metric == "allocs/op"
+		thresh := tolerance
+		if isDeterministic {
+			thresh = detTolerance
+		}
 		for _, row := range table.Rows {
 			isDegraded := row.Change == DegradingChange
 			if strictDeterministic && isDeterministic && row.PctDelta > 0 {
 				isDegraded = true
 			}
-			if !isDegraded {
-				continue
-			}
-			if row.PctDelta > max {
-				max = row.PctDelta
+			if isDegraded && row.PctDelta > thresh {
+				return true
 			}
 		}
 	}
-	return max
+	return false
 }
 
 // BenchmarkChangeType is whether a change is an improvement or degradation
diff --git a/cmd/benchdiff/internal/degraded_test.go b/cmd/benchdiff/internal/degraded_test.go
index 0552a56..8daedc2 100644
--- a/cmd/benchdiff/internal/degraded_test.go
+++ b/cmd/benchdiff/internal/degraded_test.go
@@ -22,8 +22,8 @@ func TestRunResult_HasDegradedResult(t *testing.T) {
 				},
 			},
 		}
-		require.True(t, r.HasDegradedResult(10.0, false))
-		require.False(t, r.HasDegradedResult(20.0, false))
+		require.True(t, r.HasDegradedResult(10.0, 10.0, false))
+		require.False(t, r.HasDegradedResult(20.0, 20.0, false))
 	})
 
 	t.Run("deterministic degradation with strict flag", func(t *testing.T) {
@@ -41,10 +41,10 @@ func TestRunResult_HasDegradedResult(t *testing.T) {
 			},
 		}
 		// Without strict flag, it should not be degraded because Change != DegradingChange
-		require.False(t, r.HasDegradedResult(0.0, false))
-		// With strict flag, it should be degraded because PctDelta > tolerance (0.0)
-		require.True(t, r.HasDegradedResult(0.0, true))
-		require.False(t, r.HasDegradedResult(10.0, true))
+		require.False(t, r.HasDegradedResult(0.0, 0.0, false))
+		// With strict flag, it should be degraded because PctDelta > detTolerance (0.0)
+		require.True(t, r.HasDegradedResult(0.0, 0.0, true))
+		require.False(t, r.HasDegradedResult(0.0, 10.0, true))
 	})
 
 	t.Run("deterministic degradation with B/op and strict flag", func(t *testing.T) {
@@ -61,11 +61,12 @@ func TestRunResult_HasDegradedResult(t *testing.T) {
 				},
 			},
 		}
-		require.False(t, r.HasDegradedResult(10.0, false))
-		require.True(t, r.HasDegradedResult(10.0, true))
+		require.False(t, r.HasDegradedResult(10.0, 10.0, false))
+		require.True(t, r.HasDegradedResult(10.0, 10.0, true))
+		require.False(t, r.HasDegradedResult(10.0, 25.0, true))
 	})
 
-	t.Run("mixed metrics", func(t *testing.T) {
+	t.Run("mixed metrics and tolerances", func(t *testing.T) {
 		r := &RunResult{
 			tables: []*benchstat.Table{
 				{
@@ -88,9 +89,9 @@ func TestRunResult_HasDegradedResult(t *testing.T) {
 				},
 			},
 		}
-		// ns/op is below tolerance, and allocs/op is ignored without strict
-		require.False(t, r.HasDegradedResult(10.0, false))
-		// with strict, allocs/op is over tolerance
-		require.True(t, r.HasDegradedResult(10.0, true))
+		// ns/op is below tolerance (10), and allocs/op is below detTolerance (20)
+		require.False(t, r.HasDegradedResult(10.0, 20.0, true))
+		// ns/op is below tolerance (10), but allocs/op is ABOVE detTolerance (10)
+		require.True(t, r.HasDegradedResult(10.0, 10.0, true))
 	})
 }