WillAbides · ev1lm0nk3y · Jun 4, 2026 · Jun 4, 2026
diff --git a/cmd/benchdiff/benchdiff.go b/cmd/benchdiff/benchdiff.go
@@ -81,6 +81,8 @@ var benchVars = kong.Vars{
 	"WarmupCountHelp":      `Run benchmarks with -count=n as a warmup`,
 	"WarmupTimeHelp":       `When warmups are run, set -benchtime=n`,
 	"TagsHelp":             `Set the -tags flag on the go test command`,
+	"StrictDeterministicHelp": `Strictly evaluate deterministic metrics (allocs/op, B/op) for regressions, ignoring statistical significance.`,
+	"DeterministicToleranceHelp": `The minimum percent change before a deterministic metric (allocs/op, B/op) is considered degraded.`,
 }
 
 var groupHelp = kong.Vars{
@@ -100,6 +102,8 @@ var cli struct {
 	JSON      bool          `kong:"help=${JSONHelp},group='x'"`
 	OnDegrade int           `kong:"name=on-degrade,default=0,help=${OnDegradeHelp},group='x'"`
 	Tolerance float64       `kong:"default='10.0',help=${ToleranceHelp},group='x'"`
+	StrictDeterministic bool `kong:"help=${StrictDeterministicHelp},group='x'"`
+	DeterministicTolerance float64 `kong:"name=deterministic-tolerance,default='0.0',help=${DeterministicToleranceHelp},group='x'"`
 
 	Bench            string               `kong:"default='.',help=${BenchHelp},group='gotest'"`
 	BenchmarkArgs    string               `kong:"placeholder='args',help=${BenchmarkArgsHelp},group='gotest'"`
@@ -289,9 +293,11 @@ func main() {
 		BenchstatFormatter: bStat.OutputFormatter,
 		OutputFormat:       outputFormat,
 		Tolerance:          cli.Tolerance,
+		StrictDeterministic: cli.StrictDeterministic,
+		DeterministicTolerance: cli.DeterministicTolerance,
 	})
 	kctx.FatalIfErrorf(err)
-	if result.HasDegradedResult(cli.Tolerance) {
+	if result.HasDegradedResult(cli.Tolerance, cli.DeterministicTolerance, cli.StrictDeterministic) {
 		os.Exit(cli.OnDegrade)
 	}
 }

diff --git a/cmd/benchdiff/internal/benchdiff.go b/cmd/benchdiff/internal/benchdiff.go
@@ -251,6 +251,8 @@ type RunResultOutputOptions struct {
 	BenchstatFormatter benchstatter.OutputFormatter // default benchstatter.TextFormatter(nil)
 	OutputFormat       string                       // one of json or human. default: human
 	Tolerance          float64
+	StrictDeterministic bool
+	DeterministicTolerance float64
 }
 
 // WriteOutput outputs the result
@@ -262,6 +264,8 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error
 		BenchstatFormatter: benchstatter.TextFormatter(nil),
 		OutputFormat:       "human",
 		Tolerance:          opts.Tolerance,
+		StrictDeterministic: opts.StrictDeterministic,
+		DeterministicTolerance: opts.DeterministicTolerance,
 	}
 	if opts.BenchstatFormatter != nil {
 		finalOpts.BenchstatFormatter = opts.BenchstatFormatter
@@ -281,13 +285,13 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error
 	case "human":
 		return r.writeHumanResult(w, benchstatBuf.String())
 	case "json":
-		return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance)
+		return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.DeterministicTolerance, finalOpts.StrictDeterministic)
 	default:
 		return fmt.Errorf("unknown OutputFormat")
 	}
 }
 
-func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64) error {
+func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance, detTolerance float64, strictDeterministic bool) error {
 	type runResultJSON struct {
 		BenchCommand    string `json:"bench_command,omitempty"`
 		HeadSHA         string `json:"head_sha,omitempty"`
@@ -302,7 +306,7 @@ func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, toleran
 		BenchstatOutput: benchstatResult,
 		HeadSHA:         r.headSHA,
 		BaseSHA:         r.baseSHA,
-		DegradedResult:  r.HasDegradedResult(tolerance),
+		DegradedResult:  r.HasDegradedResult(tolerance, detTolerance, strictDeterministic),
 	})
 }
 
@@ -329,23 +333,24 @@ func (r *RunResult) writeHumanResult(w io.Writer, benchstatResult string) error
 }
 
 // HasDegradedResult returns true if there are any rows with DegradingChange and PctDelta over tolerance
-func (r *RunResult) HasDegradedResult(tolerance float64) bool {
-	return r.maxDegradedPct() > tolerance
-}
-
-func (r *RunResult) maxDegradedPct() float64 {
-	max := 0.0
+func (r *RunResult) HasDegradedResult(tolerance, detTolerance float64, strictDeterministic bool) bool {
 	for _, table := range r.tables {
+		isDeterministic := table.Metric == "B/op" || table.Metric == "allocs/op"
+		thresh := tolerance
+		if isDeterministic {
+			thresh = detTolerance
+		}
 		for _, row := range table.Rows {
-			if row.Change != DegradingChange {
-				continue
+			isDegraded := row.Change == DegradingChange
+			if strictDeterministic && isDeterministic && row.PctDelta > 0 {
+				isDegraded = true
 			}
-			if row.PctDelta > max {
-				max = row.PctDelta
+			if isDegraded && row.PctDelta > thresh {
+				return true
 			}
 		}
 	}
-	return max
+	return false
 }
 
 // BenchmarkChangeType is whether a change is an improvement or degradation

diff --git a/cmd/benchdiff/internal/degraded_test.go b/cmd/benchdiff/internal/degraded_test.go
@@ -0,0 +1,97 @@
+package internal
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"golang.org/x/perf/benchstat"
+)
+
+func TestRunResult_HasDegradedResult(t *testing.T) {
+	t.Run("time-based degradation", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "ns/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 15.0,
+							Change:   DegradingChange,
+						},
+					},
+				},
+			},
+		}
+		require.True(t, r.HasDegradedResult(10.0, 10.0, false))
+		require.False(t, r.HasDegradedResult(20.0, 20.0, false))
+	})
+
+	t.Run("deterministic degradation with strict flag", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "allocs/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 5.0,
+							Change:   InsignificantChange, // benchstat didn't mark it
+						},
+					},
+				},
+			},
+		}
+		// Without strict flag, it should not be degraded because Change != DegradingChange
+		require.False(t, r.HasDegradedResult(0.0, 0.0, false))
+		// With strict flag, it should be degraded because PctDelta > detTolerance (0.0)
+		require.True(t, r.HasDegradedResult(0.0, 0.0, true))
+		require.False(t, r.HasDegradedResult(0.0, 10.0, true))
+	})
+
+	t.Run("deterministic degradation with B/op and strict flag", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "B/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 20.0,
+							Change:   InsignificantChange,
+						},
+					},
+				},
+			},
+		}
+		require.False(t, r.HasDegradedResult(10.0, 10.0, false))
+		require.True(t, r.HasDegradedResult(10.0, 10.0, true))
+		require.False(t, r.HasDegradedResult(10.0, 25.0, true))
+	})
+
+	t.Run("mixed metrics and tolerances", func(t *testing.T) {
+		r := &RunResult{
+			tables: []*benchstat.Table{
+				{
+					Metric: "ns/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 5.0,
+							Change:   InsignificantChange,
+						},
+					},
+				},
+				{
+					Metric: "allocs/op",
+					Rows: []*benchstat.Row{
+						{
+							PctDelta: 15.0,
+							Change:   InsignificantChange,
+						},
+					},
+				},
+			},
+		}
+		// ns/op is below tolerance (10), and allocs/op is below detTolerance (20)
+		require.False(t, r.HasDegradedResult(10.0, 20.0, true))
+		// ns/op is below tolerance (10), but allocs/op is ABOVE detTolerance (10)
+		require.True(t, r.HasDegradedResult(10.0, 10.0, true))
+	})
+}