From 2574ea30ba59304922037e207c77232e46cdc5ef Mon Sep 17 00:00:00 2001 From: Ryan Shatford Date: Thu, 4 Jun 2026 01:12:22 -0700 Subject: [PATCH 1/2] feat: add --strict-deterministic flag to strictly evaluate deterministic metrics This flag allows benchdiff to bypass benchstat's statistical significance checks for allocs/op and B/op, ensuring that any regression in these metrics exceeding the tolerance threshold triggers a failure. --- cmd/benchdiff/benchdiff.go | 5 +- cmd/benchdiff/internal/benchdiff.go | 21 ++++-- cmd/benchdiff/internal/degraded_test.go | 96 +++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 cmd/benchdiff/internal/degraded_test.go diff --git a/cmd/benchdiff/benchdiff.go b/cmd/benchdiff/benchdiff.go index dcd13b6..6254913 100644 --- a/cmd/benchdiff/benchdiff.go +++ b/cmd/benchdiff/benchdiff.go @@ -81,6 +81,7 @@ var benchVars = kong.Vars{ "WarmupCountHelp": `Run benchmarks with -count=n as a warmup`, "WarmupTimeHelp": `When warmups are run, set -benchtime=n`, "TagsHelp": `Set the -tags flag on the go test command`, + "StrictDeterministicHelp": `Strictly evaluate deterministic metrics (allocs/op, B/op) for regressions, ignoring statistical significance.`, } var groupHelp = kong.Vars{ @@ -100,6 +101,7 @@ var cli struct { JSON bool `kong:"help=${JSONHelp},group='x'"` OnDegrade int `kong:"name=on-degrade,default=0,help=${OnDegradeHelp},group='x'"` Tolerance float64 `kong:"default='10.0',help=${ToleranceHelp},group='x'"` + StrictDeterministic bool `kong:"help=${StrictDeterministicHelp},group='x'"` Bench string `kong:"default='.',help=${BenchHelp},group='gotest'"` BenchmarkArgs string `kong:"placeholder='args',help=${BenchmarkArgsHelp},group='gotest'"` @@ -289,9 +291,10 @@ func main() { BenchstatFormatter: bStat.OutputFormatter, OutputFormat: outputFormat, Tolerance: cli.Tolerance, + StrictDeterministic: cli.StrictDeterministic, }) kctx.FatalIfErrorf(err) - if result.HasDegradedResult(cli.Tolerance) { + if result.HasDegradedResult(cli.Tolerance, cli.StrictDeterministic) { os.Exit(cli.OnDegrade) } } diff --git a/cmd/benchdiff/internal/benchdiff.go b/cmd/benchdiff/internal/benchdiff.go index 5d4a46d..ef12fce 100644 --- a/cmd/benchdiff/internal/benchdiff.go +++ b/cmd/benchdiff/internal/benchdiff.go @@ -251,6 +251,7 @@ type RunResultOutputOptions struct { BenchstatFormatter benchstatter.OutputFormatter // default benchstatter.TextFormatter(nil) OutputFormat string // one of json or human. default: human Tolerance float64 + StrictDeterministic bool } // WriteOutput outputs the result @@ -262,6 +263,7 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error BenchstatFormatter: benchstatter.TextFormatter(nil), OutputFormat: "human", Tolerance: opts.Tolerance, + StrictDeterministic: opts.StrictDeterministic, } if opts.BenchstatFormatter != nil { finalOpts.BenchstatFormatter = opts.BenchstatFormatter @@ -281,13 +283,13 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error case "human": return r.writeHumanResult(w, benchstatBuf.String()) case "json": - return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance) + return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.StrictDeterministic) default: return fmt.Errorf("unknown OutputFormat") } } -func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64) error { +func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64, strictDeterministic bool) error { type runResultJSON struct { BenchCommand string `json:"bench_command,omitempty"` HeadSHA string `json:"head_sha,omitempty"` @@ -302,7 +304,7 @@ func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, toleran BenchstatOutput: benchstatResult, HeadSHA: r.headSHA, BaseSHA: r.baseSHA, - DegradedResult: r.HasDegradedResult(tolerance), + DegradedResult: r.HasDegradedResult(tolerance, strictDeterministic), }) } @@ -329,15 +331,20 @@ func (r *RunResult) writeHumanResult(w io.Writer, benchstatResult string) error } // HasDegradedResult returns true if there are any rows with DegradingChange and PctDelta over tolerance -func (r *RunResult) HasDegradedResult(tolerance float64) bool { - return r.maxDegradedPct() > tolerance +func (r *RunResult) HasDegradedResult(tolerance float64, strictDeterministic bool) bool { + return r.maxDegradedPct(strictDeterministic) > tolerance } -func (r *RunResult) maxDegradedPct() float64 { +func (r *RunResult) maxDegradedPct(strictDeterministic bool) float64 { max := 0.0 for _, table := range r.tables { + isDeterministic := table.Metric == "B/op" || table.Metric == "allocs/op" for _, row := range table.Rows { - if row.Change != DegradingChange { + isDegraded := row.Change == DegradingChange + if strictDeterministic && isDeterministic && row.PctDelta > 0 { + isDegraded = true + } + if !isDegraded { continue } if row.PctDelta > max { diff --git a/cmd/benchdiff/internal/degraded_test.go b/cmd/benchdiff/internal/degraded_test.go new file mode 100644 index 0000000..0552a56 --- /dev/null +++ b/cmd/benchdiff/internal/degraded_test.go @@ -0,0 +1,96 @@ +package internal + +import ( + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/perf/benchstat" +) + +func TestRunResult_HasDegradedResult(t *testing.T) { + t.Run("time-based degradation", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "ns/op", + Rows: []*benchstat.Row{ + { + PctDelta: 15.0, + Change: DegradingChange, + }, + }, + }, + }, + } + require.True(t, r.HasDegradedResult(10.0, false)) + require.False(t, r.HasDegradedResult(20.0, false)) + }) + + t.Run("deterministic degradation with strict flag", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "allocs/op", + Rows: []*benchstat.Row{ + { + PctDelta: 5.0, + Change: InsignificantChange, // benchstat didn't mark it + }, + }, + }, + }, + } + // Without strict flag, it should not be degraded because Change != DegradingChange + require.False(t, r.HasDegradedResult(0.0, false)) + // With strict flag, it should be degraded because PctDelta > tolerance (0.0) + require.True(t, r.HasDegradedResult(0.0, true)) + require.False(t, r.HasDegradedResult(10.0, true)) + }) + + t.Run("deterministic degradation with B/op and strict flag", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "B/op", + Rows: []*benchstat.Row{ + { + PctDelta: 20.0, + Change: InsignificantChange, + }, + }, + }, + }, + } + require.False(t, r.HasDegradedResult(10.0, false)) + require.True(t, r.HasDegradedResult(10.0, true)) + }) + + t.Run("mixed metrics", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "ns/op", + Rows: []*benchstat.Row{ + { + PctDelta: 5.0, + Change: InsignificantChange, + }, + }, + }, + { + Metric: "allocs/op", + Rows: []*benchstat.Row{ + { + PctDelta: 15.0, + Change: InsignificantChange, + }, + }, + }, + }, + } + // ns/op is below tolerance, and allocs/op is ignored without strict + require.False(t, r.HasDegradedResult(10.0, false)) + // with strict, allocs/op is over tolerance + require.True(t, r.HasDegradedResult(10.0, true)) + }) +} From 43d62382dc405881d66794598ce69b57d6317fa2 Mon Sep 17 00:00:00 2001 From: Ryan Shatford Date: Thu, 4 Jun 2026 01:43:31 -0700 Subject: [PATCH 2/2] feat: add --deterministic-tolerance flag This allows users to define a separate threshold for regressions in allocs/op and B/op. It defaults to 0.0 as requested. --- cmd/benchdiff/benchdiff.go | 5 ++++- cmd/benchdiff/internal/benchdiff.go | 28 ++++++++++++------------- cmd/benchdiff/internal/degraded_test.go | 27 ++++++++++++------------ 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/cmd/benchdiff/benchdiff.go b/cmd/benchdiff/benchdiff.go index 6254913..a16085d 100644 --- a/cmd/benchdiff/benchdiff.go +++ b/cmd/benchdiff/benchdiff.go @@ -82,6 +82,7 @@ var benchVars = kong.Vars{ "WarmupTimeHelp": `When warmups are run, set -benchtime=n`, "TagsHelp": `Set the -tags flag on the go test command`, "StrictDeterministicHelp": `Strictly evaluate deterministic metrics (allocs/op, B/op) for regressions, ignoring statistical significance.`, + "DeterministicToleranceHelp": `The minimum percent change before a deterministic metric (allocs/op, B/op) is considered degraded.`, } var groupHelp = kong.Vars{ @@ -102,6 +103,7 @@ var cli struct { OnDegrade int `kong:"name=on-degrade,default=0,help=${OnDegradeHelp},group='x'"` Tolerance float64 `kong:"default='10.0',help=${ToleranceHelp},group='x'"` StrictDeterministic bool `kong:"help=${StrictDeterministicHelp},group='x'"` + DeterministicTolerance float64 `kong:"name=deterministic-tolerance,default='0.0',help=${DeterministicToleranceHelp},group='x'"` Bench string `kong:"default='.',help=${BenchHelp},group='gotest'"` BenchmarkArgs string `kong:"placeholder='args',help=${BenchmarkArgsHelp},group='gotest'"` @@ -292,9 +294,10 @@ func main() { OutputFormat: outputFormat, Tolerance: cli.Tolerance, StrictDeterministic: cli.StrictDeterministic, + DeterministicTolerance: cli.DeterministicTolerance, }) kctx.FatalIfErrorf(err) - if result.HasDegradedResult(cli.Tolerance, cli.StrictDeterministic) { + if result.HasDegradedResult(cli.Tolerance, cli.DeterministicTolerance, cli.StrictDeterministic) { os.Exit(cli.OnDegrade) } } diff --git a/cmd/benchdiff/internal/benchdiff.go b/cmd/benchdiff/internal/benchdiff.go index ef12fce..1671dab 100644 --- a/cmd/benchdiff/internal/benchdiff.go +++ b/cmd/benchdiff/internal/benchdiff.go @@ -252,6 +252,7 @@ type RunResultOutputOptions struct { OutputFormat string // one of json or human. default: human Tolerance float64 StrictDeterministic bool + DeterministicTolerance float64 } // WriteOutput outputs the result @@ -264,6 +265,7 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error OutputFormat: "human", Tolerance: opts.Tolerance, StrictDeterministic: opts.StrictDeterministic, + DeterministicTolerance: opts.DeterministicTolerance, } if opts.BenchstatFormatter != nil { finalOpts.BenchstatFormatter = opts.BenchstatFormatter @@ -283,13 +285,13 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error case "human": return r.writeHumanResult(w, benchstatBuf.String()) case "json": - return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.StrictDeterministic) + return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.DeterministicTolerance, finalOpts.StrictDeterministic) default: return fmt.Errorf("unknown OutputFormat") } } -func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64, strictDeterministic bool) error { +func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance, detTolerance float64, strictDeterministic bool) error { type runResultJSON struct { BenchCommand string `json:"bench_command,omitempty"` HeadSHA string `json:"head_sha,omitempty"` @@ -304,7 +306,7 @@ func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, toleran BenchstatOutput: benchstatResult, HeadSHA: r.headSHA, BaseSHA: r.baseSHA, - DegradedResult: r.HasDegradedResult(tolerance, strictDeterministic), + DegradedResult: r.HasDegradedResult(tolerance, detTolerance, strictDeterministic), }) } @@ -331,28 +333,24 @@ func (r *RunResult) writeHumanResult(w io.Writer, benchstatResult string) error } // HasDegradedResult returns true if there are any rows with DegradingChange and PctDelta over tolerance -func (r *RunResult) HasDegradedResult(tolerance float64, strictDeterministic bool) bool { - return r.maxDegradedPct(strictDeterministic) > tolerance -} - -func (r *RunResult) maxDegradedPct(strictDeterministic bool) float64 { - max := 0.0 +func (r *RunResult) HasDegradedResult(tolerance, detTolerance float64, strictDeterministic bool) bool { for _, table := range r.tables { isDeterministic := table.Metric == "B/op" || table.Metric == "allocs/op" + thresh := tolerance + if isDeterministic { + thresh = detTolerance + } for _, row := range table.Rows { isDegraded := row.Change == DegradingChange if strictDeterministic && isDeterministic && row.PctDelta > 0 { isDegraded = true } - if !isDegraded { - continue - } - if row.PctDelta > max { - max = row.PctDelta + if isDegraded && row.PctDelta > thresh { + return true } } } - return max + return false } // BenchmarkChangeType is whether a change is an improvement or degradation diff --git a/cmd/benchdiff/internal/degraded_test.go b/cmd/benchdiff/internal/degraded_test.go index 0552a56..8daedc2 100644 --- a/cmd/benchdiff/internal/degraded_test.go +++ b/cmd/benchdiff/internal/degraded_test.go @@ -22,8 +22,8 @@ func TestRunResult_HasDegradedResult(t *testing.T) { }, }, } - require.True(t, r.HasDegradedResult(10.0, false)) - require.False(t, r.HasDegradedResult(20.0, false)) + require.True(t, r.HasDegradedResult(10.0, 10.0, false)) + require.False(t, r.HasDegradedResult(20.0, 20.0, false)) }) t.Run("deterministic degradation with strict flag", func(t *testing.T) { @@ -41,10 +41,10 @@ func TestRunResult_HasDegradedResult(t *testing.T) { }, } // Without strict flag, it should not be degraded because Change != DegradingChange - require.False(t, r.HasDegradedResult(0.0, false)) - // With strict flag, it should be degraded because PctDelta > tolerance (0.0) - require.True(t, r.HasDegradedResult(0.0, true)) - require.False(t, r.HasDegradedResult(10.0, true)) + require.False(t, r.HasDegradedResult(0.0, 0.0, false)) + // With strict flag, it should be degraded because PctDelta > detTolerance (0.0) + require.True(t, r.HasDegradedResult(0.0, 0.0, true)) + require.False(t, r.HasDegradedResult(0.0, 10.0, true)) }) t.Run("deterministic degradation with B/op and strict flag", func(t *testing.T) { @@ -61,11 +61,12 @@ func TestRunResult_HasDegradedResult(t *testing.T) { }, }, } - require.False(t, r.HasDegradedResult(10.0, false)) - require.True(t, r.HasDegradedResult(10.0, true)) + require.False(t, r.HasDegradedResult(10.0, 10.0, false)) + require.True(t, r.HasDegradedResult(10.0, 10.0, true)) + require.False(t, r.HasDegradedResult(10.0, 25.0, true)) }) - t.Run("mixed metrics", func(t *testing.T) { + t.Run("mixed metrics and tolerances", func(t *testing.T) { r := &RunResult{ tables: []*benchstat.Table{ { @@ -88,9 +89,9 @@ func TestRunResult_HasDegradedResult(t *testing.T) { }, }, } - // ns/op is below tolerance, and allocs/op is ignored without strict - require.False(t, r.HasDegradedResult(10.0, false)) - // with strict, allocs/op is over tolerance - require.True(t, r.HasDegradedResult(10.0, true)) + // ns/op is below tolerance (10), and allocs/op is below detTolerance (20) + require.False(t, r.HasDegradedResult(10.0, 20.0, true)) + // ns/op is below tolerance (10), but allocs/op is ABOVE detTolerance (10) + require.True(t, r.HasDegradedResult(10.0, 10.0, true)) }) }