diff --git a/cmd/benchdiff/benchdiff.go b/cmd/benchdiff/benchdiff.go index dcd13b6..a16085d 100644 --- a/cmd/benchdiff/benchdiff.go +++ b/cmd/benchdiff/benchdiff.go @@ -81,6 +81,8 @@ var benchVars = kong.Vars{ "WarmupCountHelp": `Run benchmarks with -count=n as a warmup`, "WarmupTimeHelp": `When warmups are run, set -benchtime=n`, "TagsHelp": `Set the -tags flag on the go test command`, + "StrictDeterministicHelp": `Strictly evaluate deterministic metrics (allocs/op, B/op) for regressions, ignoring statistical significance.`, + "DeterministicToleranceHelp": `The minimum percent change before a deterministic metric (allocs/op, B/op) is considered degraded.`, } var groupHelp = kong.Vars{ @@ -100,6 +102,8 @@ var cli struct { JSON bool `kong:"help=${JSONHelp},group='x'"` OnDegrade int `kong:"name=on-degrade,default=0,help=${OnDegradeHelp},group='x'"` Tolerance float64 `kong:"default='10.0',help=${ToleranceHelp},group='x'"` + StrictDeterministic bool `kong:"help=${StrictDeterministicHelp},group='x'"` + DeterministicTolerance float64 `kong:"name=deterministic-tolerance,default='0.0',help=${DeterministicToleranceHelp},group='x'"` Bench string `kong:"default='.',help=${BenchHelp},group='gotest'"` BenchmarkArgs string `kong:"placeholder='args',help=${BenchmarkArgsHelp},group='gotest'"` @@ -289,9 +293,11 @@ func main() { BenchstatFormatter: bStat.OutputFormatter, OutputFormat: outputFormat, Tolerance: cli.Tolerance, + StrictDeterministic: cli.StrictDeterministic, + DeterministicTolerance: cli.DeterministicTolerance, }) kctx.FatalIfErrorf(err) - if result.HasDegradedResult(cli.Tolerance) { + if result.HasDegradedResult(cli.Tolerance, cli.DeterministicTolerance, cli.StrictDeterministic) { os.Exit(cli.OnDegrade) } } diff --git a/cmd/benchdiff/internal/benchdiff.go b/cmd/benchdiff/internal/benchdiff.go index 5d4a46d..1671dab 100644 --- a/cmd/benchdiff/internal/benchdiff.go +++ b/cmd/benchdiff/internal/benchdiff.go @@ -251,6 +251,8 @@ type RunResultOutputOptions struct { BenchstatFormatter benchstatter.OutputFormatter // default benchstatter.TextFormatter(nil) OutputFormat string // one of json or human. default: human Tolerance float64 + StrictDeterministic bool + DeterministicTolerance float64 } // WriteOutput outputs the result @@ -262,6 +264,8 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error BenchstatFormatter: benchstatter.TextFormatter(nil), OutputFormat: "human", Tolerance: opts.Tolerance, + StrictDeterministic: opts.StrictDeterministic, + DeterministicTolerance: opts.DeterministicTolerance, } if opts.BenchstatFormatter != nil { finalOpts.BenchstatFormatter = opts.BenchstatFormatter @@ -281,13 +285,13 @@ func (r *RunResult) WriteOutput(w io.Writer, opts *RunResultOutputOptions) error case "human": return r.writeHumanResult(w, benchstatBuf.String()) case "json": - return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance) + return r.writeJSONResult(w, benchstatBuf.String(), finalOpts.Tolerance, finalOpts.DeterministicTolerance, finalOpts.StrictDeterministic) default: return fmt.Errorf("unknown OutputFormat") } } -func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance float64) error { +func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, tolerance, detTolerance float64, strictDeterministic bool) error { type runResultJSON struct { BenchCommand string `json:"bench_command,omitempty"` HeadSHA string `json:"head_sha,omitempty"` @@ -302,7 +306,7 @@ func (r *RunResult) writeJSONResult(w io.Writer, benchstatResult string, toleran BenchstatOutput: benchstatResult, HeadSHA: r.headSHA, BaseSHA: r.baseSHA, - DegradedResult: r.HasDegradedResult(tolerance), + DegradedResult: r.HasDegradedResult(tolerance, detTolerance, strictDeterministic), }) } @@ -329,23 +333,24 @@ func (r *RunResult) writeHumanResult(w io.Writer, benchstatResult string) error } // HasDegradedResult returns true if there are any rows with DegradingChange and PctDelta over tolerance -func (r *RunResult) HasDegradedResult(tolerance float64) bool { - return r.maxDegradedPct() > tolerance -} - -func (r *RunResult) maxDegradedPct() float64 { - max := 0.0 +func (r *RunResult) HasDegradedResult(tolerance, detTolerance float64, strictDeterministic bool) bool { for _, table := range r.tables { + isDeterministic := table.Metric == "B/op" || table.Metric == "allocs/op" + thresh := tolerance + if isDeterministic { + thresh = detTolerance + } for _, row := range table.Rows { - if row.Change != DegradingChange { - continue + isDegraded := row.Change == DegradingChange + if strictDeterministic && isDeterministic && row.PctDelta > 0 { + isDegraded = true } - if row.PctDelta > max { - max = row.PctDelta + if isDegraded && row.PctDelta > thresh { + return true } } } - return max + return false } // BenchmarkChangeType is whether a change is an improvement or degradation diff --git a/cmd/benchdiff/internal/degraded_test.go b/cmd/benchdiff/internal/degraded_test.go new file mode 100644 index 0000000..8daedc2 --- /dev/null +++ b/cmd/benchdiff/internal/degraded_test.go @@ -0,0 +1,97 @@ +package internal + +import ( + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/perf/benchstat" +) + +func TestRunResult_HasDegradedResult(t *testing.T) { + t.Run("time-based degradation", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "ns/op", + Rows: []*benchstat.Row{ + { + PctDelta: 15.0, + Change: DegradingChange, + }, + }, + }, + }, + } + require.True(t, r.HasDegradedResult(10.0, 10.0, false)) + require.False(t, r.HasDegradedResult(20.0, 20.0, false)) + }) + + t.Run("deterministic degradation with strict flag", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "allocs/op", + Rows: []*benchstat.Row{ + { + PctDelta: 5.0, + Change: InsignificantChange, // benchstat didn't mark it + }, + }, + }, + }, + } + // Without strict flag, it should not be degraded because Change != DegradingChange + require.False(t, r.HasDegradedResult(0.0, 0.0, false)) + // With strict flag, it should be degraded because PctDelta > detTolerance (0.0) + require.True(t, r.HasDegradedResult(0.0, 0.0, true)) + require.False(t, r.HasDegradedResult(0.0, 10.0, true)) + }) + + t.Run("deterministic degradation with B/op and strict flag", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "B/op", + Rows: []*benchstat.Row{ + { + PctDelta: 20.0, + Change: InsignificantChange, + }, + }, + }, + }, + } + require.False(t, r.HasDegradedResult(10.0, 10.0, false)) + require.True(t, r.HasDegradedResult(10.0, 10.0, true)) + require.False(t, r.HasDegradedResult(10.0, 25.0, true)) + }) + + t.Run("mixed metrics and tolerances", func(t *testing.T) { + r := &RunResult{ + tables: []*benchstat.Table{ + { + Metric: "ns/op", + Rows: []*benchstat.Row{ + { + PctDelta: 5.0, + Change: InsignificantChange, + }, + }, + }, + { + Metric: "allocs/op", + Rows: []*benchstat.Row{ + { + PctDelta: 15.0, + Change: InsignificantChange, + }, + }, + }, + }, + } + // ns/op is below tolerance (10), and allocs/op is below detTolerance (20) + require.False(t, r.HasDegradedResult(10.0, 20.0, true)) + // ns/op is below tolerance (10), but allocs/op is ABOVE detTolerance (10) + require.True(t, r.HasDegradedResult(10.0, 10.0, true)) + }) +}