Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,95 @@ jobs:
run: |
go tool -modfile=tools/task/go.mod task test-sandbox

test-fuzz:
needs:
- cleanups

# The terraform/direct create-payload parity tests run two real `bundle deploy`
# invocations per seed, so they are too slow for every PR and too noisy to gate
# the merge queue. Run them on the nightly schedule to catch engine drift; not
# part of test-result for that reason.
if: ${{ github.event_name == 'schedule' }}
name: "task test-fuzz"
runs-on:
group: databricks-protected-runner-group-large
labels: linux-ubuntu-latest-large

defaults:
run:
shell: bash

permissions:
id-token: write
contents: read
# Needed by the failure-reporting step below to open/comment a tracking issue.
issues: write

steps:
- name: Checkout repository and submodules
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Setup build environment
uses: ./.github/actions/setup-build-environment
with:
cache-key: test-fuzz

- name: Run tests
env:
# Shift the seed window by the run number every nightly run so CI
# explores configs it has never tested before instead of re-checking a
# fixed set. The window is kept modest (each seed runs two real deploys)
# since the exploration comes from rotating the window, not its size;
# raise it once nightly timings are known. A divergence prints
# FUZZ_SEED=<n> for one-command reproduction.
#
# offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS. GITHUB_RUN_NUMBER is a
# built-in, monotonically increasing, unique-per-run integer, so as long
# as FUZZ_SEEDS is constant the windows are non-overlapping (gaps from
# non-schedule runs are fine; we only need fresh seeds, not every seed).
FUZZ_SEEDS: "25"
run: |
export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS ))
go tool -modfile=tools/task/go.mod task test-fuzz

# This job is intentionally excluded from test-result, so a failure here is
# invisible unless someone watches the Actions tab. Surface it as a GitHub
# issue instead. Reuse a single open issue (deduped by label) so a recurring
# divergence doesn't open one issue per night.
- name: Report failure
if: ${{ failure() }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
gh label create fuzz-nightly \
--description "Nightly terraform/direct create-payload parity failures" \
--color FBCA04 2>/dev/null || true

body=$(cat <<EOF
The nightly terraform/direct create-payload parity job (\`task test-fuzz\`) failed.

Run: $RUN_URL

The failing seed(s) are printed in the job log as \`reproduce with: FUZZ_SEED=<n>\`.
Reproduce locally with:

\`\`\`
FUZZ_SEED=<seed> task test-fuzz
\`\`\`

Once fixed, add the seed to \`regressionSeeds\` in \`bundle/fuzz/fuzz_test.go\`
in the same PR so the divergence can never silently regress.
EOF
)

existing=$(gh issue list --state open --label fuzz-nightly --json number --jq '.[0].number')
if [ -n "$existing" ]; then
gh issue comment "$existing" --body "$body"
else
gh issue create --title "Nightly fuzz parity failure" --label fuzz-nightly --body "$body"
fi

# This job groups the result of all the above test jobs.
# It is a required check, so it blocks auto-merge and the merge queue.
#
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ tools/testmask/testmask
# Release artifacts
dist/

# Terraform binary + provider mirror provisioned by acceptance/install_terraform.py
# for the bundle/fuzz parity tests (see Taskfile `test-fuzz`).
/build/

# Local development notes, tmp
/pr-*
/tmp/
Expand Down
23 changes: 23 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,29 @@ tasks:
--packages ./acceptance/... \
-- -timeout=${LOCAL_TIMEOUT:-30m} -run "TestAccept/cmd/sandbox"

test-fuzz:
desc: Run terraform/direct create-payload parity fuzz tests (provisions terraform)
# No `sources:` fingerprint: the seeds checked are a function of the FUZZ_SEED,
# FUZZ_SEEDS, and FUZZ_SEED_OFFSET env vars, which Task can't see. Skipping on
# an unchanged source checksum would silently no-op a FUZZ_SEED=<n> repro run
# or a shifted nightly window, so always run.
env:
# The terraform parity tests are opt-in (see requireFuzzOptIn): they skip
# unless a FUZZ_* var is set, so a leftover build/ never makes them run as
# part of a plain `task test`. This constant flag opts this target in
# without overriding the FUZZ_SEED(S)/OFFSET tuning knobs.
FUZZ_PARITY: "1"
cmds:
# The parity harness expects terraform + the provider mirror at <repo>/build;
# requireTerraform skips when it's absent, so provision it first.
- python3 acceptance/install_terraform.py --targetdir build
- |
{{.GO_TOOL}} gotestsum \
--format ${GOTESTSUM_FORMAT:-pkgname-and-test-fails} \
--no-summary=skipped \
--packages ./bundle/fuzz/... \
-- -timeout=${LOCAL_TIMEOUT:-30m}

# --- Integration tests ---

integration:
Expand Down
2 changes: 2 additions & 0 deletions acceptance/bundle/deploy/wal/chain-3-jobs/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Exit code: [KILLED]
{
"new_cluster": {
"node_type_id": "[NODE_TYPE_ID]",
"num_workers": 0,
"spark_version": "15.4.x-scala2.12"
},
"spark_python_task": {
Expand Down Expand Up @@ -73,6 +74,7 @@ Exit code: [KILLED]
{
"new_cluster": {
"node_type_id": "[NODE_TYPE_ID]",
"num_workers": 0,
"spark_version": "15.4.x-scala2.12"
},
"spark_python_task": {
Expand Down
1 change: 1 addition & 0 deletions acceptance/bundle/deploy/wal/crash-after-create/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Exit code: [KILLED]
{
"new_cluster": {
"node_type_id": "[NODE_TYPE_ID]",
"num_workers": 0,
"spark_version": "15.4.x-scala2.12"
},
"spark_python_task": {
Expand Down
2 changes: 2 additions & 0 deletions acceptance/bundle/override/job_tasks/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
},
{
"new_cluster": {
"num_workers": 0,
"spark_version": "13.3.x-scala2.12"
},
"spark_python_task": {
Expand All @@ -42,6 +43,7 @@ Exit code: 1
"tasks": [
{
"new_cluster": {
"num_workers": 0,
"spark_version": "13.3.x-scala2.12"
},
"spark_python_task": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
"new_cluster": {
"custom_tags": {
"ResourceClass": "SingleNode"
}
},
"num_workers": 0
},
"task_key": "test-task"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
"new_cluster": {
"custom_tags": {
"ResourceClass": "SingleNode"
}
},
"num_workers": 0
},
"task_key": "test-task"
}
Expand Down
1 change: 1 addition & 0 deletions bundle/config/mutator/resourcemutator/cluster_fixups.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) {
for _, task := range js.Tasks {
if task.NewCluster != nil {
ModifyRequestOnInstancePool(task.NewCluster)
initializeNumWorkers(task.NewCluster)
}
}
for ind := range js.JobClusters {
Expand Down
92 changes: 92 additions & 0 deletions bundle/config/mutator/resourcemutator/cluster_fixups_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package resourcemutator

import (
"testing"

"github.com/databricks/databricks-sdk-go/service/compute"
"github.com/databricks/databricks-sdk-go/service/jobs"
"github.com/stretchr/testify/assert"
)

func TestInitializeNumWorkers(t *testing.T) {
tests := []struct {
name string
spec compute.ClusterSpec
wantForceSend bool
}{
{
name: "single-node cluster force-sends num_workers",
spec: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"},
wantForceSend: true,
},
{
name: "autoscale cluster does not force-send",
spec: compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}},
wantForceSend: false,
},
{
name: "multi-node cluster does not force-send",
spec: compute.ClusterSpec{NumWorkers: 3},
wantForceSend: false,
},
{
name: "already force-sent stays force-sent without duplicating",
spec: compute.ClusterSpec{ForceSendFields: []string{"NumWorkers"}},
wantForceSend: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
spec := tt.spec
initializeNumWorkers(&spec)

count := 0
for _, f := range spec.ForceSendFields {
if f == "NumWorkers" {
count++
}
}
if tt.wantForceSend {
assert.Equal(t, 1, count, "NumWorkers must appear in ForceSendFields exactly once")
} else {
assert.Equal(t, 0, count, "NumWorkers must not be in ForceSendFields")
}
})
}
}

// TestPrepareJobSettingsForUpdateForcesNumWorkers locks the DECO-25361 fix: a
// single-node new_cluster must force-send num_workers on task-level clusters too,
// not just shared job_clusters. The terraform provider always sends num_workers:0
// for such clusters, so missing it on the task side made the direct engine
// produce a divergent create payload.
func TestPrepareJobSettingsForUpdateForcesNumWorkers(t *testing.T) {
js := &jobs.JobSettings{
Tasks: []jobs.Task{
{
TaskKey: "single_node_task",
NewCluster: &compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"},
},
{
TaskKey: "autoscale_task",
NewCluster: &compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}},
},
},
JobClusters: []jobs.JobCluster{
{
JobClusterKey: "single_node_cluster",
NewCluster: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"},
},
},
}

prepareJobSettingsForUpdate(js)

assert.Contains(t, js.Tasks[0].NewCluster.ForceSendFields, "NumWorkers",
"single-node task cluster must force-send num_workers")
assert.NotContains(t, js.Tasks[1].NewCluster.ForceSendFields, "NumWorkers",
"autoscale task cluster must not force-send num_workers")
assert.Contains(t, js.JobClusters[0].NewCluster.ForceSendFields, "NumWorkers",
"single-node job cluster must force-send num_workers")
}
Loading
Loading