From 5e153412058ef1813ad922278dc7db776f5bd501 Mon Sep 17 00:00:00 2001 From: "roshanmaind194@bitgo.com" Date: Thu, 4 Jun 2026 10:36:11 +0000 Subject: [PATCH 1/2] ci: retry and alert on GitHub release creation failure The Create GitHub release step in npmjs-release.yml runs after npm publish, which is irreversible. Previously it was marked continue-on-error: true, so a failure (e.g. the GitHub API rate limit seen in VL-5474) silently turned the job yellow with no alert and no GitHub release. Wrap the gh release create call in a 3-attempt retry loop with 30s, 60s, 90s backoff and drop continue-on-error so a final failure fails the job. Add a follow-up step that fires only when the release step fails, posting a Slack notification (via SLACK_RELEASE_WEBHOOK_URL webhook secret) that identifies the run and version requiring manual remediation. Ticket: VL-6353 Session-Id: b035fd16-81c0-4327-b03e-fb4e0dce6501 Task-Id: f5755602-d6f6-4284-b891-f02b996e3188 --- .github/workflows/npmjs-release.yml | 43 +++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/.github/workflows/npmjs-release.yml b/.github/workflows/npmjs-release.yml index 59054057f1..2e84a89df1 100644 --- a/.github/workflows/npmjs-release.yml +++ b/.github/workflows/npmjs-release.yml @@ -305,16 +305,49 @@ jobs: echo "New version: $NEW_VERSION" echo "new-version=$NEW_VERSION" >> "$GITHUB_OUTPUT" + # NPM publish has already happened by this point, so a failure here + # requires manual remediation (see VL-5474). Retry transient failures, + # then surface as a job failure — no continue-on-error. - name: Create GitHub release if: inputs.dry-run == false && steps.version-bump-summary.outcome == 'success' - continue-on-error: true + id: create-github-release env: GH_TOKEN: ${{ secrets.BITGOBOT_PAT_TOKEN || github.token }} run: | - gh release create "v${{ steps.extract-version.outputs.new-version }}" \ - --latest \ - --title "v${{ steps.extract-version.outputs.new-version }}" \ - --notes-file "${{ steps.version-bump-summary.outputs.text-file }}" + for attempt in 1 2 3; do + if gh release create "v${{ steps.extract-version.outputs.new-version }}" \ + --latest \ + --title "v${{ steps.extract-version.outputs.new-version }}" \ + --notes-file "${{ steps.version-bump-summary.outputs.text-file }}"; then + echo "GitHub release created on attempt $attempt" + exit 0 + fi + delay=$((attempt * 30)) + echo "Attempt $attempt failed. Retrying in ${delay}s..." + sleep "$delay" + done + echo "::error::All retry attempts exhausted creating GitHub release v${{ steps.extract-version.outputs.new-version }}." + exit 1 + + - name: Notify on GitHub release failure + if: ${{ always() && steps.create-github-release.outcome == 'failure' }} + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_RELEASE_WEBHOOK_URL }} + VERSION: ${{ steps.extract-version.outputs.new-version }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + echo "::error::GitHub release creation failed after 3 attempts for v${VERSION}. NPM publish succeeded; create the GitHub release manually. Run: ${RUN_URL}" + if [ -z "$SLACK_WEBHOOK_URL" ]; then + echo "::warning::SLACK_RELEASE_WEBHOOK_URL is not configured; skipping Slack notification." + exit 0 + fi + payload=$(jq -nc \ + --arg text ":rotating_light: BitGoJS GitHub release creation failed for v${VERSION} after 3 retries. NPM publish already succeeded — manual GitHub release required. Run: ${RUN_URL}" \ + '{text: $text}') + curl -sS -X POST -H 'Content-Type: application/json' \ + --data "$payload" \ + --retry 3 --retry-delay 5 --max-time 30 \ + "$SLACK_WEBHOOK_URL" || echo "::warning::Slack notification POST failed." get-express-release-context: name: Get Express release context From 659778bd56f2a0df206739c127bbc317609eaa50 Mon Sep 17 00:00:00 2001 From: "roshanmaind194@bitgo.com" Date: Thu, 4 Jun 2026 10:48:49 +0000 Subject: [PATCH 2/2] ci: drop slack webhook step and skip last-attempt sleep Existing org-level Slack notifications already fire on job failure for this repo, so a dedicated SLACK_RELEASE_WEBHOOK_URL step was redundant. Removing it eliminates the unprovisioned secret and the silent no-op fallback. The job still fails non-zero on exhausted retries (no continue-on-error), which is what the existing notifier hooks into. Also guard the retry-loop sleep with `attempt -lt 3` so the loop no longer sleeps 90s after the final failed attempt before exit 1. Ticket: VL-6353 Session-Id: 171773d2-746b-4f79-8b23-5ad8c11f2e5e Task-Id: d6793c61-de39-41c3-80f1-5352025fb58c --- .github/workflows/npmjs-release.yml | 33 +++++++---------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/.github/workflows/npmjs-release.yml b/.github/workflows/npmjs-release.yml index 2e84a89df1..1ab784c04c 100644 --- a/.github/workflows/npmjs-release.yml +++ b/.github/workflows/npmjs-release.yml @@ -307,7 +307,8 @@ jobs: # NPM publish has already happened by this point, so a failure here # requires manual remediation (see VL-5474). Retry transient failures, - # then surface as a job failure — no continue-on-error. + # then surface as a job failure — no continue-on-error. Existing + # org-level Slack notifications fire on the resulting job failure. - name: Create GitHub release if: inputs.dry-run == false && steps.version-bump-summary.outcome == 'success' id: create-github-release @@ -322,33 +323,15 @@ jobs: echo "GitHub release created on attempt $attempt" exit 0 fi - delay=$((attempt * 30)) - echo "Attempt $attempt failed. Retrying in ${delay}s..." - sleep "$delay" + if [ "$attempt" -lt 3 ]; then + delay=$((attempt * 30)) + echo "Attempt $attempt failed. Retrying in ${delay}s..." + sleep "$delay" + fi done - echo "::error::All retry attempts exhausted creating GitHub release v${{ steps.extract-version.outputs.new-version }}." + echo "::error::All retry attempts exhausted creating GitHub release v${{ steps.extract-version.outputs.new-version }}. NPM publish succeeded; manual GitHub release required." exit 1 - - name: Notify on GitHub release failure - if: ${{ always() && steps.create-github-release.outcome == 'failure' }} - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_RELEASE_WEBHOOK_URL }} - VERSION: ${{ steps.extract-version.outputs.new-version }} - RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - run: | - echo "::error::GitHub release creation failed after 3 attempts for v${VERSION}. NPM publish succeeded; create the GitHub release manually. Run: ${RUN_URL}" - if [ -z "$SLACK_WEBHOOK_URL" ]; then - echo "::warning::SLACK_RELEASE_WEBHOOK_URL is not configured; skipping Slack notification." - exit 0 - fi - payload=$(jq -nc \ - --arg text ":rotating_light: BitGoJS GitHub release creation failed for v${VERSION} after 3 retries. NPM publish already succeeded — manual GitHub release required. Run: ${RUN_URL}" \ - '{text: $text}') - curl -sS -X POST -H 'Content-Type: application/json' \ - --data "$payload" \ - --retry 3 --retry-delay 5 --max-time 30 \ - "$SLACK_WEBHOOK_URL" || echo "::warning::Slack notification POST failed." - get-express-release-context: name: Get Express release context if: ${{ always() && inputs.dry-run == false && needs.release-bitgojs.result == 'success' }}