Auto rerun transient CI failures #11482
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Analyzes failed CI PR runs for retry-safe transient failures and requests reruns for | |
| # the matched jobs through GitHub's job-rerun API, which also reruns dependent jobs. | |
| # For the supported behaviors and safety rails, see | |
| # docs/ci/auto-rerun-transient-ci-failures.md. | |
| name: Auto rerun transient CI failures | |
| on: | |
| workflow_run: | |
| workflows: ["CI"] | |
| types: | |
| - completed | |
| workflow_dispatch: | |
| inputs: | |
| run_id: | |
| description: 'CI workflow run ID to inspect' | |
| required: true | |
| type: number | |
| dry_run: | |
| description: 'Inspect and summarize without requesting reruns' | |
| required: false | |
| default: false | |
| type: boolean | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && inputs.run_id || github.event.workflow_run.id }} | |
| cancel-in-progress: false | |
| jobs: | |
| analyze-transient-failures: | |
| name: Analyze transient CI failures | |
| if: >- | |
| ${{ | |
| github.repository_owner == 'microsoft' && | |
| (github.event_name == 'workflow_dispatch' || | |
| (github.event.workflow_run.event == 'pull_request' && | |
| github.event.workflow_run.conclusion == 'failure' && | |
| github.event.workflow_run.run_attempt <= 3)) | |
| }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: read | |
| checks: read | |
| contents: read | |
| outputs: | |
| source_run_id: ${{ steps.analyze.outputs.source_run_id }} | |
| source_run_attempt: ${{ steps.analyze.outputs.source_run_attempt }} | |
| source_run_url: ${{ steps.analyze.outputs.source_run_url }} | |
| retryable_jobs: ${{ steps.analyze.outputs.retryable_jobs }} | |
| pull_request_numbers: ${{ steps.analyze.outputs.pull_request_numbers }} | |
| retryable_count: ${{ steps.analyze.outputs.retryable_count }} | |
| skipped_count: ${{ steps.analyze.outputs.skipped_count }} | |
| rerun_eligible: ${{ steps.analyze.outputs.rerun_eligible }} | |
| rerun_execution_eligible: ${{ steps.analyze.outputs.rerun_execution_eligible }} | |
| dry_run: ${{ steps.analyze.outputs.dry_run }} | |
| max_retryable_jobs: ${{ steps.analyze.outputs.max_retryable_jobs }} | |
| test_pattern_matched_tests: ${{ steps.analyze.outputs.test_pattern_matched_tests }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Analyze failed jobs | |
| id: analyze | |
| uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| MANUAL_RUN_ID: ${{ inputs.run_id }} | |
| MANUAL_DRY_RUN: ${{ inputs.dry_run }} | |
| # TEMPORARY — FORCE_RERUN_ALL (revert when no longer needed): | |
| # Short-circuit the analysis: if a CI run failed and has a currently-open | |
| # associated PR, rerun its failed jobs — no job is fetched or classified, and | |
| # there is no job-count cap. The "CI failed" and "max 3 reruns" gates come from | |
| # the job-level `if` above; the open-PR requirement and the manual-dispatch | |
| # attempt cap are preserved. This is a for-now measure until CI auto-rerun | |
| # patterns are improved; disable by setting this to 'false' (or removing it) on | |
| # both jobs. See docs/ci/auto-rerun-transient-ci-failures.md. | |
| FORCE_RERUN_ALL: 'true' | |
| with: | |
| script: | | |
| const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); | |
| const path = require('node:path'); | |
| const fs = require('node:fs'); | |
| const { execSync } = require('node:child_process'); | |
| const os = require('node:os'); | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const isWorkflowDispatch = context.eventName === 'workflow_dispatch'; | |
| const maxRetryableJobs = rerunWorkflow.defaultMaxRetryableJobs; | |
| const maxJobLogInspectionBytes = 256 * 1024; | |
| // FORCE_RERUN_ALL: see env comment above. | |
| const forceRerunAll = String(process.env.FORCE_RERUN_ALL).toLowerCase() === 'true'; | |
| async function paginate(route, parameters, selectItems) { | |
| const items = []; | |
| for (let page = 1; ; page++) { | |
| const response = await github.request(route, { | |
| ...parameters, | |
| per_page: 100, | |
| page, | |
| }); | |
| items.push(...selectItems(response.data)); | |
| if (!response.headers.link || !response.headers.link.includes('rel="next"')) { | |
| return items; | |
| } | |
| } | |
| } | |
| async function getWorkflowRun() { | |
| if (!isWorkflowDispatch) { | |
| return context.payload.workflow_run; | |
| } | |
| const runId = Number(process.env.MANUAL_RUN_ID); | |
| if (!Number.isInteger(runId) || runId <= 0) { | |
| throw new Error('workflow_dispatch requires a valid run_id input.'); | |
| } | |
| const response = await github.rest.actions.getWorkflowRun({ | |
| owner, | |
| repo, | |
| run_id: runId, | |
| }); | |
| return response.data; | |
| } | |
| function parseManualDryRun() { | |
| if (!isWorkflowDispatch) { | |
| return false; | |
| } | |
| return String(process.env.MANUAL_DRY_RUN).toLowerCase() === 'true'; | |
| } | |
| async function listJobsForAttempt(runId, attemptNumber) { | |
| return paginate( | |
| 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/attempts/{attempt_number}/jobs', | |
| { | |
| owner, | |
| repo, | |
| run_id: runId, | |
| attempt_number: attemptNumber, | |
| }, | |
| data => data.jobs || []); | |
| } | |
| async function listAnnotations(job) { | |
| try { | |
| const checkRunId = await rerunWorkflow.getCheckRunIdForJob({ | |
| job, | |
| getJobForWorkflowRun: async jobId => { | |
| const response = await github.rest.actions.getJobForWorkflowRun({ | |
| owner, | |
| repo, | |
| job_id: jobId, | |
| }); | |
| return response.data; | |
| }, | |
| }); | |
| if (!checkRunId) { | |
| core.warning(`Unable to resolve a check run id for job ${job.id}.`); | |
| return []; | |
| } | |
| return await paginate( | |
| 'GET /repos/{owner}/{repo}/check-runs/{check_run_id}/annotations', | |
| { | |
| owner, | |
| repo, | |
| check_run_id: checkRunId, | |
| }, | |
| data => Array.isArray(data) ? data : []); | |
| } | |
| catch (error) { | |
| core.warning(`Failed to list annotations for job ${job.id}: ${error.message}`); | |
| return []; | |
| } | |
| } | |
| async function getJobLogText(jobId) { | |
| try { | |
| const response = await fetch(`https://api.github.com/repos/${owner}/${repo}/actions/jobs/${jobId}/logs`, { | |
| headers: { | |
| authorization: `Bearer ${process.env.GITHUB_TOKEN}`, | |
| accept: 'application/vnd.github+json', | |
| 'x-github-api-version': '2022-11-28', | |
| }, | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`HTTP ${response.status}`); | |
| } | |
| return (await response.text()).slice(-maxJobLogInspectionBytes); | |
| } | |
| catch (error) { | |
| core.warning(`Failed to fetch logs for job ${jobId}: ${error.message}`); | |
| return ''; | |
| } | |
| } | |
| const workflowRun = await getWorkflowRun(); | |
| const dryRun = parseManualDryRun(); | |
| const sourceRunUrl = workflowRun.html_url || `https://github.com/${owner}/${repo}/actions/runs/${workflowRun.id}`; | |
| core.setOutput('source_run_id', String(workflowRun.id)); | |
| core.setOutput('source_run_attempt', String(workflowRun.run_attempt || '')); | |
| core.setOutput('source_run_url', sourceRunUrl); | |
| core.setOutput('dry_run', String(dryRun)); | |
| core.setOutput('max_retryable_jobs', String(maxRetryableJobs)); | |
| core.setOutput('retryable_jobs', '[]'); | |
| core.setOutput('pull_request_numbers', '[]'); | |
| core.setOutput('retryable_count', '0'); | |
| core.setOutput('skipped_count', '0'); | |
| core.setOutput('rerun_eligible', 'false'); | |
| core.setOutput('rerun_execution_eligible', 'false'); | |
| core.setOutput('test_pattern_matched_tests', '[]'); | |
| if (workflowRun.name && workflowRun.name !== 'CI') { | |
| console.log(`Workflow run ${workflowRun.id} is '${workflowRun.name}', not 'CI'. Skipping.`); | |
| return; | |
| } | |
| const pullRequestNumbers = await rerunWorkflow.getAssociatedPullRequestNumbers({ | |
| github, | |
| owner, | |
| repo, | |
| workflowRun, | |
| warn: message => core.warning(message), | |
| }); | |
| core.setOutput('pull_request_numbers', JSON.stringify(pullRequestNumbers)); | |
| // The open-PR requirement applies in all modes: skip runs with no | |
| // associated PR. Force mode does not bypass this — there is no value in | |
| // spending CI on a run that has no open PR behind it. | |
| if (pullRequestNumbers.length === 0) { | |
| console.log('No associated pull request could be resolved for this workflow run. Skipping.'); | |
| return; | |
| } | |
| // TEMPORARY — FORCE_RERUN_ALL short-circuit (revert when no longer needed): | |
| // The run failed (job-level `if`) and has an associated PR (checked above), | |
| // so request a rerun without fetching or classifying any jobs. Only the | |
| // attempt cap is re-checked here (it matters for the manual-dispatch path, | |
| // which bypasses the trigger's run_attempt gate). The retryable_jobs output | |
| // stays '[]' on purpose: the rerun job uses GitHub's rerun-failed-jobs API, | |
| // which reruns every failed job regardless of this list, and the final | |
| // open-PR state is re-checked there. See the JS file-level comment. | |
| if (forceRerunAll) { | |
| const forceRunAttempt = workflowRun.run_attempt; | |
| const forceRerunEligible = rerunWorkflow.computeRerunEligibility({ | |
| runAttempt: forceRunAttempt, | |
| forceRerunAll: true, | |
| }); | |
| const forceRerunExecutionEligible = rerunWorkflow.computeRerunExecutionEligibility({ | |
| dryRun, | |
| runAttempt: forceRunAttempt, | |
| forceRerunAll: true, | |
| }); | |
| core.setOutput('rerun_eligible', String(forceRerunEligible)); | |
| core.setOutput('rerun_execution_eligible', String(forceRerunExecutionEligible)); | |
| await rerunWorkflow.writeForceRerunSummary({ | |
| summary: core.summary, | |
| rerunEligible: forceRerunEligible, | |
| dryRun, | |
| sourceRunUrl, | |
| sourceRunAttempt: forceRunAttempt, | |
| runAttempt: forceRunAttempt, | |
| openPullRequestNumbers: pullRequestNumbers, | |
| }); | |
| if (!forceRerunEligible) { | |
| console.log(`Force-rerun mode: attempt cap reached (attempt ${forceRunAttempt}). Skipping.`); | |
| } | |
| return; | |
| } | |
| const runId = workflowRun.id; | |
| const runAttempt = workflowRun.run_attempt; | |
| const jobs = await listJobsForAttempt(runId, runAttempt); | |
| // Load test retry patterns config | |
| const configPath = path.join(process.env.GITHUB_WORKSPACE, 'eng', 'test-retry-patterns.json'); | |
| const { config: retryPatternsConfig, errors: configErrors } = rerunWorkflow.loadRetryPatternsConfig(configPath); | |
| if (configErrors.length > 0) { | |
| core.warning(`Test retry patterns config has errors: ${configErrors.join('; ')}`); | |
| } | |
| let { failedJobs, retryableJobs, skippedJobs } = await rerunWorkflow.analyzeFailedJobs({ | |
| jobs, | |
| getAnnotationsForJob: async job => listAnnotations(job), | |
| getJobLogTextForJob: async job => getJobLogText(job.id), | |
| maxRetryableJobs, | |
| retryPatternsConfig, | |
| }); | |
| // TRX-based analysis: check test output for transient patterns. | |
| let testPatternMatchedTests = []; | |
| const hasSkippedTestExecJobs = skippedJobs.some(job => | |
| rerunWorkflow.hasTestExecutionFailureStep(job.failedSteps) | |
| ); | |
| const testFailurePatterns = retryPatternsConfig?.testFailurePatterns; | |
| if (hasSkippedTestExecJobs && Array.isArray(testFailurePatterns) && testFailurePatterns.length > 0) { | |
| try { | |
| const artifacts = await paginate( | |
| 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/artifacts', | |
| { owner, repo, run_id: runId }, | |
| data => data.artifacts || []); | |
| const testArtifact = rerunWorkflow.selectTestResultsArtifact(artifacts); | |
| if (testArtifact) { | |
| console.log(`Downloading test results artifact '${testArtifact.name}' (${testArtifact.size_in_bytes} bytes)...`); | |
| const download = await github.rest.actions.downloadArtifact({ | |
| owner, | |
| repo, | |
| artifact_id: testArtifact.id, | |
| archive_format: 'zip', | |
| }); | |
| const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-results-')); | |
| try { | |
| const zipPath = path.join(tmpDir, 'test-results.zip'); | |
| fs.writeFileSync(zipPath, Buffer.from(download.data)); | |
| const trxDir = path.join(tmpDir, 'trx'); | |
| fs.mkdirSync(trxDir, { recursive: true }); | |
| execSync(`unzip -qo "${zipPath}" -d "${trxDir}"`, { timeout: 30_000 }); | |
| const trxFileContents = []; | |
| const maxTrxFiles = 200; | |
| const maxTrxFileBytes = 50 * 1024 * 1024; // 50MB per file cap | |
| const resolvedTrxDir = fs.realpathSync(trxDir); | |
| const findTrxFiles = (dir) => { | |
| for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { | |
| if (entry.isSymbolicLink()) { | |
| continue; | |
| } | |
| const fullPath = path.join(dir, entry.name); | |
| const resolvedPath = fs.realpathSync(fullPath); | |
| if (!resolvedPath.startsWith(resolvedTrxDir + path.sep) && resolvedPath !== resolvedTrxDir) { | |
| continue; | |
| } | |
| if (entry.isDirectory()) { | |
| findTrxFiles(fullPath); | |
| } else if (entry.name.endsWith('.trx') && trxFileContents.length < maxTrxFiles) { | |
| const stat = fs.statSync(fullPath); | |
| if (stat.size <= maxTrxFileBytes) { | |
| trxFileContents.push({ | |
| fileName: entry.name, | |
| content: fs.readFileSync(fullPath, 'utf8'), | |
| }); | |
| } | |
| } | |
| } | |
| }; | |
| findTrxFiles(trxDir); | |
| if (trxFileContents.length > 0) { | |
| const { allMatchedTests } = rerunWorkflow.analyzeTrxFiles(trxFileContents, testFailurePatterns); | |
| if (allMatchedTests.length > 0) { | |
| console.log(`Found ${allMatchedTests.length} test(s) matching transient failure patterns.`); | |
| const promoted = rerunWorkflow.promoteTestExecutionFailureJobs(retryableJobs, skippedJobs, allMatchedTests); | |
| retryableJobs = promoted.retryableJobs; | |
| skippedJobs = promoted.skippedJobs; | |
| testPatternMatchedTests = allMatchedTests; | |
| } | |
| } | |
| } finally { | |
| fs.rmSync(tmpDir, { recursive: true, force: true }); | |
| } | |
| } | |
| } catch (trxError) { | |
| core.warning(`TRX analysis failed (non-fatal): ${trxError.message}`); | |
| } | |
| } | |
| core.setOutput('retryable_jobs', JSON.stringify(retryableJobs.map(job => ({ | |
| id: job.id, | |
| name: job.name, | |
| htmlUrl: job.htmlUrl, | |
| reason: job.reason, | |
| })))); | |
| core.setOutput('retryable_count', String(retryableJobs.length)); | |
| core.setOutput('skipped_count', String(skippedJobs.length)); | |
| const rerunEligible = rerunWorkflow.computeRerunEligibility({ | |
| retryableCount: retryableJobs.length, | |
| maxRetryableJobs, | |
| runAttempt, | |
| }); | |
| const rerunExecutionEligible = rerunWorkflow.computeRerunExecutionEligibility({ | |
| dryRun, | |
| retryableCount: retryableJobs.length, | |
| maxRetryableJobs, | |
| runAttempt, | |
| }); | |
| core.setOutput('rerun_eligible', String(rerunEligible)); | |
| core.setOutput('rerun_execution_eligible', String(rerunExecutionEligible)); | |
| core.setOutput('test_pattern_matched_tests', JSON.stringify(testPatternMatchedTests.slice(0, 50).map(t => ({ | |
| testName: t.testName, | |
| reason: t.reason, | |
| })))); | |
| await rerunWorkflow.writeAnalysisSummary({ | |
| summary: core.summary, | |
| failedJobs, | |
| retryableJobs, | |
| skippedJobs, | |
| maxRetryableJobs, | |
| dryRun, | |
| rerunEligible, | |
| sourceRunUrl, | |
| sourceRunAttempt: runAttempt, | |
| testPatternMatchedTests, | |
| }); | |
| if (retryableJobs.length === 0) { | |
| console.log('No retryable failed jobs were detected.'); | |
| return; | |
| } | |
| rerun-transient-failures: | |
| name: Rerun transient CI failures | |
| needs: [analyze-transient-failures] | |
| if: ${{ needs.analyze-transient-failures.outputs.rerun_execution_eligible == 'true' }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| contents: read | |
| issues: write | |
| pull-requests: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Rerun matched jobs | |
| uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 | |
| env: | |
| RETRYABLE_JOBS: ${{ needs.analyze-transient-failures.outputs.retryable_jobs }} | |
| PULL_REQUEST_NUMBERS: ${{ needs.analyze-transient-failures.outputs.pull_request_numbers }} | |
| SOURCE_RUN_ID: ${{ needs.analyze-transient-failures.outputs.source_run_id }} | |
| SOURCE_RUN_ATTEMPT: ${{ needs.analyze-transient-failures.outputs.source_run_attempt }} | |
| SOURCE_RUN_URL: ${{ needs.analyze-transient-failures.outputs.source_run_url }} | |
| TEST_PATTERN_MATCHED_TESTS: ${{ needs.analyze-transient-failures.outputs.test_pattern_matched_tests }} | |
| # TEMPORARY — FORCE_RERUN_ALL (revert when no longer needed): in force mode the | |
| # analyze step does not enumerate jobs, so RETRYABLE_JOBS is '[]'. This flag tells | |
| # the rerun step to proceed anyway (GitHub's rerun-failed-jobs API reruns every | |
| # failed job) and to use the short force-mode PR comment / summary wording. Keep | |
| # in sync with the analyze step's FORCE_RERUN_ALL. | |
| FORCE_RERUN_ALL: 'true' | |
| with: | |
| script: | | |
| const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const retryableJobs = JSON.parse(process.env.RETRYABLE_JOBS || '[]'); | |
| const pullRequestNumbers = JSON.parse(process.env.PULL_REQUEST_NUMBERS || '[]'); | |
| const sourceRunId = Number(process.env.SOURCE_RUN_ID); | |
| const sourceRunAttempt = Number(process.env.SOURCE_RUN_ATTEMPT); | |
| const sourceRunUrl = process.env.SOURCE_RUN_URL; | |
| const testPatternMatchedTests = JSON.parse(process.env.TEST_PATTERN_MATCHED_TESTS || '[]'); | |
| // FORCE_RERUN_ALL: see env comment above. | |
| const forceRerunAll = String(process.env.FORCE_RERUN_ALL).toLowerCase() === 'true'; | |
| // Normal mode passes the retry-safe job list; an empty list means nothing to | |
| // rerun. Force mode intentionally passes an empty list (no enumeration), so it | |
| // must proceed to the rerun. | |
| if (!forceRerunAll && retryableJobs.length === 0) { | |
| console.log('No retryable jobs were provided to the rerun job.'); | |
| return; | |
| } | |
| await rerunWorkflow.rerunMatchedJobs({ | |
| github, | |
| owner, | |
| repo, | |
| retryableJobs, | |
| pullRequestNumbers, | |
| summary: core.summary, | |
| sourceRunId, | |
| sourceRunAttempt, | |
| sourceRunUrl, | |
| testPatternMatchedTests, | |
| forceRerunAll, | |
| }); |