Skip to content

Commit 9879d5f

Browse files
authored
Support retryable_step_names in retryBot for per-repo retryable step config (#7876)
Allow repos to declare step names (e.g. "CUDA Compute Check") that should always trigger a retry when they fail, regardless of whether test steps also failed. This enables retrying Helion jobs on GPU health check failures. PR on helion side pytorch/helion#1808 Signed-off-by: Huy Do <huydhn@gmail.com>
1 parent 1972edf commit 9879d5f

File tree

2 files changed

+130
-2
lines changed

2 files changed

+130
-2
lines changed

torchci/lib/bot/retryBot.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ async function retryCurrentWorkflow(
7272
defaultBranch: string,
7373
workflowName: string,
7474
workflowJobs: any[],
75-
runId: number
75+
runId: number,
76+
retryableStepNames: string[] = []
7677
) {
7778
const failedJobs = workflowJobs.filter((job) =>
7879
FAILURE_CONCLUSIONS.includes(job.conclusion!)
@@ -135,6 +136,22 @@ async function retryCurrentWorkflow(
135136
}
136137
}
137138

139+
// If a retryable step name failed, always retry (e.g. CUDA Compute Check
140+
// indicates a bad runner, not a code problem)
141+
if (retryableStepNames.length > 0) {
142+
const hasRetryableStepFailure = job.steps?.some(
143+
(step: any) =>
144+
step.conclusion !== null &&
145+
FAILURE_CONCLUSIONS.includes(step.conclusion) &&
146+
retryableStepNames.some(
147+
(name) => step.name.toLowerCase() === name.toLowerCase()
148+
)
149+
);
150+
if (hasRetryableStepFailure) {
151+
return true;
152+
}
153+
}
154+
138155
// if no test steps failed, can rerun
139156
return !doesLookLikeUserFailure(job, (step) =>
140157
step.name.toLowerCase().includes("test")
@@ -181,6 +198,8 @@ function retryBot(app: Probot): void {
181198
const config: any = await tracker.loadConfig(ctx);
182199
const allowedWorkflowPrefixes: string[] | undefined =
183200
config != null ? config["retryable_workflows"] : undefined;
201+
const retryableStepNames: string[] =
202+
config != null ? config["retryable_step_names"] ?? [] : [];
184203

185204
if (allowedWorkflowPrefixes === undefined) {
186205
return;
@@ -225,7 +244,8 @@ function retryBot(app: Probot): void {
225244
defaultBranch,
226245
workflowName,
227246
workflowJobs,
228-
runId
247+
runId,
248+
retryableStepNames
229249
);
230250
}
231251

torchci/test/retryBot.test.ts

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,114 @@ describe("retry-bot", () => {
440440
handleScope(scope);
441441
});
442442

443+
test("rerun when a retryable step name fails", async () => {
444+
mockIsPytorchbotSupportedOrg(true);
445+
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
446+
event.payload.workflow_run.name = "test";
447+
const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json");
448+
// Job fails on a retryable step (CUDA Compute Check)
449+
workflow_jobs.jobs[4].conclusion = "failure";
450+
workflow_jobs.jobs[4].steps = [
451+
{
452+
name: "CUDA Compute Check",
453+
status: "completed",
454+
conclusion: "failure",
455+
number: 1,
456+
started_at: "2022-10-06T18:09:54.000-07:00",
457+
completed_at: "2022-10-06T18:09:54.000-07:00",
458+
},
459+
];
460+
461+
const owner = event.payload.repository.owner.login;
462+
const repo = event.payload.repository.name;
463+
const attempt_number = event.payload.workflow_run.run_attempt;
464+
const run_id = event.payload.workflow_run.id;
465+
466+
const scope = nock("https://api.github.com")
467+
.get(
468+
`/repos/${owner}/${repo}/actions/runs/${run_id}/attempts/${attempt_number}/jobs?page=1&per_page=100`
469+
)
470+
.reply(200, workflow_jobs)
471+
.get(
472+
`/repos/${owner}/${repo}/contents/${encodeURIComponent(
473+
".github/pytorch-probot.yml"
474+
)}`
475+
)
476+
.reply(
477+
200,
478+
'{retryable_workflows: ["test", "benchmark"], retryable_step_names: ["CUDA Compute Check"]}'
479+
)
480+
.post(
481+
`/repos/${owner}/${repo}/actions/jobs/${workflow_jobs.jobs[4].id}/rerun`
482+
)
483+
.reply(200);
484+
485+
const mock = jest.spyOn(clickhouse, "queryClickhouseSaved");
486+
mock.mockImplementation(() => Promise.resolve([]));
487+
488+
await probot.receive(event);
489+
490+
handleScope(scope);
491+
});
492+
493+
test("rerun when retryable step fails even if test step also failed", async () => {
494+
mockIsPytorchbotSupportedOrg(true);
495+
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
496+
event.payload.workflow_run.name = "test";
497+
const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json");
498+
// Job fails on both a retryable step and a test step
499+
workflow_jobs.jobs[4].conclusion = "failure";
500+
workflow_jobs.jobs[4].steps = [
501+
{
502+
name: "CUDA Compute Check",
503+
status: "completed",
504+
conclusion: "failure",
505+
number: 1,
506+
started_at: "2022-10-06T18:09:54.000-07:00",
507+
completed_at: "2022-10-06T18:09:54.000-07:00",
508+
},
509+
{
510+
name: "test",
511+
status: "completed",
512+
conclusion: "failure",
513+
number: 2,
514+
started_at: "2022-10-06T18:10:54.000-07:00",
515+
completed_at: "2022-10-06T18:10:54.000-07:00",
516+
},
517+
];
518+
519+
const owner = event.payload.repository.owner.login;
520+
const repo = event.payload.repository.name;
521+
const attempt_number = event.payload.workflow_run.run_attempt;
522+
const run_id = event.payload.workflow_run.id;
523+
524+
const scope = nock("https://api.github.com")
525+
.get(
526+
`/repos/${owner}/${repo}/actions/runs/${run_id}/attempts/${attempt_number}/jobs?page=1&per_page=100`
527+
)
528+
.reply(200, workflow_jobs)
529+
.get(
530+
`/repos/${owner}/${repo}/contents/${encodeURIComponent(
531+
".github/pytorch-probot.yml"
532+
)}`
533+
)
534+
.reply(
535+
200,
536+
'{retryable_workflows: ["test", "benchmark"], retryable_step_names: ["CUDA Compute Check"]}'
537+
)
538+
.post(
539+
`/repos/${owner}/${repo}/actions/jobs/${workflow_jobs.jobs[4].id}/rerun`
540+
)
541+
.reply(200);
542+
543+
const mock = jest.spyOn(clickhouse, "queryClickhouseSaved");
544+
mock.mockImplementation(() => Promise.resolve([]));
545+
546+
await probot.receive(event);
547+
548+
handleScope(scope);
549+
});
550+
443551
test("dont re-run unless retryable_workflows is specified in .github/pytorch-probot.yml", async () => {
444552
mockIsPytorchbotSupportedOrg(true);
445553
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");

0 commit comments

Comments
 (0)