Skip to content

Commit 76927c2

Browse files
authored
Fix Google Batch exit code when spot claim is successfully retried (#6926)
1 parent 4d36d22 commit 76927c2

2 files changed

Lines changed: 11 additions & 5 deletions

File tree

plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,10 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
688688
task.stderr = executor.logging.stderr(uid, taskId) ?: errorFile
689689
}
690690
else {
691+
// Retried spot instances could keep the 500xx exit code event when the automatic retied succeeds. In this case, we need to read the exit code from .exitcode
692+
// https://github.com/nextflow-io/nextflow/issues/6779
693+
if( task.exitStatus >= 50000 )
694+
task.exitStatus = readExitFile()
691695
task.stdout = outputFile
692696
task.stderr = errorFile
693697
}

plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ class GoogleBatchTaskHandlerTest extends Specification {
796796
def client = Mock(BatchClient){
797797
getTaskInArrayStatus(jobId, taskId) >> { TASK_STATE ? makeTaskStatus(TASK_STATE, DESC, EXIT_CODE): null }
798798
getTaskStatus(jobId, taskId) >> { TASK_STATE ? makeTaskStatus(TASK_STATE, DESC, EXIT_CODE): null }
799-
getJobStatus(jobId ) >> makeJobStatus(JobStatus.State.FAILED,DESC)
799+
getJobStatus(jobId ) >> makeJobStatus(JOB_STATE,DESC)
800800
}
801801
def logging = Mock(BatchLogging)
802802
def executor = Mock(GoogleBatchExecutor){
@@ -808,16 +808,18 @@ class GoogleBatchTaskHandlerTest extends Specification {
808808
when:
809809
def result = handler.checkIfCompleted()
810810
then:
811-
0 * handler.readExitFile() >> EXIT_STATUS
811+
NUM_READ_EXIT * handler.readExitFile() >> EXIT_STATUS
812812
handler.status == TASK_STATUS
813813
handler.task.exitStatus == EXIT_STATUS
814814
handler.task.error?.message == TASK_ERROR
815815
result == RESULT
816816

817817
where:
818-
TASK_STATE | DESC | EXIT_CODE | ARRAY_CHILD | TASK_STATUS | EXIT_STATUS | RESULT | TASK_ERROR
819-
TaskStatus.State.FAILED | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.'
820-
TaskStatus.State.FAILED | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.'
818+
TASK_STATE | JOB_STATE | NUM_READ_EXIT | DESC | EXIT_CODE | ARRAY_CHILD | TASK_STATUS | EXIT_STATUS | RESULT | TASK_ERROR
819+
TaskStatus.State.FAILED | JobStatus.State.FAILED | 0 | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.'
820+
TaskStatus.State.FAILED | JobStatus.State.FAILED | 0 | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.'
821+
TaskStatus.State.SUCCEEDED | JobStatus.State.SUCCEEDED | 1 | null | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | 0 | true | null
822+
TaskStatus.State.SUCCEEDED | JobStatus.State.SUCCEEDED | 1 | null | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | 0 | true | null
821823
}
822824

823825
StatusEvent makeStatusEventWithTime(long seconds, Integer exitCode) {

0 commit comments

Comments
 (0)