|
43 | 43 | produce_k8s_job_prefix, |
44 | 44 | pull_policy, |
45 | 45 | pykube_client_from_dict, |
| 46 | + reload_job, |
46 | 47 | Service, |
47 | 48 | service_object_dict, |
48 | 49 | ) |
|
53 | 54 |
|
54 | 55 | __all__ = ("KubernetesJobRunner",) |
55 | 56 |
|
56 | | - |
57 | 57 | @dataclass |
58 | | -class RetryableDeleteJob: |
59 | | - k8s_job: Job |
60 | | - retries: int = 5 # Max number of retries |
61 | | - attempts: int = 0 # Current number of attempts |
62 | | - |
| 58 | +class RetryableDeleteJobState(JobState): |
| 59 | + def __init__ (self, job_state, k8s_job, max_retries=5, attempts=0): |
| 60 | + self.__dict__ = job_state.__dict__.copy() |
| 61 | + self.init_retryable_job(max_retries, attempts) |
| 62 | + self.k8s_job = k8s_job |
| 63 | + |
| 64 | + def init_retryable_job(self, max_retries, attempts): |
| 65 | + self.max_retries: int = max_retries |
| 66 | + self.attempts: int = attempts |
63 | 67 |
|
64 | 68 | class KubernetesJobRunner(AsynchronousJobRunner): |
65 | 69 | """ |
@@ -839,7 +843,7 @@ def _handle_unschedulable_job(self, k8s_job, job_state): |
839 | 843 | if self.__has_guest_ports(job_state.job_wrapper): |
840 | 844 | self.__cleanup_k8s_guest_ports(job_state.job_wrapper, k8s_job) |
841 | 845 | # Wrap the k8s job before we put it in the work queue so it can be retried a few times |
842 | | - self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJob(k8s_job=k8s_job))) |
| 846 | + self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJobState(job_state=job_state, k8s_job=k8s_job))) |
843 | 847 | except Exception: |
844 | 848 | log.exception("Could not clean up an unschedulable k8s batch job. Ignoring...") |
845 | 849 | return None |
@@ -879,33 +883,40 @@ def _handle_job_failure(self, k8s_job, job_state): |
879 | 883 | if self.__has_guest_ports(job_state.job_wrapper): |
880 | 884 | self.__cleanup_k8s_guest_ports(job_state.job_wrapper, k8s_job) |
881 | 885 | # Wrap the k8s job before we put it in the work queue so it can be retried a few times |
882 | | - self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJob(k8s_job=k8s_job))) |
| 886 | + self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJobState(job_state=job_state, k8s_job=k8s_job))) |
883 | 887 | except Exception: |
884 | 888 | log.exception("Could not clean up a failed k8s batch job. Ignoring...") |
885 | 889 | return mark_failed |
886 | 890 |
|
887 | | - def __cleanup_k8s_job(self, retryable_delete_k8s_job: RetryableDeleteJob): |
888 | | - k8s_job = retryable_delete_k8s_job.k8s_job |
889 | | - log.debug(f"Cleaning up job with K8s id {k8s_job.name} (attempt {retryable_delete_k8s_job.attempts + 1}).") |
| 891 | + def __cleanup_k8s_job(self, retryable_delete_k8s_job_state: RetryableDeleteJobState): |
| 892 | + k8s_job = retryable_delete_k8s_job_state.k8s_job |
| 893 | + log.debug(f"Cleaning up job with K8s id {k8s_job.name} (attempt {retryable_delete_k8s_job_state.attempts + 1}).") |
890 | 894 | k8s_cleanup_job = self.runner_params["k8s_cleanup_job"] |
891 | 895 | try: |
892 | 896 | delete_job(k8s_job, k8s_cleanup_job) |
893 | 897 | except HTTPError as exc: |
894 | | - if retryable_delete_k8s_job.retries < 1: |
| 898 | + # If job not found, then previous deletion was successful |
| 899 | + if exc.code == 404 and retryable_delete_k8s_job_state.attempts >= 1: |
| 900 | + log.warning( |
| 901 | + f"Cleanup job with K8s id {k8s_job.name} skipped as it is no longer available (404) and a previous deletion was triggered." |
| 902 | + ) |
| 903 | + return |
| 904 | + if retryable_delete_k8s_job_state.max_retries <= retryable_delete_k8s_job_state.attempts: |
895 | 905 | log.error( |
896 | | - f"Failed to cleanup job with K8s id {k8s_job.name} after {retryable_delete_k8s_job.attempts} attempts; giving up." |
| 906 | + f"Failed to cleanup job with K8s id {k8s_job.name} after {retryable_delete_k8s_job_state.attempts} of {retryable_delete_k8s_job_state.max_retries} attempts; giving up." |
897 | 907 | ) |
898 | 908 | raise exc |
899 | 909 | else: |
900 | 910 | # Refresh the job to resolve object & cluster conflicts |
901 | | - k8s_job.reload() |
| 911 | + reload_job(k8s_job) |
902 | 912 | # Try the cleanup again |
903 | | - new_retryable_job = RetryableDeleteJob( |
| 913 | + new_retryable_job_state = RetryableDeleteJobState( |
| 914 | + job_state=retryable_delete_k8s_job_state, |
904 | 915 | k8s_job=k8s_job, |
905 | | - retries=retryable_delete_k8s_job.retries - 1, |
906 | | - attempts=retryable_delete_k8s_job.attempts + 1, |
| 916 | + max_retries=retryable_delete_k8s_job_state.max_retries, |
| 917 | + attempts=retryable_delete_k8s_job_state.attempts + 1, |
907 | 918 | ) |
908 | | - self.work_queue.put((self.__cleanup_k8s_job, new_retryable_job)) |
| 919 | + self.work_queue.put((self.__cleanup_k8s_job, new_retryable_job_state)) |
909 | 920 |
|
910 | 921 | def __cleanup_k8s_ingress(self, ingress, job_failed=False): |
911 | 922 | k8s_cleanup_job = self.runner_params["k8s_cleanup_job"] |
@@ -1023,7 +1034,7 @@ def stop_job(self, job_wrapper): |
1023 | 1034 | log.debug(f"Job {gxy_job.id} ({gxy_job.job_runner_external_id}) has guest ports, cleaning them up") |
1024 | 1035 | self.__cleanup_k8s_guest_ports(job_wrapper, k8s_job) |
1025 | 1036 | # Wrap the k8s job before we put it in the work queue so it can be retried a few times |
1026 | | - self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJob(k8s_job=k8s_job))) |
| 1037 | + self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJobState(job_state=JobState(job_wrapper=job_wrapper,job_destination=job_wrapper.job_destination),k8s_job=k8s_job))) |
1027 | 1038 | else: |
1028 | 1039 | log.debug(f"Could not find job with id {gxy_job.get_job_runner_external_id()} to delete") |
1029 | 1040 | # TODO assert whether job parallelism == 0 |
@@ -1156,4 +1167,4 @@ def finish_job(self, job_state): |
1156 | 1167 | if self.__has_guest_ports(job_state.job_wrapper): |
1157 | 1168 | self.__cleanup_k8s_guest_ports(job_state.job_wrapper, k8s_job) |
1158 | 1169 | # Wrap the k8s job before we put it in the work queue so it can be retried a few times |
1159 | | - self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJob(k8s_job=k8s_job))) |
| 1170 | + self.work_queue.put((self.__cleanup_k8s_job, RetryableDeleteJobState(job_state=job_state, k8s_job=k8s_job))) |
0 commit comments