Merge pull request galaxyproject#19824 from mvdbeek/fix_limit_bypass

natefoo · web-flow · commit ecc4b47804ee · 2025-03-24T17:37:26.000-04:00
[24.2] Fix various job concurrency limit issues
diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py
@@ -14,6 +14,10 @@
 import sys
 import time
 import traceback
+from dataclasses import (
+    dataclass,
+    field,
+)
 from json import loads
 from typing import (
     Any,
@@ -27,7 +31,12 @@
 import yaml
 from packaging.version import Version
 from pulsar.client.staging import COMMAND_VERSION_FILENAME
-from sqlalchemy import select
+from sqlalchemy import (
+    and_,
+    func,
+    select,
+    update,
+)
 
 from galaxy import (
     model,
@@ -296,6 +305,18 @@ def job_config_xml_to_dict(config, root):
     return config_dict
 
 
+@dataclass
+class JobConfigurationLimits:
+    registered_user_concurrent_jobs: Optional[int] = None
+    anonymous_user_concurrent_jobs: Optional[int] = None
+    walltime: Optional[str] = None
+    walltime_delta: Optional[datetime.timedelta] = None
+    total_walltime: Dict[str, Any] = field(default_factory=dict)
+    output_size: Optional[int] = None
+    destination_user_concurrent_jobs: Dict[str, int] = field(default_factory=dict)
+    destination_total_concurrent_jobs: Dict[str, int] = field(default_factory=dict)
+
+
 class JobConfiguration(ConfiguresHandlers):
     """A parser and interface to advanced job management features.
 
@@ -344,16 +365,7 @@ def __init__(self, app: MinimalManagerApp):
         self.resource_groups = {}
         self.default_resource_group = None
         self.resource_parameters = {}
-        self.limits = Bunch(
-            registered_user_concurrent_jobs=None,
-            anonymous_user_concurrent_jobs=None,
-            walltime=None,
-            walltime_delta=None,
-            total_walltime={},
-            output_size=None,
-            destination_user_concurrent_jobs={},
-            destination_total_concurrent_jobs={},
-        )
+        self.limits = JobConfigurationLimits()
 
         default_resubmits = []
         default_resubmit_condition = self.app.config.default_job_resubmission_condition
@@ -1610,12 +1622,142 @@ def get_destination_configuration(self, key, default=None):
         dest_params = self.job_destination.params
         return self.get_job().get_destination_configuration(dest_params, self.app.config, key, default)
 
+    def queue_with_limit(self, job: Job, job_destination: JobDestination):
+        anonymous_user_concurrent_jobs = self.app.job_config.limits.anonymous_user_concurrent_jobs
+        registered_user_concurrent_jobs = self.app.job_config.limits.registered_user_concurrent_jobs
+        destination_total_concurrent_jobs = self.app.job_config.limits.destination_total_concurrent_jobs
+        destination_total_limit = self.app.job_config.limits.destination_total_concurrent_jobs.get(job_destination.id)
+        destination_user_limit = self.app.job_config.limits.destination_user_concurrent_jobs.get(job_destination.id)
+        destination_tag_limits = {}
+        if job_destination.tags:
+            for tag in job_destination.tags:
+                if tag_limit := destination_total_concurrent_jobs.get(tag):
+                    destination_tag_limits[tag] = tag_limit
+
+        conditions = [model.Job.table.c.id == job.id]
+
+        if job.user_id:
+            user_job_count = (
+                select(func.count(model.Job.table.c.id))
+                .where(
+                    and_(
+                        model.Job.table.c.state.in_(
+                            [
+                                model.Job.states.QUEUED,
+                                model.Job.states.RUNNING,
+                                model.Job.states.RESUBMITTED,
+                            ]
+                        ),
+                        model.Job.table.c.user_id == job.user_id,
+                    )
+                )
+                .scalar_subquery()
+            )
+
+            if registered_user_concurrent_jobs is not None:
+                conditions.append(user_job_count < registered_user_concurrent_jobs)
+            if destination_user_limit is not None:
+                destination_job_count = (
+                    select(func.count(model.Job.table.c.id))
+                    .where(
+                        and_(
+                            model.Job.table.c.state.in_(
+                                [
+                                    model.Job.states.QUEUED,
+                                    model.Job.states.RUNNING,
+                                    model.Job.states.RESUBMITTED,
+                                ]
+                            ),
+                            model.Job.table.c.destination_id == job_destination.id,
+                            model.Job.table.c.user_id == job.user_id,
+                        )
+                    )
+                    .scalar_subquery()
+                )
+                conditions.append(destination_job_count < destination_user_limit)
+
+        elif anonymous_user_concurrent_jobs and job.galaxy_session and job.galaxy_session.id:
+            anon_job_count = (
+                select(func.count(model.Job.table.c.id))
+                .where(
+                    and_(
+                        model.Job.table.c.state.in_(
+                            [
+                                model.Job.states.QUEUED,
+                                model.Job.states.RUNNING,
+                                model.Job.states.RESUBMITTED,
+                            ]
+                        ),
+                        model.Job.table.c.session_id == job.galaxy_session.id,
+                    )
+                )
+                .scalar_subquery()
+            )
+            conditions.append(anon_job_count < anonymous_user_concurrent_jobs)
+
+        if destination_total_limit is not None:
+            destination_total_count = (
+                select(func.count(model.Job.table.c.id))
+                .where(
+                    and_(
+                        model.Job.table.c.state.in_(
+                            [
+                                model.Job.states.QUEUED,
+                                model.Job.states.RUNNING,
+                                model.Job.states.RESUBMITTED,
+                            ]
+                        ),
+                        model.Job.table.c.destination_id == job_destination.id,
+                    )
+                )
+                .scalar_subquery()
+            )
+            conditions.append(destination_total_count < destination_total_limit)
+
+        if destination_tag_limits:
+            for tag, limit in destination_tag_limits.items():
+                destination_ids = {destination.id for destination in self.app.job_config.get_destinations(tag)}
+                tag_count = (
+                    select(func.count(model.Job.table.c.id))
+                    .where(
+                        and_(
+                            model.Job.table.c.state.in_(
+                                [
+                                    model.Job.states.QUEUED,
+                                    model.Job.states.RUNNING,
+                                    model.Job.states.RESUBMITTED,
+                                ]
+                            ),
+                            model.Job.table.c.destination_id.in_(destination_ids),
+                        )
+                    )
+                    .scalar_subquery()
+                )
+                conditions.append(tag_count < limit)
+
+        update_stmt = (
+            update(model.Job)
+            .where(*conditions)
+            .values(
+                state=model.Job.states.QUEUED,
+                destination_id=job_destination.id,
+                destination_params=job_destination.params,
+                job_runner_name=job_destination.runner,
+            )
+        )
+
+        result = self.sa_session.execute(update_stmt)
+        self.sa_session.commit()
+
+        return result.rowcount > 0
+
     def enqueue(self):
         job = self.get_job()
         # Change to queued state before handing to worker thread so the runner won't pick it up again
-        self.change_state(model.Job.states.QUEUED, flush=False, job=job)
-        # Persist the destination so that the job will be included in counts if using concurrency limits
-        self.set_job_destination(self.job_destination, None, flush=False, job=job)
+        if self.is_task:
+            self.change_state(model.Job.states.QUEUED, flush=False, job=job)
+        elif not self.queue_with_limit(job, self.job_destination):
+            return False
         # Set object store after job destination so can leverage parameters...
         self._set_object_store_ids(job)
         # Now that we have the object store id, check if we are over the limit
diff --git a/lib/galaxy/jobs/handler.py b/lib/galaxy/jobs/handler.py
@@ -518,6 +518,9 @@ def __handle_waiting_jobs(self):
                 pass
         # Ensure that we get new job counts on each iteration
         self.__clear_job_count()
+        self.__cache_total_job_count_per_destination()
+        self.__cache_user_job_count_per_destination()
+        self.__cache_user_job_count()
         # Check resubmit jobs first so that limits of new jobs will still be enforced
         for job in resubmit_jobs:
             log.debug("(%s) Job was resubmitted and is being dispatched immediately", job.id)
@@ -824,7 +827,6 @@ def __clear_job_count(self):
         self.total_job_count_per_destination = None
 
     def get_user_job_count(self, user_id):
-        self.__cache_user_job_count()
         # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching
         rval = self.user_job_count.get(user_id, 0)
         if not self.app.config.cache_user_job_count:
@@ -865,7 +867,6 @@ def __cache_user_job_count(self):
             self.user_job_count = {}
 
     def get_user_job_count_per_destination(self, user_id):
-        self.__cache_user_job_count_per_destination()
         cached = self.user_job_count_per_destination.get(user_id, {})
         if self.app.config.cache_user_job_count:
             rval = cached
@@ -1006,7 +1007,6 @@ def __cache_total_job_count_per_destination(self):
                 self.total_job_count_per_destination[row["destination_id"]] = row["job_count"]
 
     def get_total_job_count_per_destination(self):
-        self.__cache_total_job_count_per_destination()
         # Always use caching (at worst a job will have to wait one iteration,
         # and this would be more fair anyway as it ensures FIFO scheduling,
         # insofar as FIFO would be fair...)
diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py
@@ -451,9 +451,7 @@ def queue_job(self, job_wrapper):
             job = job_wrapper.get_job()
             # Set the job destination here (unlike other runners) because there are likely additional job destination
             # params from the Pulsar client.
-            # Flush with change_state.
-            job_wrapper.set_job_destination(job_destination, external_id=external_job_id, flush=False, job=job)
-            job_wrapper.change_state(model.Job.states.QUEUED, job=job)
+            job_wrapper.set_job_destination(job_destination, external_id=external_job_id, flush=True, job=job)
         except Exception:
             job_wrapper.fail("failure running job", exception=True)
             log.exception("failure running job %d", job_wrapper.job_id)
diff --git a/lib/galaxy/jobs/runners/slurm.py b/lib/galaxy/jobs/runners/slurm.py
@@ -138,16 +138,8 @@ def _get_slurm_state():
                         ajs.job_wrapper.get_id_tag(),
                         ajs.job_id,
                     )
-                    ajs.job_wrapper.change_state(
-                        model.Job.states.QUEUED, info="Job was resubmitted due to node failure"
-                    )
-                    try:
-                        self.queue_job(ajs.job_wrapper)
-                        return
-                    except Exception:
-                        ajs.fail_message = (
-                            "This job failed due to a cluster node failure, and an attempt to resubmit the job failed."
-                        )
+                    self.mark_as_resubmitted(ajs, info="Job was resubmitted due to node failure")
+                    return
                 elif slurm_state == "OUT_OF_MEMORY":
                     log.info(
                         "(%s/%s) Job hit memory limit (SLURM state: OUT_OF_MEMORY)",
diff --git a/test/unit/app/jobs/test_queue_limit.py b/test/unit/app/jobs/test_queue_limit.py