Merge pull request #22548 from mvdbeek/fix-expensive-workflow-search-query

mvdbeek · web-flow · commit b2e43248170f · 2026-04-27T21:55:59.000+02:00
[26.0] Replace per-term joins in workflow search with EXISTS subqueries
diff --git a/lib/galaxy/managers/workflows.py b/lib/galaxy/managers/workflows.py
@@ -38,7 +38,6 @@
     true,
 )
 from sqlalchemy.orm import (
-    aliased,
     joinedload,
     subqueryload,
 )
@@ -74,10 +73,10 @@
 )
 from galaxy.model.base import ensure_object_added_to_session
 from galaxy.model.index_filter_util import (
-    append_user_filter,
     raw_text_column_filter,
-    tag_filter,
+    tag_exists_filter,
     text_column_filter,
+    user_exists_filter,
 )
 from galaxy.model.item_attrs import UsesAnnotations
 from galaxy.schema.invocation import InvocationCancellationUserRequest
@@ -103,6 +102,7 @@
 )
 from galaxy.util.sanitize_html import sanitize_html
 from galaxy.util.search import (
+    filter_terms,
     FilteredTerm,
     parse_filters_structured,
     RawTextTerm,
@@ -211,13 +211,16 @@ def index_query(
         stmt = stmt.where(StoredWorkflow.hidden == (true() if show_hidden else false()))
         if payload.search:
             search_query = payload.search
-            parsed_search = parse_filters_structured(search_query, INDEX_SEARCH_FILTERS)
-
-            def w_tag_filter(term_text: str, quoted: bool):
-                nonlocal stmt
-                alias = aliased(StoredWorkflowTagAssociation)
-                stmt = stmt.outerjoin(StoredWorkflow.tags.of_type(alias))
-                return tag_filter(alias, term_text, quoted)
+            parsed_search = filter_terms(parse_filters_structured(search_query, INDEX_SEARCH_FILTERS))
+
+            def w_tag_exists(term_text: str, quoted: bool):
+                return tag_exists_filter(
+                    StoredWorkflowTagAssociation,
+                    StoredWorkflowTagAssociation.stored_workflow_id,
+                    StoredWorkflow.id,
+                    term_text,
+                    quoted,
+                )
 
             def name_filter(term):
                 return text_column_filter(StoredWorkflow.name, term)
@@ -227,12 +230,11 @@ def name_filter(term):
                     key = term.filter
                     q = term.text
                     if key == "tag":
-                        tf = w_tag_filter(term.text, term.quoted)
-                        stmt = stmt.where(tf)
+                        stmt = stmt.where(w_tag_exists(term.text, term.quoted))
                     elif key == "name":
                         stmt = stmt.where(name_filter(term))
                     elif key == "user":
-                        stmt = append_user_filter(stmt, StoredWorkflow, term)
+                        stmt = stmt.where(user_exists_filter(StoredWorkflow.user_id, term.text))
                     elif key == "is":
                         if q == "published":
                             stmt = stmt.where(StoredWorkflow.published == true())
@@ -260,15 +262,12 @@ def name_filter(term):
                                 model.StoredWorkflowMenuEntry.stored_workflow_id == StoredWorkflow.id,
                             ).where(model.StoredWorkflowMenuEntry.user_id == user.id)
                 elif isinstance(term, RawTextTerm):
-                    tf = w_tag_filter(term.text, False)
-                    alias = aliased(User)
-                    stmt = stmt.outerjoin(StoredWorkflow.user.of_type(alias))
                     stmt = stmt.where(
                         raw_text_column_filter(
                             [
                                 StoredWorkflow.name,
-                                tf,
-                                alias.username,
+                                w_tag_exists(term.text, False),
+                                user_exists_filter(StoredWorkflow.user_id, term.text),
                             ],
                             term,
                         )
diff --git a/lib/galaxy/model/index_filter_util.py b/lib/galaxy/model/index_filter_util.py
@@ -7,6 +7,7 @@
 from sqlalchemy import (
     and_,
     or_,
+    select,
 )
 from sqlalchemy.orm import (
     aliased,
@@ -68,3 +69,32 @@ def append_user_filter(query, model_class, term: FilteredTerm):
     query = query.outerjoin(model_class.user.of_type(alias))
     query = query.filter(text_column_filter(alias.username, term))
     return query
+
+
+def tag_exists_filter(association_model_class, fk_column, parent_id_column, term_text, quoted: bool = False):
+    """Correlated EXISTS subquery that matches any tag on the parent row against term_text.
+
+    Prefer this over adding a per-term outer join on the tag-association table: each
+    extra outer join multiplies rows (forcing an expensive DISTINCT) and in free-text
+    search N whitespace-separated terms produce N such joins.
+    """
+    return (
+        select(1)
+        .select_from(association_model_class)
+        .where(fk_column == parent_id_column)
+        .where(tag_filter(association_model_class, term_text, quoted))
+        .correlate_except(association_model_class)
+        .exists()
+    )
+
+
+def user_exists_filter(owner_id_column, term_text: str):
+    """Correlated EXISTS subquery that matches the owning user's username."""
+    return (
+        select(1)
+        .select_from(model.User)
+        .where(model.User.id == owner_id_column)
+        .where(model.User.username.ilike(f"%{term_text}%"))
+        .correlate_except(model.User)
+        .exists()
+    )
diff --git a/lib/galaxy/util/search.py b/lib/galaxy/util/search.py
@@ -12,6 +12,12 @@
 ParseFilterResultT = Tuple[Optional[List["FilteredTerm"]], Optional[str]]
 QUOTE_PATTERN = re.compile(r"\'(.*?)\'")
 
+# Defaults for `filter_terms` used by index-search callers. A whitespace-rich
+# query turns into one WHERE clause (and, pre-trigram-index, one seq scan per
+# matching table) per raw term, so both floors are there to bound query cost.
+DEFAULT_MIN_RAW_TERM_LENGTH = 4
+DEFAULT_MAX_RAW_TERMS = 7
+
 
 def parse_filters(search_term: str, filters: Optional[Dict[str, str]] = None) -> ParseFilterResultT:
     """Support github-like filters for narrowing the results.
@@ -110,7 +116,39 @@ def simple_result(self) -> ParseFilterResultT:
         return None if len(self.filter_terms) == 0 else self.filter_terms, " ".join([t.text for t in self.text_terms])
 
 
+def filter_terms(
+    parsed: "ParsedSearch",
+    min_raw_term_length: int = DEFAULT_MIN_RAW_TERM_LENGTH,
+    max_raw_terms: Optional[int] = DEFAULT_MAX_RAW_TERMS,
+) -> "ParsedSearch":
+    """Return a new ParsedSearch with short / excess raw text terms dropped.
+
+    Raw (unquoted, non-keyed) terms shorter than ``min_raw_term_length`` are
+    dropped, and the surviving raw terms are capped at ``max_raw_terms``.
+    Filtered terms (``key:value``) and quoted raw terms ('foo bar') are
+    always kept — those are explicit user intent.
+    """
+    out = ParsedSearch()
+    raw_kept = 0
+    for term in parsed.terms:
+        if isinstance(term, RawTextTerm) and not term.quoted:
+            if len(term.text) < min_raw_term_length:
+                continue
+            if max_raw_terms is not None and raw_kept >= max_raw_terms:
+                continue
+            raw_kept += 1
+            out.add_unfiltered_text(term.text, term.quoted)
+        elif isinstance(term, RawTextTerm):
+            out.add_unfiltered_text(term.text, term.quoted)
+        else:
+            out.add_keyed_term(term.filter, term.text, term.quoted)
+    return out
+
+
 __all__ = (
+    "DEFAULT_MAX_RAW_TERMS",
+    "DEFAULT_MIN_RAW_TERM_LENGTH",
+    "filter_terms",
     "parse_filters",
     "parse_filters_structured",
 )
diff --git a/lib/galaxy_test/api/test_workflows.py b/lib/galaxy_test/api/test_workflows.py
@@ -683,6 +683,17 @@ def test_index_search_tags_multiple(self):
             assert workflow_id_2 not in index_ids
             assert workflow_id_3 not in index_ids
 
+    def test_index_search_many_terms(self):
+        # Regression: a whitespace-rich search string used to add one outer join
+        # on stored_workflow_tag_association and one on galaxy_user per term,
+        # producing an unusably expensive query for long searches.
+        name = f"Copy of Genomic Assembly and analysis - RDH shared by user {uuid4()}"
+        workflow_id = self.workflow_populator.simple_workflow(name)
+        self.workflow_populator.set_tags(workflow_id, [f"manyterms-{uuid4()}"])
+        search = "Copy of Genomic Assembly and analysis - RDH shared by user"
+        index_ids = self.workflow_populator.index_ids(search=search)
+        assert workflow_id in index_ids
+
     def test_search_casing(self):
         name1, name2 = (
             self.dataset_populator.get_random_name().upper(),
diff --git a/test/unit/util/test_search.py b/test/unit/util/test_search.py
@@ -1,6 +1,9 @@
 from galaxy.util.search import (
+    filter_terms,
+    FilteredTerm,
     parse_filters,
     parse_filters_structured,
+    RawTextTerm,
 )
 
 
@@ -94,3 +97,55 @@ def test_parse_filters_structured():
     assert text_terms[1].quoted is True
     assert text_terms[2].text == "foo"
     assert text_terms[2].quoted is False
+
+
+def test_filter_terms_drops_short_raw_terms():
+    parsed = parse_filters_structured("Copy of Genomic Assembly and analysis", {})
+    filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
+    kept = [t.text for t in filtered.terms]
+    assert kept == ["Copy", "Genomic", "Assembly", "analysis"]
+
+
+def test_filter_terms_preserves_quoted_raw_terms():
+    parsed = parse_filters_structured("'ab' Copy 'de'", {})
+    filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
+    assert [(t.text, t.quoted) for t in filtered.terms] == [
+        ("ab", True),
+        ("Copy", False),
+        ("de", True),
+    ]
+
+
+def test_filter_terms_preserves_filtered_terms_of_any_length():
+    parsed = parse_filters_structured("tag:ab user:cd Copy", {"tag": "tag", "user": "user"})
+    filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
+    kinds = [(t.__class__.__name__, t.text) for t in filtered.terms]
+    # Both filtered terms are preserved even though their text is shorter than 4;
+    # "Copy" (4) is preserved too.
+    assert ("FilteredTerm", "ab") in kinds
+    assert ("FilteredTerm", "cd") in kinds
+    assert ("RawTextTerm", "Copy") in kinds
+
+
+def test_filter_terms_caps_raw_terms_only():
+    parsed = parse_filters_structured(
+        "tag:foo Copy Genomic Assembly analysis shared user nedflanders extra1 extra2",
+        {"tag": "tag"},
+    )
+    filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=3)
+    raw = [t.text for t in filtered.terms if isinstance(t, RawTextTerm)]
+    filt = [t.text for t in filtered.terms if isinstance(t, FilteredTerm)]
+    assert raw == ["Copy", "Genomic", "Assembly"]
+    assert filt == ["foo"]
+
+
+def test_filter_terms_defaults():
+    # Default behaviour: 4-char min length, 7-term cap on raw terms.
+    parsed = parse_filters_structured(
+        "Copy of Genomic Assembly and analysis - RDH shared by user nedflanders",
+        {},
+    )
+    filtered = filter_terms(parsed)
+    kept = [t.text for t in filtered.terms]
+    # of, and, -, RDH, by dropped for length; everything else survives.
+    assert kept == ["Copy", "Genomic", "Assembly", "analysis", "shared", "user", "nedflanders"]