Skip to content

Commit b2e4324

Browse files
authored
Merge pull request #22548 from mvdbeek/fix-expensive-workflow-search-query
[26.0] Replace per-term joins in workflow search with EXISTS subqueries
2 parents df879c6 + ac5d775 commit b2e4324

5 files changed

Lines changed: 151 additions & 18 deletions

File tree

lib/galaxy/managers/workflows.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
true,
3939
)
4040
from sqlalchemy.orm import (
41-
aliased,
4241
joinedload,
4342
subqueryload,
4443
)
@@ -74,10 +73,10 @@
7473
)
7574
from galaxy.model.base import ensure_object_added_to_session
7675
from galaxy.model.index_filter_util import (
77-
append_user_filter,
7876
raw_text_column_filter,
79-
tag_filter,
77+
tag_exists_filter,
8078
text_column_filter,
79+
user_exists_filter,
8180
)
8281
from galaxy.model.item_attrs import UsesAnnotations
8382
from galaxy.schema.invocation import InvocationCancellationUserRequest
@@ -103,6 +102,7 @@
103102
)
104103
from galaxy.util.sanitize_html import sanitize_html
105104
from galaxy.util.search import (
105+
filter_terms,
106106
FilteredTerm,
107107
parse_filters_structured,
108108
RawTextTerm,
@@ -211,13 +211,16 @@ def index_query(
211211
stmt = stmt.where(StoredWorkflow.hidden == (true() if show_hidden else false()))
212212
if payload.search:
213213
search_query = payload.search
214-
parsed_search = parse_filters_structured(search_query, INDEX_SEARCH_FILTERS)
215-
216-
def w_tag_filter(term_text: str, quoted: bool):
217-
nonlocal stmt
218-
alias = aliased(StoredWorkflowTagAssociation)
219-
stmt = stmt.outerjoin(StoredWorkflow.tags.of_type(alias))
220-
return tag_filter(alias, term_text, quoted)
214+
parsed_search = filter_terms(parse_filters_structured(search_query, INDEX_SEARCH_FILTERS))
215+
216+
def w_tag_exists(term_text: str, quoted: bool):
217+
return tag_exists_filter(
218+
StoredWorkflowTagAssociation,
219+
StoredWorkflowTagAssociation.stored_workflow_id,
220+
StoredWorkflow.id,
221+
term_text,
222+
quoted,
223+
)
221224

222225
def name_filter(term):
223226
return text_column_filter(StoredWorkflow.name, term)
@@ -227,12 +230,11 @@ def name_filter(term):
227230
key = term.filter
228231
q = term.text
229232
if key == "tag":
230-
tf = w_tag_filter(term.text, term.quoted)
231-
stmt = stmt.where(tf)
233+
stmt = stmt.where(w_tag_exists(term.text, term.quoted))
232234
elif key == "name":
233235
stmt = stmt.where(name_filter(term))
234236
elif key == "user":
235-
stmt = append_user_filter(stmt, StoredWorkflow, term)
237+
stmt = stmt.where(user_exists_filter(StoredWorkflow.user_id, term.text))
236238
elif key == "is":
237239
if q == "published":
238240
stmt = stmt.where(StoredWorkflow.published == true())
@@ -260,15 +262,12 @@ def name_filter(term):
260262
model.StoredWorkflowMenuEntry.stored_workflow_id == StoredWorkflow.id,
261263
).where(model.StoredWorkflowMenuEntry.user_id == user.id)
262264
elif isinstance(term, RawTextTerm):
263-
tf = w_tag_filter(term.text, False)
264-
alias = aliased(User)
265-
stmt = stmt.outerjoin(StoredWorkflow.user.of_type(alias))
266265
stmt = stmt.where(
267266
raw_text_column_filter(
268267
[
269268
StoredWorkflow.name,
270-
tf,
271-
alias.username,
269+
w_tag_exists(term.text, False),
270+
user_exists_filter(StoredWorkflow.user_id, term.text),
272271
],
273272
term,
274273
)

lib/galaxy/model/index_filter_util.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from sqlalchemy import (
88
and_,
99
or_,
10+
select,
1011
)
1112
from sqlalchemy.orm import (
1213
aliased,
@@ -68,3 +69,32 @@ def append_user_filter(query, model_class, term: FilteredTerm):
6869
query = query.outerjoin(model_class.user.of_type(alias))
6970
query = query.filter(text_column_filter(alias.username, term))
7071
return query
72+
73+
74+
def tag_exists_filter(association_model_class, fk_column, parent_id_column, term_text, quoted: bool = False):
75+
"""Correlated EXISTS subquery that matches any tag on the parent row against term_text.
76+
77+
Prefer this over adding a per-term outer join on the tag-association table: each
78+
extra outer join multiplies rows (forcing an expensive DISTINCT) and in free-text
79+
search N whitespace-separated terms produce N such joins.
80+
"""
81+
return (
82+
select(1)
83+
.select_from(association_model_class)
84+
.where(fk_column == parent_id_column)
85+
.where(tag_filter(association_model_class, term_text, quoted))
86+
.correlate_except(association_model_class)
87+
.exists()
88+
)
89+
90+
91+
def user_exists_filter(owner_id_column, term_text: str):
92+
"""Correlated EXISTS subquery that matches the owning user's username."""
93+
return (
94+
select(1)
95+
.select_from(model.User)
96+
.where(model.User.id == owner_id_column)
97+
.where(model.User.username.ilike(f"%{term_text}%"))
98+
.correlate_except(model.User)
99+
.exists()
100+
)

lib/galaxy/util/search.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
ParseFilterResultT = Tuple[Optional[List["FilteredTerm"]], Optional[str]]
1313
QUOTE_PATTERN = re.compile(r"\'(.*?)\'")
1414

15+
# Defaults for `filter_terms` used by index-search callers. A whitespace-rich
16+
# query turns into one WHERE clause (and, pre-trigram-index, one seq scan per
17+
# matching table) per raw term, so both floors are there to bound query cost.
18+
DEFAULT_MIN_RAW_TERM_LENGTH = 4
19+
DEFAULT_MAX_RAW_TERMS = 7
20+
1521

1622
def parse_filters(search_term: str, filters: Optional[Dict[str, str]] = None) -> ParseFilterResultT:
1723
"""Support github-like filters for narrowing the results.
@@ -110,7 +116,39 @@ def simple_result(self) -> ParseFilterResultT:
110116
return None if len(self.filter_terms) == 0 else self.filter_terms, " ".join([t.text for t in self.text_terms])
111117

112118

119+
def filter_terms(
120+
parsed: "ParsedSearch",
121+
min_raw_term_length: int = DEFAULT_MIN_RAW_TERM_LENGTH,
122+
max_raw_terms: Optional[int] = DEFAULT_MAX_RAW_TERMS,
123+
) -> "ParsedSearch":
124+
"""Return a new ParsedSearch with short / excess raw text terms dropped.
125+
126+
Raw (unquoted, non-keyed) terms shorter than ``min_raw_term_length`` are
127+
dropped, and the surviving raw terms are capped at ``max_raw_terms``.
128+
Filtered terms (``key:value``) and quoted raw terms ('foo bar') are
129+
always kept — those are explicit user intent.
130+
"""
131+
out = ParsedSearch()
132+
raw_kept = 0
133+
for term in parsed.terms:
134+
if isinstance(term, RawTextTerm) and not term.quoted:
135+
if len(term.text) < min_raw_term_length:
136+
continue
137+
if max_raw_terms is not None and raw_kept >= max_raw_terms:
138+
continue
139+
raw_kept += 1
140+
out.add_unfiltered_text(term.text, term.quoted)
141+
elif isinstance(term, RawTextTerm):
142+
out.add_unfiltered_text(term.text, term.quoted)
143+
else:
144+
out.add_keyed_term(term.filter, term.text, term.quoted)
145+
return out
146+
147+
113148
__all__ = (
149+
"DEFAULT_MAX_RAW_TERMS",
150+
"DEFAULT_MIN_RAW_TERM_LENGTH",
151+
"filter_terms",
114152
"parse_filters",
115153
"parse_filters_structured",
116154
)

lib/galaxy_test/api/test_workflows.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,17 @@ def test_index_search_tags_multiple(self):
683683
assert workflow_id_2 not in index_ids
684684
assert workflow_id_3 not in index_ids
685685

686+
def test_index_search_many_terms(self):
687+
# Regression: a whitespace-rich search string used to add one outer join
688+
# on stored_workflow_tag_association and one on galaxy_user per term,
689+
# producing an unusably expensive query for long searches.
690+
name = f"Copy of Genomic Assembly and analysis - RDH shared by user {uuid4()}"
691+
workflow_id = self.workflow_populator.simple_workflow(name)
692+
self.workflow_populator.set_tags(workflow_id, [f"manyterms-{uuid4()}"])
693+
search = "Copy of Genomic Assembly and analysis - RDH shared by user"
694+
index_ids = self.workflow_populator.index_ids(search=search)
695+
assert workflow_id in index_ids
696+
686697
def test_search_casing(self):
687698
name1, name2 = (
688699
self.dataset_populator.get_random_name().upper(),

test/unit/util/test_search.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from galaxy.util.search import (
2+
filter_terms,
3+
FilteredTerm,
24
parse_filters,
35
parse_filters_structured,
6+
RawTextTerm,
47
)
58

69

@@ -94,3 +97,55 @@ def test_parse_filters_structured():
9497
assert text_terms[1].quoted is True
9598
assert text_terms[2].text == "foo"
9699
assert text_terms[2].quoted is False
100+
101+
102+
def test_filter_terms_drops_short_raw_terms():
103+
parsed = parse_filters_structured("Copy of Genomic Assembly and analysis", {})
104+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
105+
kept = [t.text for t in filtered.terms]
106+
assert kept == ["Copy", "Genomic", "Assembly", "analysis"]
107+
108+
109+
def test_filter_terms_preserves_quoted_raw_terms():
110+
parsed = parse_filters_structured("'ab' Copy 'de'", {})
111+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
112+
assert [(t.text, t.quoted) for t in filtered.terms] == [
113+
("ab", True),
114+
("Copy", False),
115+
("de", True),
116+
]
117+
118+
119+
def test_filter_terms_preserves_filtered_terms_of_any_length():
120+
parsed = parse_filters_structured("tag:ab user:cd Copy", {"tag": "tag", "user": "user"})
121+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
122+
kinds = [(t.__class__.__name__, t.text) for t in filtered.terms]
123+
# Both filtered terms are preserved even though their text is shorter than 4;
124+
# "Copy" (4) is preserved too.
125+
assert ("FilteredTerm", "ab") in kinds
126+
assert ("FilteredTerm", "cd") in kinds
127+
assert ("RawTextTerm", "Copy") in kinds
128+
129+
130+
def test_filter_terms_caps_raw_terms_only():
131+
parsed = parse_filters_structured(
132+
"tag:foo Copy Genomic Assembly analysis shared user nedflanders extra1 extra2",
133+
{"tag": "tag"},
134+
)
135+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=3)
136+
raw = [t.text for t in filtered.terms if isinstance(t, RawTextTerm)]
137+
filt = [t.text for t in filtered.terms if isinstance(t, FilteredTerm)]
138+
assert raw == ["Copy", "Genomic", "Assembly"]
139+
assert filt == ["foo"]
140+
141+
142+
def test_filter_terms_defaults():
143+
# Default behaviour: 4-char min length, 7-term cap on raw terms.
144+
parsed = parse_filters_structured(
145+
"Copy of Genomic Assembly and analysis - RDH shared by user nedflanders",
146+
{},
147+
)
148+
filtered = filter_terms(parsed)
149+
kept = [t.text for t in filtered.terms]
150+
# of, and, -, RDH, by dropped for length; everything else survives.
151+
assert kept == ["Copy", "Genomic", "Assembly", "analysis", "shared", "user", "nedflanders"]

0 commit comments

Comments
 (0)