Skip to content

Commit cc72559

Browse files
committed
Replace per-term joins in workflow search with EXISTS subqueries
Free-text search added one outer join on stored_workflow_tag_association and one on galaxy_user per whitespace-separated term, producing ~2N joins over a DISTINCT that is unusable for long queries (observed: 12-word search generating 26 joins).
1 parent c7c4f7a commit cc72559

3 files changed

Lines changed: 55 additions & 16 deletions

File tree

lib/galaxy/managers/workflows.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
true,
3939
)
4040
from sqlalchemy.orm import (
41-
aliased,
4241
joinedload,
4342
subqueryload,
4443
)
@@ -74,10 +73,10 @@
7473
)
7574
from galaxy.model.base import ensure_object_added_to_session
7675
from galaxy.model.index_filter_util import (
77-
append_user_filter,
7876
raw_text_column_filter,
79-
tag_filter,
77+
tag_exists_filter,
8078
text_column_filter,
79+
user_exists_filter,
8180
)
8281
from galaxy.model.item_attrs import UsesAnnotations
8382
from galaxy.schema.invocation import InvocationCancellationUserRequest
@@ -213,11 +212,14 @@ def index_query(
213212
search_query = payload.search
214213
parsed_search = parse_filters_structured(search_query, INDEX_SEARCH_FILTERS)
215214

216-
def w_tag_filter(term_text: str, quoted: bool):
217-
nonlocal stmt
218-
alias = aliased(StoredWorkflowTagAssociation)
219-
stmt = stmt.outerjoin(StoredWorkflow.tags.of_type(alias))
220-
return tag_filter(alias, term_text, quoted)
215+
def w_tag_exists(term_text: str, quoted: bool):
216+
return tag_exists_filter(
217+
StoredWorkflowTagAssociation,
218+
StoredWorkflowTagAssociation.stored_workflow_id,
219+
StoredWorkflow.id,
220+
term_text,
221+
quoted,
222+
)
221223

222224
def name_filter(term):
223225
return text_column_filter(StoredWorkflow.name, term)
@@ -227,12 +229,11 @@ def name_filter(term):
227229
key = term.filter
228230
q = term.text
229231
if key == "tag":
230-
tf = w_tag_filter(term.text, term.quoted)
231-
stmt = stmt.where(tf)
232+
stmt = stmt.where(w_tag_exists(term.text, term.quoted))
232233
elif key == "name":
233234
stmt = stmt.where(name_filter(term))
234235
elif key == "user":
235-
stmt = append_user_filter(stmt, StoredWorkflow, term)
236+
stmt = stmt.where(user_exists_filter(StoredWorkflow.user_id, term.text))
236237
elif key == "is":
237238
if q == "published":
238239
stmt = stmt.where(StoredWorkflow.published == true())
@@ -260,15 +261,12 @@ def name_filter(term):
260261
model.StoredWorkflowMenuEntry.stored_workflow_id == StoredWorkflow.id,
261262
).where(model.StoredWorkflowMenuEntry.user_id == user.id)
262263
elif isinstance(term, RawTextTerm):
263-
tf = w_tag_filter(term.text, False)
264-
alias = aliased(User)
265-
stmt = stmt.outerjoin(StoredWorkflow.user.of_type(alias))
266264
stmt = stmt.where(
267265
raw_text_column_filter(
268266
[
269267
StoredWorkflow.name,
270-
tf,
271-
alias.username,
268+
w_tag_exists(term.text, False),
269+
user_exists_filter(StoredWorkflow.user_id, term.text),
272270
],
273271
term,
274272
)

lib/galaxy/model/index_filter_util.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from sqlalchemy import (
88
and_,
99
or_,
10+
select,
1011
)
1112
from sqlalchemy.orm import (
1213
aliased,
@@ -68,3 +69,32 @@ def append_user_filter(query, model_class, term: FilteredTerm):
6869
query = query.outerjoin(model_class.user.of_type(alias))
6970
query = query.filter(text_column_filter(alias.username, term))
7071
return query
72+
73+
74+
def tag_exists_filter(association_model_class, fk_column, parent_id_column, term_text, quoted: bool = False):
75+
"""Correlated EXISTS subquery that matches any tag on the parent row against term_text.
76+
77+
Prefer this over adding a per-term outer join on the tag-association table: each
78+
extra outer join multiplies rows (forcing an expensive DISTINCT) and in free-text
79+
search N whitespace-separated terms produce N such joins.
80+
"""
81+
return (
82+
select(1)
83+
.select_from(association_model_class)
84+
.where(fk_column == parent_id_column)
85+
.where(tag_filter(association_model_class, term_text, quoted))
86+
.correlate_except(association_model_class)
87+
.exists()
88+
)
89+
90+
91+
def user_exists_filter(owner_id_column, term_text: str):
92+
"""Correlated EXISTS subquery that matches the owning user's username."""
93+
return (
94+
select(1)
95+
.select_from(model.User)
96+
.where(model.User.id == owner_id_column)
97+
.where(model.User.username.ilike(f"%{term_text}%"))
98+
.correlate_except(model.User)
99+
.exists()
100+
)

lib/galaxy_test/api/test_workflows.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,17 @@ def test_index_search_tags_multiple(self):
683683
assert workflow_id_2 not in index_ids
684684
assert workflow_id_3 not in index_ids
685685

686+
def test_index_search_many_terms(self):
687+
# Regression: a whitespace-rich search string used to add one outer join
688+
# on stored_workflow_tag_association and one on galaxy_user per term,
689+
# producing an unusably expensive query for long searches.
690+
name = f"Copy of Genomic Assembly and analysis - RDH shared by user {uuid4()}"
691+
workflow_id = self.workflow_populator.simple_workflow(name)
692+
self.workflow_populator.set_tags(workflow_id, [f"manyterms-{uuid4()}"])
693+
search = "Copy of Genomic Assembly and analysis - RDH shared by user"
694+
index_ids = self.workflow_populator.index_ids(search=search)
695+
assert workflow_id in index_ids
696+
686697
def test_search_casing(self):
687698
name1, name2 = (
688699
self.dataset_populator.get_random_name().upper(),

0 commit comments

Comments
 (0)