Skip to content

Commit ac5d775

Browse files
committed
Drop short / excess raw terms in index-search parser
ILIKE '%term%' on unindexed columns is linear in table size, so every raw term in a whitespace-split search string is one full scan. Terms shorter than 4 characters ('of', 'and', 'by', ...) match too much to be useful and, once joined with all the others, dominate query time. Cap raw terms at 7 and drop any shorter than 4; filtered terms (tag:foo) and quoted raw terms ('ab') are preserved as explicit user intent.
1 parent 52dbfaf commit ac5d775

3 files changed

Lines changed: 95 additions & 1 deletion

File tree

lib/galaxy/managers/workflows.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@
102102
)
103103
from galaxy.util.sanitize_html import sanitize_html
104104
from galaxy.util.search import (
105+
filter_terms,
105106
FilteredTerm,
106107
parse_filters_structured,
107108
RawTextTerm,
@@ -210,7 +211,7 @@ def index_query(
210211
stmt = stmt.where(StoredWorkflow.hidden == (true() if show_hidden else false()))
211212
if payload.search:
212213
search_query = payload.search
213-
parsed_search = parse_filters_structured(search_query, INDEX_SEARCH_FILTERS)
214+
parsed_search = filter_terms(parse_filters_structured(search_query, INDEX_SEARCH_FILTERS))
214215

215216
def w_tag_exists(term_text: str, quoted: bool):
216217
return tag_exists_filter(

lib/galaxy/util/search.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
ParseFilterResultT = Tuple[Optional[List["FilteredTerm"]], Optional[str]]
1313
QUOTE_PATTERN = re.compile(r"\'(.*?)\'")
1414

15+
# Defaults for `filter_terms` used by index-search callers. A whitespace-rich
16+
# query turns into one WHERE clause (and, pre-trigram-index, one seq scan per
17+
# matching table) per raw term, so both floors are there to bound query cost.
18+
DEFAULT_MIN_RAW_TERM_LENGTH = 4
19+
DEFAULT_MAX_RAW_TERMS = 7
20+
1521

1622
def parse_filters(search_term: str, filters: Optional[Dict[str, str]] = None) -> ParseFilterResultT:
1723
"""Support github-like filters for narrowing the results.
@@ -110,7 +116,39 @@ def simple_result(self) -> ParseFilterResultT:
110116
return None if len(self.filter_terms) == 0 else self.filter_terms, " ".join([t.text for t in self.text_terms])
111117

112118

119+
def filter_terms(
120+
parsed: "ParsedSearch",
121+
min_raw_term_length: int = DEFAULT_MIN_RAW_TERM_LENGTH,
122+
max_raw_terms: Optional[int] = DEFAULT_MAX_RAW_TERMS,
123+
) -> "ParsedSearch":
124+
"""Return a new ParsedSearch with short / excess raw text terms dropped.
125+
126+
Raw (unquoted, non-keyed) terms shorter than ``min_raw_term_length`` are
127+
dropped, and the surviving raw terms are capped at ``max_raw_terms``.
128+
Filtered terms (``key:value``) and quoted raw terms ('foo bar') are
129+
always kept — those are explicit user intent.
130+
"""
131+
out = ParsedSearch()
132+
raw_kept = 0
133+
for term in parsed.terms:
134+
if isinstance(term, RawTextTerm) and not term.quoted:
135+
if len(term.text) < min_raw_term_length:
136+
continue
137+
if max_raw_terms is not None and raw_kept >= max_raw_terms:
138+
continue
139+
raw_kept += 1
140+
out.add_unfiltered_text(term.text, term.quoted)
141+
elif isinstance(term, RawTextTerm):
142+
out.add_unfiltered_text(term.text, term.quoted)
143+
else:
144+
out.add_keyed_term(term.filter, term.text, term.quoted)
145+
return out
146+
147+
113148
__all__ = (
149+
"DEFAULT_MAX_RAW_TERMS",
150+
"DEFAULT_MIN_RAW_TERM_LENGTH",
151+
"filter_terms",
114152
"parse_filters",
115153
"parse_filters_structured",
116154
)

test/unit/util/test_search.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from galaxy.util.search import (
2+
filter_terms,
3+
FilteredTerm,
24
parse_filters,
35
parse_filters_structured,
6+
RawTextTerm,
47
)
58

69

@@ -94,3 +97,55 @@ def test_parse_filters_structured():
9497
assert text_terms[1].quoted is True
9598
assert text_terms[2].text == "foo"
9699
assert text_terms[2].quoted is False
100+
101+
102+
def test_filter_terms_drops_short_raw_terms():
103+
parsed = parse_filters_structured("Copy of Genomic Assembly and analysis", {})
104+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
105+
kept = [t.text for t in filtered.terms]
106+
assert kept == ["Copy", "Genomic", "Assembly", "analysis"]
107+
108+
109+
def test_filter_terms_preserves_quoted_raw_terms():
110+
parsed = parse_filters_structured("'ab' Copy 'de'", {})
111+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
112+
assert [(t.text, t.quoted) for t in filtered.terms] == [
113+
("ab", True),
114+
("Copy", False),
115+
("de", True),
116+
]
117+
118+
119+
def test_filter_terms_preserves_filtered_terms_of_any_length():
120+
parsed = parse_filters_structured("tag:ab user:cd Copy", {"tag": "tag", "user": "user"})
121+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=None)
122+
kinds = [(t.__class__.__name__, t.text) for t in filtered.terms]
123+
# Both filtered terms are preserved even though their text is shorter than 4;
124+
# "Copy" (4) is preserved too.
125+
assert ("FilteredTerm", "ab") in kinds
126+
assert ("FilteredTerm", "cd") in kinds
127+
assert ("RawTextTerm", "Copy") in kinds
128+
129+
130+
def test_filter_terms_caps_raw_terms_only():
131+
parsed = parse_filters_structured(
132+
"tag:foo Copy Genomic Assembly analysis shared user nedflanders extra1 extra2",
133+
{"tag": "tag"},
134+
)
135+
filtered = filter_terms(parsed, min_raw_term_length=4, max_raw_terms=3)
136+
raw = [t.text for t in filtered.terms if isinstance(t, RawTextTerm)]
137+
filt = [t.text for t in filtered.terms if isinstance(t, FilteredTerm)]
138+
assert raw == ["Copy", "Genomic", "Assembly"]
139+
assert filt == ["foo"]
140+
141+
142+
def test_filter_terms_defaults():
143+
# Default behaviour: 4-char min length, 7-term cap on raw terms.
144+
parsed = parse_filters_structured(
145+
"Copy of Genomic Assembly and analysis - RDH shared by user nedflanders",
146+
{},
147+
)
148+
filtered = filter_terms(parsed)
149+
kept = [t.text for t in filtered.terms]
150+
# of, and, -, RDH, by dropped for length; everything else survives.
151+
assert kept == ["Copy", "Genomic", "Assembly", "analysis", "shared", "user", "nedflanders"]

0 commit comments

Comments
 (0)