Skip to content

Commit 88256f1

Browse files
committed
fix(data): OR-join content words in entity FTS so NL questions match
The eval harness surfaced that FTS context enrichment was barely firing in practice: every question on the benchmark had entities=0/N with identical FTS-on and FTS-off rubric scores. Root cause: SearchEntities reused prepareFTSQuery, which AND-joins every whitespace token. For a conversational question like "what's the status of the kitchen project?" that expands to "whats" AND "the" AND "status" AND ... which zero-matches because "the" and "whats" don't appear in entity names/notes. Add prepareFTSEntityQuery for short-record entity search: - lowercase each token, strip punctuation (so "project?" becomes "project"), drop tokens shorter than 2 runes; - drop a small English stopword list so "the", "of", "what", etc. don't count as content words; - OR-join the survivors as quoted prefix phrases so any content word hit surfaces the row, and BM25 + the existing rank ceiling order and trim the result set. Document-text search keeps the AND-join default (prepareFTSQuery unchanged) since document bodies have enough vocabulary that every query word should contribute. Tests cover the query builder directly (stopword dropping, 1-char filtering, OR-join formatting) plus an end-to-end regression: "what's the status of the kitchen project?" and variants now surface the Kitchen Remodel project, while a stopword-only query like "what is the" returns empty instead of matching every row.
1 parent 768bb7f commit 88256f1

2 files changed

Lines changed: 175 additions & 1 deletion

File tree

internal/data/fts.go

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"strings"
1010
"time"
11+
"unicode"
1112

1213
"gorm.io/gorm"
1314
)
@@ -171,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
171172
// literal text, not operators -- the search box is type-as-you-go and
172173
// partial operator syntax mid-keystroke would otherwise error.
173174
//
175+
// This is the right default for long-form search (documents): every
176+
// word in a multi-word query should contribute. For short, structured
177+
// entity records see prepareFTSEntityQuery, which OR-joins content
178+
// words after stripping stopwords.
179+
//
174180
// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
175181
func prepareFTSQuery(query string) string {
176182
fields := strings.Fields(query)
@@ -181,6 +187,75 @@ func prepareFTSQuery(query string) string {
181187
return strings.Join(out, " ")
182188
}
183189

190+
// entityFTSStopwords drops high-frequency English words that rarely
191+
// carry semantic weight against entity names and notes. The list is
192+
// intentionally small: prepareFTSEntityQuery OR-joins the survivors,
193+
// so "kitchen" needs to stay even when the user wrote "what's the
194+
// status of the kitchen project?".
195+
var entityFTSStopwords = map[string]bool{
196+
"a": true, "an": true, "the": true, "of": true, "in": true,
197+
"on": true, "at": true, "to": true, "for": true, "by": true,
198+
"is": true, "are": true, "was": true, "were": true, "be": true,
199+
"been": true, "being": true, "have": true, "has": true, "had": true,
200+
"do": true, "does": true, "did": true, "will": true, "would": true,
201+
"should": true, "could": true, "what": true, "whats": true,
202+
"when": true, "where": true, "who": true, "why": true, "how": true,
203+
"that": true, "this": true, "these": true, "those": true,
204+
"and": true, "or": true, "but": true, "not": true, "no": true,
205+
"it": true, "its": true, "i": true, "my": true, "me": true,
206+
"you": true, "your": true, "we": true, "our": true,
207+
"any": true, "some": true, "all": true, "with": true, "about": true,
208+
}
209+
210+
// prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
211+
// short entity records and natural-language user questions. Unlike
212+
// prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
213+
// question like "what's the status of the kitchen project?" isn't
214+
// zero-matched just because "what's" and "the" don't appear in any
215+
// entity's indexed text.
216+
//
217+
// Filtering steps, applied per token:
218+
// 1. Lowercase.
219+
// 2. Strip non-letter/digit runes so "project?" becomes "project".
220+
// 3. Drop tokens shorter than 2 runes (removes stray punctuation and
221+
// single-char noise).
222+
// 4. Drop entityFTSStopwords.
223+
//
224+
// Survivors become quoted prefix phrases joined with ` OR `. Ranking
225+
// is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
226+
// threshold trims low-quality matches.
227+
//
228+
// Returns "" when no content words survive; callers treat that the
229+
// same way as an empty user query.
230+
func prepareFTSEntityQuery(query string) string {
231+
fields := strings.Fields(query)
232+
var tokens []string
233+
for _, w := range fields {
234+
normalized := stripNonAlphaNum(strings.ToLower(w))
235+
if len(normalized) < 2 {
236+
continue
237+
}
238+
if entityFTSStopwords[normalized] {
239+
continue
240+
}
241+
tokens = append(tokens, `"`+normalized+`"*`)
242+
}
243+
return strings.Join(tokens, " OR ")
244+
}
245+
246+
// stripNonAlphaNum keeps only letters and digits. Used by
247+
// prepareFTSEntityQuery to drop punctuation glued to words.
248+
func stripNonAlphaNum(s string) string {
249+
var b strings.Builder
250+
b.Grow(len(s))
251+
for _, r := range s {
252+
if unicode.IsLetter(r) || unicode.IsDigit(r) {
253+
b.WriteRune(r)
254+
}
255+
}
256+
return b.String()
257+
}
258+
184259
// RebuildFTSIndex forces a full rebuild of the FTS5 index. Useful after
185260
// bulk imports or data recovery.
186261
func (s *Store) RebuildFTSIndex() error {
@@ -604,7 +679,15 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
604679
return nil, nil
605680
}
606681

607-
safeQuery := prepareFTSQuery(query)
682+
// Entity records are short and user questions are conversational.
683+
// prepareFTSEntityQuery drops stopwords and OR-joins content words
684+
// so "what's the status of the kitchen project?" actually matches
685+
// rows containing "kitchen" even though "what's" and "the" don't.
686+
// Returns "" when every token is a stopword or punctuation.
687+
safeQuery := prepareFTSEntityQuery(query)
688+
if safeQuery == "" {
689+
return nil, nil
690+
}
608691

609692
// Per-type quota and BM25 threshold via window functions. Prevents
610693
// one noisy entity type from crowding out the others, and drops

internal/data/fts_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,97 @@ func TestPrepareFTSQuery(t *testing.T) {
321321
}
322322
}
323323

324+
func TestPrepareFTSEntityQuery(t *testing.T) {
325+
t.Parallel()
326+
tests := []struct {
327+
name, in, want string
328+
}{
329+
{"empty", "", ""},
330+
{"all stopwords", "what is the of", ""},
331+
{"single content word", "plumber", `"plumber"*`},
332+
{
333+
"strips stopwords and punctuation",
334+
"what's the status of the kitchen project?",
335+
`"status"* OR "kitchen"* OR "project"*`,
336+
},
337+
{
338+
"or-joins multiple content words",
339+
"kitchen remodel budget",
340+
`"kitchen"* OR "remodel"* OR "budget"*`,
341+
},
342+
{
343+
"drops 1-char tokens",
344+
"a b c kitchen",
345+
`"kitchen"*`,
346+
},
347+
{
348+
"drops pure punctuation",
349+
"- ? kitchen !",
350+
`"kitchen"*`,
351+
},
352+
{
353+
"lowercases",
354+
"Kitchen REMODEL",
355+
`"kitchen"* OR "remodel"*`,
356+
},
357+
}
358+
for _, tt := range tests {
359+
t.Run(tt.name, func(t *testing.T) {
360+
assert.Equal(t, tt.want, prepareFTSEntityQuery(tt.in))
361+
})
362+
}
363+
}
364+
365+
// TestSearchEntitiesMatchesNaturalLanguageQuestions is the end-to-end
366+
// regression for the stopword-AND bug: before the prepareFTSEntityQuery
367+
// fix, asking a conversational question like "what's the status of the
368+
// kitchen project?" produced zero results because every word had to
369+
// match. Now the content words OR-match and the kitchen project
370+
// surfaces.
371+
func TestSearchEntitiesMatchesNaturalLanguageQuestions(t *testing.T) {
372+
t.Parallel()
373+
store := newTestStore(t)
374+
375+
types, _ := store.ProjectTypes()
376+
require.NoError(t, store.CreateProject(&Project{
377+
Title: "Kitchen Remodel",
378+
ProjectTypeID: types[0].ID,
379+
Status: ProjectStatusInProgress,
380+
}))
381+
382+
for _, q := range []string{
383+
"what's the status of the kitchen project?",
384+
"how's the kitchen going",
385+
"kitchen",
386+
} {
387+
t.Run(q, func(t *testing.T) {
388+
results, err := store.SearchEntities(q)
389+
require.NoError(t, err)
390+
require.NotEmpty(t, results, "expected a match for %q", q)
391+
assert.Equal(t, "Kitchen Remodel", results[0].EntityName)
392+
})
393+
}
394+
}
395+
396+
// TestSearchEntitiesStopwordOnlyQueryReturnsEmpty covers the fast
397+
// path where every user token is a stopword. The expected behavior
398+
// is "no results" rather than "match everything".
399+
func TestSearchEntitiesStopwordOnlyQueryReturnsEmpty(t *testing.T) {
400+
t.Parallel()
401+
store := newTestStore(t)
402+
403+
types, _ := store.ProjectTypes()
404+
require.NoError(t, store.CreateProject(&Project{
405+
Title: "Kitchen Remodel",
406+
ProjectTypeID: types[0].ID,
407+
Status: ProjectStatusInProgress,
408+
}))
409+
410+
results, err := store.SearchEntities("what is the")
411+
require.NoError(t, err)
412+
assert.Empty(t, results, "stopword-only query must not match every row")
413+
}
414+
324415
func TestRebuildFTSIndex(t *testing.T) {
325416
t.Parallel()
326417
store := newTestStore(t)

0 commit comments

Comments
 (0)