fix(data): OR-join content words in entity FTS so NL questions match

cpcloud · cpcloud · commit 88256f1e977c · 2026-04-20T07:28:24.000-04:00
The eval harness surfaced that FTS context enrichment was barely
firing in practice: every question on the benchmark had
entities=0/N with identical FTS-on and FTS-off rubric scores. Root
cause: SearchEntities reused prepareFTSQuery, which AND-joins every
whitespace token. For a conversational question like "what's the
status of the kitchen project?" that expands to "whats" AND "the"
AND "status" AND ... which zero-matches because "the" and "whats"
don't appear in entity names/notes.

Add prepareFTSEntityQuery for short-record entity search:
- lowercase each token, strip punctuation (so "project?" becomes
  "project"), drop tokens shorter than 2 runes;
- drop a small English stopword list so "the", "of", "what", etc.
  don't count as content words;
- OR-join the survivors as quoted prefix phrases so any content
  word hit surfaces the row, and BM25 + the existing rank ceiling
  order and trim the result set.

Document-text search keeps the AND-join default (prepareFTSQuery
unchanged) since document bodies have enough vocabulary that every
query word should contribute.

Tests cover the query builder directly (stopword dropping, 1-char
filtering, OR-join formatting) plus an end-to-end regression:
"what's the status of the kitchen project?" and variants now
surface the Kitchen Remodel project, while a stopword-only query
like "what is the" returns empty instead of matching every row.
diff --git a/internal/data/fts.go b/internal/data/fts.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"strings"
 	"time"
+	"unicode"
 
 	"gorm.io/gorm"
 )
@@ -171,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
 // literal text, not operators -- the search box is type-as-you-go and
 // partial operator syntax mid-keystroke would otherwise error.
 //
+// This is the right default for long-form search (documents): every
+// word in a multi-word query should contribute. For short, structured
+// entity records see prepareFTSEntityQuery, which OR-joins content
+// words after stripping stopwords.
+//
 // See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
 func prepareFTSQuery(query string) string {
 	fields := strings.Fields(query)
@@ -181,6 +187,75 @@ func prepareFTSQuery(query string) string {
 	return strings.Join(out, " ")
 }
 
+// entityFTSStopwords drops high-frequency English words that rarely
+// carry semantic weight against entity names and notes. The list is
+// intentionally small: prepareFTSEntityQuery OR-joins the survivors,
+// so "kitchen" needs to stay even when the user wrote "what's the
+// status of the kitchen project?".
+var entityFTSStopwords = map[string]bool{
+	"a": true, "an": true, "the": true, "of": true, "in": true,
+	"on": true, "at": true, "to": true, "for": true, "by": true,
+	"is": true, "are": true, "was": true, "were": true, "be": true,
+	"been": true, "being": true, "have": true, "has": true, "had": true,
+	"do": true, "does": true, "did": true, "will": true, "would": true,
+	"should": true, "could": true, "what": true, "whats": true,
+	"when": true, "where": true, "who": true, "why": true, "how": true,
+	"that": true, "this": true, "these": true, "those": true,
+	"and": true, "or": true, "but": true, "not": true, "no": true,
+	"it": true, "its": true, "i": true, "my": true, "me": true,
+	"you": true, "your": true, "we": true, "our": true,
+	"any": true, "some": true, "all": true, "with": true, "about": true,
+}
+
+// prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
+// short entity records and natural-language user questions. Unlike
+// prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
+// question like "what's the status of the kitchen project?" isn't
+// zero-matched just because "what's" and "the" don't appear in any
+// entity's indexed text.
+//
+// Filtering steps, applied per token:
+//  1. Lowercase.
+//  2. Strip non-letter/digit runes so "project?" becomes "project".
+//  3. Drop tokens shorter than 2 runes (removes stray punctuation and
+//     single-char noise).
+//  4. Drop entityFTSStopwords.
+//
+// Survivors become quoted prefix phrases joined with ` OR `. Ranking
+// is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
+// threshold trims low-quality matches.
+//
+// Returns "" when no content words survive; callers treat that the
+// same way as an empty user query.
+func prepareFTSEntityQuery(query string) string {
+	fields := strings.Fields(query)
+	var tokens []string
+	for _, w := range fields {
+		normalized := stripNonAlphaNum(strings.ToLower(w))
+		if len(normalized) < 2 {
+			continue
+		}
+		if entityFTSStopwords[normalized] {
+			continue
+		}
+		tokens = append(tokens, `"`+normalized+`"*`)
+	}
+	return strings.Join(tokens, " OR ")
+}
+
+// stripNonAlphaNum keeps only letters and digits. Used by
+// prepareFTSEntityQuery to drop punctuation glued to words.
+func stripNonAlphaNum(s string) string {
+	var b strings.Builder
+	b.Grow(len(s))
+	for _, r := range s {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			b.WriteRune(r)
+		}
+	}
+	return b.String()
+}
+
 // RebuildFTSIndex forces a full rebuild of the FTS5 index. Useful after
 // bulk imports or data recovery.
 func (s *Store) RebuildFTSIndex() error {
@@ -604,7 +679,15 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
 		return nil, nil
 	}
 
-	safeQuery := prepareFTSQuery(query)
+	// Entity records are short and user questions are conversational.
+	// prepareFTSEntityQuery drops stopwords and OR-joins content words
+	// so "what's the status of the kitchen project?" actually matches
+	// rows containing "kitchen" even though "what's" and "the" don't.
+	// Returns "" when every token is a stopword or punctuation.
+	safeQuery := prepareFTSEntityQuery(query)
+	if safeQuery == "" {
+		return nil, nil
+	}
 
 	// Per-type quota and BM25 threshold via window functions. Prevents
 	// one noisy entity type from crowding out the others, and drops
diff --git a/internal/data/fts_test.go b/internal/data/fts_test.go
@@ -321,6 +321,97 @@ func TestPrepareFTSQuery(t *testing.T) {
 	}
 }
 
+func TestPrepareFTSEntityQuery(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		name, in, want string
+	}{
+		{"empty", "", ""},
+		{"all stopwords", "what is the of", ""},
+		{"single content word", "plumber", `"plumber"*`},
+		{
+			"strips stopwords and punctuation",
+			"what's the status of the kitchen project?",
+			`"status"* OR "kitchen"* OR "project"*`,
+		},
+		{
+			"or-joins multiple content words",
+			"kitchen remodel budget",
+			`"kitchen"* OR "remodel"* OR "budget"*`,
+		},
+		{
+			"drops 1-char tokens",
+			"a b c kitchen",
+			`"kitchen"*`,
+		},
+		{
+			"drops pure punctuation",
+			"- ? kitchen !",
+			`"kitchen"*`,
+		},
+		{
+			"lowercases",
+			"Kitchen REMODEL",
+			`"kitchen"* OR "remodel"*`,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.want, prepareFTSEntityQuery(tt.in))
+		})
+	}
+}
+
+// TestSearchEntitiesMatchesNaturalLanguageQuestions is the end-to-end
+// regression for the stopword-AND bug: before the prepareFTSEntityQuery
+// fix, asking a conversational question like "what's the status of the
+// kitchen project?" produced zero results because every word had to
+// match. Now the content words OR-match and the kitchen project
+// surfaces.
+func TestSearchEntitiesMatchesNaturalLanguageQuestions(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	types, _ := store.ProjectTypes()
+	require.NoError(t, store.CreateProject(&Project{
+		Title:         "Kitchen Remodel",
+		ProjectTypeID: types[0].ID,
+		Status:        ProjectStatusInProgress,
+	}))
+
+	for _, q := range []string{
+		"what's the status of the kitchen project?",
+		"how's the kitchen going",
+		"kitchen",
+	} {
+		t.Run(q, func(t *testing.T) {
+			results, err := store.SearchEntities(q)
+			require.NoError(t, err)
+			require.NotEmpty(t, results, "expected a match for %q", q)
+			assert.Equal(t, "Kitchen Remodel", results[0].EntityName)
+		})
+	}
+}
+
+// TestSearchEntitiesStopwordOnlyQueryReturnsEmpty covers the fast
+// path where every user token is a stopword. The expected behavior
+// is "no results" rather than "match everything".
+func TestSearchEntitiesStopwordOnlyQueryReturnsEmpty(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	types, _ := store.ProjectTypes()
+	require.NoError(t, store.CreateProject(&Project{
+		Title:         "Kitchen Remodel",
+		ProjectTypeID: types[0].ID,
+		Status:        ProjectStatusInProgress,
+	}))
+
+	results, err := store.SearchEntities("what is the")
+	require.NoError(t, err)
+	assert.Empty(t, results, "stopword-only query must not match every row")
+}
+
 func TestRebuildFTSIndex(t *testing.T) {
 	t.Parallel()
 	store := newTestStore(t)