Skip to content

Commit 563be0f

Browse files
committed
feat(data): per-type quotas, rank threshold, NL-tolerant entity FTS
Replaces the flat LIMIT 20 in SearchEntities with a three-tier window-function query and adds natural-language query tolerance. Ranking: - Tier 1 takes exactly one row per matching entity type (guarantees cross-type representation). - Tier 2 raises each type up to ftsEntityKPerType rows so single noisy types can't dominate. - Tier 3 fills any remaining room up to ftsEntityTotalCap from whatever's left, globally ranked. Single-type searches use the full cap this way. Package-level tuning constants (not user-configurable -- the eval harness is the tuning channel): ftsEntityKPerType = 5 ftsEntityRankCeiling = 0.0 // permissive; eval will tighten ftsEntityTotalCap = 20 entity_id tiebreaks rank in every ORDER BY so results are stable when BM25 produces identical ranks on similarly-shaped rows. Query tolerance: - prepareFTSEntityQuery lowercases, strips non-alphanum, drops short and stopword tokens, and OR-joins the survivors as quoted prefix phrases. - Returns early when no content words survive so a pure-stopword question like "what is it?" doesn't hammer FTS with an empty MATCH. Tests cover per-type quota preservation under a flood of first-class matches, single-type searches using the full cap, every matching type surfacing when 5+ types share a token, total cap enforcement, rank threshold plumbing, stable ordering across runs, the query builder directly, and the end-to-end regression that "what's the status of the kitchen project?" now surfaces the Kitchen Remodel project. Refs #707.
1 parent c5f3f84 commit 563be0f

2 files changed

Lines changed: 502 additions & 23 deletions

File tree

internal/data/fts.go

Lines changed: 169 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"strings"
1010
"time"
11+
"unicode"
1112

1213
"gorm.io/gorm"
1314
)
@@ -22,6 +23,20 @@ const (
2223
tableEntitiesFTS = "entities_fts"
2324
)
2425

26+
// SearchEntities tuning knobs. Not user-configurable; the eval harness
27+
// (`micasa eval fts`) is the channel for tuning these.
28+
//
29+
// FTS5's BM25 rank is negative for every match (more negative = more
30+
// relevant). `ftsEntityRankCeiling` is the floor below which a match
31+
// counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
32+
// deliberately permissive so any match passes; the eval will tighten
33+
// this once we have real measurements of noise vs signal.
34+
const (
35+
ftsEntityKPerType = 5 // max rows returned per entity_type
36+
ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
37+
ftsEntityTotalCap = 20 // final cap across all entity_types
38+
)
39+
2540
// DocumentSearchResult holds a single FTS5 match with metadata for display.
2641
type DocumentSearchResult struct {
2742
ID string
@@ -157,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
157172
// literal text, not operators -- the search box is type-as-you-go and
158173
// partial operator syntax mid-keystroke would otherwise error.
159174
//
175+
// This is the right default for long-form search (documents): every
176+
// word in a multi-word query should contribute. For short, structured
177+
// entity records see prepareFTSEntityQuery, which OR-joins content
178+
// words after stripping stopwords.
179+
//
160180
// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
161181
func prepareFTSQuery(query string) string {
162182
fields := strings.Fields(query)
@@ -167,6 +187,75 @@ func prepareFTSQuery(query string) string {
167187
return strings.Join(out, " ")
168188
}
169189

190+
// entityFTSStopwords drops high-frequency English words that rarely
191+
// carry semantic weight against entity names and notes. The list is
192+
// intentionally small: prepareFTSEntityQuery OR-joins the survivors,
193+
// so "kitchen" needs to stay even when the user wrote "what's the
194+
// status of the kitchen project?".
195+
var entityFTSStopwords = map[string]bool{
196+
"a": true, "an": true, "the": true, "of": true, "in": true,
197+
"on": true, "at": true, "to": true, "for": true, "by": true,
198+
"is": true, "are": true, "was": true, "were": true, "be": true,
199+
"been": true, "being": true, "have": true, "has": true, "had": true,
200+
"do": true, "does": true, "did": true, "will": true, "would": true,
201+
"should": true, "could": true, "what": true, "whats": true,
202+
"when": true, "where": true, "who": true, "why": true, "how": true,
203+
"that": true, "this": true, "these": true, "those": true,
204+
"and": true, "or": true, "but": true, "not": true, "no": true,
205+
"it": true, "its": true, "i": true, "my": true, "me": true,
206+
"you": true, "your": true, "we": true, "our": true,
207+
"any": true, "some": true, "all": true, "with": true, "about": true,
208+
}
209+
210+
// prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
211+
// short entity records and natural-language user questions. Unlike
212+
// prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
213+
// question like "what's the status of the kitchen project?" isn't
214+
// zero-matched just because "what's" and "the" don't appear in any
215+
// entity's indexed text.
216+
//
217+
// Filtering steps, applied per token:
218+
// 1. Lowercase.
219+
// 2. Strip non-letter/digit runes so "project?" becomes "project".
220+
// 3. Drop tokens shorter than 2 runes (removes stray punctuation and
221+
// single-char noise).
222+
// 4. Drop entityFTSStopwords.
223+
//
224+
// Survivors become quoted prefix phrases joined with ` OR `. Ranking
225+
// is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
226+
// threshold trims low-quality matches.
227+
//
228+
// Returns "" when no content words survive; callers treat that the
229+
// same way as an empty user query.
230+
func prepareFTSEntityQuery(query string) string {
231+
fields := strings.Fields(query)
232+
var tokens []string
233+
for _, w := range fields {
234+
normalized := stripNonAlphaNum(strings.ToLower(w))
235+
if len(normalized) < 2 {
236+
continue
237+
}
238+
if entityFTSStopwords[normalized] {
239+
continue
240+
}
241+
tokens = append(tokens, `"`+normalized+`"*`)
242+
}
243+
return strings.Join(tokens, " OR ")
244+
}
245+
246+
// stripNonAlphaNum keeps only letters and digits. Used by
247+
// prepareFTSEntityQuery to drop punctuation glued to words.
248+
func stripNonAlphaNum(s string) string {
249+
var b strings.Builder
250+
b.Grow(len(s))
251+
for _, r := range s {
252+
if unicode.IsLetter(r) || unicode.IsDigit(r) {
253+
b.WriteRune(r)
254+
}
255+
}
256+
return b.String()
257+
}
258+
170259
// RebuildFTSIndex forces a full rebuild of both FTS5 indexes (documents
171260
// and entities). Useful after bulk imports or data recovery.
172261
func (s *Store) RebuildFTSIndex() error {
@@ -593,16 +682,91 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
593682
return nil, nil
594683
}
595684

596-
safeQuery := prepareFTSQuery(query)
685+
// Entity records are short and user questions are conversational.
686+
// prepareFTSEntityQuery drops stopwords and OR-joins content words
687+
// so "what's the status of the kitchen project?" actually matches
688+
// rows containing "kitchen" even though "what's" and "the" don't.
689+
// Returns "" when every token is a stopword or punctuation.
690+
safeQuery := prepareFTSEntityQuery(query)
691+
if safeQuery == "" {
692+
return nil, nil
693+
}
597694

695+
// Per-type quota and BM25 threshold via window functions. Prevents
696+
// one noisy entity type from crowding out the others, and drops
697+
// matches below the rank ceiling.
698+
//
699+
// Three tiers so that:
700+
// - Every matching type gets at least one slot (tier 1: rn == 1 per
701+
// type, i.e. that type's strongest match).
702+
// - Each type can then fill up to ftsEntityKPerType (tier 2: rows
703+
// with 2 <= rn <= K).
704+
// - Remaining room up to ftsEntityTotalCap is filled globally from
705+
// whatever's left (tier 3: rn > K).
706+
//
707+
// The outer LIMIT ftsEntityTotalCap still applies across the union,
708+
// so even with many matching types the total is bounded. Because
709+
// tier 1 takes exactly one row per type first, every matching type
710+
// is represented as long as the number of matching types does not
711+
// exceed ftsEntityTotalCap.
712+
//
713+
// entity_id tiebreaks rank in every ORDER BY so results are stable
714+
// when BM25 scores collide across similarly-shaped rows.
598715
var results []EntitySearchResult
599716
err := s.db.Raw(fmt.Sprintf(`
717+
WITH matches AS (
718+
SELECT entity_type, entity_id, entity_name, rank,
719+
ROW_NUMBER() OVER (
720+
PARTITION BY entity_type
721+
ORDER BY rank, entity_id
722+
) AS rn
723+
FROM %s
724+
WHERE %s MATCH ? AND rank < ?
725+
),
726+
tier1 AS (
727+
-- one row per matching type: each type's strongest match.
728+
SELECT entity_type, entity_id, entity_name, rank
729+
FROM matches
730+
WHERE rn = 1
731+
),
732+
tier2 AS (
733+
-- up to K per type after tier 1.
734+
SELECT entity_type, entity_id, entity_name, rank
735+
FROM matches
736+
WHERE rn > 1 AND rn <= ?
737+
ORDER BY rank, entity_type, entity_id
738+
LIMIT MAX(
739+
? - (SELECT COUNT(*) FROM tier1),
740+
0
741+
)
742+
),
743+
tier3 AS (
744+
-- fill remaining global slots after per-type quotas are done.
745+
SELECT entity_type, entity_id, entity_name, rank
746+
FROM matches
747+
WHERE rn > ?
748+
ORDER BY rank, entity_type, entity_id
749+
LIMIT MAX(
750+
? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
751+
0
752+
)
753+
)
600754
SELECT entity_type, entity_id, entity_name, rank
601-
FROM %s
602-
WHERE %s MATCH ?
755+
FROM (
756+
SELECT * FROM tier1
757+
UNION ALL
758+
SELECT * FROM tier2
759+
UNION ALL
760+
SELECT * FROM tier3
761+
)
603762
ORDER BY rank, entity_type, entity_id
604-
LIMIT 20
605-
`, tableEntitiesFTS, tableEntitiesFTS), safeQuery).
763+
LIMIT ?
764+
`, tableEntitiesFTS, tableEntitiesFTS),
765+
safeQuery,
766+
ftsEntityRankCeiling,
767+
ftsEntityKPerType, ftsEntityTotalCap,
768+
ftsEntityKPerType, ftsEntityTotalCap,
769+
ftsEntityTotalCap).
606770
Scan(&results).Error
607771
if err != nil {
608772
return nil, fmt.Errorf("search entities: %w", err)

0 commit comments

Comments
 (0)