88 "fmt"
99 "strings"
1010 "time"
11+ "unicode"
1112
1213 "gorm.io/gorm"
1314)
@@ -22,6 +23,20 @@ const (
2223 tableEntitiesFTS = "entities_fts"
2324)
2425
26+ // SearchEntities tuning knobs. Not user-configurable; the eval harness
27+ // (`micasa eval fts`) is the channel for tuning these.
28+ //
29+ // FTS5's BM25 rank is negative for every match (more negative = more
30+ // relevant). `ftsEntityRankCeiling` is the floor below which a match
31+ // counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
32+ // deliberately permissive so any match passes; the eval will tighten
33+ // this once we have real measurements of noise vs signal.
34+ const (
35+ ftsEntityKPerType = 5 // max rows returned per entity_type
36+ ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
37+ ftsEntityTotalCap = 20 // final cap across all entity_types
38+ )
39+
2540// DocumentSearchResult holds a single FTS5 match with metadata for display.
2641type DocumentSearchResult struct {
2742 ID string
@@ -157,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
157172// literal text, not operators -- the search box is type-as-you-go and
158173// partial operator syntax mid-keystroke would otherwise error.
159174//
175+ // This is the right default for long-form search (documents): every
176+ // word in a multi-word query should contribute. For short, structured
177+ // entity records see prepareFTSEntityQuery, which OR-joins content
178+ // words after stripping stopwords.
179+ //
160180// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
161181func prepareFTSQuery (query string ) string {
162182 fields := strings .Fields (query )
@@ -167,6 +187,72 @@ func prepareFTSQuery(query string) string {
167187 return strings .Join (out , " " )
168188}
169189
190+ // entityFTSStopwords drops high-frequency English words that rarely
191+ // carry semantic weight against entity names and notes. The list is
192+ // intentionally small: prepareFTSEntityQuery OR-joins the survivors,
193+ // so "kitchen" needs to stay even when the user wrote "what's the
194+ // status of the kitchen project?".
195+ var entityFTSStopwords = map [string ]bool {
196+ "a" : true , "an" : true , "the" : true , "of" : true , "in" : true ,
197+ "on" : true , "at" : true , "to" : true , "for" : true , "by" : true ,
198+ "is" : true , "are" : true , "was" : true , "were" : true , "be" : true ,
199+ "been" : true , "being" : true , "have" : true , "has" : true , "had" : true ,
200+ "do" : true , "does" : true , "did" : true , "will" : true , "would" : true ,
201+ "should" : true , "could" : true , "what" : true , "whats" : true ,
202+ "when" : true , "where" : true , "who" : true , "why" : true , "how" : true ,
203+ "that" : true , "this" : true , "these" : true , "those" : true ,
204+ "and" : true , "or" : true , "but" : true , "not" : true , "no" : true ,
205+ "it" : true , "its" : true , "i" : true , "my" : true , "me" : true ,
206+ "you" : true , "your" : true , "we" : true , "our" : true ,
207+ "any" : true , "some" : true , "all" : true , "with" : true , "about" : true ,
208+ }
209+
210+ // prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
211+ // short entity records and natural-language user questions. Unlike
212+ // prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
213+ // question like "what's the status of the kitchen project?" isn't
214+ // zero-matched just because "what's" and "the" don't appear in any
215+ // entity's indexed text.
216+ //
217+ // Filtering steps, applied per token:
218+ // 1. Lowercase.
219+ // 2. Strip non-letter/digit runes so "project?" becomes "project".
220+ // 3. Drop tokens shorter than 2 runes (removes stray punctuation and
221+ // single-char noise).
222+ // 4. Drop entityFTSStopwords.
223+ //
224+ // Survivors become quoted prefix phrases joined with ` OR `. Ranking
225+ // is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
226+ // threshold trims low-quality matches.
227+ //
228+ // Returns "" when no content words survive; callers treat that the
229+ // same way as an empty user query.
230+ func prepareFTSEntityQuery (query string ) string {
231+ fields := strings .Fields (query )
232+ var tokens []string
233+ for _ , w := range fields {
234+ normalized := strings .Map (keepAlphaNum , strings .ToLower (w ))
235+ if len (normalized ) < 2 {
236+ continue
237+ }
238+ if entityFTSStopwords [normalized ] {
239+ continue
240+ }
241+ tokens = append (tokens , `"` + normalized + `"*` )
242+ }
243+ return strings .Join (tokens , " OR " )
244+ }
245+
246+ // keepAlphaNum is a strings.Map predicate that keeps letters and digits
247+ // and drops everything else. Used by prepareFTSEntityQuery to strip
248+ // punctuation glued to words ("project?" -> "project").
249+ func keepAlphaNum (r rune ) rune {
250+ if unicode .IsLetter (r ) || unicode .IsDigit (r ) {
251+ return r
252+ }
253+ return - 1
254+ }
255+
170256// RebuildFTSIndex forces a full rebuild of both FTS5 indexes (documents
171257// and entities). Useful after bulk imports or data recovery.
172258func (s * Store ) RebuildFTSIndex () error {
@@ -593,16 +679,91 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
593679 return nil , nil
594680 }
595681
596- safeQuery := prepareFTSQuery (query )
682+ // Entity records are short and user questions are conversational.
683+ // prepareFTSEntityQuery drops stopwords and OR-joins content words
684+ // so "what's the status of the kitchen project?" actually matches
685+ // rows containing "kitchen" even though "what's" and "the" don't.
686+ // Returns "" when every token is a stopword or punctuation.
687+ safeQuery := prepareFTSEntityQuery (query )
688+ if safeQuery == "" {
689+ return nil , nil
690+ }
597691
692+ // Per-type quota and BM25 threshold via window functions. Prevents
693+ // one noisy entity type from crowding out the others, and drops
694+ // matches below the rank ceiling.
695+ //
696+ // Three tiers so that:
697+ // - Every matching type gets at least one slot (tier 1: rn == 1 per
698+ // type, i.e. that type's strongest match).
699+ // - Each type can then fill up to ftsEntityKPerType (tier 2: rows
700+ // with 2 <= rn <= K).
701+ // - Remaining room up to ftsEntityTotalCap is filled globally from
702+ // whatever's left (tier 3: rn > K).
703+ //
704+ // The outer LIMIT ftsEntityTotalCap still applies across the union,
705+ // so even with many matching types the total is bounded. Because
706+ // tier 1 takes exactly one row per type first, every matching type
707+ // is represented as long as the number of matching types does not
708+ // exceed ftsEntityTotalCap.
709+ //
710+ // entity_id tiebreaks rank in every ORDER BY so results are stable
711+ // when BM25 scores collide across similarly-shaped rows.
598712 var results []EntitySearchResult
599713 err := s .db .Raw (fmt .Sprintf (`
714+ WITH matches AS (
715+ SELECT entity_type, entity_id, entity_name, rank,
716+ ROW_NUMBER() OVER (
717+ PARTITION BY entity_type
718+ ORDER BY rank, entity_id
719+ ) AS rn
720+ FROM %s
721+ WHERE %s MATCH ? AND rank < ?
722+ ),
723+ tier1 AS (
724+ -- one row per matching type: each type's strongest match.
725+ SELECT entity_type, entity_id, entity_name, rank
726+ FROM matches
727+ WHERE rn = 1
728+ ),
729+ tier2 AS (
730+ -- up to K per type after tier 1.
731+ SELECT entity_type, entity_id, entity_name, rank
732+ FROM matches
733+ WHERE rn > 1 AND rn <= ?
734+ ORDER BY rank, entity_type, entity_id
735+ LIMIT MAX(
736+ ? - (SELECT COUNT(*) FROM tier1),
737+ 0
738+ )
739+ ),
740+ tier3 AS (
741+ -- fill remaining global slots after per-type quotas are done.
742+ SELECT entity_type, entity_id, entity_name, rank
743+ FROM matches
744+ WHERE rn > ?
745+ ORDER BY rank, entity_type, entity_id
746+ LIMIT MAX(
747+ ? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
748+ 0
749+ )
750+ )
600751 SELECT entity_type, entity_id, entity_name, rank
601- FROM %s
602- WHERE %s MATCH ?
752+ FROM (
753+ SELECT * FROM tier1
754+ UNION ALL
755+ SELECT * FROM tier2
756+ UNION ALL
757+ SELECT * FROM tier3
758+ )
603759 ORDER BY rank, entity_type, entity_id
604- LIMIT 20
605- ` , tableEntitiesFTS , tableEntitiesFTS ), safeQuery ).
760+ LIMIT ?
761+ ` , tableEntitiesFTS , tableEntitiesFTS ),
762+ safeQuery ,
763+ ftsEntityRankCeiling ,
764+ ftsEntityKPerType , ftsEntityTotalCap ,
765+ ftsEntityKPerType , ftsEntityTotalCap ,
766+ ftsEntityTotalCap ).
606767 Scan (& results ).Error
607768 if err != nil {
608769 return nil , fmt .Errorf ("search entities: %w" , err )
0 commit comments