88 "fmt"
99 "strings"
1010 "time"
11+ "unicode"
1112
1213 "gorm.io/gorm"
1314)
@@ -22,6 +23,20 @@ const (
2223 tableEntitiesFTS = "entities_fts"
2324)
2425
26+ // SearchEntities tuning knobs. Not user-configurable; the eval harness
27+ // (`micasa eval fts`) is the channel for tuning these.
28+ //
29+ // FTS5's BM25 rank is negative for every match (more negative = more
30+ // relevant). `ftsEntityRankCeiling` is the floor below which a match
31+ // counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
32+ // deliberately permissive so any match passes; the eval will tighten
33+ // this once we have real measurements of noise vs signal.
34+ const (
35+ ftsEntityKPerType = 5 // max rows returned per entity_type
36+ ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
37+ ftsEntityTotalCap = 20 // final cap across all entity_types
38+ )
39+
2540// DocumentSearchResult holds a single FTS5 match with metadata for display.
2641type DocumentSearchResult struct {
2742 ID string
@@ -157,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
157172// literal text, not operators -- the search box is type-as-you-go and
158173// partial operator syntax mid-keystroke would otherwise error.
159174//
175+ // This is the right default for long-form search (documents): every
176+ // word in a multi-word query should contribute. For short, structured
177+ // entity records see prepareFTSEntityQuery, which OR-joins content
178+ // words after stripping stopwords.
179+ //
160180// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
161181func prepareFTSQuery (query string ) string {
162182 fields := strings .Fields (query )
@@ -167,6 +187,75 @@ func prepareFTSQuery(query string) string {
167187 return strings .Join (out , " " )
168188}
169189
190+ // entityFTSStopwords drops high-frequency English words that rarely
191+ // carry semantic weight against entity names and notes. The list is
192+ // intentionally small: prepareFTSEntityQuery OR-joins the survivors,
193+ // so "kitchen" needs to stay even when the user wrote "what's the
194+ // status of the kitchen project?".
195+ var entityFTSStopwords = map [string ]bool {
196+ "a" : true , "an" : true , "the" : true , "of" : true , "in" : true ,
197+ "on" : true , "at" : true , "to" : true , "for" : true , "by" : true ,
198+ "is" : true , "are" : true , "was" : true , "were" : true , "be" : true ,
199+ "been" : true , "being" : true , "have" : true , "has" : true , "had" : true ,
200+ "do" : true , "does" : true , "did" : true , "will" : true , "would" : true ,
201+ "should" : true , "could" : true , "what" : true , "whats" : true ,
202+ "when" : true , "where" : true , "who" : true , "why" : true , "how" : true ,
203+ "that" : true , "this" : true , "these" : true , "those" : true ,
204+ "and" : true , "or" : true , "but" : true , "not" : true , "no" : true ,
205+ "it" : true , "its" : true , "i" : true , "my" : true , "me" : true ,
206+ "you" : true , "your" : true , "we" : true , "our" : true ,
207+ "any" : true , "some" : true , "all" : true , "with" : true , "about" : true ,
208+ }
209+
210+ // prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
211+ // short entity records and natural-language user questions. Unlike
212+ // prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
213+ // question like "what's the status of the kitchen project?" isn't
214+ // zero-matched just because "what's" and "the" don't appear in any
215+ // entity's indexed text.
216+ //
217+ // Filtering steps, applied per token:
218+ // 1. Lowercase.
219+ // 2. Strip non-letter/digit runes so "project?" becomes "project".
220+ // 3. Drop tokens shorter than 2 runes (removes stray punctuation and
221+ // single-char noise).
222+ // 4. Drop entityFTSStopwords.
223+ //
224+ // Survivors become quoted prefix phrases joined with ` OR `. Ranking
225+ // is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
226+ // threshold trims low-quality matches.
227+ //
228+ // Returns "" when no content words survive; callers treat that the
229+ // same way as an empty user query.
230+ func prepareFTSEntityQuery (query string ) string {
231+ fields := strings .Fields (query )
232+ var tokens []string
233+ for _ , w := range fields {
234+ normalized := stripNonAlphaNum (strings .ToLower (w ))
235+ if len (normalized ) < 2 {
236+ continue
237+ }
238+ if entityFTSStopwords [normalized ] {
239+ continue
240+ }
241+ tokens = append (tokens , `"` + normalized + `"*` )
242+ }
243+ return strings .Join (tokens , " OR " )
244+ }
245+
246+ // stripNonAlphaNum keeps only letters and digits. Used by
247+ // prepareFTSEntityQuery to drop punctuation glued to words.
248+ func stripNonAlphaNum (s string ) string {
249+ var b strings.Builder
250+ b .Grow (len (s ))
251+ for _ , r := range s {
252+ if unicode .IsLetter (r ) || unicode .IsDigit (r ) {
253+ b .WriteRune (r )
254+ }
255+ }
256+ return b .String ()
257+ }
258+
170259// RebuildFTSIndex forces a full rebuild of both FTS5 indexes (documents
171260// and entities). Useful after bulk imports or data recovery.
172261func (s * Store ) RebuildFTSIndex () error {
@@ -593,16 +682,91 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
593682 return nil , nil
594683 }
595684
596- safeQuery := prepareFTSQuery (query )
685+ // Entity records are short and user questions are conversational.
686+ // prepareFTSEntityQuery drops stopwords and OR-joins content words
687+ // so "what's the status of the kitchen project?" actually matches
688+ // rows containing "kitchen" even though "what's" and "the" don't.
689+ // Returns "" when every token is a stopword or punctuation.
690+ safeQuery := prepareFTSEntityQuery (query )
691+ if safeQuery == "" {
692+ return nil , nil
693+ }
597694
695+ // Per-type quota and BM25 threshold via window functions. Prevents
696+ // one noisy entity type from crowding out the others, and drops
697+ // matches below the rank ceiling.
698+ //
699+ // Three tiers so that:
700+ // - Every matching type gets at least one slot (tier 1: rn == 1 per
701+ // type, i.e. that type's strongest match).
702+ // - Each type can then fill up to ftsEntityKPerType (tier 2: rows
703+ // with 2 <= rn <= K).
704+ // - Remaining room up to ftsEntityTotalCap is filled globally from
705+ // whatever's left (tier 3: rn > K).
706+ //
707+ // The outer LIMIT ftsEntityTotalCap still applies across the union,
708+ // so even with many matching types the total is bounded. Because
709+ // tier 1 takes exactly one row per type first, every matching type
710+ // is represented as long as the number of matching types does not
711+ // exceed ftsEntityTotalCap.
712+ //
713+ // entity_id tiebreaks rank in every ORDER BY so results are stable
714+ // when BM25 scores collide across similarly-shaped rows.
598715 var results []EntitySearchResult
599716 err := s .db .Raw (fmt .Sprintf (`
717+ WITH matches AS (
718+ SELECT entity_type, entity_id, entity_name, rank,
719+ ROW_NUMBER() OVER (
720+ PARTITION BY entity_type
721+ ORDER BY rank, entity_id
722+ ) AS rn
723+ FROM %s
724+ WHERE %s MATCH ? AND rank < ?
725+ ),
726+ tier1 AS (
727+ -- one row per matching type: each type's strongest match.
728+ SELECT entity_type, entity_id, entity_name, rank
729+ FROM matches
730+ WHERE rn = 1
731+ ),
732+ tier2 AS (
733+ -- up to K per type after tier 1.
734+ SELECT entity_type, entity_id, entity_name, rank
735+ FROM matches
736+ WHERE rn > 1 AND rn <= ?
737+ ORDER BY rank, entity_type, entity_id
738+ LIMIT MAX(
739+ ? - (SELECT COUNT(*) FROM tier1),
740+ 0
741+ )
742+ ),
743+ tier3 AS (
744+ -- fill remaining global slots after per-type quotas are done.
745+ SELECT entity_type, entity_id, entity_name, rank
746+ FROM matches
747+ WHERE rn > ?
748+ ORDER BY rank, entity_type, entity_id
749+ LIMIT MAX(
750+ ? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
751+ 0
752+ )
753+ )
600754 SELECT entity_type, entity_id, entity_name, rank
601- FROM %s
602- WHERE %s MATCH ?
755+ FROM (
756+ SELECT * FROM tier1
757+ UNION ALL
758+ SELECT * FROM tier2
759+ UNION ALL
760+ SELECT * FROM tier3
761+ )
603762 ORDER BY rank, entity_type, entity_id
604- LIMIT 20
605- ` , tableEntitiesFTS , tableEntitiesFTS ), safeQuery ).
763+ LIMIT ?
764+ ` , tableEntitiesFTS , tableEntitiesFTS ),
765+ safeQuery ,
766+ ftsEntityRankCeiling ,
767+ ftsEntityKPerType , ftsEntityTotalCap ,
768+ ftsEntityKPerType , ftsEntityTotalCap ,
769+ ftsEntityTotalCap ).
606770 Scan (& results ).Error
607771 if err != nil {
608772 return nil , fmt .Errorf ("search entities: %w" , err )
0 commit comments