88 "fmt"
99 "strings"
1010 "time"
11+ "unicode"
1112
1213 "gorm.io/gorm"
1314)
@@ -22,6 +23,20 @@ const (
2223 tableEntitiesFTS = "entities_fts"
2324)
2425
26+ // SearchEntities tuning knobs. Not user-configurable; the eval harness
27+ // (`micasa eval fts`) is the channel for tuning these.
28+ //
29+ // FTS5's BM25 rank is negative for every match (more negative = more
30+ // relevant). `ftsEntityRankCeiling` is the floor below which a match
31+ // counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
32+ // deliberately permissive so any match passes; the eval will tighten
33+ // this once we have real measurements of noise vs signal.
34+ const (
35+ ftsEntityKPerType = 5 // max rows returned per entity_type
36+ ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
37+ ftsEntityTotalCap = 20 // final cap across all entity_types
38+ )
39+
2540// DocumentSearchResult holds a single FTS5 match with metadata for display.
2641type DocumentSearchResult struct {
2742 ID string
@@ -157,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
157172// literal text, not operators -- the search box is type-as-you-go and
158173// partial operator syntax mid-keystroke would otherwise error.
159174//
175+ // This is the right default for long-form search (documents): every
176+ // word in a multi-word query should contribute. For short, structured
177+ // entity records see prepareFTSEntityQuery, which OR-joins content
178+ // words after stripping stopwords.
179+ //
160180// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
161181func prepareFTSQuery (query string ) string {
162182 fields := strings .Fields (query )
@@ -167,6 +187,74 @@ func prepareFTSQuery(query string) string {
167187 return strings .Join (out , " " )
168188}
169189
190+ // entityFTSStopwords drops high-frequency English words that rarely
191+ // carry semantic weight against entity names and notes. The list is
192+ // intentionally small: prepareFTSEntityQuery OR-joins the survivors,
193+ // so "kitchen" needs to stay even when the user wrote "what's the
194+ // status of the kitchen project?".
195+ var entityFTSStopwords = map [string ]bool {
196+ "a" : true , "an" : true , "the" : true , "of" : true , "in" : true ,
197+ "on" : true , "at" : true , "to" : true , "for" : true , "by" : true ,
198+ "is" : true , "are" : true , "was" : true , "were" : true , "be" : true ,
199+ "been" : true , "being" : true , "have" : true , "has" : true , "had" : true ,
200+ "do" : true , "does" : true , "did" : true , "will" : true , "would" : true ,
201+ "should" : true , "could" : true , "what" : true , "whats" : true ,
202+ "when" : true , "where" : true , "who" : true , "why" : true , "how" : true ,
203+ "that" : true , "this" : true , "these" : true , "those" : true ,
204+ "and" : true , "or" : true , "but" : true , "not" : true , "no" : true ,
205+ "it" : true , "its" : true , "i" : true , "my" : true , "me" : true ,
206+ "you" : true , "your" : true , "we" : true , "our" : true ,
207+ "any" : true , "some" : true , "all" : true , "with" : true , "about" : true ,
208+ }
209+
210+ // prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
211+ // short entity records and natural-language user questions. Unlike
212+ // prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
213+ // question like "what's the status of the kitchen project?" isn't
214+ // zero-matched just because "what's" and "the" don't appear in any
215+ // entity's indexed text.
216+ //
217+ // Filtering steps, applied per token:
218+ // 1. Lowercase.
219+ // 2. Strip non-letter/digit runes so "project?" becomes "project".
220+ // 3. Drop tokens shorter than 2 runes (removes stray punctuation and
221+ // single-char noise).
222+ // 4. Drop entityFTSStopwords.
223+ //
224+ // Survivors become quoted prefix phrases joined with ` OR `. Ranking
225+ // is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
226+ // threshold trims low-quality matches.
227+ //
228+ // Returns "" when no content words survive; callers treat that the
229+ // same way as an empty user query.
230+ func prepareFTSEntityQuery (query string ) string {
231+ fields := strings .Fields (query )
232+ var tokens []string
233+ for _ , w := range fields {
234+ normalized := strings .Map (keepAlphaNum , strings .ToLower (w ))
235+ if len (normalized ) < 2 {
236+ continue
237+ }
238+ if entityFTSStopwords [normalized ] {
239+ continue
240+ }
241+ tokens = append (tokens , `"` + normalized + `"*` )
242+ }
243+ return strings .Join (tokens , " OR " )
244+ }
245+
246+ // keepAlphaNum is a strings.Map predicate that keeps letters and digits
247+ // and drops everything else. Used by prepareFTSEntityQuery to strip
248+ // punctuation glued to words ("project?" -> "project"). Defined at
249+ // package scope so the compiler treats it as a static function pointer
250+ // rather than a per-iteration closure.
251+ func keepAlphaNum (r rune ) rune {
252+ if unicode .IsLetter (r ) || unicode .IsDigit (r ) {
253+ return r
254+ }
255+ return - 1
256+ }
257+
170258// RebuildFTSIndex forces a full rebuild of both FTS5 indexes (documents
171259// and entities). Useful after bulk imports or data recovery.
172260func (s * Store ) RebuildFTSIndex () error {
@@ -593,16 +681,91 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
593681 return nil , nil
594682 }
595683
596- safeQuery := prepareFTSQuery (query )
684+ // Entity records are short and user questions are conversational.
685+ // prepareFTSEntityQuery drops stopwords and OR-joins content words
686+ // so "what's the status of the kitchen project?" actually matches
687+ // rows containing "kitchen" even though "what's" and "the" don't.
688+ // Returns "" when every token is a stopword or punctuation.
689+ safeQuery := prepareFTSEntityQuery (query )
690+ if safeQuery == "" {
691+ return nil , nil
692+ }
597693
694+ // Per-type quota and BM25 threshold via window functions. Prevents
695+ // one noisy entity type from crowding out the others, and drops
696+ // matches below the rank ceiling.
697+ //
698+ // Three tiers so that:
699+ // - Every matching type gets at least one slot (tier 1: rn == 1 per
700+ // type, i.e. that type's strongest match).
701+ // - Each type can then fill up to ftsEntityKPerType (tier 2: rows
702+ // with 2 <= rn <= K).
703+ // - Remaining room up to ftsEntityTotalCap is filled globally from
704+ // whatever's left (tier 3: rn > K).
705+ //
706+ // The outer LIMIT ftsEntityTotalCap still applies across the union,
707+ // so even with many matching types the total is bounded. Because
708+ // tier 1 takes exactly one row per type first, every matching type
709+ // is represented as long as the number of matching types does not
710+ // exceed ftsEntityTotalCap.
711+ //
712+ // entity_id tiebreaks rank in every ORDER BY so results are stable
713+ // when BM25 scores collide across similarly-shaped rows.
598714 var results []EntitySearchResult
599715 err := s .db .Raw (fmt .Sprintf (`
716+ WITH matches AS (
717+ SELECT entity_type, entity_id, entity_name, rank,
718+ ROW_NUMBER() OVER (
719+ PARTITION BY entity_type
720+ ORDER BY rank, entity_id
721+ ) AS rn
722+ FROM %s
723+ WHERE %s MATCH ? AND rank < ?
724+ ),
725+ tier1 AS (
726+ -- one row per matching type: each type's strongest match.
727+ SELECT entity_type, entity_id, entity_name, rank
728+ FROM matches
729+ WHERE rn = 1
730+ ),
731+ tier2 AS (
732+ -- up to K per type after tier 1.
733+ SELECT entity_type, entity_id, entity_name, rank
734+ FROM matches
735+ WHERE rn > 1 AND rn <= ?
736+ ORDER BY rank, entity_type, entity_id
737+ LIMIT MAX(
738+ ? - (SELECT COUNT(*) FROM tier1),
739+ 0
740+ )
741+ ),
742+ tier3 AS (
743+ -- fill remaining global slots after per-type quotas are done.
744+ SELECT entity_type, entity_id, entity_name, rank
745+ FROM matches
746+ WHERE rn > ?
747+ ORDER BY rank, entity_type, entity_id
748+ LIMIT MAX(
749+ ? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
750+ 0
751+ )
752+ )
600753 SELECT entity_type, entity_id, entity_name, rank
601- FROM %s
602- WHERE %s MATCH ?
754+ FROM (
755+ SELECT * FROM tier1
756+ UNION ALL
757+ SELECT * FROM tier2
758+ UNION ALL
759+ SELECT * FROM tier3
760+ )
603761 ORDER BY rank, entity_type, entity_id
604- LIMIT 20
605- ` , tableEntitiesFTS , tableEntitiesFTS ), safeQuery ).
762+ LIMIT ?
763+ ` , tableEntitiesFTS , tableEntitiesFTS ),
764+ safeQuery ,
765+ ftsEntityRankCeiling ,
766+ ftsEntityKPerType , ftsEntityTotalCap ,
767+ ftsEntityKPerType , ftsEntityTotalCap ,
768+ ftsEntityTotalCap ).
606769 Scan (& results ).Error
607770 if err != nil {
608771 return nil , fmt .Errorf ("search entities: %w" , err )
0 commit comments