Skip to content

Commit ffd7cda

Browse files
committed
feat(data): per-type quotas, rank threshold, NL-tolerant entity FTS
Replaces the flat LIMIT 20 in SearchEntities with a three-tier window-function query and adds natural-language query tolerance. Ranking: - Tier 1 takes exactly one row per matching entity type (guarantees cross-type representation). - Tier 2 raises each type up to ftsEntityKPerType rows so single noisy types can't dominate. - Tier 3 fills any remaining room up to ftsEntityTotalCap from whatever's left, globally ranked. Single-type searches use the full cap this way. Package-level tuning constants (not user-configurable -- the eval harness is the tuning channel): ftsEntityKPerType = 5 ftsEntityRankCeiling = 0.0 // permissive; eval will tighten ftsEntityTotalCap = 20 entity_id tiebreaks rank in every ORDER BY so results are stable when BM25 produces identical ranks on similarly-shaped rows. Query tolerance: - prepareFTSEntityQuery lowercases, strips non-alphanum, drops short and stopword tokens, and OR-joins the survivors as quoted prefix phrases. - Returns early when no content words survive so a pure-stopword question like "what is it?" doesn't hammer FTS with an empty MATCH. Tests cover per-type quota preservation under a flood of first-class matches, single-type searches using the full cap, every matching type surfacing when 5+ types share a token, total cap enforcement, rank threshold plumbing, stable ordering across runs, the query builder directly, and the end-to-end regression that "what's the status of the kitchen project?" now surfaces the Kitchen Remodel project. Refs #707.
1 parent c5f3f84 commit ffd7cda

2 files changed

Lines changed: 499 additions & 23 deletions

File tree

internal/data/fts.go

Lines changed: 166 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"strings"
1010
"time"
11+
"unicode"
1112

1213
"gorm.io/gorm"
1314
)
@@ -22,6 +23,20 @@ const (
2223
tableEntitiesFTS = "entities_fts"
2324
)
2425

26+
// SearchEntities tuning knobs. Not user-configurable; the eval harness
27+
// (`micasa eval fts`) is the channel for tuning these.
28+
//
29+
// FTS5's BM25 rank is negative for every match (more negative = more
30+
// relevant). `ftsEntityRankCeiling` is the floor below which a match
31+
// counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
32+
// deliberately permissive so any match passes; the eval will tighten
33+
// this once we have real measurements of noise vs signal.
34+
const (
35+
ftsEntityKPerType = 5 // max rows returned per entity_type
36+
ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
37+
ftsEntityTotalCap = 20 // final cap across all entity_types
38+
)
39+
2540
// DocumentSearchResult holds a single FTS5 match with metadata for display.
2641
type DocumentSearchResult struct {
2742
ID string
@@ -157,6 +172,11 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
157172
// literal text, not operators -- the search box is type-as-you-go and
158173
// partial operator syntax mid-keystroke would otherwise error.
159174
//
175+
// This is the right default for long-form search (documents): every
176+
// word in a multi-word query should contribute. For short, structured
177+
// entity records see prepareFTSEntityQuery, which OR-joins content
178+
// words after stripping stopwords.
179+
//
160180
// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
161181
func prepareFTSQuery(query string) string {
162182
fields := strings.Fields(query)
@@ -167,6 +187,72 @@ func prepareFTSQuery(query string) string {
167187
return strings.Join(out, " ")
168188
}
169189

190+
// entityFTSStopwords drops high-frequency English words that rarely
191+
// carry semantic weight against entity names and notes. The list is
192+
// intentionally small: prepareFTSEntityQuery OR-joins the survivors,
193+
// so "kitchen" needs to stay even when the user wrote "what's the
194+
// status of the kitchen project?".
195+
var entityFTSStopwords = map[string]bool{
196+
"a": true, "an": true, "the": true, "of": true, "in": true,
197+
"on": true, "at": true, "to": true, "for": true, "by": true,
198+
"is": true, "are": true, "was": true, "were": true, "be": true,
199+
"been": true, "being": true, "have": true, "has": true, "had": true,
200+
"do": true, "does": true, "did": true, "will": true, "would": true,
201+
"should": true, "could": true, "what": true, "whats": true,
202+
"when": true, "where": true, "who": true, "why": true, "how": true,
203+
"that": true, "this": true, "these": true, "those": true,
204+
"and": true, "or": true, "but": true, "not": true, "no": true,
205+
"it": true, "its": true, "i": true, "my": true, "me": true,
206+
"you": true, "your": true, "we": true, "our": true,
207+
"any": true, "some": true, "all": true, "with": true, "about": true,
208+
}
209+
210+
// prepareFTSEntityQuery builds an FTS5 MATCH expression suited to
211+
// short entity records and natural-language user questions. Unlike
212+
// prepareFTSQuery (which AND-joins), this OR-joins the survivors so a
213+
// question like "what's the status of the kitchen project?" isn't
214+
// zero-matched just because "what's" and "the" don't appear in any
215+
// entity's indexed text.
216+
//
217+
// Filtering steps, applied per token:
218+
// 1. Lowercase.
219+
// 2. Strip non-letter/digit runes so "project?" becomes "project".
220+
// 3. Drop tokens shorter than 2 runes (removes stray punctuation and
221+
// single-char noise).
222+
// 4. Drop entityFTSStopwords.
223+
//
224+
// Survivors become quoted prefix phrases joined with ` OR `. Ranking
225+
// is BM25 via FTS5's default, and the ftsEntityRankCeiling caller-side
226+
// threshold trims low-quality matches.
227+
//
228+
// Returns "" when no content words survive; callers treat that the
229+
// same way as an empty user query.
230+
func prepareFTSEntityQuery(query string) string {
231+
fields := strings.Fields(query)
232+
var tokens []string
233+
for _, w := range fields {
234+
normalized := strings.Map(keepAlphaNum, strings.ToLower(w))
235+
if len(normalized) < 2 {
236+
continue
237+
}
238+
if entityFTSStopwords[normalized] {
239+
continue
240+
}
241+
tokens = append(tokens, `"`+normalized+`"*`)
242+
}
243+
return strings.Join(tokens, " OR ")
244+
}
245+
246+
// keepAlphaNum is a strings.Map predicate that keeps letters and digits
247+
// and drops everything else. Used by prepareFTSEntityQuery to strip
248+
// punctuation glued to words ("project?" -> "project").
249+
func keepAlphaNum(r rune) rune {
250+
if unicode.IsLetter(r) || unicode.IsDigit(r) {
251+
return r
252+
}
253+
return -1
254+
}
255+
170256
// RebuildFTSIndex forces a full rebuild of both FTS5 indexes (documents
171257
// and entities). Useful after bulk imports or data recovery.
172258
func (s *Store) RebuildFTSIndex() error {
@@ -593,16 +679,91 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
593679
return nil, nil
594680
}
595681

596-
safeQuery := prepareFTSQuery(query)
682+
// Entity records are short and user questions are conversational.
683+
// prepareFTSEntityQuery drops stopwords and OR-joins content words
684+
// so "what's the status of the kitchen project?" actually matches
685+
// rows containing "kitchen" even though "what's" and "the" don't.
686+
// Returns "" when every token is a stopword or punctuation.
687+
safeQuery := prepareFTSEntityQuery(query)
688+
if safeQuery == "" {
689+
return nil, nil
690+
}
597691

692+
// Per-type quota and BM25 threshold via window functions. Prevents
693+
// one noisy entity type from crowding out the others, and drops
694+
// matches below the rank ceiling.
695+
//
696+
// Three tiers so that:
697+
// - Every matching type gets at least one slot (tier 1: rn == 1 per
698+
// type, i.e. that type's strongest match).
699+
// - Each type can then fill up to ftsEntityKPerType (tier 2: rows
700+
// with 2 <= rn <= K).
701+
// - Remaining room up to ftsEntityTotalCap is filled globally from
702+
// whatever's left (tier 3: rn > K).
703+
//
704+
// The outer LIMIT ftsEntityTotalCap still applies across the union,
705+
// so even with many matching types the total is bounded. Because
706+
// tier 1 takes exactly one row per type first, every matching type
707+
// is represented as long as the number of matching types does not
708+
// exceed ftsEntityTotalCap.
709+
//
710+
// entity_id tiebreaks rank in every ORDER BY so results are stable
711+
// when BM25 scores collide across similarly-shaped rows.
598712
var results []EntitySearchResult
599713
err := s.db.Raw(fmt.Sprintf(`
714+
WITH matches AS (
715+
SELECT entity_type, entity_id, entity_name, rank,
716+
ROW_NUMBER() OVER (
717+
PARTITION BY entity_type
718+
ORDER BY rank, entity_id
719+
) AS rn
720+
FROM %s
721+
WHERE %s MATCH ? AND rank < ?
722+
),
723+
tier1 AS (
724+
-- one row per matching type: each type's strongest match.
725+
SELECT entity_type, entity_id, entity_name, rank
726+
FROM matches
727+
WHERE rn = 1
728+
),
729+
tier2 AS (
730+
-- up to K per type after tier 1.
731+
SELECT entity_type, entity_id, entity_name, rank
732+
FROM matches
733+
WHERE rn > 1 AND rn <= ?
734+
ORDER BY rank, entity_type, entity_id
735+
LIMIT MAX(
736+
? - (SELECT COUNT(*) FROM tier1),
737+
0
738+
)
739+
),
740+
tier3 AS (
741+
-- fill remaining global slots after per-type quotas are done.
742+
SELECT entity_type, entity_id, entity_name, rank
743+
FROM matches
744+
WHERE rn > ?
745+
ORDER BY rank, entity_type, entity_id
746+
LIMIT MAX(
747+
? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
748+
0
749+
)
750+
)
600751
SELECT entity_type, entity_id, entity_name, rank
601-
FROM %s
602-
WHERE %s MATCH ?
752+
FROM (
753+
SELECT * FROM tier1
754+
UNION ALL
755+
SELECT * FROM tier2
756+
UNION ALL
757+
SELECT * FROM tier3
758+
)
603759
ORDER BY rank, entity_type, entity_id
604-
LIMIT 20
605-
`, tableEntitiesFTS, tableEntitiesFTS), safeQuery).
760+
LIMIT ?
761+
`, tableEntitiesFTS, tableEntitiesFTS),
762+
safeQuery,
763+
ftsEntityRankCeiling,
764+
ftsEntityKPerType, ftsEntityTotalCap,
765+
ftsEntityKPerType, ftsEntityTotalCap,
766+
ftsEntityTotalCap).
606767
Scan(&results).Error
607768
if err != nil {
608769
return nil, fmt.Errorf("search entities: %w", err)

0 commit comments

Comments
 (0)