Skip to content

Commit 1a143bd

Browse files
committed
feat(data): per-type quotas and rank threshold in SearchEntities
Replaces the flat LIMIT 20 with a three-tier window-function query: - Tier 1 takes exactly one row per matching entity type (guarantees cross-type representation: every type with a match surfaces at least once). - Tier 2 raises each type up to ftsEntityKPerType rows, so single noisy types can't dominate. - Tier 3 fills any remaining room up to ftsEntityTotalCap from whatever's left, globally ranked. Single-type searches use the full cap this way. Package-level tuning constants (not user-configurable -- the eval harness is the tuning channel): ftsEntityKPerType = 5 ftsEntityRankCeiling = 0.0 // permissive; eval will tighten ftsEntityTotalCap = 20 entity_id tiebreaks rank in every ORDER BY so results are stable when BM25 produces identical ranks on similarly-shaped rows. Tests cover: per-type quota guarantees a second-class match survives a flood of first-class matches; single-type searches return more than ftsEntityKPerType rows; every matching type appears when 5+ types share a token; total cap holds; rank threshold plumbing is wired; same query returns stable IDs across runs. Refs #707.
1 parent 6cca13a commit 1a143bd

2 files changed

Lines changed: 327 additions & 22 deletions

File tree

internal/data/fts.go

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,20 @@ const (
2222
tableEntitiesFTS = "entities_fts"
2323
)
2424

25+
// SearchEntities tuning knobs. Not user-configurable; the eval harness
26+
// (`micasa eval fts`) is the channel for tuning these.
27+
//
28+
// FTS5's BM25 rank is negative for every match (more negative = more
29+
// relevant). `ftsEntityRankCeiling` is the floor below which a match
30+
// counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
31+
// deliberately permissive so any match passes; the eval will tighten
32+
// this once we have real measurements of noise vs signal.
33+
const (
34+
ftsEntityKPerType = 5 // max rows returned per entity_type
35+
ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
36+
ftsEntityTotalCap = 20 // final cap across all entity_types
37+
)
38+
2539
// DocumentSearchResult holds a single FTS5 match with metadata for display.
2640
type DocumentSearchResult struct {
2741
ID string
@@ -592,14 +606,81 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
592606

593607
safeQuery := prepareFTSQuery(query)
594608

609+
// Per-type quota and BM25 threshold via window functions. Prevents
610+
// one noisy entity type from crowding out the others, and drops
611+
// matches below the rank ceiling.
612+
//
613+
// Three tiers so that:
614+
// - Every matching type gets at least one slot (tier 1: rn == 1 per
615+
// type, i.e. that type's strongest match).
616+
// - Each type can then fill up to ftsEntityKPerType (tier 2: rows
617+
// with 2 <= rn <= K).
618+
// - Remaining room up to ftsEntityTotalCap is filled globally from
619+
// whatever's left (tier 3: rn > K).
620+
//
621+
// The outer LIMIT ftsEntityTotalCap still applies across the union,
622+
// so even with many matching types the total is bounded. Because
623+
// tier 1 takes exactly one row per type first, every matching type
624+
// is represented as long as the number of matching types does not
625+
// exceed ftsEntityTotalCap.
626+
//
627+
// entity_id tiebreaks rank in every ORDER BY so results are stable
628+
// when BM25 scores collide across similarly-shaped rows.
595629
var results []EntitySearchResult
596630
err := s.db.Raw(fmt.Sprintf(`
631+
WITH matches AS (
632+
SELECT entity_type, entity_id, entity_name, rank,
633+
ROW_NUMBER() OVER (
634+
PARTITION BY entity_type
635+
ORDER BY rank, entity_id
636+
) AS rn
637+
FROM %s
638+
WHERE %s MATCH ? AND rank < ?
639+
),
640+
tier1 AS (
641+
-- one row per matching type: each type's strongest match.
642+
SELECT entity_type, entity_id, entity_name, rank
643+
FROM matches
644+
WHERE rn = 1
645+
),
646+
tier2 AS (
647+
-- up to K per type after tier 1.
648+
SELECT entity_type, entity_id, entity_name, rank
649+
FROM matches
650+
WHERE rn > 1 AND rn <= ?
651+
ORDER BY rank, entity_type, entity_id
652+
LIMIT MAX(
653+
? - (SELECT COUNT(*) FROM tier1),
654+
0
655+
)
656+
),
657+
tier3 AS (
658+
-- fill remaining global slots after per-type quotas are done.
659+
SELECT entity_type, entity_id, entity_name, rank
660+
FROM matches
661+
WHERE rn > ?
662+
ORDER BY rank, entity_type, entity_id
663+
LIMIT MAX(
664+
? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
665+
0
666+
)
667+
)
597668
SELECT entity_type, entity_id, entity_name, rank
598-
FROM %s
599-
WHERE %s MATCH ?
669+
FROM (
670+
SELECT * FROM tier1
671+
UNION ALL
672+
SELECT * FROM tier2
673+
UNION ALL
674+
SELECT * FROM tier3
675+
)
600676
ORDER BY rank, entity_type, entity_id
601-
LIMIT 20
602-
`, tableEntitiesFTS, tableEntitiesFTS), safeQuery).
677+
LIMIT ?
678+
`, tableEntitiesFTS, tableEntitiesFTS),
679+
safeQuery,
680+
ftsEntityRankCeiling,
681+
ftsEntityKPerType, ftsEntityTotalCap,
682+
ftsEntityKPerType, ftsEntityTotalCap,
683+
ftsEntityTotalCap).
603684
Scan(&results).Error
604685
if err != nil {
605686
return nil, fmt.Errorf("search entities: %w", err)

internal/data/fts_test.go

Lines changed: 242 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package data
55

66
import (
7+
"fmt"
78
"strings"
89
"testing"
910
"time"
@@ -923,37 +924,205 @@ func TestFTSTriggerCascadeOnProjectSoftDelete(t *testing.T) {
923924
assert.Empty(t, attic, "soft-deleted project title should not surface via any entity")
924925
}
925926

926-
func TestFTSTriggerHardDeleteMaintenanceCascadesSLE(t *testing.T) {
927+
// ---------------------------------------------------------------------------
928+
// Per-type quota and rank threshold tests (ftsEntityKPerType,
929+
// ftsEntityRankCeiling, ftsEntityTotalCap).
930+
// ---------------------------------------------------------------------------
931+
932+
func TestFTSSearchEntitiesPerTypeQuotaGuaranteesRepresentation(t *testing.T) {
927933
t.Parallel()
928934
store := newTestStore(t)
929935

930-
cats, err := store.MaintenanceCategories()
936+
// Insert many projects that all share a strong matching token. With
937+
// no per-type quota, the lone matching vendor below would drop off
938+
// the bottom as projects dominate the top of the ranking. The quota
939+
// guarantees at least one vendor slot; remaining space is still
940+
// filled from the global top.
941+
types, _ := store.ProjectTypes()
942+
const projectCount = 10
943+
for i := range projectCount {
944+
require.NoError(t, store.CreateProject(&Project{
945+
Title: fmt.Sprintf("Sawmill Project %d", i),
946+
ProjectTypeID: types[0].ID,
947+
Status: ProjectStatusPlanned,
948+
}))
949+
}
950+
951+
// Single vendor matching the same token. Without the quota this
952+
// would be at rank position 11 behind all 10 projects; with the
953+
// quota tier 1 forces it into the result set.
954+
require.NoError(t, store.CreateVendor(&Vendor{Name: "Sawmill Supplies Co"}))
955+
956+
results, err := store.SearchEntities("sawmill")
931957
require.NoError(t, err)
932-
require.NotEmpty(t, cats)
933958

934-
m := &MaintenanceItem{
935-
Name: "Gutter Cleaning",
936-
CategoryID: cats[0].ID,
937-
IntervalMonths: 12,
959+
var vendorHits int
960+
for _, r := range results {
961+
if r.EntityType == DeletionEntityVendor {
962+
vendorHits++
963+
}
938964
}
939-
require.NoError(t, store.CreateMaintenance(m))
965+
assert.Equal(t, 1, vendorHits,
966+
"vendor must survive the project flood thanks to the per-type quota; got %d", vendorHits)
967+
assert.LessOrEqual(t, len(results), ftsEntityTotalCap,
968+
"total cap must still hold")
969+
}
940970

941-
sle := &ServiceLogEntry{
942-
MaintenanceItemID: m.ID,
943-
ServicedAt: time.Now(),
944-
Notes: "fall cleanup",
971+
func TestFTSSearchEntitiesTotalCap(t *testing.T) {
972+
t.Parallel()
973+
store := newTestStore(t)
974+
975+
// Insert many entities across multiple types so (ftsEntityKPerType *
976+
// number_of_types) > ftsEntityTotalCap. The overall LIMIT must still
977+
// apply.
978+
types, _ := store.ProjectTypes()
979+
for i := range ftsEntityKPerType + 2 {
980+
require.NoError(t, store.CreateProject(&Project{
981+
Title: fmt.Sprintf("Overflow Project %d", i),
982+
ProjectTypeID: types[0].ID,
983+
Status: ProjectStatusPlanned,
984+
}))
985+
}
986+
for i := range ftsEntityKPerType + 2 {
987+
require.NoError(t, store.CreateVendor(&Vendor{
988+
Name: fmt.Sprintf("Overflow Vendor %d", i),
989+
}))
990+
}
991+
for i := range ftsEntityKPerType + 2 {
992+
require.NoError(t, store.CreateAppliance(&Appliance{
993+
Name: fmt.Sprintf("Overflow Appliance %d", i),
994+
}))
995+
}
996+
for i := range ftsEntityKPerType + 2 {
997+
require.NoError(t, store.CreateIncident(&Incident{
998+
Title: fmt.Sprintf("Overflow Incident %d", i),
999+
Status: "open",
1000+
Severity: "low",
1001+
}))
9451002
}
946-
require.NoError(t, store.CreateServiceLog(sle, Vendor{}))
9471003

948-
require.NoError(t, store.HardDeleteMaintenance(m.ID))
1004+
results, err := store.SearchEntities("overflow")
1005+
require.NoError(t, err)
1006+
assert.LessOrEqual(t, len(results), ftsEntityTotalCap,
1007+
"total cap should limit results to %d; got %d", ftsEntityTotalCap, len(results))
1008+
}
9491009

950-
gutterResults, err := store.SearchEntities("gutter")
1010+
func TestFTSSearchEntitiesSingleTypeUsesFullCap(t *testing.T) {
1011+
t.Parallel()
1012+
store := newTestStore(t)
1013+
1014+
// Insert more projects than the per-type quota, with NO other type
1015+
// matching. The earlier flat-quota implementation would clip at
1016+
// ftsEntityKPerType even though 15 other slots were unused; the
1017+
// two-tier implementation should fill up to ftsEntityTotalCap.
1018+
types, _ := store.ProjectTypes()
1019+
const projectCount = ftsEntityKPerType + 3
1020+
for i := range projectCount {
1021+
require.NoError(t, store.CreateProject(&Project{
1022+
Title: fmt.Sprintf("Lakeside Project %d", i),
1023+
ProjectTypeID: types[0].ID,
1024+
Status: ProjectStatusPlanned,
1025+
}))
1026+
}
1027+
1028+
results, err := store.SearchEntities("lakeside")
9511029
require.NoError(t, err)
952-
assert.Empty(t, gutterResults, "maintenance item FTS row should be gone after hard delete")
1030+
assert.GreaterOrEqual(
1031+
t,
1032+
len(results),
1033+
projectCount,
1034+
"single-type search must not be capped at ftsEntityKPerType when no other type competes; got %d",
1035+
len(results),
1036+
)
1037+
assert.LessOrEqual(t, len(results), ftsEntityTotalCap,
1038+
"total cap should still apply")
1039+
}
9531040

954-
fallResults, err := store.SearchEntities("fall")
1041+
func TestFTSSearchEntitiesTiebreakerIsDeterministic(t *testing.T) {
1042+
t.Parallel()
1043+
store := newTestStore(t)
1044+
1045+
// Insert several projects with identical text so BM25 assigns them
1046+
// all the same rank. Run the search twice and assert results come
1047+
// back in the same order -- the window ORDER BY has an entity_id
1048+
// tiebreaker to guarantee this.
1049+
types, _ := store.ProjectTypes()
1050+
const count = ftsEntityKPerType + 2
1051+
for i := range count {
1052+
require.NoError(t, store.CreateProject(&Project{
1053+
Title: fmt.Sprintf("Identical Widget %d", i),
1054+
ProjectTypeID: types[0].ID,
1055+
Status: ProjectStatusPlanned,
1056+
}))
1057+
}
1058+
1059+
first, err := store.SearchEntities("widget")
9551060
require.NoError(t, err)
956-
assert.Empty(t, fallResults, "child SLE FTS row should be gone via FK cascade + _ad trigger")
1061+
require.NotEmpty(t, first)
1062+
second, err := store.SearchEntities("widget")
1063+
require.NoError(t, err)
1064+
require.Equal(t, len(first), len(second), "same query should return same count")
1065+
for i := range first {
1066+
assert.Equal(t, first[i].EntityID, second[i].EntityID,
1067+
"position %d should be stable across runs", i)
1068+
}
1069+
}
1070+
1071+
func TestFTSSearchEntitiesRepresentsEveryMatchingType(t *testing.T) {
1072+
t.Parallel()
1073+
store := newTestStore(t)
1074+
1075+
// Insert enough entities per type that each type's match count
1076+
// would otherwise exceed the per-type quota, so the tier-3 fill
1077+
// has a chance to shadow late types. With the "one row per matching
1078+
// type first" tier, every matching type should still surface.
1079+
types, _ := store.ProjectTypes()
1080+
cats, err := store.MaintenanceCategories()
1081+
require.NoError(t, err)
1082+
require.NotEmpty(t, cats)
1083+
1084+
const perType = ftsEntityKPerType + 3
1085+
for i := range perType {
1086+
require.NoError(t, store.CreateProject(&Project{
1087+
Title: fmt.Sprintf("Signal Project %d", i),
1088+
ProjectTypeID: types[0].ID,
1089+
Status: ProjectStatusPlanned,
1090+
}))
1091+
require.NoError(t, store.CreateVendor(&Vendor{
1092+
Name: fmt.Sprintf("Signal Vendor %d", i),
1093+
}))
1094+
require.NoError(t, store.CreateAppliance(&Appliance{
1095+
Name: fmt.Sprintf("Signal Appliance %d", i),
1096+
}))
1097+
require.NoError(t, store.CreateMaintenance(&MaintenanceItem{
1098+
Name: fmt.Sprintf("Signal Maintenance %d", i),
1099+
CategoryID: cats[0].ID,
1100+
IntervalMonths: 6,
1101+
}))
1102+
require.NoError(t, store.CreateIncident(&Incident{
1103+
Title: fmt.Sprintf("Signal Incident %d", i),
1104+
Status: "open",
1105+
Severity: "low",
1106+
}))
1107+
}
1108+
1109+
results, err := store.SearchEntities("signal")
1110+
require.NoError(t, err)
1111+
1112+
seen := map[string]bool{}
1113+
for _, r := range results {
1114+
seen[r.EntityType] = true
1115+
}
1116+
for _, entity := range []string{
1117+
DeletionEntityProject,
1118+
DeletionEntityVendor,
1119+
DeletionEntityAppliance,
1120+
DeletionEntityMaintenance,
1121+
DeletionEntityIncident,
1122+
} {
1123+
assert.True(t, seen[entity],
1124+
"every matching type must appear at least once; %s missing", entity)
1125+
}
9571126
}
9581127

9591128
func TestFTSPopulateFiltersSoftDeletedMaintenanceInSLEJoin(t *testing.T) {
@@ -1039,3 +1208,58 @@ func TestFTSPopulateFiltersSoftDeletedParentsInQuoteJoin(t *testing.T) {
10391208
}
10401209
}
10411210
}
1211+
1212+
func TestFTSSearchEntitiesRankThresholdFiltersAboveCeiling(t *testing.T) {
1213+
t.Parallel()
1214+
store := newTestStore(t)
1215+
1216+
// Prove the rank threshold infrastructure works: insert a vendor with
1217+
// a known searchable name, then verify that every returned row has
1218+
// `rank < ftsEntityRankCeiling`. The initial ceiling is permissive
1219+
// (0.0) — every BM25 match is negative, so every result passes. Once
1220+
// the eval tightens the ceiling, this test continues to hold.
1221+
require.NoError(t, store.CreateVendor(&Vendor{Name: "Rank Threshold Test Co"}))
1222+
1223+
results, err := store.SearchEntities("threshold")
1224+
require.NoError(t, err)
1225+
require.NotEmpty(t, results, "vendor name should match")
1226+
1227+
for _, r := range results {
1228+
assert.Less(t, r.Rank, ftsEntityRankCeiling,
1229+
"every returned row must have rank < %v; got %q with rank %v",
1230+
ftsEntityRankCeiling, r.EntityName, r.Rank)
1231+
}
1232+
}
1233+
1234+
func TestFTSTriggerHardDeleteMaintenanceCascadesSLE(t *testing.T) {
1235+
t.Parallel()
1236+
store := newTestStore(t)
1237+
1238+
cats, err := store.MaintenanceCategories()
1239+
require.NoError(t, err)
1240+
require.NotEmpty(t, cats)
1241+
1242+
m := &MaintenanceItem{
1243+
Name: "Gutter Cleaning",
1244+
CategoryID: cats[0].ID,
1245+
IntervalMonths: 12,
1246+
}
1247+
require.NoError(t, store.CreateMaintenance(m))
1248+
1249+
sle := &ServiceLogEntry{
1250+
MaintenanceItemID: m.ID,
1251+
ServicedAt: time.Now(),
1252+
Notes: "fall cleanup",
1253+
}
1254+
require.NoError(t, store.CreateServiceLog(sle, Vendor{}))
1255+
1256+
require.NoError(t, store.HardDeleteMaintenance(m.ID))
1257+
1258+
gutterResults, err := store.SearchEntities("gutter")
1259+
require.NoError(t, err)
1260+
assert.Empty(t, gutterResults, "maintenance item FTS row should be gone after hard delete")
1261+
1262+
fallResults, err := store.SearchEntities("fall")
1263+
require.NoError(t, err)
1264+
assert.Empty(t, fallResults, "child SLE FTS row should be gone via FK cascade + _ad trigger")
1265+
}

0 commit comments

Comments
 (0)