feat(data): per-type quotas and rank threshold in SearchEntities

cpcloud · cpcloud · commit 1a143bdfdc0b · 2026-04-19T07:14:43.000-04:00
Replaces the flat LIMIT 20 with a three-tier window-function query: - Tier 1 takes exactly one row per matching entity type (guarantees cross-type representation: every type with a match surfaces at least once). - Tier 2 raises each type up to ftsEntityKPerType rows, so single noisy types can't dominate. - Tier 3 fills any remaining room up to ftsEntityTotalCap from whatever's left, globally ranked. Single-type searches use the full cap this way. Package-level tuning constants (not user-configurable -- the eval harness is the tuning channel): ftsEntityKPerType = 5 ftsEntityRankCeiling = 0.0 // permissive; eval will tighten ftsEntityTotalCap = 20 entity_id tiebreaks rank in every ORDER BY so results are stable when BM25 produces identical ranks on similarly-shaped rows. Tests cover: per-type quota guarantees a second-class match survives a flood of first-class matches; single-type searches return more than ftsEntityKPerType rows; every matching type appears when 5+ types share a token; total cap holds; rank threshold plumbing is wired; same query returns stable IDs across runs. Refs #707.
diff --git a/internal/data/fts.go b/internal/data/fts.go
@@ -22,6 +22,20 @@ const (
 	tableEntitiesFTS = "entities_fts"
 )
 
+// SearchEntities tuning knobs. Not user-configurable; the eval harness
+// (`micasa eval fts`) is the channel for tuning these.
+//
+// FTS5's BM25 rank is negative for every match (more negative = more
+// relevant). `ftsEntityRankCeiling` is the floor below which a match
+// counts; rows with `rank >= ceiling` are dropped. Initial value 0.0 is
+// deliberately permissive so any match passes; the eval will tighten
+// this once we have real measurements of noise vs signal.
+const (
+	ftsEntityKPerType    = 5   // max rows returned per entity_type
+	ftsEntityRankCeiling = 0.0 // keep only rows with rank strictly less than this
+	ftsEntityTotalCap    = 20  // final cap across all entity_types
+)
+
 // DocumentSearchResult holds a single FTS5 match with metadata for display.
 type DocumentSearchResult struct {
 	ID         string
@@ -592,14 +606,81 @@ func (s *Store) SearchEntities(query string) ([]EntitySearchResult, error) {
 
 	safeQuery := prepareFTSQuery(query)
 
+	// Per-type quota and BM25 threshold via window functions. Prevents
+	// one noisy entity type from crowding out the others, and drops
+	// matches below the rank ceiling.
+	//
+	// Three tiers so that:
+	//   - Every matching type gets at least one slot (tier 1: rn == 1 per
+	//     type, i.e. that type's strongest match).
+	//   - Each type can then fill up to ftsEntityKPerType (tier 2: rows
+	//     with 2 <= rn <= K).
+	//   - Remaining room up to ftsEntityTotalCap is filled globally from
+	//     whatever's left (tier 3: rn > K).
+	//
+	// The outer LIMIT ftsEntityTotalCap still applies across the union,
+	// so even with many matching types the total is bounded. Because
+	// tier 1 takes exactly one row per type first, every matching type
+	// is represented as long as the number of matching types does not
+	// exceed ftsEntityTotalCap.
+	//
+	// entity_id tiebreaks rank in every ORDER BY so results are stable
+	// when BM25 scores collide across similarly-shaped rows.
 	var results []EntitySearchResult
 	err := s.db.Raw(fmt.Sprintf(`
+		WITH matches AS (
+			SELECT entity_type, entity_id, entity_name, rank,
+			       ROW_NUMBER() OVER (
+			           PARTITION BY entity_type
+			           ORDER BY rank, entity_id
+			       ) AS rn
+			FROM %s
+			WHERE %s MATCH ? AND rank < ?
+		),
+		tier1 AS (
+			-- one row per matching type: each type's strongest match.
+			SELECT entity_type, entity_id, entity_name, rank
+			FROM matches
+			WHERE rn = 1
+		),
+		tier2 AS (
+			-- up to K per type after tier 1.
+			SELECT entity_type, entity_id, entity_name, rank
+			FROM matches
+			WHERE rn > 1 AND rn <= ?
+			ORDER BY rank, entity_type, entity_id
+			LIMIT MAX(
+			    ? - (SELECT COUNT(*) FROM tier1),
+			    0
+			)
+		),
+		tier3 AS (
+			-- fill remaining global slots after per-type quotas are done.
+			SELECT entity_type, entity_id, entity_name, rank
+			FROM matches
+			WHERE rn > ?
+			ORDER BY rank, entity_type, entity_id
+			LIMIT MAX(
+			    ? - (SELECT COUNT(*) FROM tier1) - (SELECT COUNT(*) FROM tier2),
+			    0
+			)
+		)
 		SELECT entity_type, entity_id, entity_name, rank
-		FROM %s
-		WHERE %s MATCH ?
+		FROM (
+			SELECT * FROM tier1
+			UNION ALL
+			SELECT * FROM tier2
+			UNION ALL
+			SELECT * FROM tier3
+		)
 		ORDER BY rank, entity_type, entity_id
-		LIMIT 20
-	`, tableEntitiesFTS, tableEntitiesFTS), safeQuery).
+		LIMIT ?
+	`, tableEntitiesFTS, tableEntitiesFTS),
+		safeQuery,
+		ftsEntityRankCeiling,
+		ftsEntityKPerType, ftsEntityTotalCap,
+		ftsEntityKPerType, ftsEntityTotalCap,
+		ftsEntityTotalCap).
 		Scan(&results).Error
 	if err != nil {
 		return nil, fmt.Errorf("search entities: %w", err)
diff --git a/internal/data/fts_test.go b/internal/data/fts_test.go
@@ -4,6 +4,7 @@
 package data
 
 import (
+	"fmt"
 	"strings"
 	"testing"
 	"time"
@@ -923,37 +924,205 @@ func TestFTSTriggerCascadeOnProjectSoftDelete(t *testing.T) {
 	assert.Empty(t, attic, "soft-deleted project title should not surface via any entity")
 }
 
-func TestFTSTriggerHardDeleteMaintenanceCascadesSLE(t *testing.T) {
+// ---------------------------------------------------------------------------
+// Per-type quota and rank threshold tests (ftsEntityKPerType,
+// ftsEntityRankCeiling, ftsEntityTotalCap).
+// ---------------------------------------------------------------------------
+
+func TestFTSSearchEntitiesPerTypeQuotaGuaranteesRepresentation(t *testing.T) {
 	t.Parallel()
 	store := newTestStore(t)
 
-	cats, err := store.MaintenanceCategories()
+	// Insert many projects that all share a strong matching token. With
+	// no per-type quota, the lone matching vendor below would drop off
+	// the bottom as projects dominate the top of the ranking. The quota
+	// guarantees at least one vendor slot; remaining space is still
+	// filled from the global top.
+	types, _ := store.ProjectTypes()
+	const projectCount = 10
+	for i := range projectCount {
+		require.NoError(t, store.CreateProject(&Project{
+			Title:         fmt.Sprintf("Sawmill Project %d", i),
+			ProjectTypeID: types[0].ID,
+			Status:        ProjectStatusPlanned,
+		}))
+	}
+
+	// Single vendor matching the same token. Without the quota this
+	// would be at rank position 11 behind all 10 projects; with the
+	// quota tier 1 forces it into the result set.
+	require.NoError(t, store.CreateVendor(&Vendor{Name: "Sawmill Supplies Co"}))
+
+	results, err := store.SearchEntities("sawmill")
 	require.NoError(t, err)
-	require.NotEmpty(t, cats)
 
-	m := &MaintenanceItem{
-		Name:           "Gutter Cleaning",
-		CategoryID:     cats[0].ID,
-		IntervalMonths: 12,
+	var vendorHits int
+	for _, r := range results {
+		if r.EntityType == DeletionEntityVendor {
+			vendorHits++
+		}
 	}
-	require.NoError(t, store.CreateMaintenance(m))
+	assert.Equal(t, 1, vendorHits,
+		"vendor must survive the project flood thanks to the per-type quota; got %d", vendorHits)
+	assert.LessOrEqual(t, len(results), ftsEntityTotalCap,
+		"total cap must still hold")
+}
 
-	sle := &ServiceLogEntry{
-		MaintenanceItemID: m.ID,
-		ServicedAt:        time.Now(),
-		Notes:             "fall cleanup",
+func TestFTSSearchEntitiesTotalCap(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	// Insert many entities across multiple types so (ftsEntityKPerType *
+	// number_of_types) > ftsEntityTotalCap. The overall LIMIT must still
+	// apply.
+	types, _ := store.ProjectTypes()
+	for i := range ftsEntityKPerType + 2 {
+		require.NoError(t, store.CreateProject(&Project{
+			Title:         fmt.Sprintf("Overflow Project %d", i),
+			ProjectTypeID: types[0].ID,
+			Status:        ProjectStatusPlanned,
+		}))
+	}
+	for i := range ftsEntityKPerType + 2 {
+		require.NoError(t, store.CreateVendor(&Vendor{
+			Name: fmt.Sprintf("Overflow Vendor %d", i),
+		}))
+	}
+	for i := range ftsEntityKPerType + 2 {
+		require.NoError(t, store.CreateAppliance(&Appliance{
+			Name: fmt.Sprintf("Overflow Appliance %d", i),
+		}))
+	}
+	for i := range ftsEntityKPerType + 2 {
+		require.NoError(t, store.CreateIncident(&Incident{
+			Title:    fmt.Sprintf("Overflow Incident %d", i),
+			Status:   "open",
+			Severity: "low",
+		}))
 	}
-	require.NoError(t, store.CreateServiceLog(sle, Vendor{}))
 
-	require.NoError(t, store.HardDeleteMaintenance(m.ID))
+	results, err := store.SearchEntities("overflow")
+	require.NoError(t, err)
+	assert.LessOrEqual(t, len(results), ftsEntityTotalCap,
+		"total cap should limit results to %d; got %d", ftsEntityTotalCap, len(results))
+}
 
-	gutterResults, err := store.SearchEntities("gutter")
+func TestFTSSearchEntitiesSingleTypeUsesFullCap(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	// Insert more projects than the per-type quota, with NO other type
+	// matching. The earlier flat-quota implementation would clip at
+	// ftsEntityKPerType even though 15 other slots were unused; the
+	// two-tier implementation should fill up to ftsEntityTotalCap.
+	types, _ := store.ProjectTypes()
+	const projectCount = ftsEntityKPerType + 3
+	for i := range projectCount {
+		require.NoError(t, store.CreateProject(&Project{
+			Title:         fmt.Sprintf("Lakeside Project %d", i),
+			ProjectTypeID: types[0].ID,
+			Status:        ProjectStatusPlanned,
+		}))
+	}
+
+	results, err := store.SearchEntities("lakeside")
 	require.NoError(t, err)
-	assert.Empty(t, gutterResults, "maintenance item FTS row should be gone after hard delete")
+	assert.GreaterOrEqual(
+		t,
+		len(results),
+		projectCount,
+		"single-type search must not be capped at ftsEntityKPerType when no other type competes; got %d",
+		len(results),
+	)
+	assert.LessOrEqual(t, len(results), ftsEntityTotalCap,
+		"total cap should still apply")
+}
 
-	fallResults, err := store.SearchEntities("fall")
+func TestFTSSearchEntitiesTiebreakerIsDeterministic(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	// Insert several projects with identical text so BM25 assigns them
+	// all the same rank. Run the search twice and assert results come
+	// back in the same order -- the window ORDER BY has an entity_id
+	// tiebreaker to guarantee this.
+	types, _ := store.ProjectTypes()
+	const count = ftsEntityKPerType + 2
+	for i := range count {
+		require.NoError(t, store.CreateProject(&Project{
+			Title:         fmt.Sprintf("Identical Widget %d", i),
+			ProjectTypeID: types[0].ID,
+			Status:        ProjectStatusPlanned,
+		}))
+	}
+
+	first, err := store.SearchEntities("widget")
 	require.NoError(t, err)
-	assert.Empty(t, fallResults, "child SLE FTS row should be gone via FK cascade + _ad trigger")
+	require.NotEmpty(t, first)
+	second, err := store.SearchEntities("widget")
+	require.NoError(t, err)
+	require.Equal(t, len(first), len(second), "same query should return same count")
+	for i := range first {
+		assert.Equal(t, first[i].EntityID, second[i].EntityID,
+			"position %d should be stable across runs", i)
+	}
+}
+
+func TestFTSSearchEntitiesRepresentsEveryMatchingType(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	// Insert enough entities per type that each type's match count
+	// would otherwise exceed the per-type quota, so the tier-3 fill
+	// has a chance to shadow late types. With the "one row per matching
+	// type first" tier, every matching type should still surface.
+	types, _ := store.ProjectTypes()
+	cats, err := store.MaintenanceCategories()
+	require.NoError(t, err)
+	require.NotEmpty(t, cats)
+
+	const perType = ftsEntityKPerType + 3
+	for i := range perType {
+		require.NoError(t, store.CreateProject(&Project{
+			Title:         fmt.Sprintf("Signal Project %d", i),
+			ProjectTypeID: types[0].ID,
+			Status:        ProjectStatusPlanned,
+		}))
+		require.NoError(t, store.CreateVendor(&Vendor{
+			Name: fmt.Sprintf("Signal Vendor %d", i),
+		}))
+		require.NoError(t, store.CreateAppliance(&Appliance{
+			Name: fmt.Sprintf("Signal Appliance %d", i),
+		}))
+		require.NoError(t, store.CreateMaintenance(&MaintenanceItem{
+			Name:           fmt.Sprintf("Signal Maintenance %d", i),
+			CategoryID:     cats[0].ID,
+			IntervalMonths: 6,
+		}))
+		require.NoError(t, store.CreateIncident(&Incident{
+			Title:    fmt.Sprintf("Signal Incident %d", i),
+			Status:   "open",
+			Severity: "low",
+		}))
+	}
+
+	results, err := store.SearchEntities("signal")
+	require.NoError(t, err)
+
+	seen := map[string]bool{}
+	for _, r := range results {
+		seen[r.EntityType] = true
+	}
+	for _, entity := range []string{
+		DeletionEntityProject,
+		DeletionEntityVendor,
+		DeletionEntityAppliance,
+		DeletionEntityMaintenance,
+		DeletionEntityIncident,
+	} {
+		assert.True(t, seen[entity],
+			"every matching type must appear at least once; %s missing", entity)
+	}
 }
 
 func TestFTSPopulateFiltersSoftDeletedMaintenanceInSLEJoin(t *testing.T) {
@@ -1039,3 +1208,58 @@ func TestFTSPopulateFiltersSoftDeletedParentsInQuoteJoin(t *testing.T) {
 		}
 	}
 }
+
+func TestFTSSearchEntitiesRankThresholdFiltersAboveCeiling(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	// Prove the rank threshold infrastructure works: insert a vendor with
+	// a known searchable name, then verify that every returned row has
+	// `rank < ftsEntityRankCeiling`. The initial ceiling is permissive
+	// (0.0) — every BM25 match is negative, so every result passes. Once
+	// the eval tightens the ceiling, this test continues to hold.
+	require.NoError(t, store.CreateVendor(&Vendor{Name: "Rank Threshold Test Co"}))
+
+	results, err := store.SearchEntities("threshold")
+	require.NoError(t, err)
+	require.NotEmpty(t, results, "vendor name should match")
+
+	for _, r := range results {
+		assert.Less(t, r.Rank, ftsEntityRankCeiling,
+			"every returned row must have rank < %v; got %q with rank %v",
+			ftsEntityRankCeiling, r.EntityName, r.Rank)
+	}
+}
+
+func TestFTSTriggerHardDeleteMaintenanceCascadesSLE(t *testing.T) {
+	t.Parallel()
+	store := newTestStore(t)
+
+	cats, err := store.MaintenanceCategories()
+	require.NoError(t, err)
+	require.NotEmpty(t, cats)
+
+	m := &MaintenanceItem{
+		Name:           "Gutter Cleaning",
+		CategoryID:     cats[0].ID,
+		IntervalMonths: 12,
+	}
+	require.NoError(t, store.CreateMaintenance(m))
+
+	sle := &ServiceLogEntry{
+		MaintenanceItemID: m.ID,
+		ServicedAt:        time.Now(),
+		Notes:             "fall cleanup",
+	}
+	require.NoError(t, store.CreateServiceLog(sle, Vendor{}))
+
+	require.NoError(t, store.HardDeleteMaintenance(m.ID))
+
+	gutterResults, err := store.SearchEntities("gutter")
+	require.NoError(t, err)
+	assert.Empty(t, gutterResults, "maintenance item FTS row should be gone after hard delete")
+
+	fallResults, err := store.SearchEntities("fall")
+	require.NoError(t, err)
+	assert.Empty(t, fallResults, "child SLE FTS row should be gone via FK cascade + _ad trigger")
+}