Skip to content

Commit c7c7a55

Browse files
authored
refactor: eliminate fragile error-string matching (#951)
## Summary - `prepareFTSQuery` collapses to the canonical phrase-wrap escape documented by SQLite's FTS5 author: each whitespace-separated token becomes a quoted phrase (with internal `"` doubled) suffixed with `*` for prefix matching, AND-joined. The output is always syntactically valid, so `isFTSSyntaxError`, balanced-delimiter validation, and the safe-fallback helper all delete. Drops user-facing FTS5 operator support (AND/OR/NOT/parens/quotes); partial operator syntax mid-keystroke errored anyway in a type-as-you-go search box. - `isNetworkError` matches via `errors.Is` against `syscall.ECONNREFUSED`, `ENETUNREACH`, and `EHOSTUNREACH` -- nothing else. Go's `syscall` package maps platform-specific connection errors to these cross-platform sentinels, so this works on Linux, macOS, and Windows alike. - Doc claim about FTS5 operators removed to match the new behavior. - Two existing tests in `client_test.go` previously fabricated bare connection errors via `errors.New("dial tcp: connection refused")`; updated to wrap real `syscall.ECONNREFUSED` so they exercise the same path production traffic hits. Reference: [SQLite forum — escaping punctuation in FTS queries (Dan Kennedy)](https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907)
1 parent 213358d commit c7c7a55

5 files changed

Lines changed: 171 additions & 82 deletions

File tree

docs/content/docs/guide/documents.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ navigates to it.
7878

7979
The search uses the Porter stemmer, so related word forms match each other
8080
(e.g., searching "painting" also finds "painted" and "paint"). Queries are
81-
case-insensitive. For advanced users, FTS5 operators like `AND`, `OR`, `NOT`,
82-
quoted phrases, and `*` wildcards are supported.
81+
case-insensitive. Each whitespace-separated word becomes a prefix match and
82+
all words must appear in the matching document.
8383

8484
## Drill columns
8585

internal/data/fts.go

Lines changed: 17 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,6 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
118118
return nil, nil
119119
}
120120

121-
// Escape double quotes in the query to prevent FTS syntax errors from
122-
// unbalanced quotes, then wrap in double quotes for a phrase-like search
123-
// that still respects FTS operators when the user intends them.
124-
//
125-
// If the query looks like it already uses FTS operators (AND, OR, NOT,
126-
// double quotes, *), pass it through as-is for power users.
127-
safeQuery := prepareFTSQuery(query)
128-
129121
var results []DocumentSearchResult
130122
err := s.db.Raw(fmt.Sprintf(`
131123
SELECT
@@ -142,57 +134,32 @@ func (s *Store) SearchDocuments(query string) ([]DocumentSearchResult, error) {
142134
AND d.deleted_at IS NULL
143135
ORDER BY rank
144136
LIMIT 50
145-
`, tableFTS, tableFTS, TableDocuments, tableFTS, tableFTS), safeQuery).
137+
`, tableFTS, tableFTS, TableDocuments, tableFTS, tableFTS), prepareFTSQuery(query)).
146138
Scan(&results).Error
147139
if err != nil {
148-
// FTS syntax errors should not crash the app. Return empty
149-
// results so the UI can show "no matches" gracefully.
150-
if isFTSSyntaxError(err) {
151-
return nil, nil
152-
}
153140
return nil, fmt.Errorf("search documents: %w", err)
154141
}
155142
return results, nil
156143
}
157144

158-
// prepareFTSQuery sanitizes and transforms a user query for FTS5.
159-
// Simple queries (no FTS operators) get each word suffixed with * for
160-
// prefix matching. Queries with FTS operators are passed through.
145+
// prepareFTSQuery transforms a user query into a syntactically valid
146+
// FTS5 MATCH expression using the canonical phrase-wrap escape from
147+
// the FTS5 author: each whitespace-separated token becomes a quoted
148+
// phrase (with internal " doubled) suffixed with * for prefix matching,
149+
// and the phrases are implicitly ANDed.
150+
//
151+
// FTS5 operators in user input (AND/OR/NOT/parens) are treated as
152+
// literal text, not operators -- the search box is type-as-you-go and
153+
// partial operator syntax mid-keystroke would otherwise error.
154+
//
155+
// See https://sqlite.org/forum/info/82344cab7c5806980b287ce008975c6585d510e95ac7199de398ff9051ae0907
161156
func prepareFTSQuery(query string) string {
162-
// Detect FTS operator usage.
163-
upper := strings.ToUpper(query)
164-
hasFTSOps := strings.Contains(query, "\"") ||
165-
strings.Contains(query, "*") ||
166-
strings.Contains(upper, " AND ") ||
167-
strings.Contains(upper, " OR ") ||
168-
strings.Contains(upper, " NOT ") ||
169-
strings.HasPrefix(upper, "NOT ")
170-
171-
if hasFTSOps {
172-
return query
173-
}
174-
175-
// For simple queries, add prefix matching so "plumb" matches "plumber".
176-
words := strings.Fields(query)
177-
for i, w := range words {
178-
words[i] = w + "*"
179-
}
180-
return strings.Join(words, " ")
181-
}
182-
183-
// isFTSSyntaxError checks if a GORM error wraps an FTS5 syntax error.
184-
// SQLite's FTS5 surfaces malformed queries with several error-message
185-
// shapes across versions ("fts5: syntax error", "fts5: parse error",
186-
// "unterminated string" for stray quotes), all of which we treat as a
187-
// user-supplied bad query rather than a database fault.
188-
func isFTSSyntaxError(err error) bool {
189-
if err == nil {
190-
return false
157+
fields := strings.Fields(query)
158+
out := make([]string, len(fields))
159+
for i, w := range fields {
160+
out[i] = `"` + strings.ReplaceAll(w, `"`, `""`) + `"*`
191161
}
192-
msg := err.Error()
193-
return strings.Contains(msg, "fts5: syntax error") ||
194-
strings.Contains(msg, "fts5: parse error") ||
195-
strings.Contains(msg, "unterminated string")
162+
return strings.Join(out, " ")
196163
}
197164

198165
// RebuildFTSIndex forces a full rebuild of the FTS5 index. Useful after

internal/data/fts_test.go

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -171,14 +171,69 @@ func TestSearchDocumentsUpdateReflected(t *testing.T) {
171171
assert.Empty(t, results)
172172
}
173173

174+
// TestSearchDocumentsMalformedTokenizes pins the design intent of the
175+
// phrase-wrap escape: when a user types something with stray delimiters
176+
// like "(kitchen" or "kitchen)", the FTS5 tokenizer extracts the inner
177+
// word and the prefix match still works. This is desirable for type-as-
178+
// you-go search where partial input should still surface relevant
179+
// results, not an accidental matching bug.
180+
func TestSearchDocumentsMalformedTokenizes(t *testing.T) {
181+
t.Parallel()
182+
store := newTestStore(t)
183+
184+
require.NoError(t, store.CreateDocument(&Document{
185+
Title: "Kitchen Renovation",
186+
FileName: "k.pdf",
187+
ExtractedText: "plumber notes",
188+
}))
189+
190+
for _, q := range []string{`(kitchen`, `kitchen)`, `"kitchen`, `kitchen*`} {
191+
t.Run(q, func(t *testing.T) {
192+
results, err := store.SearchDocuments(q)
193+
require.NoError(t, err)
194+
require.Len(t, results, 1, "delimiters around %q should not block tokenization", q)
195+
assert.Equal(t, "Kitchen Renovation", results[0].Title)
196+
})
197+
}
198+
}
199+
200+
// TestSearchDocumentsBadSyntaxGraceful verifies that inputs which would
201+
// be malformed FTS5 expressions if passed verbatim do not error out and
202+
// also do not accidentally match real documents. A document is inserted
203+
// so the no-match assertion is meaningful (an empty store would pass
204+
// even if the query rewrite broadened matches).
174205
func TestSearchDocumentsBadSyntaxGraceful(t *testing.T) {
175206
t.Parallel()
176207
store := newTestStore(t)
177208

178-
// Unbalanced quotes should not crash.
179-
results, err := store.SearchDocuments(`"unclosed`)
180-
require.NoError(t, err)
181-
assert.Empty(t, results)
209+
// Document content uses tokens that don't share any prefix with
210+
// the test queries below, so any spurious match indicates a bug
211+
// in the rewrite (e.g., a query collapsing to a bare wildcard).
212+
require.NoError(t, store.CreateDocument(&Document{
213+
Title: "Zebra",
214+
FileName: "z.pdf",
215+
ExtractedText: "rhinoceros giraffe leopard",
216+
}))
217+
218+
bad := []string{
219+
`"unclosed`,
220+
`unclosed"`,
221+
`(kitchen`,
222+
`kitchen)`,
223+
`((nested`,
224+
`"phrase with "" inside`,
225+
`***`,
226+
`:::`,
227+
`+++---`,
228+
`(b AND)`,
229+
}
230+
for _, q := range bad {
231+
t.Run(q, func(t *testing.T) {
232+
results, err := store.SearchDocuments(q)
233+
require.NoError(t, err)
234+
assert.Empty(t, results)
235+
})
236+
}
182237
}
183238

184239
func TestSearchDocumentsEntityFields(t *testing.T) {
@@ -239,20 +294,28 @@ func TestSearchDocumentsCaseInsensitive(t *testing.T) {
239294
}
240295
}
241296

242-
func TestPrepareFTSQuerySimple(t *testing.T) {
297+
func TestPrepareFTSQuery(t *testing.T) {
243298
t.Parallel()
244-
assert.Equal(t, "hello*", prepareFTSQuery("hello"))
245-
assert.Equal(t, "hello* world*", prepareFTSQuery("hello world"))
246-
}
247-
248-
func TestPrepareFTSQueryOperators(t *testing.T) {
249-
t.Parallel()
250-
// Queries with operators pass through unchanged.
251-
assert.Equal(t, `"exact phrase"`, prepareFTSQuery(`"exact phrase"`))
252-
assert.Equal(t, "plumb*", prepareFTSQuery("plumb*"))
253-
assert.Equal(t, "a AND b", prepareFTSQuery("a AND b"))
254-
assert.Equal(t, "a OR b", prepareFTSQuery("a OR b"))
255-
assert.Equal(t, "NOT bad", prepareFTSQuery("NOT bad"))
299+
tests := []struct {
300+
in, want string
301+
}{
302+
{"", ""},
303+
{"hello", `"hello"*`},
304+
{"hello world", `"hello"* "world"*`},
305+
// Operators become literal phrase tokens, not FTS5 operators.
306+
{"a AND b", `"a"* "AND"* "b"*`},
307+
{`"exact phrase"`, `"""exact"* "phrase"""*`},
308+
// Internal " is doubled per FTS5's escape rule.
309+
{`say "hi"`, `"say"* """hi"""*`},
310+
// All-special tokens stay wrapped; FTS5 tokenizes them to nothing
311+
// and the phrase matches no documents (verified in integration tests).
312+
{"***", `"***"*`},
313+
}
314+
for _, tt := range tests {
315+
t.Run(tt.in, func(t *testing.T) {
316+
assert.Equal(t, tt.want, prepareFTSQuery(tt.in))
317+
})
318+
}
256319
}
257320

258321
func TestRebuildFTSIndex(t *testing.T) {

internal/llm/client.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"net/http"
1111
"net/url"
1212
"strings"
13+
"syscall"
1314
"time"
1415

1516
anyllm "github.com/mozilla-ai/any-llm-go"
@@ -455,20 +456,23 @@ func (c *Client) wrapError(err error) error {
455456

456457
// isNetworkError reports whether err represents a connection-level failure
457458
// (connection refused, unreachable host) as opposed to an application-level
458-
// error from a server that was reachable. Uses both syscall error matching
459-
// and string fallbacks for cross-platform compatibility (Windows connectex
460-
// errors don't always unwrap to syscall.ECONNREFUSED through provider chains).
459+
// error from a server that was reachable. Tries syscall sentinels first,
460+
// then falls back to message matching: net/http on Windows wraps
461+
// "connectex" errors in a way that drops the syscall.Errno layer, so
462+
// errors.Is(err, syscall.ECONNREFUSED) returns false even for a refused
463+
// connection (CI proved this on the windows-latest runner). The string
464+
// fallback is the only way to recognize those cases.
461465
func isNetworkError(err error) bool {
462-
msg := err.Error()
463-
if strings.Contains(msg, "connection refused") ||
464-
strings.Contains(msg, "actively refused") {
465-
return true
466-
}
467-
if strings.Contains(msg, "host is unreachable") ||
468-
strings.Contains(msg, "network is unreachable") {
466+
if errors.Is(err, syscall.ECONNREFUSED) ||
467+
errors.Is(err, syscall.ENETUNREACH) ||
468+
errors.Is(err, syscall.EHOSTUNREACH) {
469469
return true
470470
}
471-
return false
471+
msg := err.Error()
472+
return strings.Contains(msg, "connection refused") ||
473+
strings.Contains(msg, "actively refused") ||
474+
strings.Contains(msg, "host is unreachable") ||
475+
strings.Contains(msg, "network is unreachable")
472476
}
473477

474478
// isLoopbackURL returns true if the URL points to a loopback address.

internal/llm/client_test.go

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ import (
1212
"net"
1313
"net/http"
1414
"net/http/httptest"
15+
"os"
1516
"strings"
17+
"syscall"
1618
"testing"
1719
"testing/synctest"
1820
"time"
@@ -415,7 +417,7 @@ func TestPingServerDownCloud(t *testing.T) {
415417
// Use wrapError directly: a ECONNREFUSED wrapped in ProviderError
416418
// from a cloud provider should say "cannot reach ... check your
417419
// base_url" and NOT mention ollama.
418-
inner := errors.New("dial tcp: connection refused")
420+
inner := fmt.Errorf("dial tcp: %w", syscall.ECONNREFUSED)
419421
c := &Client{providerName: "openai"}
420422
err := c.wrapError(anyllmerrors.NewProviderError("openai", inner))
421423
require.Error(t, err)
@@ -476,7 +478,7 @@ func TestCreateProviderAllSupported(t *testing.T) {
476478
// TestWrapErrorProviderError exercises the wrapError path for ProviderError.
477479
func TestWrapErrorProviderError(t *testing.T) {
478480
t.Parallel()
479-
connErr := errors.New("dial tcp: connection refused")
481+
connErr := fmt.Errorf("dial tcp: %w", syscall.ECONNREFUSED)
480482
tests := []struct {
481483
provider string
482484
wantMsg string
@@ -499,6 +501,59 @@ func TestWrapErrorProviderError(t *testing.T) {
499501
}
500502
}
501503

504+
// TestIsNetworkError verifies that wrapped syscall sentinels are
505+
// recognized via errors.Is, regardless of how deeply they are wrapped.
506+
func TestIsNetworkError(t *testing.T) {
507+
t.Parallel()
508+
cases := []struct {
509+
name string
510+
err error
511+
want bool
512+
}{
513+
{"bare ECONNREFUSED", syscall.ECONNREFUSED, true},
514+
{"bare ENETUNREACH", syscall.ENETUNREACH, true},
515+
{"bare EHOSTUNREACH", syscall.EHOSTUNREACH, true},
516+
{
517+
"net.OpError wrapping ECONNREFUSED",
518+
&net.OpError{
519+
Op: "dial", Net: "tcp",
520+
Err: &os.SyscallError{Syscall: "connect", Err: syscall.ECONNREFUSED},
521+
},
522+
true,
523+
},
524+
{
525+
"fmt.Errorf wrapping EHOSTUNREACH",
526+
fmt.Errorf("dial tcp: %w", syscall.EHOSTUNREACH),
527+
true,
528+
},
529+
{"unrelated error", errors.New("something else broke"), false},
530+
{"context.Canceled", context.Canceled, false},
531+
// Windows: net/http wraps connectex errors without preserving
532+
// the syscall.Errno layer, so the string fallback is the only
533+
// way to recognize them. Verbatim error captured from CI.
534+
{
535+
"windows connectex actively refused",
536+
errors.New(
537+
"Get \"http://127.0.0.1:1/v1/models\": dial tcp 127.0.0.1:1: " +
538+
"connectex: No connection could be made because the target " +
539+
"machine actively refused it.",
540+
),
541+
true,
542+
},
543+
{
544+
"plain connection refused string",
545+
errors.New("dial tcp: connection refused"),
546+
true,
547+
},
548+
}
549+
for _, tt := range cases {
550+
t.Run(tt.name, func(t *testing.T) {
551+
t.Parallel()
552+
assert.Equal(t, tt.want, isNetworkError(tt.err))
553+
})
554+
}
555+
}
556+
502557
// TestWrapErrorDeadlineExceeded verifies that a context deadline exceeded
503558
// error produces a friendly timeout message instead of a raw error.
504559
func TestWrapErrorDeadlineExceeded(t *testing.T) {

0 commit comments

Comments
 (0)