Skip to content

Commit d14769a

Browse files
authored
fix(llm): use Windows WSA errnos instead of string matching (#953)
## Summary - Follow-up to #951. The Windows CI failure resolved there with a string-matching fallback has a deeper, fully typed fix. - `syscall.ECONNREFUSED` on Windows is defined as \`APPLICATION_ERROR | iota\` (an invented value, not the real WSA error code), so \`errors.Is(err, syscall.ECONNREFUSED)\` *always* returns false on Windows for real network errors. Same for \`ENETUNREACH\` and \`EHOSTUNREACH\`. - Split \`connectionErrnos\` across build-tagged files: POSIX uses the standard \`syscall.*\` constants; Windows uses \`golang.org/x/sys/windows.WSAECONNREFUSED\` / \`WSAENETUNREACH\` / \`WSAEHOSTUNREACH\` (already a direct dep, exposed as \`syscall.Errno\` so \`errors.Is\` works). - \`isNetworkError\` collapses to a simple loop over the platform-specific list; the substring fallback comes out. Zero string matching on either platform. - \`AGENTS.md\`: codify "no \`err.Error()\` substring classification" with the Windows errno caveat documented inline so the next agent doesn't fall into the same trap.
1 parent d4ff6ad commit d14769a

5 files changed

Lines changed: 83 additions & 42 deletions

File tree

AGENTS.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,19 @@ details; do not duplicate that detail here.
344344
(e.g. `delete(m, "id"); delete(m, "ID")`) -- that papers over an
345345
inconsistency instead of fixing it. If you see ambiguity, fix the
346346
source struct's tags.
347+
- **No string matching to classify errors**: Never inspect `err.Error()`
348+
substrings to decide whether an error is e.g. a network failure or a
349+
syntax error. Substring checks break silently when an upstream wrapper
350+
changes its message format. Use `errors.Is` against typed sentinels or
351+
`errors.As` against typed error structs. **Windows socket gotcha**:
352+
`syscall.ECONNREFUSED`/`ENETUNREACH`/`EHOSTUNREACH` are NOT portable.
353+
On Windows they are `APPLICATION_ERROR | iota` invented constants that
354+
never match the WSA error codes (10061/10051/10065) `net/http` actually
355+
returns -- so `errors.Is(err, syscall.ECONNREFUSED)` always fails on
356+
Windows. Define platform-specific sentinel lists in build-tagged files
357+
using `golang.org/x/sys/windows.WSA*` (already typed as `syscall.Errno`)
358+
for the Windows variant. See `internal/llm/network_errors_{windows,other}.go`
359+
for the pattern.
347360
- **Deterministic ordering requires tiebreakers**: Every `ORDER BY` that
348361
could tie MUST include a tiebreaker (typically `id DESC`).
349362
- **Audit new deps before adding**: Review source for security issues

internal/llm/client.go

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"net/http"
1111
"net/url"
1212
"strings"
13-
"syscall"
1413
"time"
1514

1615
anyllm "github.com/mozilla-ai/any-llm-go"
@@ -456,23 +455,18 @@ func (c *Client) wrapError(err error) error {
456455

457456
// isNetworkError reports whether err represents a connection-level failure
458457
// (connection refused, unreachable host) as opposed to an application-level
459-
// error from a server that was reachable. Tries syscall sentinels first,
460-
// then falls back to message matching: net/http on Windows wraps
461-
// "connectex" errors in a way that drops the syscall.Errno layer, so
462-
// errors.Is(err, syscall.ECONNREFUSED) returns false even for a refused
463-
// connection (CI proved this on the windows-latest runner). The string
464-
// fallback is the only way to recognize those cases.
458+
// error from a server that was reachable. The set of errno sentinels is
459+
// platform-specific because Windows' net stack returns WSA error codes
460+
// (e.g., WSAECONNREFUSED = 10061) and syscall.ECONNREFUSED on Windows is
461+
// an invented APPLICATION_ERROR value that never matches them. See
462+
// connectionErrnos in network_errors_{windows,other}.go.
465463
func isNetworkError(err error) bool {
466-
if errors.Is(err, syscall.ECONNREFUSED) ||
467-
errors.Is(err, syscall.ENETUNREACH) ||
468-
errors.Is(err, syscall.EHOSTUNREACH) {
469-
return true
464+
for _, errno := range connectionErrnos {
465+
if errors.Is(err, errno) {
466+
return true
467+
}
470468
}
471-
msg := err.Error()
472-
return strings.Contains(msg, "connection refused") ||
473-
strings.Contains(msg, "actively refused") ||
474-
strings.Contains(msg, "host is unreachable") ||
475-
strings.Contains(msg, "network is unreachable")
469+
return false
476470
}
477471

478472
// isLoopbackURL returns true if the URL points to a loopback address.

internal/llm/client_test.go

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"net/http/httptest"
1515
"os"
1616
"strings"
17-
"syscall"
1817
"testing"
1918
"testing/synctest"
2019
"time"
@@ -417,7 +416,7 @@ func TestPingServerDownCloud(t *testing.T) {
417416
// Use wrapError directly: a ECONNREFUSED wrapped in ProviderError
418417
// from a cloud provider should say "cannot reach ... check your
419418
// base_url" and NOT mention ollama.
420-
inner := fmt.Errorf("dial tcp: %w", syscall.ECONNREFUSED)
419+
inner := fmt.Errorf("dial tcp: %w", connectionErrnos[0])
421420
c := &Client{providerName: "openai"}
422421
err := c.wrapError(anyllmerrors.NewProviderError("openai", inner))
423422
require.Error(t, err)
@@ -478,7 +477,7 @@ func TestCreateProviderAllSupported(t *testing.T) {
478477
// TestWrapErrorProviderError exercises the wrapError path for ProviderError.
479478
func TestWrapErrorProviderError(t *testing.T) {
480479
t.Parallel()
481-
connErr := fmt.Errorf("dial tcp: %w", syscall.ECONNREFUSED)
480+
connErr := fmt.Errorf("dial tcp: %w", connectionErrnos[0])
482481
tests := []struct {
483482
provider string
484483
wantMsg string
@@ -501,49 +500,43 @@ func TestWrapErrorProviderError(t *testing.T) {
501500
}
502501
}
503502

504-
// TestIsNetworkError verifies that wrapped syscall sentinels are
505-
// recognized via errors.Is, regardless of how deeply they are wrapped.
503+
// TestIsNetworkError verifies that wrapped platform-correct syscall
504+
// sentinels are recognized via errors.Is, regardless of how deeply
505+
// they are wrapped. Uses connectionErrnos to stay correct across
506+
// Linux/macOS (POSIX errnos) and Windows (WSA errnos), since
507+
// syscall.ECONNREFUSED on Windows is an invented APPLICATION_ERROR
508+
// constant that doesn't match the WSA value.
506509
func TestIsNetworkError(t *testing.T) {
507510
t.Parallel()
511+
refused := connectionErrnos[0]
508512
cases := []struct {
509513
name string
510514
err error
511515
want bool
512516
}{
513-
{"bare ECONNREFUSED", syscall.ECONNREFUSED, true},
514-
{"bare ENETUNREACH", syscall.ENETUNREACH, true},
515-
{"bare EHOSTUNREACH", syscall.EHOSTUNREACH, true},
517+
{"bare connection-refused errno", refused, true},
516518
{
517-
"net.OpError wrapping ECONNREFUSED",
519+
"net.OpError wrapping connection-refused errno",
518520
&net.OpError{
519521
Op: "dial", Net: "tcp",
520-
Err: &os.SyscallError{Syscall: "connect", Err: syscall.ECONNREFUSED},
522+
Err: &os.SyscallError{Syscall: "connect", Err: refused},
521523
},
522524
true,
523525
},
524526
{
525-
"fmt.Errorf wrapping EHOSTUNREACH",
526-
fmt.Errorf("dial tcp: %w", syscall.EHOSTUNREACH),
527+
"fmt.Errorf wrapping connection-refused errno",
528+
fmt.Errorf("dial tcp: %w", refused),
527529
true,
528530
},
529531
{"unrelated error", errors.New("something else broke"), false},
530532
{"context.Canceled", context.Canceled, false},
531-
// Windows: net/http wraps connectex errors without preserving
532-
// the syscall.Errno layer, so the string fallback is the only
533-
// way to recognize them. Verbatim error captured from CI.
533+
// Bare strings without a wrapped syscall sentinel must NOT be
534+
// classified as network errors. Pin this so we don't slide back
535+
// into substring matching.
534536
{
535-
"windows connectex actively refused",
536-
errors.New(
537-
"Get \"http://127.0.0.1:1/v1/models\": dial tcp 127.0.0.1:1: " +
538-
"connectex: No connection could be made because the target " +
539-
"machine actively refused it.",
540-
),
541-
true,
542-
},
543-
{
544-
"plain connection refused string",
537+
"bare string says connection refused",
545538
errors.New("dial tcp: connection refused"),
546-
true,
539+
false,
547540
},
548541
}
549542
for _, tt := range cases {
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Copyright 2026 Phillip Cloud
2+
// Licensed under the Apache License, Version 2.0
3+
4+
//go:build !windows
5+
6+
package llm
7+
8+
import "syscall"
9+
10+
// connectionErrnos lists the syscall errors that mean "I could not reach
11+
// the server" on POSIX platforms. errors.Is unwraps net.OpError ->
12+
// os.SyscallError -> syscall.Errno, so these match the values net/http
13+
// surfaces when a connect(2) fails.
14+
var connectionErrnos = []syscall.Errno{
15+
syscall.ECONNREFUSED,
16+
syscall.ENETUNREACH,
17+
syscall.EHOSTUNREACH,
18+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Copyright 2026 Phillip Cloud
2+
// Licensed under the Apache License, Version 2.0
3+
4+
package llm
5+
6+
import (
7+
"syscall"
8+
9+
"golang.org/x/sys/windows"
10+
)
11+
12+
// connectionErrnos lists the WSA error values that mean "I could not
13+
// reach the server" on Windows. The standard syscall.ECONNREFUSED on
14+
// Windows is an APPLICATION_ERROR-prefixed invented constant that
15+
// never matches what ConnectEx returns; the real values come from
16+
// winsock and are exposed via golang.org/x/sys/windows. errors.Is
17+
// unwraps net.OpError -> os.SyscallError{Syscall: "connectex"} ->
18+
// syscall.Errno, so these match what net/http surfaces.
19+
var connectionErrnos = []syscall.Errno{
20+
windows.WSAECONNREFUSED,
21+
windows.WSAENETUNREACH,
22+
windows.WSAEHOSTUNREACH,
23+
}

0 commit comments

Comments
 (0)