refactor(config): fix LLM timeout architecture (#746)

cpcloud · claude · web-flow · commit bd6f3dc89f2b · 2026-03-10T08:57:50.000-04:00
## Summary Closes #732. - **Clarify timeout semantics**: `ResolvedLLM.Timeout` is now the inference context deadline (was the HTTP client timeout). Per-pipeline overrides (`llm.chat.timeout`, `llm.extraction.timeout`) inherit from the base `llm.timeout`. - **Derive HTTP client timeout**: `NewClient` computes `max(timeout, QuickOpTimeout)` so quick ops (ping, model listing) aren't killed by short inference timeouts. `QuickOpTimeout` stays as a non-configurable 30s constant. - **Add chat inference deadline**: Chat streaming now uses `context.WithTimeout` (was `WithCancel` with no deadline), enforcing the configured `llm.chat.timeout`. - **Deprecate `extraction.llm_timeout`**: Migrated to `llm.extraction.timeout` with TOML key migration and env var rename (`MICASA_EXTRACTION_LLM_TIMEOUT` -> `MICASA_LLM_EXTRACTION_TIMEOUT`). - **Remove redundant `LLMInferenceTimeout`**: `extractionConfig.Timeout` now serves as the inference deadline directly, eliminating the extra field. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/cmd/micasa/main.go b/cmd/micasa/main.go
@@ -173,7 +173,6 @@ func (cmd *runCmd) Run() error {
 		exCfg.Thinking,
 		extractors,
 		cfg.Extraction.IsEnabled(),
-		cfg.Extraction.LLMTimeoutDuration(),
 		cfg.Extraction.IsOCRTSV(),
 		cfg.Extraction.OCRConfThreshold(),
 	)
diff --git a/docs/content/docs/reference/configuration.md b/docs/content/docs/reference/configuration.md
@@ -111,7 +111,7 @@ You can always infer the env var name from the config key.
 | `MICASA_LLM_MODEL` | `qwen3` | `llm.model` | LLM model name |
 | `MICASA_LLM_API_KEY` | (empty) | `llm.api_key` | LLM API key for cloud providers |
 | `MICASA_LLM_EXTRA_CONTEXT` | (empty) | `llm.extra_context` | Custom context appended to LLM system prompts |
-| `MICASA_LLM_TIMEOUT` | `5m` | `llm.timeout` | Max time for a single LLM response |
+| `MICASA_LLM_TIMEOUT` | `5m` | `llm.timeout` | Base inference timeout for LLM responses |
 | `MICASA_LLM_THINKING` | (unset) | `llm.thinking` | Enable model thinking for chat |
 | `MICASA_DOCUMENTS_MAX_FILE_SIZE` | `50 MiB` | `documents.max_file_size` | Max document import size |
 | `MICASA_DOCUMENTS_CACHE_TTL` | `30d` | `documents.cache_ttl` | Document cache lifetime |
@@ -121,7 +121,7 @@ You can always infer the env var name from the config key.
 | `MICASA_EXTRACTION_ENABLE` | `true` | `extraction.enable` | Enable/disable LLM extraction |
 | `MICASA_EXTRACTION_THINKING` | `false` | `extraction.thinking` | Enable model thinking for extraction |
 | `MICASA_EXTRACTION_MAX_PAGES` | `0` | `extraction.max_pages` | Max pages to OCR per document (0 = no limit) |
-| `MICASA_EXTRACTION_LLM_TIMEOUT` | `5m` | `extraction.llm_timeout` | LLM extraction timeout |
+| `MICASA_LLM_EXTRACTION_TIMEOUT` | `5m` | `llm.extraction.timeout` | Extraction inference timeout |
 | `MICASA_EXTRACTION_OCR_ENABLE` | `true` | `extraction.ocr.enable` | Enable/disable OCR on documents |
 | `MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD` | `0` | `extraction.ocr.confidence_threshold` | Min tesseract confidence (0-100) |
 | `MICASA_LOCALE_CURRENCY` | (auto-detect) | `locale.currency` | ISO 4217 currency code (e.g. `USD`, `EUR`, `GBP`) |
@@ -144,6 +144,7 @@ warning. They will be removed in a future release.
 | `MICASA_EXTRACTION_ENABLED` | `MICASA_EXTRACTION_ENABLE` |
 | `MICASA_EXTRACTION_MODEL` | `MICASA_LLM_EXTRACTION_MODEL` |
 | `MICASA_EXTRACTION_THINKING` | `MICASA_LLM_EXTRACTION_THINKING` |
+| `MICASA_EXTRACTION_LLM_TIMEOUT` | `MICASA_LLM_EXTRACTION_TIMEOUT` |
 
 {{% /details %}}
 
@@ -279,9 +280,9 @@ model = "qwen3"
 # Use this to inject domain-specific details about your house, region, etc.
 # extra_context = "My house is a 1920s craftsman in Portland, OR."
 
-# Max time for a single LLM response (including streaming).
+# Base inference timeout for LLM responses (including streaming).
+# Per-pipeline overrides: llm.chat.timeout and llm.extraction.timeout.
 # Go duration syntax: "5m", "10m", etc. Default: "5m".
-# Increase for slow models or complex queries.
 # timeout = "5m"
 
 # Enable model thinking mode for chat (e.g. qwen3 <think> blocks).
@@ -335,7 +336,7 @@ set in `[llm.chat]` and `[llm.extraction]`.
 | `model` | string | `qwen3` | Model identifier sent in chat requests. Must be available on the server. |
 | `api_key` | string | (empty) | Authentication credential. Required for cloud providers (Anthropic, OpenAI, etc.). Leave empty for local servers. |
 | `extra_context` | string | (empty) | Free-form text appended to all LLM system prompts. Useful for telling the model about your house or regional conventions. Currency is handled automatically via `[locale]`. |
-| `timeout` | string | `"5m"` | Max time for a single LLM response (including streaming). Go duration syntax, e.g. `"10m"`. Increase for slow models. |
+| `timeout` | string | `"5m"` | Base inference timeout for LLM responses (including streaming). Per-pipeline overrides: `llm.chat.timeout` and `llm.extraction.timeout`. Go duration syntax, e.g. `"10m"`. |
 | `thinking` | bool | (unset) | Enable model thinking mode (e.g. qwen3 `<think>` blocks). Unset = don't send the option (server default). |
 
 ### `[llm.chat]` section
@@ -350,22 +351,23 @@ than the default.
 | `base_url` | string | (inherits) | Override API base URL for chat. |
 | `model` | string | (inherits) | Override model for chat. |
 | `api_key` | string | (inherits) | Override API key for chat. |
-| `timeout` | string | (inherits) | Override timeout for chat. |
+| `timeout` | string | (inherits) | Chat inference context deadline. Inherits from `llm.timeout` when not set. |
 | `thinking` | string | (inherits) | Override thinking mode for chat. |
 
 ### `[llm.extraction]` section
 
 Per-pipeline LLM overrides for document extraction. Empty fields inherit
 from `[llm]`. Use this to run extraction on a smaller, faster model while
-keeping a more capable model for chat.
+keeping a more capable model for chat. The `timeout` field replaces the
+deprecated `extraction.llm_timeout`.
 
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
 | `provider` | string | (inherits) | Override LLM provider for extraction. |
 | `base_url` | string | (inherits) | Override API base URL for extraction. |
 | `model` | string | (inherits) | Override model for extraction. |
 | `api_key` | string | (inherits) | Override API key for extraction. |
-| `timeout` | string | (inherits) | Override timeout for extraction. |
+| `timeout` | string | (inherits) | Extraction inference context deadline. Replaces `extraction.llm_timeout`. Inherits from `llm.timeout` when not set. |
 | `thinking` | string | (inherits) | Override thinking mode for extraction. |
 
 ### `[documents]` section
@@ -391,6 +393,7 @@ dates, vendor matching) from uploaded documents.
 | `max_pages` | int | `0` | Maximum pages to OCR per scanned document. 0 means no limit. |
 | `enable` | bool | `true` | Set to `false` to disable LLM-powered structured extraction. OCR and pdftotext still run (see `[extraction.ocr]`). |
 | `enabled` | bool | -- | **Deprecated.** Use `enable` instead. |
+| `llm_timeout` | string | `"5m"` | **Deprecated.** Use `[llm.extraction] timeout` instead. |
 | `thinking` | bool | `false` | **Deprecated.** Use `[llm.extraction] thinking` instead. |
 
 ### `[extraction.ocr]` section
diff --git a/internal/app/chat.go b/internal/app/chat.go
@@ -15,6 +15,7 @@ import (
 	tea "github.com/charmbracelet/bubbletea"
 	"github.com/charmbracelet/lipgloss"
 	"github.com/charmbracelet/x/ansi"
+	"github.com/cpcloud/micasa/internal/config"
 	"github.com/cpcloud/micasa/internal/data"
 	"github.com/cpcloud/micasa/internal/llm"
 	ollamaPull "github.com/cpcloud/micasa/internal/ollama"
@@ -318,11 +319,20 @@ func (m *Model) submitChat() tea.Cmd {
 	return tea.Batch(m.startSQLStream(query), m.chat.Spinner.Tick)
 }
 
+// chatInferenceTimeout returns the configured chat inference timeout.
+func (m *Model) chatInferenceTimeout() time.Duration {
+	if m.llmConfig != nil && m.llmConfig.Timeout > 0 {
+		return m.llmConfig.Timeout
+	}
+	return config.DefaultLLMTimeout
+}
+
 // startSQLStream initiates streaming SQL generation (stage 1).
 func (m *Model) startSQLStream(query string) tea.Cmd {
 	client := m.llmClient
 	store := m.store
 	extraContext := m.llmExtraContext
+	chatTimeout := m.chatInferenceTimeout()
 	// Capture conversation history on the main goroutine before the closure
 	// runs in a background goroutine -- m.chat.Messages is mutated by the
 	// Bubble Tea event loop and is not safe to read concurrently.
@@ -346,8 +356,8 @@ func (m *Model) startSQLStream(query string) tea.Cmd {
 		messages = append(messages, llm.Message{Role: roleUser, Content: query})
 
 		//nolint:gosec // cancel stored in CancelFn, called on ctrl+c
-		ctx, cancel := context.WithCancel(
-			context.Background(),
+		ctx, cancel := context.WithTimeout(
+			context.Background(), chatTimeout,
 		)
 		streamCh, err := client.ChatStream(ctx, messages)
 		if err != nil {
@@ -763,7 +773,7 @@ func (m *Model) handleSQLResult(msg sqlResultMsg) tea.Cmd {
 		{Role: roleUser, Content: "Summarize these results."},
 	}
 
-	ctx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := context.WithTimeout(context.Background(), m.chatInferenceTimeout())
 	ch, err := m.llmClient.ChatStream(ctx, messages)
 	if err != nil {
 		cancel()
@@ -788,7 +798,7 @@ func (m *Model) handleSQLResult(msg sqlResultMsg) tea.Cmd {
 func (m *Model) startFallbackStream(question string) tea.Cmd {
 	messages := m.buildFallbackMessages(question)
 
-	ctx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := context.WithTimeout(context.Background(), m.chatInferenceTimeout())
 	ch, err := m.llmClient.ChatStream(ctx, messages)
 	if err != nil {
 		cancel()
diff --git a/internal/app/extraction.go b/internal/app/extraction.go
@@ -492,8 +492,9 @@ func (m *Model) llmPingCmd(state *extractionLogState) tea.Cmd {
 		return nil
 	}
 	id := state.ID
+	quickOpTimeout := client.Timeout()
 	return func() tea.Msg {
-		ctx, cancel := context.WithTimeout(context.Background(), llm.QuickOpTimeout)
+		ctx, cancel := context.WithTimeout(context.Background(), quickOpTimeout)
 		defer cancel()
 		err := client.Ping(ctx)
 		return extractionLLMPingMsg{ID: id, Err: err}
@@ -508,7 +509,7 @@ func (m *Model) llmExtractCmd(ctx context.Context, ex *extractionLogState) tea.C
 	}
 	schemaCtx := m.buildSchemaContext()
 	id := ex.ID
-	timeout := m.ex.llmInferenceTimeout
+	timeout := m.ex.extractionTimeout
 	return func() tea.Msg {
 		llmCtx := ctx
 		if timeout > 0 {
@@ -743,7 +744,7 @@ func (m *Model) handleExtractionLLMChunk(msg extractionLLMChunkMsg) tea.Cmd {
 		errMsg := msg.Err.Error()
 		if errors.Is(msg.Err, context.DeadlineExceeded) {
 			errMsg = fmt.Sprintf(
-				"timed out after %s -- increase extraction.llm_timeout in config",
+				"timed out after %s -- increase llm.extraction.timeout in config",
 				step.Elapsed.Truncate(time.Second),
 			)
 		}
diff --git a/internal/app/extraction_test.go b/internal/app/extraction_test.go
@@ -1016,7 +1016,7 @@ func TestLLMExtraction_TimeoutError(t *testing.T) {
 	assert.Equal(t, stepFailed, step.Status)
 	require.NotEmpty(t, step.Logs)
 	assert.Contains(t, step.Logs[0], "timed out")
-	assert.Contains(t, step.Logs[0], "extraction.llm_timeout")
+	assert.Contains(t, step.Logs[0], "llm.extraction.timeout")
 }
 
 func TestLLMExtraction_TimeoutError_NonDeadlinePreservesOriginal(t *testing.T) {
diff --git a/internal/app/model.go b/internal/app/model.go
@@ -266,17 +266,16 @@ func NewModel(store *data.Store, options Options) (*Model, error) {
 		llmExtraContext: extraContext,
 		filePickerDir:   options.FilePickerDir,
 		ex: extractState{
-			extractionProvider:  options.ExtractionConfig.Provider,
-			extractionBaseURL:   options.ExtractionConfig.BaseURL,
-			extractionModel:     options.ExtractionConfig.Model,
-			extractionAPIKey:    options.ExtractionConfig.APIKey,
-			extractionTimeout:   options.ExtractionConfig.Timeout,
-			extractionThinking:  options.ExtractionConfig.Thinking,
-			extractionEnabled:   options.ExtractionConfig.Enabled,
-			ocrTSV:              options.ExtractionConfig.OCRTSV,
-			ocrConfThreshold:    options.ExtractionConfig.OCRConfThreshold,
-			extractors:          options.ExtractionConfig.Extractors,
-			llmInferenceTimeout: options.ExtractionConfig.LLMInferenceTimeout,
+			extractionProvider: options.ExtractionConfig.Provider,
+			extractionBaseURL:  options.ExtractionConfig.BaseURL,
+			extractionModel:    options.ExtractionConfig.Model,
+			extractionAPIKey:   options.ExtractionConfig.APIKey,
+			extractionTimeout:  options.ExtractionConfig.Timeout,
+			extractionThinking: options.ExtractionConfig.Thinking,
+			extractionEnabled:  options.ExtractionConfig.Enabled,
+			ocrTSV:             options.ExtractionConfig.OCRTSV,
+			ocrConfThreshold:   options.ExtractionConfig.OCRConfThreshold,
+			extractors:         options.ExtractionConfig.Extractors,
 		},
 		pull:      pullState{progress: pprog},
 		styles:    appStyles,
diff --git a/internal/app/types.go b/internal/app/types.go
@@ -87,19 +87,18 @@ func (fs *formState) formKind() FormKind {
 type extractState struct {
 	// Extraction-specific LLM connection settings. When extractionProvider
 	// differs from the chat provider, an independent client is created.
-	extractionProvider  string
-	extractionBaseURL   string
-	extractionModel     string
-	extractionAPIKey    string
-	extractionTimeout   time.Duration
-	extractionThinking  string
-	extractionEnabled   bool
-	ocrTSV              bool
-	ocrConfThreshold    int
-	extractionClient    *llm.Client
-	extractors          []extract.Extractor
-	extractionReady     bool
-	llmInferenceTimeout time.Duration
+	extractionProvider string
+	extractionBaseURL  string
+	extractionModel    string
+	extractionAPIKey   string
+	extractionTimeout  time.Duration // inference context deadline
+	extractionThinking string
+	extractionEnabled  bool
+	ocrTSV             bool
+	ocrConfThreshold   int
+	extractionClient   *llm.Client
+	extractors         []extract.Extractor
+	extractionReady    bool
 
 	pendingExtractionDocID *uint
 	extraction             *extractionLogState
@@ -277,22 +276,21 @@ type llmConfig struct {
 	Model        string
 	APIKey       string //nolint:gosec // G101 false positive: field name triggers heuristic, not a hardcoded credential
 	ExtraContext string
-	Timeout      time.Duration
-	Thinking     string // reasoning effort: none|low|medium|high|auto
+	Timeout      time.Duration // inference context deadline
+	Thinking     string        // reasoning effort: none|low|medium|high|auto
 }
 
 // extractionConfig holds resolved extraction pipeline settings.
 type extractionConfig struct {
 	// LLM connection settings for extraction. When Provider is non-empty,
 	// the extraction pipeline creates its own LLM client independent of
 	// the chat client. When empty, falls back to the chat client.
-	Provider            string
-	BaseURL             string
-	Model               string
-	APIKey              string //nolint:gosec // G117 false positive: field name, not a hardcoded credential
-	Timeout             time.Duration
-	Thinking            string // reasoning effort level
-	LLMInferenceTimeout time.Duration
+	Provider string
+	BaseURL  string
+	Model    string
+	APIKey   string        //nolint:gosec // G117 false positive: field name, not a hardcoded credential
+	Timeout  time.Duration // inference context deadline
+	Thinking string        // reasoning effort level
 
 	Extractors       []extract.Extractor // configured extractors; nil = defaults
 	Enabled          bool                // LLM extraction enabled
@@ -307,22 +305,20 @@ func (o *Options) SetExtraction(
 	thinking string,
 	extractors []extract.Extractor,
 	enabled bool,
-	llmInferenceTimeout time.Duration,
 	ocrTSV bool,
 	ocrConfThreshold int,
 ) {
 	o.ExtractionConfig = extractionConfig{
-		Provider:            provider,
-		BaseURL:             baseURL,
-		Model:               model,
-		APIKey:              apiKey,
-		Timeout:             timeout,
-		Thinking:            thinking,
-		LLMInferenceTimeout: llmInferenceTimeout,
-		Extractors:          extractors,
-		Enabled:             enabled,
-		OCRTSV:              ocrTSV,
-		OCRConfThreshold:    ocrConfThreshold,
+		Provider:         provider,
+		BaseURL:          baseURL,
+		Model:            model,
+		APIKey:           apiKey,
+		Timeout:          timeout,
+		Thinking:         thinking,
+		Extractors:       extractors,
+		Enabled:          enabled,
+		OCRTSV:           ocrTSV,
+		OCRConfThreshold: ocrConfThreshold,
 	}
 }
 
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -66,9 +66,9 @@ type LLM struct {
 	// Currency is handled by [locale] section. Optional; defaults to empty.
 	ExtraContext string `toml:"extra_context"`
 
-	// Timeout is the maximum time for a single LLM response (including
+	// Timeout is the base inference timeout for LLM responses (including
 	// streaming). Go duration string, e.g. "5m", "10m". Default: "5m".
-	// Quick operations (ping, model listing) use a shorter fixed deadline.
+	// Per-pipeline overrides: llm.chat.timeout and llm.extraction.timeout.
 	Timeout string `toml:"timeout" default:"5m"`
 
 	// Thinking controls the model's reasoning effort level. Supported values:
@@ -114,7 +114,7 @@ type ResolvedLLM struct {
 	Model        string
 	APIKey       string //nolint:gosec // resolved config field, not a hardcoded credential
 	ExtraContext string
-	Timeout      time.Duration
+	Timeout      time.Duration // inference context deadline for this pipeline
 	Thinking     string
 }
 
@@ -965,12 +965,23 @@ func migrateRenamedKeys(cfg *Config, md toml.MetaData, path string) {
 			"extraction.thinking is deprecated -- use llm.extraction.thinking instead",
 		)
 	}
+
+	// extraction.llm_timeout -> llm.extraction.timeout (v1.80)
+	if md.IsDefined("extraction", "llm_timeout") && !md.IsDefined("llm", "extraction", "timeout") {
+		cfg.LLM.Extraction.Timeout = cfg.Extraction.LLMTimeout
+		cfg.Warnings = append(cfg.Warnings,
+			"extraction.llm_timeout is deprecated -- use llm.extraction.timeout instead",
+		)
+	}
 }
 
 // envRenames maps deprecated environment variable names to their canonical
 // replacements. Processed newest-first so that the most recent intermediate
 // name wins when multiple generations of the same variable are set.
 var envRenames = []struct{ old, canonical string }{
+	// v1.80: extraction.llm_timeout -> llm.extraction.timeout.
+	{"MICASA_EXTRACTION_LLM_TIMEOUT", "MICASA_LLM_EXTRACTION_TIMEOUT"},
+
 	// v1.78: extraction.enabled -> extraction.enable.
 	{"MICASA_EXTRACTION_ENABLED", "MICASA_EXTRACTION_ENABLE"},
 
@@ -1105,9 +1116,9 @@ model = "` + DefaultModel + `"
 # Use this to inject domain-specific details about your house, region, etc.
 # extra_context = "My house is a 1920s craftsman in Portland, OR."
 
-# Maximum time for a single LLM response (including streaming).
+# Base inference timeout for LLM responses (including streaming).
 # Go duration syntax: "5m", "10m", etc. Default: "5m".
-# Increase for very slow models or complex queries.
+# Per-pipeline overrides: llm.chat.timeout and llm.extraction.timeout.
 # timeout = "5m"
 
 # Model reasoning effort level. Supported: none, low, medium, high, auto.
@@ -1121,7 +1132,7 @@ model = "` + DefaultModel + `"
 # base_url = "https://api.anthropic.com"
 # model = "claude-sonnet-4-5-20250929"
 # api_key = "sk-ant-..."
-# timeout = "10s"
+# timeout = "5m"     # inference context deadline (default: 5m)
 # thinking = "medium"
 
 # [llm.extraction]
@@ -1131,7 +1142,7 @@ model = "` + DefaultModel + `"
 # base_url = "https://api.anthropic.com"
 # model = "claude-haiku-3-5-20241022"
 # api_key = "sk-ant-..."
-# timeout = "15s"
+# timeout = "5m"     # inference context deadline (default: 5m)
 # thinking = "low"
 
 [documents]
@@ -1153,8 +1164,7 @@ model = "` + DefaultModel + `"
 # still run (see [extraction.ocr]) to populate document text for search/display.
 # enable = true
 
-# Timeout for LLM extraction inference. Go duration syntax: "5m", "90s", etc.
-# Default: "5m". Increase for slow local models or complex documents.
+# Deprecated: use [llm.extraction] timeout instead.
 # llm_timeout = "5m"
 
 # Maximum pages for async extraction of scanned documents. 0 = no limit. Default: 0.
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
diff --git a/internal/config/show.go b/internal/config/show.go
diff --git a/internal/llm/client.go b/internal/llm/client.go
diff --git a/plans/llm-timeout-architecture.md b/plans/llm-timeout-architecture.md

Original file line number	Diff line number	Diff line change
`@@ -173,7 +173,6 @@ func (cmd *runCmd) Run() error {`
`173`	`173`	`exCfg.Thinking,`
`174`	`174`	`extractors,`
`175`	`175`	`cfg.Extraction.IsEnabled(),`
`176`		`- cfg.Extraction.LLMTimeoutDuration(),`
`177`	`176`	`cfg.Extraction.IsOCRTSV(),`
`178`	`177`	`cfg.Extraction.OCRConfThreshold(),`
`179`	`178`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1016,7 +1016,7 @@ func TestLLMExtraction_TimeoutError(t *testing.T) {`
`1016`	`1016`	`assert.Equal(t, stepFailed, step.Status)`
`1017`	`1017`	`require.NotEmpty(t, step.Logs)`
`1018`	`1018`	`assert.Contains(t, step.Logs[0], "timed out")`
`1019`		`- assert.Contains(t, step.Logs[0], "extraction.llm_timeout")`
	`1019`	`+ assert.Contains(t, step.Logs[0], "llm.extraction.timeout")`
`1020`	`1020`	`}`
`1021`	`1021`
`1022`	`1022`	`func TestLLMExtraction_TimeoutError_NonDeadlinePreservesOriginal(t *testing.T) {`