vdavid
diff --git a/‎.github/workflows/slow-checks.yml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/slow-checks.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎apps/desktop/src-tauri/src/ai/CLAUDE.md‎
Lines changed: 3 additions & 2 deletions b/‎apps/desktop/src-tauri/src/ai/CLAUDE.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎apps/desktop/src-tauri/src/ai/client.rs‎
Lines changed: 134 additions & 6 deletions b/‎apps/desktop/src-tauri/src/ai/client.rs‎
Lines changed: 134 additions & 6 deletions
diff --git a/‎apps/desktop/src-tauri/src/ai/client_real_groq_test.rs‎
Lines changed: 61 additions & 0 deletions b/‎apps/desktop/src-tauri/src/ai/client_real_groq_test.rs‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎apps/desktop/src-tauri/src/ai/mod.rs‎
Lines changed: 2 additions & 0 deletions b/‎apps/desktop/src-tauri/src/ai/mod.rs‎
Lines changed: 2 additions & 0 deletions
@@ -78,6 +78,15 @@ jobs:
       - name: Run govulncheck
         run: ./scripts/check/check --check scripts-go-govulncheck --ci
 
+      # Real-API smoke against Groq (OpenAI-compatible, free tier). Self-skips when the
+      # GROQ_API_KEY secret is absent, so this never fails the workflow before the secret
+      # is added in repo settings. Catches adapter-routing / auth / parse regressions that
+      # the wiremock tests can't (e.g. the genai Ollama-fallback bug for `llama-*` models).
+      - name: Run Groq smoke (real API)
+        run: ./scripts/check/check --check desktop-rust-groq-smoke --ci
+        env:
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+
   # ===========================================
   # Type-aware ESLint (too slow for per-push CI)
   # ===========================================
 
@@ -19,7 +19,8 @@ Three provider modes:
 | `download.rs` | HTTP streaming download with Range-based resume. Emits `ai-download-progress` events (200ms throttle). Cooperative cancellation via function parameter (`Fn() -> bool`). |
 | `extract.rs` | Copies bundled `llama-server` binary + dylibs from `resources/ai/` to the AI data dir. Sets Unix permissions, handles symlinks. |
 | `process.rs` | Spawns child process with `DYLD_LIBRARY_PATH` set. Instant SIGKILL to stop (llama-server is stateless; macOS reclaims all GPU/mmap resources). `kill_process` for fire-and-forget (quit, orphans), `kill_and_reap_in_background` for normal operation (reaps zombie in bg thread). `kill_stale_llama_servers` for belt-and-suspenders orphan cleanup by process name. Port discovery via `bind(:0)`. |
-| `client.rs` | `genai`-backed chat client. `AiBackend` is a struct bundling a long-lived `genai::Client` with a model name; built via `AiBackend::local(port)` or `AiBackend::remote(api_key, base_url, model)`. The model name picks the adapter (`claude-*` → Anthropic native, `gemini-*` → Gemini native, `gpt-5*`/`*-pro`/`*-codex` → OpenAI Responses API, etc.). Auto-omits `temperature`/`top_p` for OpenAI Responses adapter and for chat-completions reasoning models (`o1*`, `o3*`, `o4*`, `chatgpt-*`, `gpt-5*` defense-in-depth) and substitutes `ReasoningEffort::Low`. Local backend forces the OpenAI adapter via a `ServiceTargetResolver` pinning endpoint to `http://127.0.0.1:<port>/v1/`. Exposes both `chat_completion` (full response) and `chat_completion_stream` (returns a `BoxStream<Result<String, AiError>>` of content chunks; reasoning/thought-signature/tool-call chunks filtered out). `AiError` is typed by HTTP status via the pure `ai_error_for_status` (401/403 → `AuthFailed`, 429 → `RateLimited`, else `ServerError`); a `None` `first_text()` → `EmptyResponse`. |
+| `client.rs` | `genai`-backed chat client. `AiBackend` is a struct bundling a long-lived `genai::Client` with a model name; built via `AiBackend::local(port)` or `AiBackend::remote(api_key, base_url, model)`. For `remote`, the model name picks the adapter via the pure `remote_model_iden`: `claude-*` → Anthropic native, `gemini-*` → Gemini native, `gpt-*`/`o1*`/`o3*`/`o4*`/`chatgpt-*` → OpenAI (with `genai`'s `gpt-5*`/`*-codex`/`*-pro` → Responses-API auto-routing), and EVERYTHING ELSE is forced onto the OpenAI chat-completions adapter via the `openai::` namespace. That last rule is load-bearing: `genai` falls back to its **Ollama** adapter for unrecognized model names, so a bare `llama-3.1-8b-instant` (Groq), `deepseek-chat`, or `google/gemma-…:free` (OpenRouter) would POST to Ollama's `/api/chat` against an OpenAI endpoint and 404 — every BYOK provider except Anthropic/Gemini speaks OpenAI chat-completions. Auto-omits `temperature`/`top_p` for the OpenAI Responses adapter and for chat-completions reasoning models (`o1*`, `o3*`, `o4*`, `chatgpt-*`, `gpt-5*` defense-in-depth) and substitutes `ReasoningEffort::Low`. Local backend forces the OpenAI adapter via a `ServiceTargetResolver` pinning endpoint to `http://127.0.0.1:<port>/v1/`. Exposes `chat_completion` (full response), `chat_completion_with_empty_retry` (retries once with 4× the token budget on `EmptyResponse` — the translate commands use this), and `chat_completion_stream` (returns a `BoxStream<Result<String, AiError>>` of content chunks; reasoning/thought-signature/tool-call chunks filtered out). `AiError` is typed by HTTP status via the pure `ai_error_for_status` (401/403 → `AuthFailed`, 429 → `RateLimited`, else `ServerError`); a `None` `first_text()` → `EmptyResponse`. |
+| `client_real_groq_test.rs` | `#[ignore]`-gated real-API smoke against Groq (OpenAI-compatible, free tier) through `AiBackend::remote` + `chat_completion_with_empty_retry`. The cheap always-available real-provider gate — catches adapter-routing / auth / parse regressions the wiremock tests can't (it's what caught the Ollama-fallback bug above). The `groq-smoke` check (Go runner) resolves `GROQ_API_KEY` from env or the macOS Keychain and runs it with `--run-ignored only`, self-skipping when no key. CI: `slow-checks.yml` passes the `GROQ_API_KEY` secret. |
 | `translate_error.rs` | `AiTranslateError { kind, message }` + `AiTranslateErrorKind` enum, the typed error the two translate IPC commands return so the frontend branches on `kind` (not the message string). `From<AiError>` maps transport variants; the commands map `BackendResolution` non-ready cases. Mirror enum: `lib/ai/translate-error-toast.ts`. |
 | `client_integration_test.rs` | `wiremock`-based tests covering request shape per adapter (chat completions vs Responses API), parsing, error mapping. Always run in CI. |
 | `client_streaming_test.rs` | `axum`-based SSE mock server tests for `chat_completion_stream`: chunks arrive in order, empty streams end cleanly, drop-mid-stream closes the connection, HTTP 5xx maps to `ServerError`. Always run in CI. (Wiremock can't chunk-deliver SSE bodies. See Gotchas.) |
@@ -149,7 +150,7 @@ privacy-focused users. The architecture doesn't fight this switch: it's just a d
 
 **Gotcha**: `genai 0.6` auto-routes `gpt-5*`, `*-codex`, `*-pro` to the Responses API, but `o1*`/`o3*`/`o4*`/`chatgpt-*` stay on Chat Completions even though they also reject custom `temperature`. We layer `is_openai_chat_reasoning_model()` on top to strip `temperature`/`top_p` and substitute `ReasoningEffort::Low` for those. The heuristic also matches `gpt-5*` as defense-in-depth in case `genai`'s routing rule changes.
 
-**Gotcha**: For reasoning models, `max_tokens` (`max_output_tokens` on Responses API) covers reasoning + visible answer combined. Real-world finding: at `ReasoningEffort::Low`, `gpt-5-mini` consumed all 40 tokens thinking and emitted no `output_text`, so `first_text()` returned `None`. Both translate commands now request `max_tokens=300` (search bumped from 200; selection already 300) to give reasoning room before the visible answer. When `first_text()` is still `None`, `chat_completion` returns the typed `AiError::EmptyResponse` (not a generic parse error), which surfaces as a specific "the AI came back empty, try a faster model" toast. `suggestions.rs` (`max_tokens=150`) stays graceful-empty since folder suggestions are nice-to-have. Picking a non-reasoning model (the default `gpt-4.1-mini`) sidesteps this entirely.
+**Gotcha**: For reasoning models, `max_tokens` (`max_output_tokens` on Responses API) covers reasoning + visible answer combined. Real-world finding: at `ReasoningEffort::Low`, `gpt-5-mini` consumed all 40 tokens thinking and emitted no `output_text`, so `first_text()` returned `None`. Both translate commands now request `max_tokens=300` (search bumped from 200; selection already 300) AND call `chat_completion_with_empty_retry`, which on an `EmptyResponse` retries ONCE with 4× the budget (capped at 2000) — a provider-agnostic guard that reacts to the symptom instead of maintaining a never-complete reasoning-model name list. When the retry is still empty, `chat_completion` surfaces the typed `AiError::EmptyResponse`, which becomes a specific "the AI came back empty, try a faster model" toast. `suggestions.rs` (`max_tokens=150`) stays graceful-empty since folder suggestions are nice-to-have and don't use the retry helper. Picking a non-reasoning model (the default `gpt-4.1-mini`) sidesteps this entirely.
 
 **Gotcha**: `tauri::async_runtime::spawn` is used in `configure_ai` and `start_ai_server` instead of `tokio::spawn`.
 **Why**: These may run during Tauri setup before the tokio runtime is fully available. `tauri::async_runtime::spawn` uses Tauri's own runtime which is always ready at that point.
 
@@ -5,9 +5,11 @@
 //!
 //! Two backend constructors:
 //! - [`AiBackend::local`]: forces the OpenAI adapter at `http://127.0.0.1:<port>/v1/`.
-//! - [`AiBackend::remote`]: BYOK. The model name picks the adapter (e.g. `claude-*` → Anthropic
-//!   native, `gemini-*` → Gemini native, `gpt-5*`/`*-pro`/`*-codex` → OpenAI Responses), and
-//!   `base_url` overrides the endpoint.
+//! - [`AiBackend::remote`]: BYOK. `base_url` overrides the endpoint; the adapter comes from the
+//!   model name via [`remote_model_iden`] — `claude-*`/`gemini-*` keep their native protocols,
+//!   `gpt-*`/`o*`/`chatgpt-*` use OpenAI (incl. the Responses-API auto-routing), and every other
+//!   OpenAI-compatible provider (Groq, OpenRouter, DeepSeek, …) is forced onto OpenAI
+//!   chat-completions so `genai` doesn't mis-route it to Ollama.
 
 use std::sync::Arc;
 use std::time::Duration;
@@ -46,8 +48,10 @@ impl AiBackend {
         }
     }
 
-    /// Remote / cloud provider. Adapter is chosen from the model name prefix
-    /// (e.g. `claude-3-5-sonnet-latest` → Anthropic, `gemini-2.0-flash` → Gemini).
+    /// Remote / cloud provider. The adapter is chosen from the model name: `claude-*` and
+    /// `gemini-*` use their native protocols, `gpt-*` / `o*` / `chatgpt-*` use OpenAI (with
+    /// `genai`'s gpt-5*/codex/pro → Responses-API auto-routing), and EVERYTHING ELSE is forced
+    /// onto the OpenAI chat-completions adapter (see [`remote_model_iden`]).
     pub fn remote(api_key: String, base_url: String, model: String) -> Self {
         // Without a trailing `/` the `Url::join` quirk above silently drops `/v1`.
         let endpoint = if base_url.ends_with('/') {
@@ -57,7 +61,37 @@ impl AiBackend {
         };
         let resolver = make_resolver(endpoint, AuthData::from_single(api_key), ForceAdapter::None);
         let client = Client::builder().with_service_target_resolver(resolver).build();
-        Self { client, model }
+        // The resolver only overrides endpoint + auth; adapter dispatch happens from the model
+        // name BEFORE the resolver runs (same reason `local` uses the `openai::` namespace).
+        Self {
+            client,
+            model: remote_model_iden(&model),
+        }
+    }
+}
+
+/// Maps a BYOK model name to the `genai` model identifier whose namespace picks the right adapter.
+///
+/// `genai` infers the adapter from the model name and falls back to **Ollama** for anything it
+/// doesn't recognize — so a bare `llama-3.1-8b-instant` (Groq), `deepseek-chat`, or
+/// `google/gemma-…:free` (OpenRouter) would POST to Ollama's `/api/chat` against an OpenAI endpoint
+/// and 404. Every cloud provider we support except Anthropic and Gemini speaks the OpenAI
+/// chat-completions wire format, so we force the `openai::` namespace for all of them. Anthropic
+/// (`claude-*`) and Gemini (`gemini-*`) keep their native adapters; real OpenAI model families
+/// (`gpt-*` / `o1*` / `o3*` / `o4*` / `chatgpt-*`) are left alone so `genai` can auto-route the
+/// `gpt-5*` / `*-codex` / `*-pro` Responses-API models.
+fn remote_model_iden(model: &str) -> String {
+    let native_or_openai = model.starts_with("claude-")
+        || model.starts_with("gemini-")
+        || model.starts_with("gpt-")
+        || model.starts_with("o1")
+        || model.starts_with("o3")
+        || model.starts_with("o4")
+        || model.starts_with("chatgpt-");
+    if native_or_openai {
+        model.to_string()
+    } else {
+        format!("openai::{model}")
     }
 }
 
@@ -143,6 +177,55 @@ pub async fn chat_completion(
     Ok(text)
 }
 
+/// Hard ceiling for the empty-response retry's token budget, so a pathological model can't make
+/// us request an unbounded (and expensive) completion.
+const EMPTY_RETRY_TOKEN_CEILING: u32 = 2000;
+/// Multiplier applied to `max_tokens` on the empty-response retry.
+const EMPTY_RETRY_TOKEN_FACTOR: u32 = 4;
+
+/// Returns a clone of `options` with `max_tokens` multiplied by `factor` (capped at `ceiling`).
+/// Pure, so the budget math is unit-tested. A missing `max_tokens` defaults to the ceiling on
+/// retry — if the caller didn't cap it, the first attempt already had room, so go straight to the
+/// ceiling rather than guessing a base.
+fn with_bumped_max_tokens(options: &ChatOptions, factor: u32, ceiling: u32) -> ChatOptions {
+    let mut opts = options.clone();
+    let bumped = match opts.max_tokens {
+        Some(current) => current.saturating_mul(factor).min(ceiling),
+        None => ceiling,
+    };
+    opts.max_tokens = Some(bumped);
+    opts
+}
+
+/// Like [`chat_completion`], but retries ONCE with a larger token budget when the model returns
+/// no visible text ([`AiError::EmptyResponse`]).
+///
+/// This is the provider-agnostic guard against reasoning models (`gpt-5*`, `o*`, DeepSeek
+/// `*-reasoner`, Qwen `qwq`, …) spending the whole budget on hidden reasoning before emitting an
+/// answer. Rather than maintain a model-name list that's never complete, we react to the symptom:
+/// an empty answer means "retry with room to think AND answer". One retry only — if it's still
+/// empty, the budget isn't the problem and we surface `EmptyResponse` so the UI can suggest a
+/// faster model. Every other error (and a success) passes straight through with no extra call.
+pub async fn chat_completion_with_empty_retry(
+    backend: &AiBackend,
+    system_prompt: &str,
+    user_prompt: &str,
+    options: &ChatOptions,
+) -> Result<String, AiError> {
+    match chat_completion(backend, system_prompt, user_prompt, options).await {
+        Err(AiError::EmptyResponse) => {
+            let bumped = with_bumped_max_tokens(options, EMPTY_RETRY_TOKEN_FACTOR, EMPTY_RETRY_TOKEN_CEILING);
+            log::info!(
+                "AI chat_completion: empty response, retrying once with max_tokens={:?} (was {:?})",
+                bumped.max_tokens,
+                options.max_tokens
+            );
+            chat_completion(backend, system_prompt, user_prompt, &bumped).await
+        }
+        other => other,
+    }
+}
+
 /// Streams a chat completion. Returns a boxed stream of content chunks.
 ///
 /// Same per-model option fixups as [`chat_completion`] (reasoning models get
@@ -359,6 +442,51 @@ mod tests {
         );
     }
 
+    #[test]
+    fn remote_model_iden_forces_openai_for_compatible_providers() {
+        // Native protocols + real OpenAI families: left untouched.
+        for m in [
+            "claude-sonnet-4-5",
+            "gemini-2.5-flash",
+            "gpt-4.1-mini",
+            "gpt-5.5",
+            "o3-mini",
+            "chatgpt-4o-latest",
+        ] {
+            assert_eq!(remote_model_iden(m), m, "{m} should keep its inferred adapter");
+        }
+        // OpenAI-compatible BYOK models genai would mis-route to Ollama: forced to OpenAI.
+        assert_eq!(
+            remote_model_iden("llama-3.1-8b-instant"),
+            "openai::llama-3.1-8b-instant"
+        );
+        assert_eq!(remote_model_iden("deepseek-chat"), "openai::deepseek-chat");
+        assert_eq!(
+            remote_model_iden("google/gemma-4-31b-it:free"),
+            "openai::google/gemma-4-31b-it:free"
+        );
+        assert_eq!(
+            remote_model_iden("mistral-small-latest"),
+            "openai::mistral-small-latest"
+        );
+    }
+
+    #[test]
+    fn with_bumped_max_tokens_multiplies_and_caps() {
+        let base = ChatOptions::default().with_max_tokens(300);
+        assert_eq!(with_bumped_max_tokens(&base, 4, 2000).max_tokens, Some(1200));
+        // Caps at the ceiling.
+        assert_eq!(with_bumped_max_tokens(&base, 100, 2000).max_tokens, Some(2000));
+        // Saturating multiply can't overflow into a tiny value.
+        let huge = ChatOptions::default().with_max_tokens(u32::MAX);
+        assert_eq!(with_bumped_max_tokens(&huge, 4, 2000).max_tokens, Some(2000));
+        // No prior cap → jump straight to the ceiling on retry.
+        assert_eq!(
+            with_bumped_max_tokens(&ChatOptions::default(), 4, 2000).max_tokens,
+            Some(2000)
+        );
+    }
+
     #[test]
     fn ai_error_for_status_classifies_by_code() {
         assert!(matches!(ai_error_for_status(401, "x".into()), AiError::AuthFailed(_)));
 
@@ -0,0 +1,61 @@
+//! Real-API smoke test against Groq (OpenAI-compatible, free tier). This is the cheap
+//! always-available real-provider gate for the AI translate pipeline: it exercises OUR
+//! `AiBackend::remote` + `chat_completion` code against a live OpenAI-compatible endpoint, so a
+//! regression in adapter routing, auth, or response parsing fails here instead of silently in
+//! production (the wiremock tests can't catch a real-API contract drift).
+//!
+//! `#[ignore]`-gated: needs a valid `GROQ_API_KEY`. The `groq-smoke` check in the Go check runner
+//! resolves the key (env var, else the macOS Keychain) and runs this with `--run-ignored only`,
+//! skipping cleanly when no key is available (contributors without a key, CI without the secret).
+//!
+//! Run manually:
+//! ```sh
+//! GROQ_API_KEY=$(security find-generic-password -a "$USER" -s "GROQ_API_KEY" -w) \
+//!   cargo nextest run --lib --run-ignored only ai::client_real_groq_test
+//! ```
+
+use genai::chat::ChatOptions;
+
+use super::client::{AiBackend, chat_completion_with_empty_retry};
+
+/// Groq's OpenAI-compatible base. Trailing slash required (see the `genai` `Url::join` gotcha).
+const BASE_URL: &str = "https://api.groq.com/openai/v1/";
+/// Smallest/fastest Groq model — cheapest smoke, non-reasoning so a tight budget is safe.
+const MODEL: &str = "llama-3.1-8b-instant";
+
+fn api_key_or_skip() -> Option<String> {
+    let key = std::env::var("GROQ_API_KEY").ok()?;
+    if key.trim().is_empty() {
+        return None;
+    }
+    Some(key)
+}
+
+#[tokio::test]
+#[ignore = "real API call: set GROQ_API_KEY to run"]
+async fn smoke_groq_translate_shaped_completion() {
+    let Some(api_key) = api_key_or_skip() else {
+        panic!("GROQ_API_KEY not set");
+    };
+
+    let backend = AiBackend::remote(api_key, BASE_URL.to_string(), MODEL.to_string());
+
+    // Mirror the translate commands' option shape (temperature + capped tokens + the empty-retry
+    // wrapper), so this exercises the same path Search/Selection use.
+    let options = ChatOptions::default()
+        .with_temperature(0.3)
+        .with_max_tokens(50)
+        .with_top_p(0.9);
+
+    let system = "You output one line in the form `keyword: value`. No prose.";
+    let user = "files named report from last week";
+
+    let response = chat_completion_with_empty_retry(&backend, system, user, &options)
+        .await
+        .expect("Groq chat completion should succeed");
+
+    assert!(
+        !response.trim().is_empty(),
+        "Groq returned an empty completion: {response:?}"
+    );
+}
@@ -25,6 +25,8 @@ mod client_local_llama_test;
 #[cfg(test)]
 mod client_real_anthropic_test;
 #[cfg(test)]
+mod client_real_groq_test;
+#[cfg(test)]
 mod client_real_openai_test;
 #[cfg(test)]
 mod client_streaming_test;