fix(workflows-mcp): migrate Ollama executor to /api/chat with reliable structured output

ibacalu · ibacalu · commit 3fcb9417477b · 2026-02-24T01:15:40.000+01:00
Switch from /api/generate to /api/chat for native structured output support.
Ollama's JSON Schema grammar constraint is only available on the chat endpoint.

Key changes:
- Use /api/chat with messages array instead of /api/generate with flat prompt
- Pass format parameter (inputs.response_schema) for grammar-enforced JSON output
- Force temperature=0 via options when a schema is requested for determinism
- Append explicit field hint to user message: 'Respond with JSON containing fields: &lt;required&gt;'
- Introduce validation_schema split: OpenAI uses prepared_schema (strict), all other
  providers use inputs.response_schema to keep format/validation/retry prompt consistent
- Add robust regex markdown extractor in _validate_response to handle chatty models
- Add Gemini proxy mode (api_url-based routing) with extra_headers support
- Log Ollama response content for observability
- Move import re to module level (was incorrectly placed inside staticmethod)
- Remove dead prepared_schema parameter from _call_ollama
- Fix test api_url to include /api/chat path
diff --git a/src/workflows_mcp/engine/executors_llm.py b/src/workflows_mcp/engine/executors_llm.py
@@ -11,6 +11,7 @@
 import json
 import logging
 import os
+import re
 from enum import Enum
 from typing import Any, ClassVar, cast
 
@@ -409,13 +410,27 @@ async def execute(  # type: ignore[override]
         # Determine if we need schema validation
         needs_validation = effective_inputs.response_schema is not None
 
-        # Prepare schema ONCE before retry loop - this is the schema LLM receives
-        # CRITICAL: Use the SAME prepared schema for validation to avoid mismatch
-        # where LLM generates null (allowed by prepared schema) but validation
-        # uses original schema (which doesn't allow null)
+        # Prepare schema ONCE before retry loop
+        # prepared_schema: strict OpenAI-compatible schema (for OpenAI's response_format param)
+        # validation_schema: schema used for client-side validation AND retry error prompts
+        #
+        # For OpenAI: both are prepared_schema (strict, with anyOf/additionalProperties)
+        # For all other providers (Ollama, Anthropic, Gemini): validation_schema stays
+        # as inputs.response_schema so the retry prompt matches what format sends
         prepared_schema: dict[str, Any] | None = None
+        validation_schema: dict[str, Any] | None = None
         if needs_validation:
             prepared_schema = self._prepare_schema_for_openai(effective_inputs.response_schema)
+            # Resolve effective provider to determine which schema to validate against
+            _provider = resolve_interpolatable_enum(
+                effective_inputs.provider, LLMProvider, "provider"
+            )
+            if _provider == LLMProvider.OPENAI:
+                # OpenAI sends prepared_schema to the API, validate against same schema
+                validation_schema = prepared_schema
+            else:
+                # Other providers use inputs.response_schema as format — validate consistently
+                validation_schema = effective_inputs.response_schema
 
         for attempt in range(max_retries):
             attempts += 1
@@ -431,7 +446,7 @@ async def execute(  # type: ignore[override]
                         + "\n\n"
                         + effective_inputs.validation_prompt_template.format(
                             validation_error=validation_error,
-                            schema=json.dumps(prepared_schema, indent=2),
+                            schema=json.dumps(validation_schema, indent=2),
                         )
                     )
 
@@ -449,10 +464,10 @@ async def execute(  # type: ignore[override]
                 # Validate response if schema provided (client-side for all providers)
                 if needs_validation:
                     try:
-                        # Use the SAME prepared schema that was sent to the LLM
+                        # Validate against the schema matching what the provider was given
                         validated_response = self._validate_response(
                             response_text=response_text,
-                            schema=cast(dict[str, Any], prepared_schema),
+                            schema=cast(dict[str, Any], validation_schema),
                         )
 
                         # Success - return validated JSON structure directly
@@ -1114,15 +1129,24 @@ async def _call_gemini(
     ) -> tuple[str, dict[str, Any]]:
         """Call Google Gemini API with null safety.
 
+        Supports two modes:
+        - Direct: api_key required, constructs googleapis.com URL with ?key= param
+        - Proxy: api_url is set (via profile), proxy handles auth — no api_key needed
+
         Raises:
-            ValueError: Missing API key, empty content, or null text
+            ValueError: Missing API key (direct mode), empty content, or null text
             httpx.*: Network/API errors
         """
-        if not inputs.api_key:
-            raise ValueError("api_key is required for Gemini provider")
-
-        base_url = "https://generativelanguage.googleapis.com/v1beta"
-        url = f"{base_url}/models/{inputs.model}:generateContent?key={inputs.api_key}"
+        if inputs.api_url:
+            # Proxy mode — proxy manages the API key, use api_url as base
+            base_url = inputs.api_url.rstrip("/")
+            url = f"{base_url}/v1beta/models/{inputs.model}:generateContent"
+        else:
+            # Direct mode — api_key required
+            if not inputs.api_key:
+                raise ValueError("api_key is required for Gemini provider")
+            base_url = "https://generativelanguage.googleapis.com/v1beta"
+            url = f"{base_url}/models/{inputs.model}:generateContent?key={inputs.api_key}"
 
         contents = [{"parts": [{"text": prompt}], "role": "user"}]
 
@@ -1142,12 +1166,14 @@ async def _call_gemini(
         if generation_config:
             body["generationConfig"] = generation_config
 
+        headers: dict[str, str] = {"Content-Type": "application/json"}
+
+        # Merge extra_headers (e.g., X-Org-Id, X-User-Id for proxy routing)
+        if inputs.extra_headers:
+            headers.update(_resolve_header_env_vars(inputs.extra_headers))
+
         async with httpx.AsyncClient(timeout=timeout) as client:
-            response = await client.post(
-                url,
-                json=body,
-                headers={"Content-Type": "application/json"},
-            )
+            response = await client.post(url, json=body, headers=headers)
             response.raise_for_status()
 
             data = response.json()
@@ -1190,42 +1216,73 @@ async def _call_ollama(
         temperature: float | None,
         max_tokens: int | None,
     ) -> tuple[str, dict[str, Any]]:
-        """Call Ollama local API with null safety.
+        """Call Ollama API via /api/chat with native structured output.
+
+        Uses /api/chat (messages format) instead of /api/generate because
+        Ollama's structured output (``format`` parameter with JSON schema)
+        is only supported on the chat endpoint.
 
         Raises:
             ValueError: Null response
             httpx.*: Network/API errors
         """
-        url = inputs.api_url or "http://localhost:11434/api/generate"
+        url = inputs.api_url or "http://localhost:11434/api/chat"
 
-        # Combine system instructions and prompt for Ollama
-        full_prompt = prompt
+        # Build messages list (Ollama /api/chat uses messages array)
+        messages: list[dict[str, str]] = []
         if inputs.system_instructions:
-            full_prompt = f"{inputs.system_instructions}\n\n{prompt}"
+            messages.append({"role": "system", "content": inputs.system_instructions})
+
+        # Ollama tip: append JSON instruction to the user message when a schema is defined.
+        # Explicitly name the required fields so models without strict grammar enforcement
+        # still know what keys to include.
+        user_content = prompt
+        if inputs.response_schema:
+            required_fields = inputs.response_schema.get("required", [])
+            if required_fields:
+                fields_hint = ", ".join(required_fields)
+                user_content = f"{prompt}\n\nRespond with JSON containing fields: {fields_hint}"
+            else:
+                user_content = f"{prompt}\n\nRespond with JSON"
+        messages.append({"role": "user", "content": user_content})
 
         body: dict[str, Any] = {
             "model": inputs.model,
-            "prompt": full_prompt,
+            "messages": messages,
             "stream": False,
         }
 
-        if temperature is not None:
+        # Native structured output — Ollama uses 'format' parameter on /api/chat
+        # Use raw inputs.response_schema (NOT prepared_schema) because llama.cpp's grammar
+        # engine can't reliably handle OpenAI-specific patterns like anyOf, additionalProperties.
+        # prepared_schema is still used for client-side validation in execute().
+        if inputs.response_schema:
+            body["format"] = inputs.response_schema
+            # Ollama tip: force temperature=0 for deterministic structured output
+            body["options"] = {"temperature": 0}
+        elif temperature is not None:
             body["options"] = {"temperature": temperature}
 
+        headers: dict[str, str] = {"Content-Type": "application/json"}
+
+        # Merge extra_headers (e.g., X-Org-Id, X-User-Id for proxy routing)
+        if inputs.extra_headers:
+            headers.update(_resolve_header_env_vars(inputs.extra_headers))
+
+        logger.info(f"Engine sending Ollama request to {url} with body: {json.dumps(body)}")
+
         async with httpx.AsyncClient(timeout=timeout) as client:
-            response = await client.post(
-                url,
-                json=body,
-                headers={"Content-Type": "application/json"},
-            )
+            response = await client.post(url, json=body, headers=headers)
             response.raise_for_status()
 
             data = response.json()
 
-            # Extract content with null safety
-            response_text = data.get("response")
+            # Extract content from /api/chat response format
+            message = data.get("message", {})
+            response_text = message.get("content")
             if response_text is None:
-                raise ValueError("Ollama returned null response")
+                raise ValueError("Ollama returned null content")
+            logger.info(f"Engine received Ollama response content: {response_text!r}")
 
             provider_metadata = {
                 "model": data.get("model"),
@@ -1246,9 +1303,23 @@ def _validate_response(response_text: str, schema: dict[str, Any]) -> dict[str,
         Raises:
             ValueError: Invalid JSON, non-dict response, or schema validation failure
         """
+        # Robust extraction: find markdown json block anywhere in text
+        text = response_text.strip()
+        block_match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL | re.IGNORECASE)
+
+        if block_match:
+            text = block_match.group(1).strip()
+        else:
+            # Fallback for plain text containing JSON or conversational wrappers
+            # Find the outermost curly braces
+            start_idx = text.find("{")
+            end_idx = text.rfind("}")
+            if start_idx != -1 and end_idx != -1 and end_idx >= start_idx:
+                text = text[start_idx : end_idx + 1]
+
         # Try to parse as JSON
         try:
-            response = json.loads(response_text)
+            response = json.loads(text)
         except json.JSONDecodeError as e:
             raise ValueError(f"Response is not valid JSON: {e}")
 
diff --git a/tests/test_llm_executor.py b/tests/test_llm_executor.py
@@ -149,20 +149,20 @@ async def test_gemini_basic_call(self, executor, mock_context):
 
     @pytest.mark.asyncio
     async def test_ollama_basic_call(self, executor, mock_context):
-        """Test basic Ollama local API call."""
+        """Test basic Ollama API call via /api/chat."""
         inputs = LLMCallInput(
             provider="ollama",
             model="llama2",
             prompt="Hello",
-            api_url="http://localhost:11434/api/generate",
+            api_url="http://localhost:11434/api/chat",
             timeout=60,
         )
 
         mock_response = Mock()
         mock_response.status_code = 200
         mock_response.json.return_value = {
             "model": "llama2",
-            "response": "Hello! How are you?",
+            "message": {"role": "assistant", "content": "Hello! How are you?"},
             "total_duration": 1234567890,
             "load_duration": 123456,
             "prompt_eval_count": 5,