Merge pull request #22607 from dannon/agent-router-message-history

mvdbeek · web-flow · commit 4c46425c4c65 · 2026-05-01T16:12:01.000+02:00
Pass chat history to agents as structured messages
diff --git a/lib/galaxy/agents/base.py b/lib/galaxy/agents/base.py
@@ -9,7 +9,10 @@
     ABC,
     abstractmethod,
 )
-from collections.abc import Callable
+from collections.abc import (
+    Callable,
+    Sequence,
+)
 from dataclasses import dataclass
 from typing import (
     Any,
@@ -37,6 +40,14 @@
 
 from pydantic_ai import Agent
 from pydantic_ai.exceptions import UnexpectedModelBehavior
+from pydantic_ai.messages import (
+    ModelMessage,
+    ModelRequest,
+    ModelResponse,
+    SystemPromptPart,
+    TextPart,
+    UserPromptPart,
+)
 from pydantic_ai.models.openai import OpenAIChatModel
 from pydantic_ai.providers.openai import OpenAIProvider
 from pydantic_ai.settings import ModelSettings
@@ -66,6 +77,21 @@
 # Literal inlines enum values in JSON schema, avoiding $defs that vLLM can't handle
 ConfidenceLiteral = Literal["low", "medium", "high"]
 
+MAX_HISTORY_MESSAGES = 40
+"""Cap on prior messages passed as pydantic-ai ``message_history``.
+
+~20 turn-pairs; tool-heavy turns produce 3-5 messages each. Bounds total token
+load while preserving enough recent context to keep multi-turn conversations
+coherent.
+"""
+
+TOOL_HELPER_HISTORY_MESSAGES = 8
+"""Tighter cap when a sub-agent is invoked from inside a `@agent.tool` call.
+
+Tool-context turns burn token budget faster (tool call + tool return +
+follow-up), so we hand the sub-agent a smaller window.
+"""
+
 __all__ = [
     "ActionSuggestion",
     "ActionType",
@@ -78,11 +104,66 @@
     "extract_structured_output",
     "extract_usage_info",
     "GalaxyAgentDependencies",
+    "MAX_HISTORY_MESSAGES",
     "normalize_llm_text",
     "SimpleGalaxyAgent",
+    "TOOL_HELPER_HISTORY_MESSAGES",
+    "truncate_message_history",
 ]
 
 
+def truncate_message_history(history: list[ModelMessage], limit: int = MAX_HISTORY_MESSAGES) -> list[ModelMessage]:
+    """Cap conversation history at ``limit`` recent messages, preserving the first one.
+
+    Keeps ``history[0]`` -- typically the user's original request, which anchors
+    intent across long conversations -- and the most recent ``limit`` messages.
+    """
+    if len(history) <= limit:
+        return history
+    log.info(
+        "Truncating conversation history from %d to %d messages (first + last %d)",
+        len(history),
+        limit + 1,
+        limit,
+    )
+    return [history[0]] + history[-limit:]
+
+
+def _coerce_message_history(history: Sequence[Any]) -> list[ModelMessage]:
+    """Normalize API-formatted and legacy role/content chat history."""
+    messages: list[ModelMessage] = []
+    skipped = 0
+
+    for item in history:
+        if isinstance(item, (ModelRequest, ModelResponse)):
+            messages.append(item)
+            continue
+
+        if not isinstance(item, dict):
+            skipped += 1
+            continue
+
+        role = str(item.get("role", "")).lower()
+        content = item.get("content")
+        if content is None:
+            skipped += 1
+            continue
+
+        if role == "assistant":
+            messages.append(ModelResponse(parts=[TextPart(content=str(content))]))
+        elif role == "user":
+            messages.append(ModelRequest(parts=[UserPromptPart(content=str(content))]))
+        elif role == "system":
+            messages.append(ModelRequest(parts=[SystemPromptPart(content=str(content))]))
+        else:
+            skipped += 1
+
+    if skipped:
+        log.warning("Ignored %d unsupported conversation_history message(s)", skipped)
+
+    return messages
+
+
 def extract_result_content(result: Any) -> str:
     """Extract text content from a pydantic-ai result (.output or .data)."""
     if hasattr(result, "output"):
@@ -259,15 +340,56 @@ async def process(self, query: str, context: Optional[dict[str, Any]] = None) ->
             return self._validation_error_response(validation_error)
 
         try:
-            full_prompt = self._prepare_prompt(query, context or {})
-            result = await self._run_with_retry(full_prompt)
-            return self._format_response(result, query, context or {})
+            ctx = context or {}
+            message_history = self._extract_message_history(ctx)
+            full_prompt = self._prepare_prompt(query, self._strip_history_from_context(ctx))
+            result = await self._run_with_retry(full_prompt, message_history=message_history)
+            return self._format_response(result, query, ctx)
 
         except (UnexpectedModelBehavior, OSError, ValueError) as e:
             log.warning(f"Error in {self.agent_type} agent: {e}")
             return self._get_fallback_response(query, str(e))
 
-    async def _run_with_retry(self, prompt: str, max_retries: int = 3, base_delay: float = 1.0):
+    @staticmethod
+    def _extract_message_history(
+        context: Optional[dict[str, Any]],
+        limit: int = MAX_HISTORY_MESSAGES,
+    ) -> Optional[list[ModelMessage]]:
+        """Pull ``conversation_history`` out of context, normalize it, and truncate it.
+
+        Returns None when history is missing/empty so callers can pass it
+        straight to ``agent.run(..., message_history=...)`` without branching.
+        """
+        if not context:
+            return None
+        history = context.get("conversation_history")
+        if not history:
+            return None
+        if isinstance(history, (str, bytes)) or not isinstance(history, Sequence):
+            log.warning("Ignoring unsupported conversation_history value of type %s", type(history).__name__)
+            return None
+        messages = _coerce_message_history(history)
+        if not messages:
+            return None
+        return truncate_message_history(messages, limit=limit)
+
+    @staticmethod
+    def _strip_history_from_context(context: dict[str, Any]) -> dict[str, Any]:
+        """Drop ``conversation_history`` before rendering context as text.
+
+        ``_prepare_prompt`` stringifies whatever's in the context dict; the raw
+        ``ModelMessage`` repr is noise once we're passing the history through
+        the structured ``message_history`` channel.
+        """
+        return {k: v for k, v in context.items() if k != "conversation_history"}
+
+    async def _run_with_retry(
+        self,
+        prompt: str,
+        max_retries: int = 3,
+        base_delay: float = 1.0,
+        message_history: Optional[list[ModelMessage]] = None,
+    ):
         """Run the agent with exponential backoff for retryable errors."""
         last_exception = None
 
@@ -278,7 +400,12 @@ async def _run_with_retry(self, prompt: str, max_retries: int = 3, base_delay: f
 
         for attempt in range(max_retries + 1):
             try:
-                return await self.agent.run(prompt, deps=self.deps, model_settings=model_settings)
+                return await self.agent.run(
+                    prompt,
+                    deps=self.deps,
+                    model_settings=model_settings,
+                    message_history=message_history,
+                )
 
             except Exception as e:
                 last_exception = e
@@ -551,27 +678,19 @@ async def _call_agent_from_tool(
 
             target_agent = ctx.deps.get_agent(agent_type, ctx.deps)
 
-            full_query = query
-            if context and "conversation_history" in context:
-                history = context["conversation_history"]
-                if history and len(history) > 0:
-                    history_text = "Previous conversation:\n"
-                    for msg in history[-4:]:
-                        role = msg.get("role", "unknown")
-                        content = msg.get("content", "")[:200]
-                        history_text += f"{role}: {content}\n"
-                    full_query = f"{history_text}\nCurrent request: {query}"
+            message_history = self._extract_message_history(context, limit=TOOL_HELPER_HISTORY_MESSAGES)
 
             target_model_settings = {
                 "temperature": target_agent._get_temperature(),
                 "max_tokens": target_agent._get_max_tokens(),
             }
 
             result = await target_agent.agent.run(
-                full_query,
+                query,
                 deps=ctx.deps,
                 usage=usage or ctx.usage,
                 model_settings=target_model_settings,
+                message_history=message_history,
             )
 
             response_data = extract_result_content(result)
diff --git a/lib/galaxy/agents/router.py b/lib/galaxy/agents/router.py
@@ -272,15 +272,13 @@ async def process(self, query: str, context: Optional[dict[str, Any]] = None) ->
             return self._validation_error_response(validation_error)
 
         try:
-            if context and context.get("conversation_history"):
-                log.info(f"Router: Conversation has {len(context['conversation_history'])} messages")
+            message_history = self._extract_message_history(context)
+            if message_history:
+                log.info(f"Router: passing {len(message_history)} prior messages as message_history")
             else:
-                log.info("Router: Processing query with no conversation history")
+                log.info("Router: processing query with no conversation history")
 
-            full_query = self._build_query_with_context(query, context)
-            log.info(f"Router: Full query length={len(full_query)} (original={len(query)})")
-
-            result = await self._run_with_retry(full_query)
+            result = await self._run_with_retry(query, message_history=message_history)
             content = extract_result_content(result)
 
             try:
@@ -311,27 +309,6 @@ async def process(self, query: str, context: Optional[dict[str, Any]] = None) ->
             log.warning(f"Router agent error, using fallback: {e}")
             return self._handle_fallback(query, context, str(e))
 
-    def _build_query_with_context(self, query: str, context: Optional[dict[str, Any]]) -> str:
-        if not context or "conversation_history" not in context:
-            return query
-
-        history = context["conversation_history"]
-        if not history:
-            return query
-
-        max_history = 6
-        if len(history) > max_history:
-            log.debug(f"Router: Truncating conversation history from {len(history)} to {max_history} messages")
-
-        history_text = "Previous conversation:\n"
-        for msg in history[-max_history:]:
-            role = msg.get("role", "unknown")
-            content = msg.get("content", "")
-            history_text += f"{role}: {content}\n"
-        history_text += f"\nCurrent query: {query}"
-
-        return history_text
-
     def _handle_fallback(self, query: str, context: Optional[dict[str, Any]], error_msg: str) -> AgentResponse:
         query_lower = query.lower()
 
diff --git a/lib/galaxy/webapps/galaxy/api/chat.py b/lib/galaxy/webapps/galaxy/api/chat.py
@@ -178,10 +178,12 @@ async def query(
                 # Build context with conversation history
                 full_context: dict[str, Any] = query_context.copy() if query_context else {}
 
-                # If we have an exchange_id, ALWAYS load conversation history from database (source of truth)
+                # If we have an exchange_id, ALWAYS load conversation history from database (source of truth).
+                # Use structured pydantic-ai message format so the router can pass it through as
+                # ``message_history`` rather than flattening it into a text blob.
                 if exchange_id:
                     db_history = await anyio.to_thread.run_sync(
-                        partial(self.chat_manager.get_chat_history, trans, exchange_id, format_for_pydantic_ai=False)
+                        partial(self.chat_manager.get_chat_history, trans, exchange_id, format_for_pydantic_ai=True)
                     )
                     if db_history:
                         full_context["conversation_history"] = db_history
diff --git a/test/unit/app/test_agents.py b/test/unit/app/test_agents.py