Kiln-AI
diff --git a/‎libs/core/kiln_ai/adapters/chat/chat_formatter.py‎
Lines changed: 39 additions & 5 deletions b/‎libs/core/kiln_ai/adapters/chat/chat_formatter.py‎
Lines changed: 39 additions & 5 deletions
diff --git a/‎libs/core/kiln_ai/adapters/chat/test_chat_formatter.py‎
Lines changed: 56 additions & 0 deletions b/‎libs/core/kiln_ai/adapters/chat/test_chat_formatter.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎libs/core/kiln_ai/adapters/model_adapters/adapter_stream.py‎
Lines changed: 45 additions & 4 deletions b/‎libs/core/kiln_ai/adapters/model_adapters/adapter_stream.py‎
Lines changed: 45 additions & 4 deletions
@@ -256,6 +256,10 @@ class MultiturnFormatter(ChatFormatter):
     Takes prior_trace (existing conversation) and appends the new user message.
     Produces a single turn: the new user message. Tool calls and multi-turn
     model responses are handled by _run_model_turn's internal loop.
+
+    When user_input is a dict or list with tool_call_id keys, the input is
+    treated as tool call results (role "tool") rather than a user message.
+    This supports resuming after a return_on_tool_call interrupt.
     """
 
     def __init__(
@@ -274,14 +278,44 @@ def initial_messages(self) -> list[ChatCompletionMessageIncludingLiteLLM]:
         """Messages to seed the conversation (prior trace)."""
         return list(self._prior_trace)
 
+    @property
+    def _is_tool_result(self) -> bool:
+        """Return True if user_input looks like one or more tool call results."""
+        input = self.user_input
+        if isinstance(input, dict):
+            return "tool_call_id" in input
+        if isinstance(input, list):
+            return bool(input) and all(
+                isinstance(item, dict) and "tool_call_id" in item for item in input
+            )
+        return False
+
     def next_turn(self, previous_output: str | None = None) -> Optional[ChatTurn]:
         if self._state == "start":
-            # prior trace is already in the messages list and contains system and so on, we only need
-            # to append the latest new user message
-            user_msg = BasicChatMessage("user", format_user_message(self.user_input))
             self._state = "awaiting_final"
-            self._messages.append(user_msg)
-            return ChatTurn(messages=[user_msg], final_call=True)
+            if self._is_tool_result:
+                if isinstance(self.user_input, dict):
+                    raw_items: list[dict] = [self.user_input]
+                else:
+                    raw_items = list(self.user_input)  # type: ignore[arg-type]
+                msgs: list[ChatMessage] = [
+                    ToolResponseMessage(
+                        role="tool",
+                        content=str(item.get("content", "")),
+                        tool_call_id=item["tool_call_id"],
+                    )
+                    for item in raw_items
+                ]
+                self._messages.extend(msgs)
+                return ChatTurn(messages=msgs, final_call=True)
+            else:
+                # prior trace is already in the messages list and contains system and so on, we only need
+                # to append the latest new user message
+                user_msg = BasicChatMessage(
+                    "user", format_user_message(self.user_input)
+                )
+                self._messages.append(user_msg)
+                return ChatTurn(messages=[user_msg], final_call=True)
 
         if self._state == "awaiting_final":
             if previous_output is None:
 
@@ -190,6 +190,62 @@ def test_multiturn_formatter_preserves_tool_call_messages():
     assert first.final_call
 
 
+def test_multiturn_formatter_single_tool_result():
+    """Tool result dict with tool_call_id should produce a ToolResponseMessage."""
+    prior_trace = [
+        {"role": "assistant", "content": None, "tool_calls": [{"id": "call_1"}]},
+    ]
+    formatter = MultiturnFormatter(
+        prior_trace=prior_trace,
+        user_input={"tool_call_id": "call_1", "content": "42"},
+    )
+
+    first = formatter.next_turn()
+    assert first is not None
+    assert len(first.messages) == 1
+    msg = first.messages[0]
+    assert msg.role == "tool"
+    assert msg.content == "42"
+    assert msg.tool_call_id == "call_1"
+    assert first.final_call
+
+
+def test_multiturn_formatter_multiple_tool_results():
+    """List of tool result dicts should produce multiple ToolResponseMessages."""
+    prior_trace = [
+        {"role": "assistant", "content": None, "tool_calls": []},
+    ]
+    tool_results = [
+        {"tool_call_id": "call_1", "content": "15"},
+        {"tool_call_id": "call_2", "content": "36"},
+    ]
+    formatter = MultiturnFormatter(prior_trace=prior_trace, user_input=tool_results)
+
+    first = formatter.next_turn()
+    assert first is not None
+    assert len(first.messages) == 2
+    assert first.messages[0].role == "tool"
+    assert first.messages[0].tool_call_id == "call_1"
+    assert first.messages[0].content == "15"
+    assert first.messages[1].role == "tool"
+    assert first.messages[1].tool_call_id == "call_2"
+    assert first.messages[1].content == "36"
+    assert first.final_call
+
+
+def test_multiturn_formatter_user_input_not_confused_with_tool_result():
+    """A regular dict input (no tool_call_id) is treated as a user message."""
+    prior_trace = [{"role": "system", "content": "sys"}]
+    formatter = MultiturnFormatter(
+        prior_trace=prior_trace,
+        user_input={"question": "what is 2+2?"},
+    )
+    first = formatter.next_turn()
+    assert first is not None
+    assert len(first.messages) == 1
+    assert first.messages[0].role == "user"
+
+
 def test_format_user_message():
     # String
     assert format_user_message("test input") == "test input"
 
@@ -13,7 +13,7 @@
 )
 
 from kiln_ai.adapters.chat import ChatCompletionMessageIncludingLiteLLM
-from kiln_ai.adapters.chat.chat_formatter import ChatFormatter
+from kiln_ai.adapters.chat.chat_formatter import ChatFormatter, ToolResponseMessage
 from kiln_ai.adapters.litellm_utils.litellm_streaming import StreamingCompletion
 from kiln_ai.adapters.ml_model_list import KilnModelProvider
 from kiln_ai.adapters.model_adapters.stream_events import (
@@ -101,23 +101,30 @@ async def __aiter__(self) -> AsyncIterator[AdapterStreamEvent]:
             for message in turn.messages:
                 if message.content is None:
                     raise ValueError("Empty message content isn't allowed")
-                self._messages.append(
-                    {"role": message.role, "content": message.content}  # type: ignore[arg-type]
-                )
+                msg_dict: dict = {"role": message.role, "content": message.content}
+                if isinstance(message, ToolResponseMessage):
+                    msg_dict["tool_call_id"] = message.tool_call_id
+                self._messages.append(msg_dict)  # type: ignore[arg-type]
 
             skip_response_format = not turn.final_call
             turn_top_logprobs = self._top_logprobs if turn.final_call else None
 
+            interrupted = False
             async for event in self._stream_model_turn(
                 skip_response_format, turn_top_logprobs
             ):
                 if isinstance(event, _ModelTurnComplete):
                     usage += event.usage
                     prior_output = event.assistant_message
                     final_choice = event.model_choice
+                    if event.interrupted_by_tool_calls:
+                        interrupted = True
                 else:
                     yield event
 
+            if interrupted:
+                break
+
             if not prior_output:
                 raise RuntimeError("No assistant message/output returned from model")
 
@@ -176,6 +183,39 @@ async def _stream_model_turn(
             self._messages.append(response_choice.message)
 
             if tool_calls and len(tool_calls) > 0:
+                # Check for return_on_tool_call BEFORE processing
+                if self._adapter.base_adapter_config.return_on_tool_call:
+                    real_tool_calls = [
+                        tc for tc in tool_calls if tc.function.name != "task_response"
+                    ]
+                    if real_tool_calls:
+                        # Yield INPUT_AVAILABLE events for each tool call
+                        for tc in real_tool_calls:
+                            try:
+                                parsed_args = json.loads(tc.function.arguments)
+                            except (json.JSONDecodeError, TypeError):
+                                parsed_args = None
+                            yield ToolCallEvent(
+                                event_type=ToolCallEventType.INPUT_AVAILABLE,
+                                tool_call_id=tc.id,
+                                tool_name=tc.function.name or "unknown",
+                                arguments=parsed_args,
+                                error=(
+                                    f"Failed to parse arguments: {tc.function.arguments}"
+                                    if parsed_args is None
+                                    else None
+                                ),
+                            )
+
+                        yield _ModelTurnComplete(
+                            assistant_message="",
+                            model_choice=response_choice,
+                            usage=usage,
+                            interrupted_by_tool_calls=True,
+                        )
+                        return
+
+                # Existing flow: handle tool calls internally
                 async for event in self._handle_tool_calls(tool_calls):
                     yield event
 
@@ -265,6 +305,7 @@ class _ModelTurnComplete:
     assistant_message: str
     model_choice: Choices | None
     usage: Usage
+    interrupted_by_tool_calls: bool = False
 
 
 def _validate_response(