@@ -236,6 +236,27 @@ func (sm *ServerManager) getLLMClient() (*llm.LLMClient, error) {
236236 return llm .NewLLMClient (sm .config , context .Background ())
237237}
238238
239+ // chatCompletionClient returns an LLM client for a /v1/chat/completions request.
240+ // When schema is non-nil, a fresh client is built from a per-request config so
241+ // that structured-output overrides never mutate the REPL's shared client or
242+ // the server-level config.
243+ func (sm * ServerManager ) chatCompletionClient (schema map [string ]interface {}) (* llm.LLMClient , error ) {
244+ if schema == nil {
245+ return sm .getLLMClient ()
246+ }
247+ var cfg * llm.Config
248+ ctx := context .Background ()
249+ if sm .repl != nil {
250+ cfg = sm .repl .buildLLMConfig ()
251+ ctx = sm .repl .ctx
252+ } else {
253+ c := * sm .config
254+ cfg = & c
255+ }
256+ cfg .Schema = schema
257+ return llm .NewLLMClient (cfg , ctx )
258+ }
259+
239260// executeInputWithCapture runs a plain user input through the REPL and captures output
240261func (sm * ServerManager ) executeInputWithCapture (input string , stream bool , system string ) (string , error ) {
241262 if sm .repl == nil {
@@ -491,25 +512,21 @@ func (sm *ServerManager) handleChatCompletions(w http.ResponseWriter, r *http.Re
491512 }
492513 }
493514
494- // Handle schema if provided
515+ // Capture the optional per-request JSON schema without mutating shared state.
516+ var schema map [string ]interface {}
495517 if req .ResponseFormat != nil && req .ResponseFormat .Type == "json_schema" {
496- sm .config .Schema = req .ResponseFormat .JSONSchema
497- } else {
498- sm .config .Schema = nil
518+ schema = req .ResponseFormat .JSONSchema
499519 }
500520
501- // Set streaming based on request
502- stream := req .Stream
503-
504- if stream {
505- sm .handleStreamingResponse (w , r , messages , req .Model )
521+ if req .Stream {
522+ sm .handleStreamingResponse (w , r , messages , req .Model , schema )
506523 } else {
507- sm .handleNonStreamingResponse (w , r , messages , req .Model )
524+ sm .handleNonStreamingResponse (w , r , messages , req .Model , schema )
508525 }
509526}
510527
511528// handleStreamingResponse handles streaming chat completions
512- func (sm * ServerManager ) handleStreamingResponse (w http.ResponseWriter , r * http.Request , messages []llm.Message , model string ) {
529+ func (sm * ServerManager ) handleStreamingResponse (w http.ResponseWriter , r * http.Request , messages []llm.Message , model string , schema map [ string ] interface {} ) {
513530 w .Header ().Set ("Content-Type" , "text/plain" )
514531 w .Header ().Set ("Cache-Control" , "no-cache" )
515532 w .Header ().Set ("Connection" , "keep-alive" )
@@ -521,7 +538,7 @@ func (sm *ServerManager) handleStreamingResponse(w http.ResponseWriter, r *http.
521538 }
522539
523540 // For now, use non-streaming and simulate streaming
524- client , err := sm .getLLMClient ( )
541+ client , err := sm .chatCompletionClient ( schema )
525542 if err != nil {
526543 _ , _ = fmt .Fprintf (w , "data: [ERROR] %v\n \n " , err )
527544 return
@@ -591,9 +608,9 @@ func (sm *ServerManager) handleStreamingResponse(w http.ResponseWriter, r *http.
591608}
592609
593610// handleNonStreamingResponse handles non-streaming chat completions
594- func (sm * ServerManager ) handleNonStreamingResponse (w http.ResponseWriter , r * http.Request , messages []llm.Message , model string ) {
595- // Send message using REPL-configured client
596- client , err := sm .getLLMClient ( )
611+ func (sm * ServerManager ) handleNonStreamingResponse (w http.ResponseWriter , r * http.Request , messages []llm.Message , model string , schema map [ string ] interface {} ) {
612+ // Send message using a per-request client so structured-output schemas reach the provider
613+ client , err := sm .chatCompletionClient ( schema )
597614 if err != nil {
598615 http .Error (w , fmt .Sprintf ("LLM init error: %v" , err ), http .StatusInternalServerError )
599616 return
0 commit comments