@@ -504,8 +504,9 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
504504 : result. completionTokens
505505
506506 var content : [ AnthropicContentBlock ] = [ ]
507- // Parse thinking tags from raw output
508- let thinkingParser = ThinkingParser ( )
507+ // Parse thinking tags from raw output — match streaming's model-aware detection
508+ let isAnthropicImplicit = ModelContainer . isImplicitThinkingModel ( for: anthropicReq. model)
509+ let thinkingParser = ThinkingParser ( expectImplicitThinking: isAnthropicImplicit)
509510 _ = thinkingParser. feed ( result. text)
510511 let finalResult = thinkingParser. finalize ( )
511512 if !finalResult. thinking. isEmpty {
@@ -1682,8 +1683,9 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
16821683 let finishReason : String
16831684 let message : OpenAIChatMessage
16841685
1685- // Parse thinking tags from raw output
1686- let thinkingParser = ThinkingParser ( )
1686+ // Parse thinking tags from raw output — match streaming's model-aware detection
1687+ let isImplicitModel = ModelContainer . isImplicitThinkingModel ( for: openAIReq. model)
1688+ let thinkingParser = ThinkingParser ( expectImplicitThinking: isImplicitModel)
16871689 _ = thinkingParser. feed ( result. text)
16881690 let finalResult = thinkingParser. finalize ( )
16891691 let thinkingText = finalResult. thinking. isEmpty ? nil : finalResult. thinking
@@ -1794,9 +1796,8 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
17941796 try await writer. write ( ByteBuffer ( string: " data: \( String ( data: roleData, encoding: . utf8) ?? " " ) \n \n " ) )
17951797
17961798 var completionTokenCount = 0
1797- let modelId = openAIReq. model. lowercased ( )
1798- let isThinkingModel = ModelContainer . detectThinkingModel ( for: openAIReq. model)
1799- let thinkingParser = ThinkingParser ( expectImplicitThinking: isThinkingModel)
1799+ let isImplicitModel = ModelContainer . isImplicitThinkingModel ( for: openAIReq. model)
1800+ let thinkingParser = ThinkingParser ( expectImplicitThinking: isImplicitModel)
18001801 for try await event in keepAliveStream {
18011802 switch event {
18021803 case . token( let token) :
@@ -1816,6 +1817,15 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
18161817 let data = try JSONEncoder ( ) . encode ( chunk)
18171818 try await writer. write ( ByteBuffer ( string: " data: \( String ( data: data, encoding: . utf8) ?? " " ) \n \n " ) )
18181819 } else if let finish = token. finishReason {
1820+ // Flush ThinkingParser before emitting stop chunk
1821+ let finalParsed = thinkingParser. finalize ( )
1822+ if !finalParsed. response. isEmpty {
1823+ completionTokenCount += 1
1824+ let respDelta = OpenAIDelta ( content: finalParsed. response)
1825+ let respChunk = OpenAIStreamChunk ( id: chunkId, model: openAIReq. model, choices: [ OpenAIStreamChoice ( index: 0 , delta: respDelta) ] )
1826+ let respData = try JSONEncoder ( ) . encode ( respChunk)
1827+ try await writer. write ( ByteBuffer ( string: " data: \( String ( data: respData, encoding: . utf8) ?? " " ) \n \n " ) )
1828+ }
18191829 let finalChunk = OpenAIStreamChunk (
18201830 id: chunkId,
18211831 model: openAIReq. model,
@@ -1932,9 +1942,8 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
19321942
19331943 NovaMLXLog . info ( " [SSE: \( reqTag) ] Waiting for first token from inference stream... " )
19341944
1935- let anthropicModelId = ( anthropicReq. model) . lowercased ( )
1936- let isAnthropicThinkingModel = ModelContainer . detectThinkingModel ( for: anthropicReq. model)
1937- let thinkingParser = ThinkingParser ( expectImplicitThinking: isAnthropicThinkingModel)
1945+ let isAnthropicImplicitModel = ModelContainer . isImplicitThinkingModel ( for: anthropicReq. model)
1946+ let thinkingParser = ThinkingParser ( expectImplicitThinking: isAnthropicImplicitModel)
19381947 var currentBlockIndex = 0
19391948 var isInThinkingBlock = false
19401949 var hasStartedTextBlock = false
@@ -1964,6 +1973,20 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
19641973 switch event {
19651974 case . token( let token) :
19661975 if token. finishReason != nil {
1976+ // Flush ThinkingParser before closing blocks
1977+ let finalParsed = thinkingParser. finalize ( )
1978+ if !finalParsed. response. isEmpty {
1979+ if isInThinkingBlock {
1980+ try await endCurrentBlock ( )
1981+ }
1982+ if !hasStartedTextBlock {
1983+ try await startTextBlock ( )
1984+ }
1985+ tokenCount += 1
1986+ let deltaEvent = AnthropicStreamEvent . textDelta ( finalParsed. response)
1987+ let deltaData = try JSONEncoder ( ) . encode ( deltaEvent)
1988+ try await writer. write ( ByteBuffer ( string: " event: content_block_delta \n data: \( String ( data: deltaData, encoding: . utf8) ?? " {} " ) \n \n " ) )
1989+ }
19671990 // Close current block if open
19681991 if isInThinkingBlock || hasStartedTextBlock {
19691992 try await writer. write ( ByteBuffer ( string: " event: content_block_stop \n data: {} \n \n " ) )
0 commit comments