Skip to content

Commit 128de6f

Browse files
author
lucasliu
committed
feat: audio transcription, image generation, modelfiles, keep_alive, harmony streaming
Major feature additions: - Audio transcription (/v1/audio/transcriptions): NovaMLXAudio module with Qwen3-ASR encoder-decoder, multipart upload parser, SSE streaming with raw BPE token deltas, admin load/unload routing. Base64 and multipart form-data both supported. - Image generation (/v1/images/generations): NovaMLXImage module vendored from mlx-swift-examples/StableDiffusion (MIT). SDPipeline facade wraps UNet+VAE+CLIP+Tokenizer. Intermediate eval() calls prevent GPU command buffer timeout on macOS. SDXL-Turbo verified at ~0.78s per 512x512 image. - Modelfile system: user-authored model recipes (system prompt, sampling params, tools) stored under ~/.nova/modelfiles/. CLI create/list/show commands. API resolves modelfile name → base model + overrides at request time. LRU keys on base model to avoid double-loading. - Per-request keep_alive: parse on all chat/completions/messages/embeddings endpoints, plumb to EnginePool to update eviction deadline atomically. - Harmony streaming: detect GPT-OSS <|channel|>TYPE<|message|> patterns, tag tokens with channel, emit nova.channels array in SSE chunks. - Model discovery: audio and image architecture detection, nova.capabilities on /v1/models includes type/vision/audio/imageGeneration fields. - /v1/models shows all loaded service types (engine, embeddings, audio, image).
1 parent d45369c commit 128de6f

47 files changed

Lines changed: 8459 additions & 55 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Package.swift

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,35 @@ let package = Package(
5858
],
5959
swiftSettings: concurrencySettings
6060
),
61+
.target(
62+
name: "NovaMLXAudio",
63+
dependencies: [
64+
"NovaMLXCore",
65+
.product(name: "MLX", package: "mlx-swift"),
66+
.product(name: "MLXNN", package: "mlx-swift"),
67+
.product(name: "MLXLMCommon", package: "mlx-swift-lm"),
68+
.product(name: "Tokenizers", package: "swift-transformers"),
69+
.product(name: "Hub", package: "swift-transformers"),
70+
],
71+
swiftSettings: concurrencySettings
72+
),
73+
.target(
74+
name: "NovaMLXImage",
75+
dependencies: [
76+
"NovaMLXCore",
77+
.product(name: "MLX", package: "mlx-swift"),
78+
.product(name: "MLXNN", package: "mlx-swift"),
79+
],
80+
swiftSettings: concurrencySettings
81+
),
6182
.target(
6283
name: "NovaMLXEngine",
6384
dependencies: [
6485
"NovaMLXCore",
6586
"NovaMLXUtils",
6687
"NovaMLXPrefixCache",
88+
"NovaMLXAudio",
89+
"NovaMLXImage",
6790
.product(name: "MLX", package: "mlx-swift"),
6891
.product(name: "MLXNN", package: "mlx-swift"),
6992
.product(name: "MLXRandom", package: "mlx-swift"),
@@ -219,6 +242,11 @@ let package = Package(
219242
],
220243
swiftSettings: concurrencySettings
221244
),
245+
.testTarget(
246+
name: "NovaMLXImageTests",
247+
dependencies: ["NovaMLXImage"],
248+
swiftSettings: concurrencySettings
249+
),
222250
.testTarget(
223251
name: "NovaMLXE2ETests",
224252
dependencies: [],

Sources/NovaMLXAPI/APIServer.swift

Lines changed: 455 additions & 21 deletions
Large diffs are not rendered by default.

Sources/NovaMLXAPI/AnthropicTypes.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public struct AnthropicRequest: Codable, Sendable {
3636
public let preserveThinking: Bool?
3737
public let chatTemplateKwargs: [String: AnyCodable]?
3838
public let reasoningEffort: String?
39+
public let keepAlive: KeepAliveValue?
3940

4041
private enum CodingKeys: String, CodingKey {
4142
case model, messages, maxTokens = "max_tokens", system, temperature
@@ -46,6 +47,7 @@ public struct AnthropicRequest: Codable, Sendable {
4647
case preserveThinking = "preserve_thinking"
4748
case chatTemplateKwargs = "chat_template_kwargs"
4849
case reasoningEffort = "reasoning_effort"
50+
case keepAlive = "keep_alive"
4951
}
5052

5153
public init(
@@ -64,7 +66,8 @@ public struct AnthropicRequest: Codable, Sendable {
6466
enableThinking: Bool? = nil,
6567
preserveThinking: Bool? = nil,
6668
chatTemplateKwargs: [String: AnyCodable]? = nil,
67-
reasoningEffort: String? = nil
69+
reasoningEffort: String? = nil,
70+
keepAlive: KeepAliveValue? = nil
6871
) {
6972
self.model = model
7073
self.messages = messages
@@ -82,6 +85,7 @@ public struct AnthropicRequest: Codable, Sendable {
8285
self.preserveThinking = preserveThinking
8386
self.chatTemplateKwargs = chatTemplateKwargs
8487
self.reasoningEffort = reasoningEffort
88+
self.keepAlive = keepAlive
8589
}
8690

8791
/// Resolve thinking toggle from multiple client formats, mirrors OpenAI pattern
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import Foundation
2+
import Hummingbird
3+
import NovaMLXCore
4+
5+
struct AudioSSEStream {
6+
static func body(
7+
from stream: AsyncThrowingStream<String, Error>
8+
) -> ResponseBody {
9+
ResponseBody { writer in
10+
do {
11+
for try await tokenText in stream {
12+
let escaped = tokenText
13+
.replacingOccurrences(of: "\\", with: "\\\\")
14+
.replacingOccurrences(of: "\"", with: "\\\"")
15+
.replacingOccurrences(of: "\n", with: "\\n")
16+
try await writer.write(ByteBuffer(string: "event: transcript.delta\ndata: {\"text\": \"\(escaped)\"}\n\n"))
17+
}
18+
try await writer.write(ByteBuffer(string: "event: done\ndata: [DONE]\n\n"))
19+
try await writer.finish(nil)
20+
} catch {
21+
let msg = String(describing: error)
22+
.replacingOccurrences(of: "\\", with: "\\\\")
23+
.replacingOccurrences(of: "\"", with: "\\\"")
24+
try? await writer.write(ByteBuffer(string: "event: error\ndata: {\"message\": \"\(msg)\"}\n\n"))
25+
try? await writer.finish(nil)
26+
}
27+
}
28+
}
29+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import Foundation
2+
3+
struct TranscriptionRequest: Codable, Sendable {
4+
let model: String
5+
let file: String // base64-encoded audio data
6+
let language: String?
7+
let responseFormat: String?
8+
let temperature: Double?
9+
let stream: Bool?
10+
11+
enum CodingKeys: String, CodingKey {
12+
case model, file, language, temperature, stream
13+
case responseFormat = "response_format"
14+
}
15+
16+
var resolvedResponseFormat: String {
17+
responseFormat ?? "json"
18+
}
19+
}
20+
21+
struct TranscriptionResponse: Codable, Sendable {
22+
let text: String
23+
let language: String?
24+
let duration: Double?
25+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import Foundation
2+
3+
struct ImageGenerationRequest: Codable, Sendable {
4+
let prompt: String
5+
let model: String
6+
let n: Int?
7+
let size: String?
8+
let responseFormat: String?
9+
let quality: String?
10+
let style: String?
11+
let seed: Int?
12+
let negativePrompt: String?
13+
14+
enum CodingKeys: String, CodingKey {
15+
case prompt, model, n, size, quality, style, seed
16+
case responseFormat = "response_format"
17+
case negativePrompt = "negative_prompt"
18+
}
19+
20+
var resolvedN: Int { min(max(n ?? 1, 1), 4) }
21+
var resolvedSize: (width: Int, height: Int) {
22+
switch size ?? "1024x1024" {
23+
case "256x256": return (256, 256)
24+
case "512x512": return (512, 512)
25+
case "1024x1024": return (1024, 1024)
26+
default: return (1024, 1024)
27+
}
28+
}
29+
var resolvedResponseFormat: String { responseFormat ?? "b64_json" }
30+
}
31+
32+
struct ImageGenerationResponse: Codable, Sendable {
33+
let created: Int
34+
let data: [ImageData]
35+
let model: String
36+
}
37+
38+
struct ImageData: Codable, Sendable {
39+
let b64Json: String?
40+
let url: String?
41+
let revisedPrompt: String?
42+
43+
enum CodingKeys: String, CodingKey {
44+
case b64Json = "b64_json"
45+
case url
46+
case revisedPrompt = "revised_prompt"
47+
}
48+
}
49+
50+
// MARK: - Image Edit
51+
52+
/// OpenAI-compatible request type for POST /v1/images/edits.
53+
/// Parsed from multipart/form-data fields.
54+
struct ImageEditRequest {
55+
let image: Data
56+
let mask: Data?
57+
let prompt: String
58+
let model: String
59+
let n: Int?
60+
let size: String?
61+
let responseFormat: String?
62+
63+
var resolvedN: Int { min(max(n ?? 1, 1), 4) }
64+
var resolvedSize: (width: Int, height: Int) {
65+
switch size ?? "1024x1024" {
66+
case "256x256": return (256, 256)
67+
case "512x512": return (512, 512)
68+
case "1024x1024": return (1024, 1024)
69+
default: return (1024, 1024)
70+
}
71+
}
72+
var resolvedResponseFormat: String { responseFormat ?? "b64_json" }
73+
}
74+
75+
// MARK: - Image Variation
76+
77+
/// OpenAI-compatible request type for POST /v1/images/variations.
78+
/// Parsed from multipart/form-data fields.
79+
struct ImageVariationRequest {
80+
let image: Data
81+
let model: String
82+
let n: Int?
83+
let size: String?
84+
let responseFormat: String?
85+
86+
var resolvedN: Int { min(max(n ?? 1, 1), 4) }
87+
var resolvedSize: (width: Int, height: Int) {
88+
switch size ?? "1024x1024" {
89+
case "256x256": return (256, 256)
90+
case "512x512": return (512, 512)
91+
case "1024x1024": return (1024, 1024)
92+
default: return (1024, 1024)
93+
}
94+
}
95+
var resolvedResponseFormat: String { responseFormat ?? "b64_json" }
96+
}

Sources/NovaMLXAPI/ModelCapabilitiesDetector.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,16 @@ final class ModelCapabilitiesDetector: @unchecked Sendable {
3939
let tools = Self.detectTools(template: template)
4040
let thinking = Self.detectImplicitThinking(template: template)
4141
let reasoning = Self.detectReasoning(template: template, thinking: thinking)
42+
let audio = modelType == .audio
43+
let imageGeneration = modelType == .image
4244

4345
return ModelCapabilities(
4446
reasoning: reasoning,
4547
thinking: thinking,
4648
tools: tools,
47-
vision: vision
49+
vision: vision,
50+
audio: audio,
51+
imageGeneration: imageGeneration
4852
)
4953
}
5054

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import Foundation
2+
import Hummingbird
3+
import NovaMLXCore
4+
5+
struct MultipartPart {
6+
let name: String
7+
let filename: String?
8+
let contentType: String?
9+
let body: Data
10+
}
11+
12+
struct MultipartParser {
13+
static func parse(body: Data, contentType: String) throws -> [String: MultipartPart] {
14+
guard let boundary = extractBoundary(from: contentType) else {
15+
throw NovaMLXError.apiError("Missing or invalid boundary in Content-Type header")
16+
}
17+
18+
let boundaryData = Data("--\(boundary)".utf8)
19+
let delimiterData = Data("\r\n\r\n".utf8)
20+
let crlfData = Data("\r\n".utf8)
21+
22+
var parts: [String: MultipartPart] = [:]
23+
var pos = 0
24+
25+
// Skip preamble until first boundary
26+
guard let firstBoundary = body.range(of: boundaryData, in: pos..<body.count) else {
27+
throw NovaMLXError.apiError("No boundary found in multipart body")
28+
}
29+
pos = firstBoundary.upperBound
30+
31+
while pos < body.count {
32+
// Skip \r\n after boundary
33+
if pos + 2 <= body.count, body[pos..<(pos + 2)] == crlfData {
34+
pos += 2
35+
}
36+
37+
// Find end of headers
38+
guard let headerEnd = body.range(of: delimiterData, in: pos..<body.count) else { break }
39+
40+
let headerData = body[pos..<headerEnd.lowerBound]
41+
let headers = parseHeaders(headerData)
42+
pos = headerEnd.upperBound
43+
44+
// Find next boundary
45+
let nextBoundary = Data("\r\n--\(boundary)".utf8)
46+
guard let partEnd = body.range(of: nextBoundary, in: pos..<body.count) else {
47+
// Last part ends at closing boundary --
48+
let closingBoundary = Data("--\r\n".utf8)
49+
if let closeEnd = body.range(of: closingBoundary, in: pos..<body.count) {
50+
let partBody = Data(body[pos..<closeEnd.lowerBound])
51+
if let name = headers["name"] {
52+
parts[name] = MultipartPart(
53+
name: name, filename: headers["filename"],
54+
contentType: headers["content-type"], body: partBody
55+
)
56+
}
57+
}
58+
break
59+
}
60+
61+
let partBody = Data(body[pos..<partEnd.lowerBound])
62+
if let name = headers["name"] {
63+
parts[name] = MultipartPart(
64+
name: name, filename: headers["filename"],
65+
contentType: headers["content-type"], body: partBody
66+
)
67+
}
68+
pos = partEnd.upperBound
69+
}
70+
71+
return parts
72+
}
73+
74+
static func extractBoundary(from contentType: String) -> String? {
75+
// Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryXYZ
76+
let lower = contentType.lowercased()
77+
guard lower.contains("multipart/form-data") else { return nil }
78+
79+
for component in contentType.split(separator: ";") {
80+
let trimmed = component.trimmingCharacters(in: .whitespaces)
81+
if trimmed.hasPrefix("boundary=") {
82+
let boundary = String(trimmed.dropFirst("boundary=".count))
83+
.trimmingCharacters(in: CharacterSet(charactersIn: "\""))
84+
return boundary.isEmpty ? nil : boundary
85+
}
86+
}
87+
return nil
88+
}
89+
90+
private static func parseHeaders(_ data: Data) -> [String: String] {
91+
guard let headerStr = String(data: data, encoding: .utf8) else { return [:] }
92+
var result: [String: String] = [:]
93+
94+
for line in headerStr.components(separatedBy: "\r\n") {
95+
if line.hasPrefix("Content-Disposition:") {
96+
// Extract name and filename from Content-Disposition
97+
for pair in line.split(separator: ";") {
98+
let trimmed = pair.trimmingCharacters(in: .whitespaces)
99+
if let eq = trimmed.firstIndex(of: "=") {
100+
let key = String(trimmed[..<eq]).trimmingCharacters(in: .whitespaces)
101+
let val = String(trimmed[trimmed.index(after: eq)...])
102+
.trimmingCharacters(in: CharacterSet(charactersIn: " \""))
103+
if key == "name" || key == "filename" {
104+
result[key] = val
105+
}
106+
}
107+
}
108+
} else if line.hasPrefix("Content-Type:") {
109+
let val = String(line.dropFirst("Content-Type:".count))
110+
.trimmingCharacters(in: .whitespaces)
111+
result["content-type"] = val
112+
}
113+
}
114+
return result
115+
}
116+
}

0 commit comments

Comments
 (0)