Skip to content

Commit c7ae895

Browse files
author
lucasliu
committed
feat: safe inference defaults, accumulated batch decode, UI overhaul
- Lower default maxTokens from 4096 to 2048 to prevent runaway generation - Fix temperature=0 override bug (was forcing 0.6, blocking deterministic mode) - Add default frequencyPenalty=0.5 to prevent repetition collapse in small models - Wire frequency penalty into FusedBatchScheduler decode loop via scatter_add - Fix whitespace stripping in FusedBatchScheduler with accumulated batch decode - Fix <|end|> token leaking into output (scrubControlTokens in fused path) - Remove AudioService (unused), restructure E2E tests, UI improvements
1 parent de14bcb commit c7ae895

33 files changed

Lines changed: 1150 additions & 1207 deletions

Sources/NovaMLXAPI/APIServer.swift

Lines changed: 3 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ extension NovaMLXError {
260260
case .downloadFailed: .badGateway
261261
case .unsupportedModel: .badRequest
262262
case .contextWindowExceeded: .badRequest
263+
case .insufficientMemory: .serviceUnavailable
263264
}
264265
}
265266

@@ -274,6 +275,7 @@ extension NovaMLXError {
274275
case .downloadFailed: "server_error"
275276
case .unsupportedModel: "invalid_request_error"
276277
case .contextWindowExceeded: "invalid_request_error"
278+
case .insufficientMemory: "server_error"
277279
}
278280
}
279281

@@ -288,6 +290,7 @@ extension NovaMLXError {
288290
case .downloadFailed: "download_failed"
289291
case .unsupportedModel: "unsupported_model"
290292
case .contextWindowExceeded: "context_window_exceeded"
293+
case .insufficientMemory: "insufficient_memory"
291294
}
292295
}
293296
}
@@ -302,7 +305,6 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
302305
private let perplexityService: PerplexityService
303306
private let updateChecker: UpdateChecker
304307
private let hfService: HuggingFaceService
305-
private let audioService: AudioService
306308
private let config: ServerConfig
307309

308310
public init(
@@ -322,7 +324,6 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
322324
self.perplexityService = PerplexityService(inferenceService: inferenceService)
323325
self.updateChecker = UpdateChecker()
324326
self.hfService = HuggingFaceService(modelDirectory: modelManager.modelsDirectory)
325-
self.audioService = AudioService()
326327
self.config = config
327328
// When HF download completes, re-run discovery so model appears in registry
328329
self.hfService.onModelDownloaded = { [weak self] repoId in
@@ -342,7 +343,6 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
342343
let updater = self.updateChecker
343344
let cfg = self.config
344345
let hf = self.hfService
345-
let audio = self.audioService
346346

347347
let rateLimiter = RateLimiter(config: RateLimitConfig())
348348
let securityHeaders = SecurityHeadersMiddleware()
@@ -634,64 +634,6 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
634634
store.delete(responseId)
635635
return Response(status: .ok, body: .init(byteBuffer: ByteBuffer(string: "{\"status\":\"deleted\"}")))
636636
}
637-
Post("/v1/audio/transcriptions") { request, _ in
638-
let body = try await request.body.collect(upTo: .max)
639-
let data = Data(buffer: body)
640-
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any]
641-
let language = json?["language"] as? String
642-
let audioData: Data
643-
if let file = json?["file"] as? String, let fileData = Data(base64Encoded: file) {
644-
audioData = fileData
645-
} else if let file = json?["audio"] as? String, let fileData = Data(base64Encoded: file) {
646-
audioData = fileData
647-
} else {
648-
audioData = data
649-
}
650-
let result = try await audio.transcribe(audioData: audioData, language: language)
651-
return try Self.jsonResponse(result)
652-
}
653-
Post("/v1/audio/speech") { request, _ in
654-
let body = try await request.body.collect(upTo: .max)
655-
let json = try JSONSerialization.jsonObject(with: body) as? [String: Any] ?? [:]
656-
let text = json["input"] as? String ?? json["text"] as? String ?? ""
657-
let voice = json["voice"] as? String
658-
let speed = json["speed"] as? Double
659-
let language = json["language"] as? String
660-
let stream = json["stream"] as? Bool ?? false
661-
guard !text.isEmpty else {
662-
return Response(status: .badRequest, body: .init(byteBuffer: ByteBuffer(string: "{\"error\":\"input text required\"}")))
663-
}
664-
let synthesisRequest = AudioService.SynthesisRequest(text: text, voice: voice, speed: speed, language: language)
665-
666-
if stream {
667-
let audioStream = audio.synthesizeStream(request: synthesisRequest)
668-
let responseBody: ResponseBody = .init { writer in
669-
do {
670-
for try await chunk in audioStream {
671-
try await writer.write(ByteBuffer(data: chunk.data))
672-
}
673-
} catch {
674-
NovaMLXLog.error("Audio stream error: \(error)")
675-
}
676-
}
677-
var headers = HTTPFields()
678-
headers[.contentType] = "audio/wav"
679-
return Response(status: .ok, headers: headers, body: responseBody)
680-
} else {
681-
let wavData = try await audio.synthesize(request: synthesisRequest)
682-
var headers = HTTPFields()
683-
headers[.contentType] = "audio/wav"
684-
return Response(status: .ok, headers: headers, body: .init(byteBuffer: ByteBuffer(data: wavData)))
685-
}
686-
}
687-
Get("/v1/audio/voices") { _, _ in
688-
let voices = AudioService.supportedVoices()
689-
return try Self.jsonResponse(["voices": voices])
690-
}
691-
Get("/v1/audio/languages") { _, _ in
692-
let languages = AudioService.supportedLanguages()
693-
return try Self.jsonResponse(["languages": languages])
694-
}
695637
Get("/health") { _, _ in
696638
let stats = inference.stats
697639
let mcpStatuses = mcp.getServerStatuses()

Sources/NovaMLXApp/main.swift

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,6 @@ final class AppDelegate: NSObject, NSApplicationDelegate {
184184
engine.cleanupOrphanedCacheDirs(downloadedModelIds: downloadedIds)
185185
Self.cleanupLegacyAppSupportDir()
186186

187-
// Restore previously loaded models and detect interrupted downloads
188187
if workerMode {
189188
do {
190189
try inferenceService.startWorker()
@@ -193,28 +192,6 @@ final class AppDelegate: NSObject, NSApplicationDelegate {
193192
NovaMLXLog.error("Failed to start worker: \(error)")
194193
}
195194
}
196-
await inferenceService.restoreModels(modelManager: modelManager)
197-
appState.detectIncompleteDownloads(modelsDirectory: modelManager.modelsDirectory)
198-
appState.resumeIncompleteDownloads()
199-
200-
appState.startStatsMonitoring(inferenceService: inferenceService)
201-
202-
// Discover cloud models from remote endpoint
203-
Task {
204-
let _ = await CloudBackend.shared.fetchModels()
205-
appState.cloudModels = await inferenceService.listCloudModels()
206-
NovaMLXLog.info("Cloud models discovered: \(appState.cloudModels.count)")
207-
}
208-
209-
let memHandler = MemoryPressureHandler(engine: engine, settingsManager: settingsManager)
210-
memHandler.start()
211-
memoryPressureHandler = memHandler
212-
213-
// Start ProcessMemoryEnforcer (1s polling, configurable limits)
214-
await engine.startMemoryEnforcer()
215-
await engine.configureEnforcerSettings { [settingsManager] modelId in
216-
settingsManager.getSettings(modelId)
217-
}
218195

219196
let serverConfig = await config.serverConfig
220197
apiServer = NovaMLXAPIServer(
@@ -235,6 +212,32 @@ final class AppDelegate: NSObject, NSApplicationDelegate {
235212
try? await apiServer.start()
236213
}
237214
}
215+
216+
// Restore models in background after API is live
217+
Task {
218+
await inferenceService.restoreModels(modelManager: modelManager)
219+
appState.detectIncompleteDownloads(modelsDirectory: modelManager.modelsDirectory)
220+
appState.resumeIncompleteDownloads()
221+
}
222+
223+
appState.startStatsMonitoring(inferenceService: inferenceService)
224+
225+
// Discover cloud models from remote endpoint
226+
Task {
227+
let _ = await CloudBackend.shared.fetchModels()
228+
appState.cloudModels = await inferenceService.listCloudModels()
229+
NovaMLXLog.info("Cloud models discovered: \(appState.cloudModels.count)")
230+
}
231+
232+
let memHandler = MemoryPressureHandler(engine: engine, settingsManager: settingsManager)
233+
memHandler.start()
234+
memoryPressureHandler = memHandler
235+
236+
// Start ProcessMemoryEnforcer (1s polling, configurable limits)
237+
await engine.startMemoryEnforcer()
238+
await engine.configureEnforcerSettings { [settingsManager] modelId in
239+
settingsManager.getSettings(modelId)
240+
}
238241
}
239242
}
240243

Sources/NovaMLXCore/LocalizationStrings.swift

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ enum LocalizationStrings {
3434
"status.serverOnline": "Server Online",
3535
"status.serverOffline": "Server Offline",
3636
"status.tokensPerSec": "tokens/sec",
37-
"status.inferenceSpeed": "Inference Speed",
37+
"status.peakTokensPerSec": "peak tok/s",
38+
"status.realtimeInferenceSpeed": "Real-time Inference Speed",
3839
"status.noActivity": "No inference activity yet",
3940
"status.time": "Time",
4041
"status.tokPerSec": "tok/s",
@@ -284,7 +285,8 @@ enum LocalizationStrings {
284285
"status.serverOnline": "服务器在线",
285286
"status.serverOffline": "服务器离线",
286287
"status.tokensPerSec": "token/秒",
287-
"status.inferenceSpeed": "推理速度",
288+
"status.peakTokensPerSec": "峰值 tok/s",
289+
"status.realtimeInferenceSpeed": "实时推理速度",
288290
"status.noActivity": "暂无推理活动",
289291
"status.time": "时间",
290292
"status.tokPerSec": "token/秒",
@@ -527,7 +529,8 @@ enum LocalizationStrings {
527529
"status.serverOnline": "伺服器在線",
528530
"status.serverOffline": "伺服器離線",
529531
"status.tokensPerSec": "token/秒",
530-
"status.inferenceSpeed": "推理速度",
532+
"status.peakTokensPerSec": "峰值 tok/s",
533+
"status.realtimeInferenceSpeed": "實時推理速度",
531534
"status.noActivity": "暫無推理活動",
532535
"status.time": "時間",
533536
"status.tokPerSec": "token/秒",
@@ -770,7 +773,8 @@ enum LocalizationStrings {
770773
"status.serverOnline": "伺服器上線",
771774
"status.serverOffline": "伺服器離線",
772775
"status.tokensPerSec": "token/秒",
773-
"status.inferenceSpeed": "推論速度",
776+
"status.peakTokensPerSec": "峰值 tok/s",
777+
"status.realtimeInferenceSpeed": "即時推論速度",
774778
"status.noActivity": "尚無推論活動",
775779
"status.time": "時間",
776780
"status.tokPerSec": "token/秒",
@@ -1013,7 +1017,8 @@ enum LocalizationStrings {
10131017
"status.serverOnline": "サーバーオンライン",
10141018
"status.serverOffline": "サーバーオフライン",
10151019
"status.tokensPerSec": "token/秒",
1016-
"status.inferenceSpeed": "推論速度",
1020+
"status.peakTokensPerSec": "ピーク tok/s",
1021+
"status.realtimeInferenceSpeed": "リアルタイム推論速度",
10171022
"status.noActivity": "推論アクティビティなし",
10181023
"status.time": "時間",
10191024
"status.tokPerSec": "token/秒",
@@ -1256,7 +1261,8 @@ enum LocalizationStrings {
12561261
"status.serverOnline": "서버 온라인",
12571262
"status.serverOffline": "서버 오프라인",
12581263
"status.tokensPerSec": "토큰/초",
1259-
"status.inferenceSpeed": "추론 속도",
1264+
"status.peakTokensPerSec": "최고 tok/s",
1265+
"status.realtimeInferenceSpeed": "실시간 추론 속도",
12601266
"status.noActivity": "추론 활동 없음",
12611267
"status.time": "시간",
12621268
"status.tokPerSec": "토큰/초",
@@ -1499,7 +1505,8 @@ enum LocalizationStrings {
14991505
"status.serverOnline": "Serveur en ligne",
15001506
"status.serverOffline": "Serveur hors ligne",
15011507
"status.tokensPerSec": "tokens/sec",
1502-
"status.inferenceSpeed": "Vitesse d'inférence",
1508+
"status.peakTokensPerSec": "peak tok/s",
1509+
"status.realtimeInferenceSpeed": "Vitesse d'inférence en temps réel",
15031510
"status.noActivity": "Aucune activité d'inférence",
15041511
"status.time": "Temps",
15051512
"status.tokPerSec": "tok/s",
@@ -1742,7 +1749,8 @@ enum LocalizationStrings {
17421749
"status.serverOnline": "Server online",
17431750
"status.serverOffline": "Server offline",
17441751
"status.tokensPerSec": "Tokens/Sek",
1745-
"status.inferenceSpeed": "Inferenzgeschwindigkeit",
1752+
"status.peakTokensPerSec": "Spitze tok/s",
1753+
"status.realtimeInferenceSpeed": "Echtzeit-Inferenzgeschwindigkeit",
17461754
"status.noActivity": "Noch keine Inferenzaktivität",
17471755
"status.time": "Zeit",
17481756
"status.tokPerSec": "tok/s",
@@ -1985,7 +1993,8 @@ enum LocalizationStrings {
19851993
"status.serverOnline": "Сервер онлайн",
19861994
"status.serverOffline": "Сервер офлайн",
19871995
"status.tokensPerSec": "токенов/сек",
1988-
"status.inferenceSpeed": "Скорость вывода",
1996+
"status.peakTokensPerSec": "пик tok/s",
1997+
"status.realtimeInferenceSpeed": "Скорость вывода в реальном времени",
19891998
"status.noActivity": "Нет активности вывода",
19901999
"status.time": "Время",
19912000
"status.tokPerSec": "ток/с",

Sources/NovaMLXCore/Types.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ public enum NovaMLXError: Error, LocalizedError {
2525
case downloadFailed(String, underlying: Error)
2626
case unsupportedModel(String)
2727
case contextWindowExceeded(promptTokens: Int, maxTokens: Int, contextLength: Int)
28+
case insufficientMemory(neededMB: UInt64, availableMB: UInt64, modelId: String)
2829

2930
public var errorDescription: String? {
3031
switch self {
@@ -38,6 +39,8 @@ public enum NovaMLXError: Error, LocalizedError {
3839
case .unsupportedModel(let name): "Unsupported model: \(name)"
3940
case .contextWindowExceeded(let promptTokens, let maxTokens, let contextLength):
4041
"Context window exceeded: prompt has \(promptTokens) tokens + max_tokens \(maxTokens) = \(promptTokens + maxTokens), but model context length is \(contextLength). Reduce your prompt or max_tokens."
42+
case .insufficientMemory(let neededMB, let availableMB, let modelId):
43+
"Insufficient memory to load '\(modelId)': need \(neededMB)MB but only \(availableMB)MB available under the current memory limit. Unload unused models, pin important ones, or increase maxProcessMemory."
4144
}
4245
}
4346
}
@@ -100,7 +103,7 @@ public struct ModelConfig: Codable, Sendable {
100103
modelType: ModelType = .llm,
101104
hasLinearAttention: Bool = false,
102105
contextLength: Int = 4096,
103-
maxTokens: Int = 4096,
106+
maxTokens: Int = 2048,
104107
temperature: Double = 0.7,
105108
topP: Double = 0.9,
106109
repeatPenalty: Float = 1.0,

0 commit comments

Comments
 (0)