Skip to content

Commit 075a344

Browse files
author
lucasliu
committed
feat(api): add pre-emptive memory feasibility to model list
GET /admin/models now includes a `memoryFeasibility` field for downloaded-but-not-loaded models, showing canLoad, modelSizeMB, availableMB, gpuBudgetMB, and an actionable reason if loading would fail (e.g. "exceeds GPU budget, try sysctl iogpu.wired_limit_mb=..."). Works in both direct and worker mode — uses Metal device info directly instead of requiring engine access. Reuses the same safety margin logic as the pre-load gate (20-30% or 5GB for large models). Also: make FinishGuard + estimateModelWeightSize public for cross-module use.
1 parent 7da3a46 commit 075a344

5 files changed

Lines changed: 170 additions & 9 deletions

File tree

Sources/NovaMLXAPI/APIServer.swift

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -941,15 +941,27 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
941941
NovaMLXErrorMiddleware()
942942
Get("/admin/models") { request, context in
943943
let records = models.allRegisteredModels()
944-
let statuses = records.map { record -> AdminModelStatus in
945-
AdminModelStatus(
944+
var statuses: [AdminModelStatus] = []
945+
statuses.reserveCapacity(records.count)
946+
for record in records {
947+
let isDownloaded = models.isDownloaded(record.id)
948+
let isLoaded = inference.isModelLoaded(record.id) || embeddings.isLoaded(record.id)
949+
// Only check feasibility for downloaded, non-loaded, non-embedding models
950+
var feasibility: MemoryFeasibility? = nil
951+
if isDownloaded && !isLoaded && record.modelType != .embedding {
952+
feasibility = await inference.checkMemoryFeasibility(
953+
modelId: record.id, sizeBytes: record.sizeBytes, localURL: record.localURL
954+
)
955+
}
956+
statuses.append(AdminModelStatus(
946957
id: record.id,
947958
family: record.family.rawValue,
948-
downloaded: models.isDownloaded(record.id),
949-
loaded: inference.isModelLoaded(record.id) || embeddings.isLoaded(record.id),
959+
downloaded: isDownloaded,
960+
loaded: isLoaded,
950961
sizeBytes: record.sizeBytes,
951-
downloadedAt: record.downloadedAt
952-
)
962+
downloadedAt: record.downloadedAt,
963+
memoryFeasibility: feasibility
964+
))
953965
}
954966
return try Self.jsonResponse(statuses)
955967
}

Sources/NovaMLXAPI/OpenAITypes.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,20 +620,23 @@ public struct AdminModelStatus: Codable, Sendable {
620620
public let loaded: Bool
621621
public let sizeBytes: UInt64
622622
public let downloadedAt: String?
623+
public let memoryFeasibility: MemoryFeasibility?
623624

624625
public init(
625626
id: String,
626627
family: String,
627628
downloaded: Bool,
628629
loaded: Bool,
629630
sizeBytes: UInt64,
630-
downloadedAt: Date?
631+
downloadedAt: Date?,
632+
memoryFeasibility: MemoryFeasibility? = nil
631633
) {
632634
self.id = id
633635
self.family = family
634636
self.downloaded = downloaded
635637
self.loaded = loaded
636638
self.sizeBytes = sizeBytes
639+
self.memoryFeasibility = memoryFeasibility
637640
if let date = downloadedAt {
638641
let formatter = ISO8601DateFormatter()
639642
formatter.formatOptions = [.withInternetDateTime, .withFractionalSeconds]

Sources/NovaMLXCore/Types.swift

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import Logging
33

44
public enum NovaMLX {}
55

6-
public let version = "1.0.7"
6+
public let version = "1.0.8"
77

88
public var buildTimestamp: String {
99
guard let execURL = Bundle.main.executableURL,
@@ -351,6 +351,24 @@ public struct Token: Codable, Sendable {
351351
}
352352
}
353353

354+
/// Pre-emptive memory feasibility check for model loading.
355+
/// Returned by `GET /admin/models` so the GUI can show badges before load attempt.
356+
public struct MemoryFeasibility: Codable, Sendable {
357+
public let canLoad: Bool
358+
public let modelSizeMB: UInt64
359+
public let availableMB: UInt64
360+
public let gpuBudgetMB: UInt64
361+
public let reason: String?
362+
363+
public init(canLoad: Bool, modelSizeMB: UInt64, availableMB: UInt64, gpuBudgetMB: UInt64, reason: String? = nil) {
364+
self.canLoad = canLoad
365+
self.modelSizeMB = modelSizeMB
366+
self.availableMB = availableMB
367+
self.gpuBudgetMB = gpuBudgetMB
368+
self.reason = reason
369+
}
370+
}
371+
354372
public protocol TokenizerProtocol: Sendable {
355373
func encode(text: String) -> [Int]
356374
func decode(tokens: [Int]) -> String

Sources/NovaMLXEngine/MLXEngine.swift

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1225,7 +1225,7 @@ public final class MLXEngine: InferenceEngineProtocol, @unchecked Sendable {
12251225

12261226
/// Estimate model weight size from config.json params or directory size.
12271227
/// Used by the pre-load memory gate to decide whether we need LRU eviction.
1228-
static func estimateModelWeightSize(at url: URL) -> UInt64? {
1228+
public static func estimateModelWeightSize(at url: URL) -> UInt64? {
12291229
let configFile = url.appendingPathComponent("config.json")
12301230
guard let data = try? Data(contentsOf: configFile),
12311231
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
@@ -1254,6 +1254,85 @@ public final class MLXEngine: InferenceEngineProtocol, @unchecked Sendable {
12541254
return UInt64(MLX.Memory.activeMemory) + neededBytes <= softLimitBytes
12551255
}
12561256

1257+
/// Pre-emptive memory feasibility check — can this model be loaded?
1258+
/// Returns nil if the model is already loaded or if info is unavailable.
1259+
public func checkMemoryFeasibility(modelId: String, sizeBytes: UInt64, localURL: URL) async -> MemoryFeasibility? {
1260+
// Already loaded — no need to check
1261+
guard getContainer(for: modelId) == nil else { return nil }
1262+
1263+
let maxGPU = GPU.maxRecommendedWorkingSetBytes().map { UInt64($0) } ?? 0
1264+
guard maxGPU > 0 else { return nil }
1265+
1266+
// Estimate weight size: prefer config.json, fall back to directory size or provided sizeBytes
1267+
let estimatedBytes = Self.estimateModelWeightSize(at: localURL) ?? sizeBytes
1268+
let modelMB = estimatedBytes / 1_048_576
1269+
1270+
// Same safety margin logic as the pre-load gate (lines 678-686)
1271+
let safetyMargin: UInt64
1272+
if estimatedBytes > 30 * 1_073_741_824 {
1273+
safetyMargin = 5 * 1_073_741_824
1274+
} else if estimatedBytes > 20 * 1_073_741_824 {
1275+
safetyMargin = UInt64(Double(estimatedBytes) * 0.3)
1276+
} else {
1277+
safetyMargin = UInt64(Double(estimatedBytes) * 0.2)
1278+
}
1279+
let neededBytes = estimatedBytes + safetyMargin
1280+
1281+
let currentBytes = UInt64(MLX.Memory.activeMemory)
1282+
let available = currentBytes < maxGPU ? maxGPU - currentBytes : 0
1283+
let gpuMB = maxGPU / 1_048_576
1284+
let availableMB = available / 1_048_576
1285+
1286+
if neededBytes > maxGPU {
1287+
let neededMB = neededBytes / 1_048_576
1288+
return MemoryFeasibility(
1289+
canLoad: false,
1290+
modelSizeMB: modelMB,
1291+
availableMB: availableMB,
1292+
gpuBudgetMB: gpuMB,
1293+
reason: "Model peak (\(neededMB)MB) exceeds GPU budget (\(gpuMB)MB). Try: sudo sysctl iogpu.wired_limit_mb=\(min(maxGPU / 1_048_576 + 30000, ProcessInfo.processInfo.physicalMemory / 1_048_576 - 2048))"
1294+
)
1295+
}
1296+
1297+
// Check against process soft limit if enforcer is active
1298+
if let enforcer = memoryEnforcer {
1299+
let status = await enforcer.status
1300+
if status.enabled && status.softLimitBytes > 0 {
1301+
let softLimitMB = status.softLimitBytes / 1_048_576
1302+
if currentBytes + neededBytes > status.softLimitBytes {
1303+
// Would need LRU eviction — borderline
1304+
let freeAfterEvict = status.softLimitBytes > currentBytes
1305+
? (status.softLimitBytes - currentBytes) / 1_048_576 : 0
1306+
let neededMB = neededBytes / 1_048_576
1307+
if freeAfterEvict >= neededMB {
1308+
return MemoryFeasibility(
1309+
canLoad: true,
1310+
modelSizeMB: modelMB,
1311+
availableMB: availableMB,
1312+
gpuBudgetMB: gpuMB,
1313+
reason: "Loadable after evicting other models (need \(neededMB)MB, \(freeAfterEvict)MB available after LRU eviction)"
1314+
)
1315+
} else {
1316+
return MemoryFeasibility(
1317+
canLoad: false,
1318+
modelSizeMB: modelMB,
1319+
availableMB: availableMB,
1320+
gpuBudgetMB: gpuMB,
1321+
reason: "Insufficient memory: need \(neededMB)MB, only \(freeAfterEvict)MB available after full LRU eviction"
1322+
)
1323+
}
1324+
}
1325+
}
1326+
}
1327+
1328+
return MemoryFeasibility(
1329+
canLoad: true,
1330+
modelSizeMB: modelMB,
1331+
availableMB: availableMB,
1332+
gpuBudgetMB: gpuMB
1333+
)
1334+
}
1335+
12571336
public func preflightCheck(modelId: String, promptTokens: Int, maxTokens: Int) async throws {
12581337
guard let maxGPU = GPU.maxRecommendedWorkingSetBytes(), maxGPU > 0 else { return }
12591338

Sources/NovaMLXInference/InferenceService.swift

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,55 @@ public final class InferenceService: @unchecked Sendable {
263263
return container.isLoaded
264264
}
265265

266+
/// Check if a model can be loaded given current memory constraints.
267+
/// Works in both direct and worker mode — uses Metal device info directly.
268+
public func checkMemoryFeasibility(modelId: String, sizeBytes: UInt64, localURL: URL) async -> MemoryFeasibility? {
269+
// Already loaded?
270+
if isModelLoaded(modelId) { return nil }
271+
272+
let maxGPU = MLX.GPU.maxRecommendedWorkingSetBytes().map { UInt64($0) } ?? 0
273+
guard maxGPU > 0 else { return nil }
274+
275+
// Estimate weight size
276+
let estimatedBytes = MLXEngine.estimateModelWeightSize(at: localURL) ?? sizeBytes
277+
let modelMB = estimatedBytes / 1_048_576
278+
279+
// Safety margin (same as pre-load gate)
280+
let safetyMargin: UInt64
281+
if estimatedBytes > 30 * 1_073_741_824 {
282+
safetyMargin = 5 * 1_073_741_824
283+
} else if estimatedBytes > 20 * 1_073_741_824 {
284+
safetyMargin = UInt64(Double(estimatedBytes) * 0.3)
285+
} else {
286+
safetyMargin = UInt64(Double(estimatedBytes) * 0.2)
287+
}
288+
let neededBytes = estimatedBytes + safetyMargin
289+
290+
let currentBytes = UInt64(MLX.Memory.activeMemory)
291+
let available = currentBytes < maxGPU ? maxGPU - currentBytes : 0
292+
let gpuMB = maxGPU / 1_048_576
293+
let availableMB = available / 1_048_576
294+
295+
if neededBytes > maxGPU {
296+
let neededMB = neededBytes / 1_048_576
297+
let physMB = ProcessInfo.processInfo.physicalMemory / 1_048_576
298+
return MemoryFeasibility(
299+
canLoad: false,
300+
modelSizeMB: modelMB,
301+
availableMB: availableMB,
302+
gpuBudgetMB: gpuMB,
303+
reason: "Model peak (\(neededMB)MB) exceeds GPU budget (\(gpuMB)MB). Try: sudo sysctl iogpu.wired_limit_mb=\(max(physMB - 2048, gpuMB + 30000))"
304+
)
305+
}
306+
307+
return MemoryFeasibility(
308+
canLoad: true,
309+
modelSizeMB: modelMB,
310+
availableMB: availableMB,
311+
gpuBudgetMB: gpuMB
312+
)
313+
}
314+
266315
public func listLoadedModels() -> [String] {
267316
var models: [String]
268317
if workerMode {

0 commit comments

Comments
 (0)