feat(api): add pre-emptive memory feasibility to model list

lucasliu · lucasliu · commit 075a3449ad6b · 2026-05-06T03:58:18.000-04:00
GET /admin/models now includes a `memoryFeasibility` field for
downloaded-but-not-loaded models, showing canLoad, modelSizeMB,
availableMB, gpuBudgetMB, and an actionable reason if loading
would fail (e.g. "exceeds GPU budget, try sysctl iogpu.wired_limit_mb=...").

Works in both direct and worker mode — uses Metal device info directly
instead of requiring engine access. Reuses the same safety margin logic
as the pre-load gate (20-30% or 5GB for large models).

Also: make FinishGuard + estimateModelWeightSize public for cross-module use.
diff --git a/Sources/NovaMLXAPI/APIServer.swift b/Sources/NovaMLXAPI/APIServer.swift
@@ -941,15 +941,27 @@ public final class NovaMLXAPIServer: @unchecked Sendable {
             NovaMLXErrorMiddleware()
             Get("/admin/models") { request, context in
                 let records = models.allRegisteredModels()
-                let statuses = records.map { record -> AdminModelStatus in
-                    AdminModelStatus(
+                var statuses: [AdminModelStatus] = []
+                statuses.reserveCapacity(records.count)
+                for record in records {
+                    let isDownloaded = models.isDownloaded(record.id)
+                    let isLoaded = inference.isModelLoaded(record.id) || embeddings.isLoaded(record.id)
+                    // Only check feasibility for downloaded, non-loaded, non-embedding models
+                    var feasibility: MemoryFeasibility? = nil
+                    if isDownloaded && !isLoaded && record.modelType != .embedding {
+                        feasibility = await inference.checkMemoryFeasibility(
+                            modelId: record.id, sizeBytes: record.sizeBytes, localURL: record.localURL
+                        )
+                    }
+                    statuses.append(AdminModelStatus(
                         id: record.id,
                         family: record.family.rawValue,
-                        downloaded: models.isDownloaded(record.id),
-                        loaded: inference.isModelLoaded(record.id) || embeddings.isLoaded(record.id),
+                        downloaded: isDownloaded,
+                        loaded: isLoaded,
                         sizeBytes: record.sizeBytes,
-                        downloadedAt: record.downloadedAt
-                    )
+                        downloadedAt: record.downloadedAt,
+                        memoryFeasibility: feasibility
+                    ))
                 }
                 return try Self.jsonResponse(statuses)
             }
diff --git a/Sources/NovaMLXAPI/OpenAITypes.swift b/Sources/NovaMLXAPI/OpenAITypes.swift
@@ -620,20 +620,23 @@ public struct AdminModelStatus: Codable, Sendable {
     public let loaded: Bool
     public let sizeBytes: UInt64
     public let downloadedAt: String?
+    public let memoryFeasibility: MemoryFeasibility?
 
     public init(
         id: String,
         family: String,
         downloaded: Bool,
         loaded: Bool,
         sizeBytes: UInt64,
-        downloadedAt: Date?
+        downloadedAt: Date?,
+        memoryFeasibility: MemoryFeasibility? = nil
     ) {
         self.id = id
         self.family = family
         self.downloaded = downloaded
         self.loaded = loaded
         self.sizeBytes = sizeBytes
+        self.memoryFeasibility = memoryFeasibility
         if let date = downloadedAt {
             let formatter = ISO8601DateFormatter()
             formatter.formatOptions = [.withInternetDateTime, .withFractionalSeconds]
diff --git a/Sources/NovaMLXCore/Types.swift b/Sources/NovaMLXCore/Types.swift
@@ -3,7 +3,7 @@ import Logging
 
 public enum NovaMLX {}
 
-public let version = "1.0.7"
+public let version = "1.0.8"
 
 public var buildTimestamp: String {
     guard let execURL = Bundle.main.executableURL,
@@ -351,6 +351,24 @@ public struct Token: Codable, Sendable {
     }
 }
 
+/// Pre-emptive memory feasibility check for model loading.
+/// Returned by `GET /admin/models` so the GUI can show badges before load attempt.
+public struct MemoryFeasibility: Codable, Sendable {
+    public let canLoad: Bool
+    public let modelSizeMB: UInt64
+    public let availableMB: UInt64
+    public let gpuBudgetMB: UInt64
+    public let reason: String?
+
+    public init(canLoad: Bool, modelSizeMB: UInt64, availableMB: UInt64, gpuBudgetMB: UInt64, reason: String? = nil) {
+        self.canLoad = canLoad
+        self.modelSizeMB = modelSizeMB
+        self.availableMB = availableMB
+        self.gpuBudgetMB = gpuBudgetMB
+        self.reason = reason
+    }
+}
+
 public protocol TokenizerProtocol: Sendable {
     func encode(text: String) -> [Int]
     func decode(tokens: [Int]) -> String
diff --git a/Sources/NovaMLXEngine/MLXEngine.swift b/Sources/NovaMLXEngine/MLXEngine.swift
@@ -1225,7 +1225,7 @@ public final class MLXEngine: InferenceEngineProtocol, @unchecked Sendable {
 
     /// Estimate model weight size from config.json params or directory size.
     /// Used by the pre-load memory gate to decide whether we need LRU eviction.
-    static func estimateModelWeightSize(at url: URL) -> UInt64? {
+    public static func estimateModelWeightSize(at url: URL) -> UInt64? {
         let configFile = url.appendingPathComponent("config.json")
         guard let data = try? Data(contentsOf: configFile),
               let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
@@ -1254,6 +1254,85 @@ public final class MLXEngine: InferenceEngineProtocol, @unchecked Sendable {
         return UInt64(MLX.Memory.activeMemory) + neededBytes <= softLimitBytes
     }
 
+    /// Pre-emptive memory feasibility check — can this model be loaded?
+    /// Returns nil if the model is already loaded or if info is unavailable.
+    public func checkMemoryFeasibility(modelId: String, sizeBytes: UInt64, localURL: URL) async -> MemoryFeasibility? {
+        // Already loaded — no need to check
+        guard getContainer(for: modelId) == nil else { return nil }
+
+        let maxGPU = GPU.maxRecommendedWorkingSetBytes().map { UInt64($0) } ?? 0
+        guard maxGPU > 0 else { return nil }
+
+        // Estimate weight size: prefer config.json, fall back to directory size or provided sizeBytes
+        let estimatedBytes = Self.estimateModelWeightSize(at: localURL) ?? sizeBytes
+        let modelMB = estimatedBytes / 1_048_576
+
+        // Same safety margin logic as the pre-load gate (lines 678-686)
+        let safetyMargin: UInt64
+        if estimatedBytes > 30 * 1_073_741_824 {
+            safetyMargin = 5 * 1_073_741_824
+        } else if estimatedBytes > 20 * 1_073_741_824 {
+            safetyMargin = UInt64(Double(estimatedBytes) * 0.3)
+        } else {
+            safetyMargin = UInt64(Double(estimatedBytes) * 0.2)
+        }
+        let neededBytes = estimatedBytes + safetyMargin
+
+        let currentBytes = UInt64(MLX.Memory.activeMemory)
+        let available = currentBytes < maxGPU ? maxGPU - currentBytes : 0
+        let gpuMB = maxGPU / 1_048_576
+        let availableMB = available / 1_048_576
+
+        if neededBytes > maxGPU {
+            let neededMB = neededBytes / 1_048_576
+            return MemoryFeasibility(
+                canLoad: false,
+                modelSizeMB: modelMB,
+                availableMB: availableMB,
+                gpuBudgetMB: gpuMB,
+                reason: "Model peak (\(neededMB)MB) exceeds GPU budget (\(gpuMB)MB). Try: sudo sysctl iogpu.wired_limit_mb=\(min(maxGPU / 1_048_576 + 30000, ProcessInfo.processInfo.physicalMemory / 1_048_576 - 2048))"
+            )
+        }
+
+        // Check against process soft limit if enforcer is active
+        if let enforcer = memoryEnforcer {
+            let status = await enforcer.status
+            if status.enabled && status.softLimitBytes > 0 {
+                let softLimitMB = status.softLimitBytes / 1_048_576
+                if currentBytes + neededBytes > status.softLimitBytes {
+                    // Would need LRU eviction — borderline
+                    let freeAfterEvict = status.softLimitBytes > currentBytes
+                        ? (status.softLimitBytes - currentBytes) / 1_048_576 : 0
+                    let neededMB = neededBytes / 1_048_576
+                    if freeAfterEvict >= neededMB {
+                        return MemoryFeasibility(
+                            canLoad: true,
+                            modelSizeMB: modelMB,
+                            availableMB: availableMB,
+                            gpuBudgetMB: gpuMB,
+                            reason: "Loadable after evicting other models (need \(neededMB)MB, \(freeAfterEvict)MB available after LRU eviction)"
+                        )
+                    } else {
+                        return MemoryFeasibility(
+                            canLoad: false,
+                            modelSizeMB: modelMB,
+                            availableMB: availableMB,
+                            gpuBudgetMB: gpuMB,
+                            reason: "Insufficient memory: need \(neededMB)MB, only \(freeAfterEvict)MB available after full LRU eviction"
+                        )
+                    }
+                }
+            }
+        }
+
+        return MemoryFeasibility(
+            canLoad: true,
+            modelSizeMB: modelMB,
+            availableMB: availableMB,
+            gpuBudgetMB: gpuMB
+        )
+    }
+
     public func preflightCheck(modelId: String, promptTokens: Int, maxTokens: Int) async throws {
         guard let maxGPU = GPU.maxRecommendedWorkingSetBytes(), maxGPU > 0 else { return }
 
diff --git a/Sources/NovaMLXInference/InferenceService.swift b/Sources/NovaMLXInference/InferenceService.swift
@@ -263,6 +263,55 @@ public final class InferenceService: @unchecked Sendable {
         return container.isLoaded
     }
 
+    /// Check if a model can be loaded given current memory constraints.
+    /// Works in both direct and worker mode — uses Metal device info directly.
+    public func checkMemoryFeasibility(modelId: String, sizeBytes: UInt64, localURL: URL) async -> MemoryFeasibility? {
+        // Already loaded?
+        if isModelLoaded(modelId) { return nil }
+
+        let maxGPU = MLX.GPU.maxRecommendedWorkingSetBytes().map { UInt64($0) } ?? 0
+        guard maxGPU > 0 else { return nil }
+
+        // Estimate weight size
+        let estimatedBytes = MLXEngine.estimateModelWeightSize(at: localURL) ?? sizeBytes
+        let modelMB = estimatedBytes / 1_048_576
+
+        // Safety margin (same as pre-load gate)
+        let safetyMargin: UInt64
+        if estimatedBytes > 30 * 1_073_741_824 {
+            safetyMargin = 5 * 1_073_741_824
+        } else if estimatedBytes > 20 * 1_073_741_824 {
+            safetyMargin = UInt64(Double(estimatedBytes) * 0.3)
+        } else {
+            safetyMargin = UInt64(Double(estimatedBytes) * 0.2)
+        }
+        let neededBytes = estimatedBytes + safetyMargin
+
+        let currentBytes = UInt64(MLX.Memory.activeMemory)
+        let available = currentBytes < maxGPU ? maxGPU - currentBytes : 0
+        let gpuMB = maxGPU / 1_048_576
+        let availableMB = available / 1_048_576
+
+        if neededBytes > maxGPU {
+            let neededMB = neededBytes / 1_048_576
+            let physMB = ProcessInfo.processInfo.physicalMemory / 1_048_576
+            return MemoryFeasibility(
+                canLoad: false,
+                modelSizeMB: modelMB,
+                availableMB: availableMB,
+                gpuBudgetMB: gpuMB,
+                reason: "Model peak (\(neededMB)MB) exceeds GPU budget (\(gpuMB)MB). Try: sudo sysctl iogpu.wired_limit_mb=\(max(physMB - 2048, gpuMB + 30000))"
+            )
+        }
+
+        return MemoryFeasibility(
+            canLoad: true,
+            modelSizeMB: modelMB,
+            availableMB: availableMB,
+            gpuBudgetMB: gpuMB
+        )
+    }
+
     public func listLoadedModels() -> [String] {
         var models: [String]
         if workerMode {