Skip to content

Commit 0dba4b9

Browse files
authored
Merge pull request #138 from smol-machines/binbin-fix-db-lock-retry
fix: db retries since all req during hold fails currently.
2 parents 7b75d43 + 832e637 commit 0dba4b9

File tree

2 files changed

+99
-1
lines changed

2 files changed

+99
-1
lines changed

src/db.rs

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,32 @@ use redb::{Database, ReadableTable, TableDefinition, TableError};
1313
use std::collections::HashMap;
1414
use std::path::{Path, PathBuf};
1515
use std::sync::Arc;
16+
use std::time::Duration;
17+
18+
/// Maximum number of attempts to open the database when another process holds
19+
/// the lock. Each concurrent `smolvm` CLI invocation (e.g., parallel
20+
/// `machine start` calls) opens the database exclusively via redb's file lock.
21+
/// Without retry, the second process fails immediately with
22+
/// "Database already open. Cannot acquire lock."
23+
///
24+
/// With 10 retries and exponential backoff (50ms initial, 1s cap), the total
25+
/// wait before giving up is ~5 seconds — enough for any normal CLI operation
26+
/// to release the lock.
27+
const DB_OPEN_MAX_RETRIES: u32 = 10;
28+
29+
/// Initial backoff delay between database open retries.
30+
/// Starts short (10ms) since typical DB operations complete in ~1-2ms.
31+
/// Doubles on each attempt: 10ms → 20ms → 40ms → 80ms → 160ms → 320ms → 640ms → 1000ms (capped).
32+
const DB_OPEN_INITIAL_BACKOFF: Duration = Duration::from_millis(10);
33+
34+
/// Maximum backoff delay between retries. Prevents excessive wait on any
35+
/// single retry when the backoff would otherwise grow beyond this.
36+
const DB_OPEN_MAX_BACKOFF: Duration = Duration::from_secs(1);
37+
38+
/// Check if a redb error indicates another process holds the database lock.
39+
fn is_lock_contention(e: &redb::DatabaseError) -> bool {
40+
matches!(e, redb::DatabaseError::DatabaseAlreadyOpen)
41+
}
1642

1743
/// Table for storing VM records (name -> JSON-serialized VmRecord).
1844
const VMS_TABLE: TableDefinition<&str, &[u8]> = TableDefinition::new("vms");
@@ -56,17 +82,53 @@ impl std::fmt::Debug for SmolvmDb {
5682

5783
impl SmolvmDb {
5884
/// Run a closure with the cached database handle, opening it on first use.
85+
///
86+
/// If another process holds the database lock, retries with exponential
87+
/// backoff rather than failing immediately. This allows concurrent CLI
88+
/// commands (e.g., parallel `machine start` calls) to succeed.
5989
fn with_db<T, F>(&self, f: F) -> Result<T>
6090
where
6191
F: FnOnce(&Database) -> Result<T>,
6292
{
6393
let mut guard = self.handle.lock();
6494
if guard.is_none() {
65-
*guard = Some(Database::create(&self.path).db_err("open database")?);
95+
*guard = Some(Self::open_with_retry(&self.path)?);
6696
}
6797
f(guard.as_ref().unwrap())
6898
}
6999

100+
/// Open the database file, retrying with exponential backoff on lock contention.
101+
///
102+
/// redb uses an exclusive file lock — only one process can have the database
103+
/// open at a time. When multiple CLI commands run concurrently (e.g., parallel
104+
/// `machine start` calls), the second process retries until the first releases
105+
/// the lock. The API server avoids this entirely by holding a single long-lived
106+
/// database connection.
107+
fn open_with_retry(path: &Path) -> Result<Database> {
108+
let mut backoff = DB_OPEN_INITIAL_BACKOFF;
109+
for attempt in 0..=DB_OPEN_MAX_RETRIES {
110+
match Database::create(path) {
111+
Ok(db) => return Ok(db),
112+
Err(e) if attempt < DB_OPEN_MAX_RETRIES && is_lock_contention(&e) => {
113+
tracing::debug!(
114+
attempt = attempt + 1,
115+
max = DB_OPEN_MAX_RETRIES,
116+
backoff_ms = backoff.as_millis(),
117+
"database locked by another process, retrying"
118+
);
119+
std::thread::sleep(backoff);
120+
backoff = std::cmp::min(backoff * 2, DB_OPEN_MAX_BACKOFF);
121+
}
122+
Err(e) => {
123+
return Err(Error::database_unavailable(format!("open database: {}", e)));
124+
}
125+
}
126+
}
127+
// All retries exhausted — the loop always returns on the last iteration
128+
// (attempt == DB_OPEN_MAX_RETRIES falls through to the Err arm).
129+
unreachable!()
130+
}
131+
70132
/// Open the database at the default location.
71133
///
72134
/// Default path: `~/Library/Application Support/smolvm/server/smolvm.redb` (macOS)

tests/test_machine.sh

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,6 +1086,42 @@ test_port_conflict_across_vms() {
10861086

10871087
run_test "Port: cross-VM conflict detected" test_port_conflict_across_vms || true
10881088

1089+
# Regression: concurrent machine starts used to fail with "Database already
1090+
# open. Cannot acquire lock." The fix retries with exponential backoff.
1091+
test_concurrent_machine_start() {
1092+
local vm_a="conc-start-a-$$"
1093+
local vm_b="conc-start-b-$$"
1094+
1095+
$SMOLVM machine create "$vm_a" --cpus 1 --mem 256 2>&1 >/dev/null || return 1
1096+
$SMOLVM machine create "$vm_b" --cpus 1 --mem 256 2>&1 >/dev/null || return 1
1097+
1098+
# Start both simultaneously — previously the second would fail with DB lock error
1099+
$SMOLVM machine start --name "$vm_a" 2>&1 >/dev/null &
1100+
local pid_a=$!
1101+
$SMOLVM machine start --name "$vm_b" 2>&1 >/dev/null &
1102+
local pid_b=$!
1103+
wait $pid_a; local exit_a=$?
1104+
wait $pid_b; local exit_b=$?
1105+
1106+
# Both should succeed
1107+
[[ $exit_a -eq 0 ]] || { echo "FAIL: start a failed (exit $exit_a)"; }
1108+
[[ $exit_b -eq 0 ]] || { echo "FAIL: start b failed (exit $exit_b)"; }
1109+
1110+
# Both should be running
1111+
local status_a status_b
1112+
status_a=$($SMOLVM machine status --name "$vm_a" 2>&1)
1113+
status_b=$($SMOLVM machine status --name "$vm_b" 2>&1)
1114+
1115+
$SMOLVM machine stop --name "$vm_a" 2>/dev/null || true
1116+
$SMOLVM machine stop --name "$vm_b" 2>/dev/null || true
1117+
$SMOLVM machine delete "$vm_a" -f 2>/dev/null || true
1118+
$SMOLVM machine delete "$vm_b" -f 2>/dev/null || true
1119+
1120+
[[ "$status_a" == *"running"* ]] && [[ "$status_b" == *"running"* ]]
1121+
}
1122+
1123+
run_test "Concurrent machine starts" test_concurrent_machine_start || true
1124+
10891125
echo ""
10901126
echo "--- Machine Run (Ephemeral) Tests ---"
10911127
echo ""

0 commit comments

Comments
 (0)