Skip to content

Commit f8694ce

Browse files
committed
Indexing: Add scan bytes counter + heal-contract fix
- Add `ScanProgress.bytes_scanned` (resolved post-dedup physical bytes), incremented once per stored entry after the hardlink-dedup match so it tracks the stored physical-size sums apples-to-apples. `snapshot()` now returns a `ScanProgressSnapshot` struct. - Add `ScanSummary.total_physical_bytes` at both construction sites, and persist it via `UpdateMeta` in the completion handler. - Fix the broken "interrupted scan → fresh rescan" contract: new `WriteMessage::DeleteMeta` + `IndexStore::delete_meta` clear the previous `scan_completed_at` at scan start (before `TruncateData`), and all completion-handler meta writes are now gated behind `!summary.was_cancelled`. Calibration keys stay intact so they keep describing the last completed scan; the reconcile/live transition is unchanged. `DeleteMeta` doesn't bump the writer generation. - Add `ScanCalibration` + the connection-taking `IndexStore::read_scan_calibration` helper (reads `total_entries`/`total_physical_bytes`/`scan_duration_ms` as `Option`s). - Promote `get_space_info_for_path` to `pub(crate)` for reuse. - Tests: bytes-counter parity with hardlinks, summary == final counter, writer `UpdateMeta`/`DeleteMeta` round-trips, calibration read helper (seeded + missing keys).
1 parent b40b579 commit f8694ce

5 files changed

Lines changed: 322 additions & 38 deletions

File tree

apps/desktop/src-tauri/src/file_system/volume/backends/local_posix.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ impl VolumeReadStream for LocalPosixReadStream {
796796
/// On macOS, uses `NSURLVolumeAvailableCapacityForImportantUsageKey` which includes purgeable
797797
/// space (APFS snapshots, iCloud caches), matching what Finder reports. Falls back to `statvfs`
798798
/// if the NSURL query fails. On Linux, uses `statvfs` directly (no purgeable space concept).
799-
fn get_space_info_for_path(path: &Path) -> Result<SpaceInfo, VolumeError> {
799+
pub(crate) fn get_space_info_for_path(path: &Path) -> Result<SpaceInfo, VolumeError> {
800800
// On macOS, prefer the NSURL API that accounts for purgeable space.
801801
#[cfg(target_os = "macos")]
802802
{

apps/desktop/src-tauri/src/indexing/manager.rs

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,22 @@ impl IndexManager {
303303
return Err("Scan already running".to_string());
304304
}
305305

306-
// Step 0: Truncate entries + dir_stats so the scan inserts into an empty DB.
306+
// Step 0a: Clear the previous scan's completion marker BEFORE truncating.
307+
// Without this, a rescan killed mid-way (power loss, `kill -9`) leaves the
308+
// PREVIOUS scan's `scan_completed_at` in meta on top of a truncated/partial
309+
// `entries` table, so the next startup takes the journal-replay path over a
310+
// gutted index instead of the `IncompletePreviousScan` fresh rescan. The
311+
// calibration keys (`total_entries`, `total_physical_bytes`, `scan_duration_ms`)
312+
// are intentionally left intact so they keep describing the last COMPLETED
313+
// scan throughout this one. The same flush below covers both sends.
314+
if let Err(e) = self
315+
.writer
316+
.send(WriteMessage::DeleteMeta("scan_completed_at".to_string()))
317+
{
318+
log::warn!("Failed to send DeleteMeta(scan_completed_at): {e}");
319+
}
320+
321+
// Step 0b: Truncate entries + dir_stats so the scan inserts into an empty DB.
307322
// Without this, INSERT OR REPLACE on a populated table with the `platform_case`
308323
// collation is ~12x slower (30 min vs 2.5 min), and old rows with stale IDs
309324
// accumulate as orphaned subtrees, bloating the DB 3-4x per scan cycle.
@@ -382,13 +397,13 @@ impl IndexManager {
382397
if scan_done_progress.load(Ordering::Relaxed) {
383398
break;
384399
}
385-
let (entries, dirs) = progress.snapshot();
400+
let snap = progress.snapshot();
386401
let _ = app_progress.emit(
387402
"index-scan-progress",
388403
IndexScanProgressEvent {
389404
volume_id: volume_id_progress.clone(),
390-
entries_scanned: entries,
391-
dirs_found: dirs,
405+
entries_scanned: snap.entries_scanned,
406+
dirs_found: snap.dirs_found,
392407
},
393408
);
394409

@@ -550,26 +565,40 @@ impl IndexManager {
550565
// can fail (e.g. "database is locked") and cause an early return.
551566
// Without this, scan_completed_at is never persisted and the next
552567
// startup triggers a full rescan of the entire volume.
553-
let now = std::time::SystemTime::now()
554-
.duration_since(std::time::UNIX_EPOCH)
555-
.map(|d| d.as_secs().to_string())
556-
.unwrap_or_default();
557-
let _ = writer.send(WriteMessage::UpdateMeta {
558-
key: "scan_completed_at".to_string(),
559-
value: now,
560-
});
561-
let _ = writer.send(WriteMessage::UpdateMeta {
562-
key: "scan_duration_ms".to_string(),
563-
value: summary.duration_ms.to_string(),
564-
});
565-
let _ = writer.send(WriteMessage::UpdateMeta {
566-
key: "total_entries".to_string(),
567-
value: summary.total_entries.to_string(),
568-
});
569-
let _ = writer.send(WriteMessage::UpdateMeta {
570-
key: "volume_path".to_string(),
571-
value: "/".to_string(),
572-
});
568+
//
569+
// Gate ALL meta writes behind `!was_cancelled`: a user-stopped scan
570+
// holds only partial totals, and writing `scan_completed_at` for it
571+
// would mark a partial index as complete — the next startup would skip
572+
// the `IncompletePreviousScan` fresh rescan. With the clear-at-start
573+
// above, a cancelled scan leaves NO completion marker, so it heals on
574+
// restart. The reconcile/live transition below is intentionally NOT
575+
// gated; only the meta writes are.
576+
if !summary.was_cancelled {
577+
let now = std::time::SystemTime::now()
578+
.duration_since(std::time::UNIX_EPOCH)
579+
.map(|d| d.as_secs().to_string())
580+
.unwrap_or_default();
581+
let _ = writer.send(WriteMessage::UpdateMeta {
582+
key: "scan_completed_at".to_string(),
583+
value: now,
584+
});
585+
let _ = writer.send(WriteMessage::UpdateMeta {
586+
key: "scan_duration_ms".to_string(),
587+
value: summary.duration_ms.to_string(),
588+
});
589+
let _ = writer.send(WriteMessage::UpdateMeta {
590+
key: "total_entries".to_string(),
591+
value: summary.total_entries.to_string(),
592+
});
593+
let _ = writer.send(WriteMessage::UpdateMeta {
594+
key: "total_physical_bytes".to_string(),
595+
value: summary.total_physical_bytes.to_string(),
596+
});
597+
let _ = writer.send(WriteMessage::UpdateMeta {
598+
key: "volume_path".to_string(),
599+
value: "/".to_string(),
600+
});
601+
}
573602

574603
// Open a read connection for path resolution during replay
575604
let replay_conn = match IndexStore::open_read_connection(&writer.db_path()) {
@@ -682,11 +711,9 @@ impl IndexManager {
682711

683712
let db_file_size = self.store.db_file_size().ok();
684713

685-
let (entries_scanned, dirs_found) = self
686-
.scan_handle
687-
.as_ref()
688-
.map(|h| h.progress.snapshot())
689-
.unwrap_or((0, 0));
714+
let snap = self.scan_handle.as_ref().map(|h| h.progress.snapshot());
715+
let entries_scanned = snap.map(|s| s.entries_scanned).unwrap_or(0);
716+
let dirs_found = snap.map(|s| s.dirs_found).unwrap_or(0);
690717

691718
Ok(IndexStatusResponse {
692719
initialized: true,

apps/desktop/src-tauri/src/indexing/scanner.rs

Lines changed: 129 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -104,22 +104,37 @@ impl Default for ScanConfig {
104104
pub struct ScanProgress {
105105
pub entries_scanned: Arc<AtomicU64>,
106106
pub dirs_found: Arc<AtomicU64>,
107+
/// Resolved post-dedup physical bytes seen so far. Each entry contributes its
108+
/// `physical_size.unwrap_or(0)` after hardlink dedup, so the live numerator
109+
/// follows the exact same rules as the stored physical-size sums (directories,
110+
/// symlinks, and second+ hardlinks contribute 0).
111+
pub bytes_scanned: Arc<AtomicU64>,
112+
}
113+
114+
/// A point-in-time read of an active scan's progress counters.
115+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116+
pub struct ScanProgressSnapshot {
117+
pub entries_scanned: u64,
118+
pub dirs_found: u64,
119+
pub bytes_scanned: u64,
107120
}
108121

109122
impl ScanProgress {
110123
fn new() -> Self {
111124
Self {
112125
entries_scanned: Arc::new(AtomicU64::new(0)),
113126
dirs_found: Arc::new(AtomicU64::new(0)),
127+
bytes_scanned: Arc::new(AtomicU64::new(0)),
114128
}
115129
}
116130

117131
/// Read current progress snapshot.
118-
pub fn snapshot(&self) -> (u64, u64) {
119-
(
120-
self.entries_scanned.load(Ordering::Relaxed),
121-
self.dirs_found.load(Ordering::Relaxed),
122-
)
132+
pub fn snapshot(&self) -> ScanProgressSnapshot {
133+
ScanProgressSnapshot {
134+
entries_scanned: self.entries_scanned.load(Ordering::Relaxed),
135+
dirs_found: self.dirs_found.load(Ordering::Relaxed),
136+
bytes_scanned: self.bytes_scanned.load(Ordering::Relaxed),
137+
}
123138
}
124139
}
125140

@@ -141,6 +156,9 @@ impl ScanHandle {
141156
pub struct ScanSummary {
142157
pub total_entries: u64,
143158
pub total_dirs: u64,
159+
/// Resolved post-dedup physical bytes the scan summed (the final value of the
160+
/// `bytes_scanned` counter). Apples-to-apples with the stored physical-size sums.
161+
pub total_physical_bytes: u64,
144162
pub duration_ms: u64,
145163
pub was_cancelled: bool,
146164
}
@@ -267,6 +285,7 @@ fn run_scan(
267285
let mut batch: Vec<EntryRow> = Vec::with_capacity(batch_size);
268286
let mut total_entries: u64 = 0;
269287
let mut total_dirs: u64 = 0;
288+
let mut total_physical_bytes: u64 = 0;
270289
// Tracks inodes with nlink > 1 so each hardlinked file's size is counted only once.
271290
// Files with nlink == 1 (the vast majority) skip the set entirely.
272291
let mut seen_inodes: HashSet<u64> = HashSet::new();
@@ -305,6 +324,7 @@ fn run_scan(
305324
return Ok(ScanSummary {
306325
total_entries,
307326
total_dirs,
327+
total_physical_bytes,
308328
duration_ms: start.elapsed().as_millis() as u64,
309329
was_cancelled: true,
310330
});
@@ -417,6 +437,14 @@ fn run_scan(
417437
total_entries += 1;
418438
progress.entries_scanned.fetch_add(1, Ordering::Relaxed);
419439

440+
// Accumulate the resolved post-dedup physical bytes once per stored entry.
441+
// Placed here (after the hardlink-dedup match, alongside the entry counters)
442+
// so it follows the exact dedup rules of the stored sums: directories,
443+
// symlinks, and second+ hardlinks resolved to `None` contribute 0.
444+
let entry_physical = physical_size.unwrap_or(0);
445+
total_physical_bytes += entry_physical;
446+
progress.bytes_scanned.fetch_add(entry_physical, Ordering::Relaxed);
447+
420448
batch.push(scanned);
421449
if batch.len() >= batch_size {
422450
flush_batch(&mut batch, writer)?;
@@ -435,6 +463,7 @@ fn run_scan(
435463
Ok(ScanSummary {
436464
total_entries,
437465
total_dirs,
466+
total_physical_bytes,
438467
duration_ms: start.elapsed().as_millis() as u64,
439468
was_cancelled: false,
440469
})
@@ -741,9 +770,10 @@ mod tests {
741770
assert!(summary.duration_ms < 10_000, "scan should complete quickly");
742771

743772
// Verify progress matches summary
744-
let (entries, dirs) = handle.progress.snapshot();
745-
assert_eq!(entries, summary.total_entries);
746-
assert_eq!(dirs, summary.total_dirs);
773+
let snap = handle.progress.snapshot();
774+
assert_eq!(snap.entries_scanned, summary.total_entries);
775+
assert_eq!(snap.dirs_found, summary.total_dirs);
776+
assert_eq!(snap.bytes_scanned, summary.total_physical_bytes);
747777

748778
// Wait for writer to process all messages + aggregation
749779
writer.flush_blocking().unwrap();
@@ -1101,4 +1131,95 @@ mod tests {
11011131
// Should resolve to the actual entry ID, NOT ROOT_ID
11021132
assert_eq!(ctx.lookup_parent(subtree_root), Some(noname_id));
11031133
}
1134+
1135+
/// Sum every stored row's `physical_size` (NULLs count as 0), matching how the
1136+
/// aggregator treats per-entry physical bytes.
1137+
fn sum_stored_physical_bytes(db_path: &Path) -> u64 {
1138+
let conn = IndexStore::open_read_connection(db_path).unwrap();
1139+
conn.query_row("SELECT COALESCE(SUM(physical_size), 0) FROM entries", [], |row| {
1140+
row.get::<_, i64>(0)
1141+
})
1142+
.unwrap() as u64
1143+
}
1144+
1145+
/// Build a tree with BOTH plain single-link files AND a hardlink pair. The
1146+
/// single-link files are what catch a "bytes increment placed inside the
1147+
/// dedup arm" bug: that arm fires only for `nlink > 1`, so single-link files
1148+
/// would contribute nothing and near-zero the counter.
1149+
#[cfg(unix)]
1150+
fn create_tree_with_hardlinks(dir: &Path) {
1151+
// Plain single-link files (the majority).
1152+
fs::write(dir.join("plain1.bin"), vec![0u8; 4096]).unwrap();
1153+
fs::write(dir.join("plain2.bin"), vec![0u8; 12288]).unwrap();
1154+
let sub = dir.join("sub");
1155+
fs::create_dir_all(&sub).unwrap();
1156+
fs::write(sub.join("plain3.bin"), vec![0u8; 8192]).unwrap();
1157+
1158+
// A hardlink pair: two directory entries, one inode. Only the first link's
1159+
// size should be counted; the second resolves to None.
1160+
let target = dir.join("linked.bin");
1161+
fs::write(&target, vec![0u8; 16384]).unwrap();
1162+
fs::hard_link(&target, dir.join("linked-alias.bin")).unwrap();
1163+
}
1164+
1165+
#[test]
1166+
#[cfg(unix)]
1167+
fn bytes_scanned_matches_stored_physical_sum_with_hardlinks() {
1168+
let scan_root = scan_test_tempdir();
1169+
create_tree_with_hardlinks(scan_root.path());
1170+
1171+
let (writer, db_path, _db_dir) = setup_writer();
1172+
let config = ScanConfig {
1173+
root: scan_root.path().to_path_buf(),
1174+
batch_size: 100,
1175+
num_threads: 1,
1176+
};
1177+
1178+
let (handle, join_handle) = scan_volume(config, &writer).unwrap();
1179+
let summary = join_handle.join().expect("scan thread panicked").unwrap();
1180+
assert!(!summary.was_cancelled);
1181+
1182+
writer.flush_blocking().unwrap();
1183+
writer.shutdown();
1184+
1185+
let counter_total = handle.progress.snapshot().bytes_scanned;
1186+
let stored_total = sum_stored_physical_bytes(&db_path);
1187+
1188+
// The live counter follows the exact post-dedup rules of the stored rows.
1189+
assert_eq!(
1190+
counter_total, stored_total,
1191+
"bytes_scanned counter must equal the sum of stored physical sizes"
1192+
);
1193+
// Sanity: the plain single-link files alone exceed any single hardlink, so a
1194+
// counter that only ran inside the dedup arm would fall well below this.
1195+
assert!(
1196+
counter_total >= 4096 + 12288 + 8192,
1197+
"counter must include the single-link files, not just the hardlink"
1198+
);
1199+
}
1200+
1201+
#[test]
1202+
#[cfg(unix)]
1203+
fn scan_summary_total_physical_bytes_equals_final_counter() {
1204+
let scan_root = scan_test_tempdir();
1205+
create_tree_with_hardlinks(scan_root.path());
1206+
1207+
let (writer, _db_path, _db_dir) = setup_writer();
1208+
let config = ScanConfig {
1209+
root: scan_root.path().to_path_buf(),
1210+
batch_size: 100,
1211+
num_threads: 1,
1212+
};
1213+
1214+
let (handle, join_handle) = scan_volume(config, &writer).unwrap();
1215+
let summary = join_handle.join().expect("scan thread panicked").unwrap();
1216+
writer.shutdown();
1217+
1218+
assert_eq!(
1219+
summary.total_physical_bytes,
1220+
handle.progress.snapshot().bytes_scanned,
1221+
"summary.total_physical_bytes must equal the final counter value"
1222+
);
1223+
assert!(summary.total_physical_bytes > 0, "scan should sum some physical bytes");
1224+
}
11041225
}

0 commit comments

Comments
 (0)