@@ -104,22 +104,37 @@ impl Default for ScanConfig {
104104pub struct ScanProgress {
105105 pub entries_scanned : Arc < AtomicU64 > ,
106106 pub dirs_found : Arc < AtomicU64 > ,
107+ /// Resolved post-dedup physical bytes seen so far. Each entry contributes its
108+ /// `physical_size.unwrap_or(0)` after hardlink dedup, so the live numerator
109+ /// follows the exact same rules as the stored physical-size sums (directories,
110+ /// symlinks, and second+ hardlinks contribute 0).
111+ pub bytes_scanned : Arc < AtomicU64 > ,
112+ }
113+
114+ /// A point-in-time read of an active scan's progress counters.
115+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
116+ pub struct ScanProgressSnapshot {
117+ pub entries_scanned : u64 ,
118+ pub dirs_found : u64 ,
119+ pub bytes_scanned : u64 ,
107120}
108121
109122impl ScanProgress {
110123 fn new ( ) -> Self {
111124 Self {
112125 entries_scanned : Arc :: new ( AtomicU64 :: new ( 0 ) ) ,
113126 dirs_found : Arc :: new ( AtomicU64 :: new ( 0 ) ) ,
127+ bytes_scanned : Arc :: new ( AtomicU64 :: new ( 0 ) ) ,
114128 }
115129 }
116130
117131 /// Read current progress snapshot.
118- pub fn snapshot ( & self ) -> ( u64 , u64 ) {
119- (
120- self . entries_scanned . load ( Ordering :: Relaxed ) ,
121- self . dirs_found . load ( Ordering :: Relaxed ) ,
122- )
132+ pub fn snapshot ( & self ) -> ScanProgressSnapshot {
133+ ScanProgressSnapshot {
134+ entries_scanned : self . entries_scanned . load ( Ordering :: Relaxed ) ,
135+ dirs_found : self . dirs_found . load ( Ordering :: Relaxed ) ,
136+ bytes_scanned : self . bytes_scanned . load ( Ordering :: Relaxed ) ,
137+ }
123138 }
124139}
125140
@@ -141,6 +156,9 @@ impl ScanHandle {
141156pub struct ScanSummary {
142157 pub total_entries : u64 ,
143158 pub total_dirs : u64 ,
159+ /// Resolved post-dedup physical bytes the scan summed (the final value of the
160+ /// `bytes_scanned` counter). Apples-to-apples with the stored physical-size sums.
161+ pub total_physical_bytes : u64 ,
144162 pub duration_ms : u64 ,
145163 pub was_cancelled : bool ,
146164}
@@ -267,6 +285,7 @@ fn run_scan(
267285 let mut batch: Vec < EntryRow > = Vec :: with_capacity ( batch_size) ;
268286 let mut total_entries: u64 = 0 ;
269287 let mut total_dirs: u64 = 0 ;
288+ let mut total_physical_bytes: u64 = 0 ;
270289 // Tracks inodes with nlink > 1 so each hardlinked file's size is counted only once.
271290 // Files with nlink == 1 (the vast majority) skip the set entirely.
272291 let mut seen_inodes: HashSet < u64 > = HashSet :: new ( ) ;
@@ -305,6 +324,7 @@ fn run_scan(
305324 return Ok ( ScanSummary {
306325 total_entries,
307326 total_dirs,
327+ total_physical_bytes,
308328 duration_ms : start. elapsed ( ) . as_millis ( ) as u64 ,
309329 was_cancelled : true ,
310330 } ) ;
@@ -417,6 +437,14 @@ fn run_scan(
417437 total_entries += 1 ;
418438 progress. entries_scanned . fetch_add ( 1 , Ordering :: Relaxed ) ;
419439
440+ // Accumulate the resolved post-dedup physical bytes once per stored entry.
441+ // Placed here (after the hardlink-dedup match, alongside the entry counters)
442+ // so it follows the exact dedup rules of the stored sums: directories,
443+ // symlinks, and second+ hardlinks resolved to `None` contribute 0.
444+ let entry_physical = physical_size. unwrap_or ( 0 ) ;
445+ total_physical_bytes += entry_physical;
446+ progress. bytes_scanned . fetch_add ( entry_physical, Ordering :: Relaxed ) ;
447+
420448 batch. push ( scanned) ;
421449 if batch. len ( ) >= batch_size {
422450 flush_batch ( & mut batch, writer) ?;
@@ -435,6 +463,7 @@ fn run_scan(
435463 Ok ( ScanSummary {
436464 total_entries,
437465 total_dirs,
466+ total_physical_bytes,
438467 duration_ms : start. elapsed ( ) . as_millis ( ) as u64 ,
439468 was_cancelled : false ,
440469 } )
@@ -741,9 +770,10 @@ mod tests {
741770 assert ! ( summary. duration_ms < 10_000 , "scan should complete quickly" ) ;
742771
743772 // Verify progress matches summary
744- let ( entries, dirs) = handle. progress . snapshot ( ) ;
745- assert_eq ! ( entries, summary. total_entries) ;
746- assert_eq ! ( dirs, summary. total_dirs) ;
773+ let snap = handle. progress . snapshot ( ) ;
774+ assert_eq ! ( snap. entries_scanned, summary. total_entries) ;
775+ assert_eq ! ( snap. dirs_found, summary. total_dirs) ;
776+ assert_eq ! ( snap. bytes_scanned, summary. total_physical_bytes) ;
747777
748778 // Wait for writer to process all messages + aggregation
749779 writer. flush_blocking ( ) . unwrap ( ) ;
@@ -1101,4 +1131,95 @@ mod tests {
11011131 // Should resolve to the actual entry ID, NOT ROOT_ID
11021132 assert_eq ! ( ctx. lookup_parent( subtree_root) , Some ( noname_id) ) ;
11031133 }
1134+
1135+ /// Sum every stored row's `physical_size` (NULLs count as 0), matching how the
1136+ /// aggregator treats per-entry physical bytes.
1137+ fn sum_stored_physical_bytes ( db_path : & Path ) -> u64 {
1138+ let conn = IndexStore :: open_read_connection ( db_path) . unwrap ( ) ;
1139+ conn. query_row ( "SELECT COALESCE(SUM(physical_size), 0) FROM entries" , [ ] , |row| {
1140+ row. get :: < _ , i64 > ( 0 )
1141+ } )
1142+ . unwrap ( ) as u64
1143+ }
1144+
1145+ /// Build a tree with BOTH plain single-link files AND a hardlink pair. The
1146+ /// single-link files are what catch a "bytes increment placed inside the
1147+ /// dedup arm" bug: that arm fires only for `nlink > 1`, so single-link files
1148+ /// would contribute nothing and near-zero the counter.
1149+ #[ cfg( unix) ]
1150+ fn create_tree_with_hardlinks ( dir : & Path ) {
1151+ // Plain single-link files (the majority).
1152+ fs:: write ( dir. join ( "plain1.bin" ) , vec ! [ 0u8 ; 4096 ] ) . unwrap ( ) ;
1153+ fs:: write ( dir. join ( "plain2.bin" ) , vec ! [ 0u8 ; 12288 ] ) . unwrap ( ) ;
1154+ let sub = dir. join ( "sub" ) ;
1155+ fs:: create_dir_all ( & sub) . unwrap ( ) ;
1156+ fs:: write ( sub. join ( "plain3.bin" ) , vec ! [ 0u8 ; 8192 ] ) . unwrap ( ) ;
1157+
1158+ // A hardlink pair: two directory entries, one inode. Only the first link's
1159+ // size should be counted; the second resolves to None.
1160+ let target = dir. join ( "linked.bin" ) ;
1161+ fs:: write ( & target, vec ! [ 0u8 ; 16384 ] ) . unwrap ( ) ;
1162+ fs:: hard_link ( & target, dir. join ( "linked-alias.bin" ) ) . unwrap ( ) ;
1163+ }
1164+
1165+ #[ test]
1166+ #[ cfg( unix) ]
1167+ fn bytes_scanned_matches_stored_physical_sum_with_hardlinks ( ) {
1168+ let scan_root = scan_test_tempdir ( ) ;
1169+ create_tree_with_hardlinks ( scan_root. path ( ) ) ;
1170+
1171+ let ( writer, db_path, _db_dir) = setup_writer ( ) ;
1172+ let config = ScanConfig {
1173+ root : scan_root. path ( ) . to_path_buf ( ) ,
1174+ batch_size : 100 ,
1175+ num_threads : 1 ,
1176+ } ;
1177+
1178+ let ( handle, join_handle) = scan_volume ( config, & writer) . unwrap ( ) ;
1179+ let summary = join_handle. join ( ) . expect ( "scan thread panicked" ) . unwrap ( ) ;
1180+ assert ! ( !summary. was_cancelled) ;
1181+
1182+ writer. flush_blocking ( ) . unwrap ( ) ;
1183+ writer. shutdown ( ) ;
1184+
1185+ let counter_total = handle. progress . snapshot ( ) . bytes_scanned ;
1186+ let stored_total = sum_stored_physical_bytes ( & db_path) ;
1187+
1188+ // The live counter follows the exact post-dedup rules of the stored rows.
1189+ assert_eq ! (
1190+ counter_total, stored_total,
1191+ "bytes_scanned counter must equal the sum of stored physical sizes"
1192+ ) ;
1193+ // Sanity: the plain single-link files alone exceed any single hardlink, so a
1194+ // counter that only ran inside the dedup arm would fall well below this.
1195+ assert ! (
1196+ counter_total >= 4096 + 12288 + 8192 ,
1197+ "counter must include the single-link files, not just the hardlink"
1198+ ) ;
1199+ }
1200+
1201+ #[ test]
1202+ #[ cfg( unix) ]
1203+ fn scan_summary_total_physical_bytes_equals_final_counter ( ) {
1204+ let scan_root = scan_test_tempdir ( ) ;
1205+ create_tree_with_hardlinks ( scan_root. path ( ) ) ;
1206+
1207+ let ( writer, _db_path, _db_dir) = setup_writer ( ) ;
1208+ let config = ScanConfig {
1209+ root : scan_root. path ( ) . to_path_buf ( ) ,
1210+ batch_size : 100 ,
1211+ num_threads : 1 ,
1212+ } ;
1213+
1214+ let ( handle, join_handle) = scan_volume ( config, & writer) . unwrap ( ) ;
1215+ let summary = join_handle. join ( ) . expect ( "scan thread panicked" ) . unwrap ( ) ;
1216+ writer. shutdown ( ) ;
1217+
1218+ assert_eq ! (
1219+ summary. total_physical_bytes,
1220+ handle. progress. snapshot( ) . bytes_scanned,
1221+ "summary.total_physical_bytes must equal the final counter value"
1222+ ) ;
1223+ assert ! ( summary. total_physical_bytes > 0 , "scan should sum some physical bytes" ) ;
1224+ }
11041225}
0 commit comments