1+ use common:: TinySet ;
2+
13use super :: size_hint:: estimate_intersection;
2- use crate :: docset:: { DocSet , SeekDangerResult , TERMINATED } ;
4+ use crate :: docset:: { DocSet , SeekDangerResult , BLOCK_NUM_TINYBITSETS , TERMINATED } ;
35use crate :: query:: term_query:: TermScorer ;
46use crate :: query:: { EmptyScorer , Scorer } ;
57use crate :: { DocId , Score } ;
@@ -17,7 +19,7 @@ use crate::{DocId, Score};
1719/// `size_hint` of the intersection.
1820pub fn intersect_scorers (
1921 mut scorers : Vec < Box < dyn Scorer > > ,
20- num_docs_segment : u32 ,
22+ segment_num_docs : u32 ,
2123) -> Box < dyn Scorer > {
2224 if scorers. is_empty ( ) {
2325 return Box :: new ( EmptyScorer ) ;
@@ -42,14 +44,14 @@ pub fn intersect_scorers(
4244 left : * ( left. downcast :: < TermScorer > ( ) . map_err ( |_| ( ) ) . unwrap ( ) ) ,
4345 right : * ( right. downcast :: < TermScorer > ( ) . map_err ( |_| ( ) ) . unwrap ( ) ) ,
4446 others : scorers,
45- num_docs : num_docs_segment ,
47+ segment_num_docs ,
4648 } ) ;
4749 }
4850 Box :: new ( Intersection {
4951 left,
5052 right,
5153 others : scorers,
52- num_docs : num_docs_segment ,
54+ segment_num_docs ,
5355 } )
5456}
5557
@@ -58,7 +60,7 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
5860 left : TDocSet ,
5961 right : TDocSet ,
6062 others : Vec < TOtherDocSet > ,
61- num_docs : u32 ,
63+ segment_num_docs : u32 ,
6264}
6365
6466fn go_to_first_doc < TDocSet : DocSet > ( docsets : & mut [ TDocSet ] ) -> DocId {
@@ -78,7 +80,10 @@ fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
7880
7981impl < TDocSet : DocSet > Intersection < TDocSet , TDocSet > {
8082 /// num_docs is the number of documents in the segment.
81- pub ( crate ) fn new ( mut docsets : Vec < TDocSet > , num_docs : u32 ) -> Intersection < TDocSet , TDocSet > {
83+ pub ( crate ) fn new (
84+ mut docsets : Vec < TDocSet > ,
85+ segment_num_docs : u32 ,
86+ ) -> Intersection < TDocSet , TDocSet > {
8287 let num_docsets = docsets. len ( ) ;
8388 assert ! ( num_docsets >= 2 ) ;
8489 docsets. sort_by_key ( |docset| docset. cost ( ) ) ;
@@ -97,7 +102,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
97102 left,
98103 right,
99104 others : docsets,
100- num_docs ,
105+ segment_num_docs ,
101106 }
102107 }
103108}
@@ -214,7 +219,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
214219 [ self . left . size_hint ( ) , self . right . size_hint ( ) ]
215220 . into_iter ( )
216221 . chain ( self . others . iter ( ) . map ( DocSet :: size_hint) ) ,
217- self . num_docs ,
222+ self . segment_num_docs ,
218223 )
219224 }
220225
@@ -224,6 +229,91 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
224229 // If there are docsets that are bad at skipping, they should also influence the cost.
225230 self . left . cost ( )
226231 }
232+
233+ fn count_including_deleted ( & mut self ) -> u32 {
234+ const DENSITY_THRESHOLD_INVERSE : u32 = 32 ;
235+ if self
236+ . left
237+ . size_hint ( )
238+ . saturating_mul ( DENSITY_THRESHOLD_INVERSE )
239+ < self . segment_num_docs
240+ {
241+ // Sparse path: if the lead iterator covers less than ~3% of docs,
242+ // the block approach wastes time on mostly-empty blocks.
243+ self . count_including_deleted_sparse ( )
244+ } else {
245+ // Dense approach. We push documents into a block bitset to then
246+ // perform count using popcount.
247+ self . count_including_deleted_dense ( )
248+ }
249+ }
250+ }
251+
252+ const EMPTY_BLOCK : [ TinySet ; BLOCK_NUM_TINYBITSETS ] = [ TinySet :: EMPTY ; BLOCK_NUM_TINYBITSETS ] ;
253+
254+ /// ANDs `other` into `mask` in-place. Returns `true` if the result is all zeros.
255+ #[ inline]
256+ fn and_blocks_and_return_is_empty (
257+ mask : & mut [ TinySet ; BLOCK_NUM_TINYBITSETS ] ,
258+ update : & [ TinySet ; BLOCK_NUM_TINYBITSETS ] ,
259+ ) -> bool {
260+ let mut all_empty = true ;
261+ for ( mask_tinyset, update_tinyset) in mask. iter_mut ( ) . zip ( update. iter ( ) ) {
262+ * mask_tinyset = mask_tinyset. intersect ( * update_tinyset) ;
263+ all_empty &= mask_tinyset. is_empty ( ) ;
264+ }
265+ all_empty
266+ }
267+
268+ impl < TDocSet : DocSet , TOtherDocSet : DocSet > Intersection < TDocSet , TOtherDocSet > {
269+ fn count_including_deleted_sparse ( & mut self ) -> u32 {
270+ let mut count = 0u32 ;
271+ let mut doc = self . doc ( ) ;
272+ while doc != TERMINATED {
273+ count += 1 ;
274+ doc = self . advance ( ) ;
275+ }
276+ count
277+ }
278+
279+ /// Dense block-wise bitmask intersection count.
280+ ///
281+ /// Fills a 1024-doc window from each iterator, ANDs the bitmasks together,
282+ /// and popcounts the result. `fill_bitset_block` handles seeking tails forward
283+ /// when they lag behind the current block.
284+ fn count_including_deleted_dense ( & mut self ) -> u32 {
285+ let mut count = 0u32 ;
286+ let mut next_base = self . left . doc ( ) ;
287+
288+ while next_base < TERMINATED {
289+ let base = next_base;
290+
291+ // Fill lead bitmask.
292+ let mut mask = EMPTY_BLOCK ;
293+ next_base = next_base. max ( self . left . fill_bitset_block ( base, & mut mask) ) ;
294+
295+ let mut tail_mask = EMPTY_BLOCK ;
296+ next_base = next_base. max ( self . right . fill_bitset_block ( base, & mut tail_mask) ) ;
297+
298+ if and_blocks_and_return_is_empty ( & mut mask, & tail_mask) {
299+ continue ;
300+ }
301+ // AND with each additional tail.
302+ for other in & mut self . others {
303+ let mut other_mask = EMPTY_BLOCK ;
304+ next_base = next_base. max ( other. fill_bitset_block ( base, & mut other_mask) ) ;
305+ if and_blocks_and_return_is_empty ( & mut mask, & other_mask) {
306+ continue ;
307+ }
308+ }
309+
310+ for tinyset in & mask {
311+ count += tinyset. len ( ) ;
312+ }
313+ }
314+
315+ count
316+ }
227317}
228318
229319impl < TScorer , TOtherScorer > Scorer for Intersection < TScorer , TOtherScorer >
@@ -421,6 +511,82 @@ mod tests {
421511 }
422512 }
423513
514+ proptest ! {
515+ #[ test]
516+ fn prop_test_count_including_deleted_matches_default(
517+ a in sorted_deduped_vec( 1200 , 400 ) ,
518+ b in sorted_deduped_vec( 1200 , 400 ) ,
519+ c in sorted_deduped_vec( 1200 , 400 ) ,
520+ num_docs in 1200u32 ..2000u32 ,
521+ ) {
522+ // Compute expected count via set intersection.
523+ let expected: u32 = a. iter( )
524+ . filter( |doc| b. contains( doc) && c. contains( doc) )
525+ . count( ) as u32 ;
526+
527+ // Test count_including_deleted (dense path).
528+ let make_intersection = || {
529+ Intersection :: new(
530+ vec![
531+ VecDocSet :: from( a. clone( ) ) ,
532+ VecDocSet :: from( b. clone( ) ) ,
533+ VecDocSet :: from( c. clone( ) ) ,
534+ ] ,
535+ num_docs,
536+ )
537+ } ;
538+
539+ let mut intersection = make_intersection( ) ;
540+ let count = intersection. count_including_deleted( ) ;
541+ prop_assert_eq!( count, expected,
542+ "count_including_deleted mismatch: a={:?}, b={:?}, c={:?}" , a, b, c) ;
543+ }
544+ }
545+
546+ #[ test]
547+ fn test_count_including_deleted_two_way ( ) {
548+ let left = VecDocSet :: from ( vec ! [ 1 , 3 , 9 ] ) ;
549+ let right = VecDocSet :: from ( vec ! [ 3 , 4 , 9 , 18 ] ) ;
550+ let mut intersection = Intersection :: new ( vec ! [ left, right] , 100 ) ;
551+ assert_eq ! ( intersection. count_including_deleted( ) , 2 ) ;
552+ }
553+
554+ #[ test]
555+ fn test_count_including_deleted_empty ( ) {
556+ let a = VecDocSet :: from ( vec ! [ 1 , 3 ] ) ;
557+ let b = VecDocSet :: from ( vec ! [ 1 , 4 ] ) ;
558+ let c = VecDocSet :: from ( vec ! [ 3 , 9 ] ) ;
559+ let mut intersection = Intersection :: new ( vec ! [ a, b, c] , 100 ) ;
560+ assert_eq ! ( intersection. count_including_deleted( ) , 0 ) ;
561+ }
562+
563+ /// Test with enough documents to exercise the dense path (>= num_docs/32).
564+ #[ test]
565+ fn test_count_including_deleted_dense_path ( ) {
566+ // Create dense docsets: many docs relative to segment size.
567+ let docs_a: Vec < u32 > = ( 0 ..2000 ) . step_by ( 2 ) . collect ( ) ; // even numbers 0..2000
568+ let docs_b: Vec < u32 > = ( 0 ..2000 ) . step_by ( 3 ) . collect ( ) ; // multiples of 3
569+ let expected = docs_a. iter ( ) . filter ( |d| * d % 3 == 0 ) . count ( ) as u32 ;
570+
571+ let a = VecDocSet :: from ( docs_a) ;
572+ let b = VecDocSet :: from ( docs_b) ;
573+ let mut intersection = Intersection :: new ( vec ! [ a, b] , 2000 ) ;
574+ assert_eq ! ( intersection. count_including_deleted( ) , expected) ;
575+ }
576+
577+ /// Test that spans multiple blocks (>1024 docs).
578+ #[ test]
579+ fn test_count_including_deleted_multi_block ( ) {
580+ let docs_a: Vec < u32 > = ( 0 ..5000 ) . collect ( ) ;
581+ let docs_b: Vec < u32 > = ( 0 ..5000 ) . step_by ( 7 ) . collect ( ) ;
582+ let expected = docs_b. len ( ) as u32 ; // all of b is in a
583+
584+ let a = VecDocSet :: from ( docs_a) ;
585+ let b = VecDocSet :: from ( docs_b) ;
586+ let mut intersection = Intersection :: new ( vec ! [ a, b] , 5000 ) ;
587+ assert_eq ! ( intersection. count_including_deleted( ) , expected) ;
588+ }
589+
424590 #[ test]
425591 fn test_bug_2811_intersection_candidate_should_increase ( ) {
426592 let mut schema_builder = Schema :: builder ( ) ;
0 commit comments