Optimized intersection count using a bitset when the first leg is dense

fulmicoton · web-flow · commit d2c1b8bc2c7b · 2026-04-06T12:01:52.000-04:00
diff --git a/common/src/bitset.rs b/common/src/bitset.rs
@@ -47,6 +47,9 @@ impl TinySet {
         TinySet(val)
     }
 
+    /// An empty `TinySet` constant.
+    pub const EMPTY: TinySet = TinySet(0u64);
+
     /// Returns an empty `TinySet`.
     #[inline]
     pub fn empty() -> TinySet {
diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs
@@ -1,5 +1,6 @@
 use super::Collector;
 use crate::collector::SegmentCollector;
+use crate::query::Weight;
 use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
 
 /// `CountCollector` collector only counts how many
@@ -55,6 +56,15 @@ impl Collector for Count {
     fn merge_fruits(&self, segment_counts: Vec<usize>) -> crate::Result<usize> {
         Ok(segment_counts.into_iter().sum())
     }
+
+    fn collect_segment(
+        &self,
+        weight: &dyn Weight,
+        _segment_ord: u32,
+        reader: &SegmentReader,
+    ) -> crate::Result<usize> {
+        Ok(weight.count(reader)? as usize)
+    }
 }
 
 #[derive(Default)]
diff --git a/src/docset.rs b/src/docset.rs
@@ -1,5 +1,7 @@
 use std::borrow::{Borrow, BorrowMut};
 
+use common::TinySet;
+
 use crate::fastfield::AliveBitSet;
 use crate::DocId;
 
@@ -14,6 +16,12 @@ pub const TERMINATED: DocId = i32::MAX as u32;
 /// exactly this size as long as we can fill the buffer.
 pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64;
 
+/// Number of `TinySet` (64-bit) buckets in a block used by [`DocSet::fill_bitset_block`].
+pub const BLOCK_NUM_TINYBITSETS: usize = 16;
+
+/// Number of doc IDs covered by one block: `BLOCK_NUM_TINYBITSETS * 64 = 1024`.
+pub const BLOCK_WINDOW: u32 = BLOCK_NUM_TINYBITSETS as u32 * 64;
+
 /// Represents an iterable set of sorted doc ids.
 pub trait DocSet: Send {
     /// Goes to the next element.
@@ -160,6 +168,31 @@ pub trait DocSet: Send {
         self.size_hint() as u64
     }
 
+    /// Fills a bitmask representing which documents in `[min_doc, min_doc + BLOCK_WINDOW)` are
+    /// present in this docset.
+    ///
+    /// The window is divided into `BLOCK_NUM_TINYBITSETS` buckets of 64 docs each.
+    /// Returns the next doc `>= min_doc + BLOCK_WINDOW`, or `TERMINATED` if exhausted.
+    fn fill_bitset_block(
+        &mut self,
+        min_doc: DocId,
+        mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
+    ) -> DocId {
+        self.seek(min_doc);
+        let horizon = min_doc + BLOCK_WINDOW;
+        loop {
+            let doc = self.doc();
+            if doc >= horizon {
+                return doc;
+            }
+            let delta = doc - min_doc;
+            mask[(delta / 64) as usize].insert_mut(delta % 64);
+            if self.advance() == TERMINATED {
+                return TERMINATED;
+            }
+        }
+    }
+
     /// Returns the number documents matching.
     /// Calling this method consumes the `DocSet`.
     fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
@@ -214,6 +247,18 @@ impl DocSet for &mut dyn DocSet {
         (**self).seek_danger(target)
     }
 
+    fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
+        (**self).fill_buffer(buffer)
+    }
+
+    fn fill_bitset_block(
+        &mut self,
+        min_doc: DocId,
+        mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
+    ) -> DocId {
+        (**self).fill_bitset_block(min_doc, mask)
+    }
+
     fn doc(&self) -> u32 {
         (**self).doc()
     }
@@ -256,6 +301,15 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
         unboxed.fill_buffer(buffer)
     }
 
+    fn fill_bitset_block(
+        &mut self,
+        min_doc: DocId,
+        mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
+    ) -> DocId {
+        let unboxed: &mut TDocSet = self.borrow_mut();
+        unboxed.fill_bitset_block(min_doc, mask)
+    }
+
     fn doc(&self) -> DocId {
         let unboxed: &TDocSet = self.borrow();
         unboxed.doc()
diff --git a/src/query/intersection.rs b/src/query/intersection.rs
@@ -1,5 +1,7 @@
+use common::TinySet;
+
 use super::size_hint::estimate_intersection;
-use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
+use crate::docset::{DocSet, SeekDangerResult, BLOCK_NUM_TINYBITSETS, TERMINATED};
 use crate::query::term_query::TermScorer;
 use crate::query::{EmptyScorer, Scorer};
 use crate::{DocId, Score};
@@ -17,7 +19,7 @@ use crate::{DocId, Score};
 /// `size_hint` of the intersection.
 pub fn intersect_scorers(
     mut scorers: Vec<Box<dyn Scorer>>,
-    num_docs_segment: u32,
+    segment_num_docs: u32,
 ) -> Box<dyn Scorer> {
     if scorers.is_empty() {
         return Box::new(EmptyScorer);
@@ -42,14 +44,14 @@ pub fn intersect_scorers(
             left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
             right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
             others: scorers,
-            num_docs: num_docs_segment,
+            segment_num_docs,
         });
     }
     Box::new(Intersection {
         left,
         right,
         others: scorers,
-        num_docs: num_docs_segment,
+        segment_num_docs,
     })
 }
 
@@ -58,7 +60,7 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
     left: TDocSet,
     right: TDocSet,
     others: Vec<TOtherDocSet>,
-    num_docs: u32,
+    segment_num_docs: u32,
 }
 
 fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
@@ -78,7 +80,10 @@ fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
 
 impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
     /// num_docs is the number of documents in the segment.
-    pub(crate) fn new(mut docsets: Vec<TDocSet>, num_docs: u32) -> Intersection<TDocSet, TDocSet> {
+    pub(crate) fn new(
+        mut docsets: Vec<TDocSet>,
+        segment_num_docs: u32,
+    ) -> Intersection<TDocSet, TDocSet> {
         let num_docsets = docsets.len();
         assert!(num_docsets >= 2);
         docsets.sort_by_key(|docset| docset.cost());
@@ -97,7 +102,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
             left,
             right,
             others: docsets,
-            num_docs,
+            segment_num_docs,
         }
     }
 }
@@ -214,7 +219,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
             [self.left.size_hint(), self.right.size_hint()]
                 .into_iter()
                 .chain(self.others.iter().map(DocSet::size_hint)),
-            self.num_docs,
+            self.segment_num_docs,
         )
     }
 
@@ -224,6 +229,91 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
         // If there are docsets that are bad at skipping, they should also influence the cost.
         self.left.cost()
     }
+
+    fn count_including_deleted(&mut self) -> u32 {
+        const DENSITY_THRESHOLD_INVERSE: u32 = 32;
+        if self
+            .left
+            .size_hint()
+            .saturating_mul(DENSITY_THRESHOLD_INVERSE)
+            < self.segment_num_docs
+        {
+            // Sparse path: if the lead iterator covers less than ~3% of docs,
+            // the block approach wastes time on mostly-empty blocks.
+            self.count_including_deleted_sparse()
+        } else {
+            // Dense approach. We push documents into a block bitset to then
+            // perform count using popcount.
+            self.count_including_deleted_dense()
+        }
+    }
+}
+
+const EMPTY_BLOCK: [TinySet; BLOCK_NUM_TINYBITSETS] = [TinySet::EMPTY; BLOCK_NUM_TINYBITSETS];
+
+/// ANDs `other` into `mask` in-place. Returns `true` if the result is all zeros.
+#[inline]
+fn and_blocks_and_return_is_empty(
+    mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
+    update: &[TinySet; BLOCK_NUM_TINYBITSETS],
+) -> bool {
+    let mut all_empty = true;
+    for (mask_tinyset, update_tinyset) in mask.iter_mut().zip(update.iter()) {
+        *mask_tinyset = mask_tinyset.intersect(*update_tinyset);
+        all_empty &= mask_tinyset.is_empty();
+    }
+    all_empty
+}
+
+impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
+    fn count_including_deleted_sparse(&mut self) -> u32 {
+        let mut count = 0u32;
+        let mut doc = self.doc();
+        while doc != TERMINATED {
+            count += 1;
+            doc = self.advance();
+        }
+        count
+    }
+
+    /// Dense block-wise bitmask intersection count.
+    ///
+    /// Fills a 1024-doc window from each iterator, ANDs the bitmasks together,
+    /// and popcounts the result. `fill_bitset_block` handles seeking tails forward
+    /// when they lag behind the current block.
+    fn count_including_deleted_dense(&mut self) -> u32 {
+        let mut count = 0u32;
+        let mut next_base = self.left.doc();
+
+        while next_base < TERMINATED {
+            let base = next_base;
+
+            // Fill lead bitmask.
+            let mut mask = EMPTY_BLOCK;
+            next_base = next_base.max(self.left.fill_bitset_block(base, &mut mask));
+
+            let mut tail_mask = EMPTY_BLOCK;
+            next_base = next_base.max(self.right.fill_bitset_block(base, &mut tail_mask));
+
+            if and_blocks_and_return_is_empty(&mut mask, &tail_mask) {
+                continue;
+            }
+            // AND with each additional tail.
+            for other in &mut self.others {
+                let mut other_mask = EMPTY_BLOCK;
+                next_base = next_base.max(other.fill_bitset_block(base, &mut other_mask));
+                if and_blocks_and_return_is_empty(&mut mask, &other_mask) {
+                    continue;
+                }
+            }
+
+            for tinyset in &mask {
+                count += tinyset.len();
+            }
+        }
+
+        count
+    }
 }
 
 impl<TScorer, TOtherScorer> Scorer for Intersection<TScorer, TOtherScorer>
@@ -421,6 +511,82 @@ mod tests {
         }
     }
 
+    proptest! {
+        #[test]
+        fn prop_test_count_including_deleted_matches_default(
+            a in sorted_deduped_vec(1200, 400),
+            b in sorted_deduped_vec(1200, 400),
+            c in sorted_deduped_vec(1200, 400),
+            num_docs in 1200u32..2000u32,
+        ) {
+            // Compute expected count via set intersection.
+            let expected: u32 = a.iter()
+                .filter(|doc| b.contains(doc) && c.contains(doc))
+                .count() as u32;
+
+            // Test count_including_deleted (dense path).
+            let make_intersection = || {
+                Intersection::new(
+                    vec![
+                        VecDocSet::from(a.clone()),
+                        VecDocSet::from(b.clone()),
+                        VecDocSet::from(c.clone()),
+                    ],
+                    num_docs,
+                )
+            };
+
+            let mut intersection = make_intersection();
+            let count = intersection.count_including_deleted();
+            prop_assert_eq!(count, expected,
+                "count_including_deleted mismatch: a={:?}, b={:?}, c={:?}", a, b, c);
+        }
+    }
+
+    #[test]
+    fn test_count_including_deleted_two_way() {
+        let left = VecDocSet::from(vec![1, 3, 9]);
+        let right = VecDocSet::from(vec![3, 4, 9, 18]);
+        let mut intersection = Intersection::new(vec![left, right], 100);
+        assert_eq!(intersection.count_including_deleted(), 2);
+    }
+
+    #[test]
+    fn test_count_including_deleted_empty() {
+        let a = VecDocSet::from(vec![1, 3]);
+        let b = VecDocSet::from(vec![1, 4]);
+        let c = VecDocSet::from(vec![3, 9]);
+        let mut intersection = Intersection::new(vec![a, b, c], 100);
+        assert_eq!(intersection.count_including_deleted(), 0);
+    }
+
+    /// Test with enough documents to exercise the dense path (>= num_docs/32).
+    #[test]
+    fn test_count_including_deleted_dense_path() {
+        // Create dense docsets: many docs relative to segment size.
+        let docs_a: Vec<u32> = (0..2000).step_by(2).collect(); // even numbers 0..2000
+        let docs_b: Vec<u32> = (0..2000).step_by(3).collect(); // multiples of 3
+        let expected = docs_a.iter().filter(|d| *d % 3 == 0).count() as u32;
+
+        let a = VecDocSet::from(docs_a);
+        let b = VecDocSet::from(docs_b);
+        let mut intersection = Intersection::new(vec![a, b], 2000);
+        assert_eq!(intersection.count_including_deleted(), expected);
+    }
+
+    /// Test that spans multiple blocks (>1024 docs).
+    #[test]
+    fn test_count_including_deleted_multi_block() {
+        let docs_a: Vec<u32> = (0..5000).collect();
+        let docs_b: Vec<u32> = (0..5000).step_by(7).collect();
+        let expected = docs_b.len() as u32; // all of b is in a
+
+        let a = VecDocSet::from(docs_a);
+        let b = VecDocSet::from(docs_b);
+        let mut intersection = Intersection::new(vec![a, b], 5000);
+        assert_eq!(intersection.count_including_deleted(), expected);
+    }
+
     #[test]
     fn test_bug_2811_intersection_candidate_should_increase() {
         let mut schema_builder = Schema::builder();
diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs
@@ -117,6 +117,12 @@ impl DocSet for TermScorer {
     fn size_hint(&self) -> u32 {
         self.postings.size_hint()
     }
+
+    // TODO
+    // It is probably possible to optimize fill_bitset_block for TermScorer,
+    // working directly with the blocks, enabling vectorization.
+    // I did not manage to get a performance improvement on Mac ARM,
+    // and do not have access to x86 to investigate.
 }
 
 impl Scorer for TermScorer {

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,9 @@ impl TinySet {`
`47`	`47`	`TinySet(val)`
`48`	`48`	`}`
`49`	`49`
	`50`	+ /// An empty `TinySet` constant.
	`51`	`+ pub const EMPTY: TinySet = TinySet(0u64);`
	`52`	`+`
`50`	`53`	/// Returns an empty `TinySet`.
`51`	`54`	`#[inline]`
`52`	`55`	`pub fn empty() -> TinySet {`
Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,12 @@ impl DocSet for TermScorer {`
`117`	`117`	`fn size_hint(&self) -> u32 {`
`118`	`118`	`self.postings.size_hint()`
`119`	`119`	`}`
	`120`	`+`
	`121`	`+ // TODO`
	`122`	`+ // It is probably possible to optimize fill_bitset_block for TermScorer,`
	`123`	`+ // working directly with the blocks, enabling vectorization.`
	`124`	`+ // I did not manage to get a performance improvement on Mac ARM,`
	`125`	`+ // and do not have access to x86 to investigate.`
`120`	`126`	`}`
`121`	`127`
`122`	`128`	`impl Scorer for TermScorer {`