quickwit-oss
diff --git a/‎Cargo.toml‎
Lines changed: 5 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benches/fill_bitset.rs‎
Lines changed: 106 additions & 0 deletions b/‎benches/fill_bitset.rs‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎benches/str_search_and_get.rs‎
Lines changed: 1 addition & 2 deletions b/‎benches/str_search_and_get.rs‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎common/src/bitset.rs‎
Lines changed: 11 additions & 0 deletions b/‎common/src/bitset.rs‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/custom_collector.rs‎
Lines changed: 1 addition & 1 deletion b/‎examples/custom_collector.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/date_time_field.rs‎
Lines changed: 1 addition & 1 deletion b/‎examples/date_time_field.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/faceted_search_with_tweaked_score.rs‎
Lines changed: 2 additions & 2 deletions b/‎examples/faceted_search_with_tweaked_score.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/iterating_docs_and_positions.rs‎
Lines changed: 1 addition & 37 deletions b/‎examples/iterating_docs_and_positions.rs‎
Lines changed: 1 addition & 37 deletions
diff --git a/‎examples/phrase_prefix_search.rs‎
Lines changed: 1 addition & 1 deletion b/‎examples/phrase_prefix_search.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/snippet.rs‎
Lines changed: 1 addition & 1 deletion b/‎examples/snippet.rs‎
Lines changed: 1 addition & 1 deletion
@@ -201,3 +201,8 @@ harness = false
 [[bench]]
 name = "regex_all_terms"
 harness = false
+
+[[bench]]
+name = "fill_bitset"
+harness = false
+
@@ -0,0 +1,106 @@
+use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
+use common::BitSet;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use tantivy::postings::BlockSegmentPostings;
+use tantivy::schema::*;
+use tantivy::{doc, DocSet as _, Index, InvertedIndexReader as _, TantivyDocument};
+
+#[global_allocator]
+pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
+
+fn main() {
+    let index = build_test_index();
+    let reader = index.reader().unwrap();
+    let searcher = reader.searcher();
+    let segment_reader = &searcher.segment_readers()[0];
+    let text_field = index.schema().get_field("text").unwrap();
+    let inverted_index = segment_reader.inverted_index(text_field).unwrap();
+    let max_doc = segment_reader.max_doc();
+
+    let term = Term::from_field_text(text_field, "hello");
+    let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
+
+    let mut runner = BenchRunner::new();
+    runner.set_name("fill_bitset");
+
+    let mut group = runner.new_group();
+    {
+        let inverted_index = &inverted_index;
+        let term_info = &term_info;
+        // This is the path used by queries (AutomatonWeight, RangeQuery, etc.)
+        // It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo.
+        group.register("fill_bitset_from_terminfo (via trait)", move |_| {
+            let mut bitset = BitSet::with_max_value(max_doc);
+            inverted_index
+                .fill_bitset_from_terminfo(term_info, &mut bitset)
+                .unwrap();
+            black_box(bitset);
+        });
+    }
+    {
+        let inverted_index = &inverted_index;
+        let term_info = &term_info;
+        // This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset.
+        group.register("read_docset + fill_bitset", move |_| {
+            let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap();
+            let mut bitset = BitSet::with_max_value(max_doc);
+            postings.fill_bitset(&mut bitset);
+            black_box(bitset);
+        });
+    }
+    {
+        let inverted_index = &inverted_index;
+        let term_info = &term_info;
+        // This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely.
+        group.register("BlockSegmentPostings direct", move |_| {
+            let raw = inverted_index
+                .read_raw_postings_data(term_info, IndexRecordOption::Basic)
+                .unwrap();
+            let mut block_postings = BlockSegmentPostings::open(
+                term_info.doc_freq,
+                raw.postings_data,
+                raw.record_option,
+                raw.effective_option,
+            )
+            .unwrap();
+            let mut bitset = BitSet::with_max_value(max_doc);
+            loop {
+                let docs = block_postings.docs();
+                if docs.is_empty() {
+                    break;
+                }
+                for &doc in docs {
+                    bitset.insert(doc);
+                }
+                block_postings.advance();
+            }
+            black_box(bitset);
+        });
+    }
+    group.run();
+}
+
+fn build_test_index() -> Index {
+    let mut schema_builder = Schema::builder();
+    schema_builder.add_text_field("text", TEXT);
+    let schema = schema_builder.build();
+    let index = Index::create_in_ram(schema.clone());
+    let text_field = schema.get_field("text").unwrap();
+
+    let mut writer = index.writer::<TantivyDocument>(250_000_000).unwrap();
+    let mut rng = StdRng::from_seed([42u8; 32]);
+    for _ in 0..100_000 {
+        if rng.random_bool(0.5) {
+            writer
+                .add_document(doc!(text_field => "hello world"))
+                .unwrap();
+        } else {
+            writer
+                .add_document(doc!(text_field => "goodbye world"))
+                .unwrap();
+        }
+    }
+    writer.commit().unwrap();
+    index
+}
@@ -17,7 +17,6 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 use tantivy::collector::{Count, DocSetCollector};
 use tantivy::query::RangeQuery;
-use tantivy::schema::document::TantivyDocument;
 use tantivy::schema::{Schema, Value, FAST, STORED, STRING};
 use tantivy::{doc, Index, ReloadPolicy, Searcher, Term};
 
@@ -406,7 +405,7 @@ impl FetchAllStringsFromDocTask {
 
         for doc_address in docs {
             // Get the document from the doc store (row store access)
-            if let Ok(doc) = self.searcher.doc::<TantivyDocument>(doc_address) {
+            if let Ok(doc) = self.searcher.doc(doc_address) {
                 // Extract string values from the stored field
                 if let Some(field_value) = doc.get_first(str_stored_field) {
                     if let Some(text) = field_value.as_value().as_str() {
 
@@ -193,6 +193,8 @@ impl TinySet {
 #[derive(Clone)]
 pub struct BitSet {
     tinysets: Box<[TinySet]>,
+    // Tracking `len` on every insert/remove adds overhead even when `len()` is never called.
+    // Consider removing if `len()` usage is rare or not on a hot path.
     len: u64,
     max_value: u32,
 }
@@ -252,6 +254,7 @@ impl BitSet {
 
     /// Removes all elements from the `BitSet`.
     pub fn clear(&mut self) {
+        self.len = 0;
         for tinyset in self.tinysets.iter_mut() {
             *tinyset = TinySet::empty();
         }
@@ -271,6 +274,11 @@ impl BitSet {
         }
     }
 
+    /// Estimate the heap memory consumption of this `BitSet` in bytes.
+    pub fn get_memory_consumption(&self) -> usize {
+        self.tinysets.len() * std::mem::size_of::<TinySet>()
+    }
+
     /// Returns the number of elements in the `BitSet`.
     #[inline]
     pub fn len(&self) -> usize {
@@ -314,6 +322,9 @@ impl BitSet {
             .map(|delta_bucket| bucket + delta_bucket as u32)
     }
 
+    /// Returns the maximum number of elements in the bitset.
+    ///
+    /// Warning: The largest element the bitset can contain is `max_value - 1`.
     #[inline]
     pub fn max_value(&self) -> u32 {
         self.max_value
 
@@ -70,7 +70,7 @@ impl Collector for StatsCollector {
     fn for_segment(
         &self,
         _segment_local_id: u32,
-        segment_reader: &SegmentReader,
+        segment_reader: &dyn SegmentReader,
     ) -> tantivy::Result<StatsSegmentCollector> {
         let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
         Ok(StatsSegmentCollector {
 
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
         let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?;
         assert_eq!(count_docs.len(), 1);
         for (_score, doc_address) in count_docs {
-            let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
+            let retrieved_doc = searcher.doc(doc_address)?;
             assert!(retrieved_doc
                 .get_first(occurred_at)
                 .unwrap()
 
@@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> {
         );
         let top_docs_by_custom_score =
             // Call TopDocs with a custom tweak score
-            TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
+            TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| {
                 let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
                 let facet_dict = ingredient_reader.facet_dict();
 
@@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> {
             .iter()
             .map(|(_, doc_id)| {
                 searcher
-                    .doc::<TantivyDocument>(*doc_id)
+                    .doc(*doc_id)
                     .unwrap()
                     .get_first(title)
                     .and_then(|v| v.as_str().map(|el| el.to_string()))
 
@@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> {
         }
     }
 
-    // A `Term` is a text token associated with a field.
-    // Let's go through all docs containing the term `title:the` and access their position
-    let term_the = Term::from_field_text(title, "the");
-
-    // Some other powerful operations (especially `.skip_to`) may be useful to consume these
+    // Some other powerful operations (especially `.seek`) may be useful to consume these
     // posting lists rapidly.
     // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
     // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
 
-    // Also, for some VERY specific high performance use case like an OLAP analysis of logs,
-    // you can get better performance by accessing directly the blocks of doc ids.
-    for segment_reader in searcher.segment_readers() {
-        // A segment contains different data structure.
-        // Inverted index stands for the combination of
-        // - the term dictionary
-        // - the inverted lists associated with each terms and their positions
-        let inverted_index = segment_reader.inverted_index(title)?;
-
-        // This segment posting object is like a cursor over the documents matching the term.
-        // The `IndexRecordOption` arguments tells tantivy we will be interested in both term
-        // frequencies and positions.
-        //
-        // If you don't need all this information, you may get better performance by decompressing
-        // less information.
-        if let Some(mut block_segment_postings) =
-            inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
-        {
-            loop {
-                let docs = block_segment_postings.docs();
-                if docs.is_empty() {
-                    break;
-                }
-                // Once again these docs MAY contains deleted documents as well.
-                let docs = block_segment_postings.docs();
-                // Prints `Docs [0, 2].`
-                println!("Docs {docs:?}");
-                block_segment_postings.advance();
-            }
-        }
-    }
-
     Ok(())
 }
@@ -67,7 +67,7 @@ fn main() -> Result<()> {
     let mut titles = top_docs
         .into_iter()
         .map(|(_score, doc_address)| {
-            let doc = searcher.doc::<TantivyDocument>(doc_address)?;
+            let doc = searcher.doc(doc_address)?;
             let title = doc
                 .get_first(title)
                 .and_then(|v| v.as_str())
 
@@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> {
     let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
 
     for (score, doc_address) in top_docs {
-        let doc = searcher.doc::<TantivyDocument>(doc_address)?;
+        let doc = searcher.doc(doc_address)?;
         let snippet = snippet_generator.snippet_from_doc(&doc);
         println!("Document score {score}:");
         println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());