quickwit-oss · PSeitz · Mar 30, 2026 · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -201,3 +201,8 @@ harness = false
 [[bench]]
 name = "regex_all_terms"
 harness = false
+
+[[bench]]
+name = "fill_bitset"
+harness = false
+
diff --git a/benches/fill_bitset.rs b/benches/fill_bitset.rs
@@ -0,0 +1,112 @@
+use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
+use common::BitSet;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use tantivy::postings::BlockSegmentPostings;
+use tantivy::schema::*;
+use tantivy::{
+    doc, DocSet, Index, InvertedIndexReader, TantivyDocument, TantivyInvertedIndexReader,
+};
+
+#[global_allocator]
+pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
+
+fn main() {
+    let index = build_test_index();
+    let reader = index.reader().unwrap();
+    let searcher = reader.searcher();
+    let segment_reader = &searcher.segment_readers()[0];
+    let text_field = index.schema().get_field("text").unwrap();
+    let inverted_index = segment_reader.inverted_index(text_field).unwrap();
+    let max_doc = segment_reader.max_doc();
+
+    let term = Term::from_field_text(text_field, "hello");
+    let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
+
+    let mut runner = BenchRunner::new();
+    runner.set_name("fill_bitset");
+
+    let mut group = runner.new_group();
+    {
+        let inverted_index = &inverted_index;
+        let term_info = &term_info;
+        // This is the path used by queries (AutomatonWeight, RangeQuery, etc.)
+        // It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo.
+        group.register("fill_bitset_from_terminfo (via trait)", move |_| {
+            let mut bitset = BitSet::with_max_value(max_doc);
+            inverted_index
+                .fill_bitset_from_terminfo(term_info, &mut bitset)
+                .unwrap();
+            black_box(bitset);
+        });
+    }
+    {
+        let inverted_index = &inverted_index;
+        let term_info = &term_info;
+        // This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset.
+        group.register("read_docset + fill_bitset", move |_| {
+            let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap();
+            let mut bitset = BitSet::with_max_value(max_doc);
+            postings.fill_bitset(&mut bitset);
+            black_box(bitset);
+        });
+    }
+    {
+        let inverted_index = &inverted_index;
+        let term_info = &term_info;
+        // This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely.
+        let concrete_reader = inverted_index
+            .as_any()
+            .downcast_ref::<TantivyInvertedIndexReader>()
+            .expect("expected TantivyInvertedIndexReader");
+        group.register("BlockSegmentPostings direct", move |_| {
+            let raw = concrete_reader
+                .read_raw_postings_data(term_info, IndexRecordOption::Basic)
+                .unwrap();
+            let mut block_postings = BlockSegmentPostings::open(
+                term_info.doc_freq,
+                raw.postings_data,
+                raw.record_option,
+                raw.effective_option,
+            )
+            .unwrap();
+            let mut bitset = BitSet::with_max_value(max_doc);
+            loop {
+                let docs = block_postings.docs();
+                if docs.is_empty() {
+                    break;
+                }
+                for &doc in docs {
+                    bitset.insert(doc);
+                }
+                block_postings.advance();
+            }
+            black_box(bitset);
+        });
+    }
+    group.run();
+}
+
+fn build_test_index() -> Index {
+    let mut schema_builder = Schema::builder();
+    schema_builder.add_text_field("text", TEXT);
+    let schema = schema_builder.build();
+    let index = Index::create_in_ram(schema.clone());
+    let text_field = schema.get_field("text").unwrap();
+
+    let mut writer = index.writer::<TantivyDocument>(250_000_000).unwrap();
+    let mut rng = StdRng::from_seed([42u8; 32]);
+    for _ in 0..100_000 {
+        if rng.random_bool(0.5) {
+            writer
+                .add_document(doc!(text_field => "hello world"))
+                .unwrap();
+        } else {
+            writer
+                .add_document(doc!(text_field => "goodbye world"))
+                .unwrap();
+        }
+    }
+    writer.commit().unwrap();
+    index
+}
diff --git a/benches/str_search_and_get.rs b/benches/str_search_and_get.rs
@@ -17,7 +17,6 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 use tantivy::collector::{Count, DocSetCollector};
 use tantivy::query::RangeQuery;
-use tantivy::schema::document::TantivyDocument;
 use tantivy::schema::{Schema, Value, FAST, STORED, STRING};
 use tantivy::{doc, Index, ReloadPolicy, Searcher, Term};
 
@@ -406,7 +405,7 @@ impl FetchAllStringsFromDocTask {
 
         for doc_address in docs {
             // Get the document from the doc store (row store access)
-            if let Ok(doc) = self.searcher.doc::<TantivyDocument>(doc_address) {
+            if let Ok(doc) = self.searcher.doc(doc_address) {
                 // Extract string values from the stored field
                 if let Some(field_value) = doc.get_first(str_stored_field) {
                     if let Some(text) = field_value.as_value().as_str() {

diff --git a/common/src/bitset.rs b/common/src/bitset.rs
@@ -193,6 +193,8 @@ impl TinySet {
 #[derive(Clone)]
 pub struct BitSet {
     tinysets: Box<[TinySet]>,
+    // Tracking `len` on every insert/remove adds overhead even when `len()` is never called.
+    // Consider removing if `len()` usage is rare or not on a hot path.
     len: u64,
     max_value: u32,
 }
@@ -252,6 +254,7 @@ impl BitSet {
 
     /// Removes all elements from the `BitSet`.
     pub fn clear(&mut self) {
+        self.len = 0;
         for tinyset in self.tinysets.iter_mut() {
             *tinyset = TinySet::empty();
         }
@@ -271,6 +274,11 @@ impl BitSet {
         }
     }
 
+    /// Estimate the heap memory consumption of this `BitSet` in bytes.
+    pub fn get_memory_consumption(&self) -> usize {
+        self.tinysets.len() * std::mem::size_of::<TinySet>()
+    }
+
     /// Returns the number of elements in the `BitSet`.
     #[inline]
     pub fn len(&self) -> usize {
@@ -314,6 +322,9 @@ impl BitSet {
             .map(|delta_bucket| bucket + delta_bucket as u32)
     }
 
+    /// Returns the maximum number of elements in the bitset.
+    ///
+    /// Warning: The largest element the bitset can contain is `max_value - 1`.
     #[inline]
     pub fn max_value(&self) -> u32 {
         self.max_value

diff --git a/doc/src/SUMMARY.md b/doc/src/SUMMARY.md
@@ -8,6 +8,7 @@
 - [Index Sorting](./index_sorting.md)
 - [Innerworkings](./innerworkings.md)
   - [Inverted index](./inverted_index.md)
+  - [Storage Abstraction](./storage_abstraction.md)
 - [Best practise](./inverted_index.md)
 
 [Frequently Asked Questions](./faq.md)

diff --git a/doc/src/storage_abstraction.md b/doc/src/storage_abstraction.md
@@ -0,0 +1,76 @@
+# Storage Abstraction — Design Notes
+
+## Problem
+
+tantivy's query engine needs to work with pluggable `SegmentReader` implementations while preserving the monomorphized fast path that avoids `Box<dyn Postings>` vtable
+overhead in tight scoring loops (`advance()`, `doc()`, `score()`) or similar cases.
+
+## Requirements
+
+- **Pluggable `SegmentReader`.** External crates can provide their own `SegmentReader` implementation (with their own `InvertedIndexReader`, postings types, etc.) and tantivy's query engine works with it.
+- **No performance regression.** tantivy's default path (`SegmentPostings` → `TermScorer<SegmentPostings>` → block WAND) must remain monomorphized — no boxing, no vtable dispatch in scoring loops.
+- **Arbitrary implementations without recompiling tantivy.** The design must not require a fixed set of implementations known at tantivy compile time. External crates depend on tantivy, not the reverse.
+- **Query code is backend-agnostic.** Adding a new `SegmentReader` implementation must not require changes to `TermWeight`, `PhraseWeight`, `AutomatonWeight`, or any other query code.
+- **Non-viral API.** `Searcher`, `Index`, `Weight`, and other public types are not generic over the backend. Users don't need to thread a type parameter through their code.
+
+## Current Design
+
+### Trait hierarchy
+
+- **`SegmentReader`** — trait for accessing a segment's data. Returns `Arc<dyn DynInvertedIndexReader>` from `inverted_index(field)`. `TantivySegmentReader` is the default implementation.
+- **`DynInvertedIndexReader`** — object-safe trait for dynamic dispatch. Returns `Box<dyn Postings>`. Used as `Arc<dyn DynInvertedIndexReader>`.
+- **`InvertedIndexReader`** — typed trait with `type Postings` and `type DocSet` associated types. `TantivyInvertedIndexReader` implements this with `Postings = SegmentPostings`. There is a blanket impl of `InvertedIndexReader` for `dyn DynInvertedIndexReader` with `Postings = Box<dyn Postings>`.
+
+### `try_downcast_and_call!` macro
+
+The macro attempts to downcast `&dyn DynInvertedIndexReader` to `&TantivyInvertedIndexReader`. The body is compiled twice — once with the concrete reader (typed postings, monomorphized) and once with the dyn fallback (boxed postings).
+
+```rust
+try_downcast_and_call!(inverted_index.as_ref(), |reader| {
+    let postings = reader.read_postings_from_terminfo(&term_info, option)?;
+    TermScorer::new(postings, fieldnorm_reader, similarity_weight)
+})
+```
+
+This replaced the earlier `TypedInvertedIndexReaderCb` trait + struct pattern, which required creating a struct for every call site to serve as a "generic closure."
+
+## Rejected approaches
+
+### Specialized methods on `DynInvertedIndexReader`
+
+Adding methods like `build_term_scorer()`, `build_phrase_scorer()`, `fill_bitset_from_terminfo()` to `DynInvertedIndexReader` was rejected. This forces every implementor to reimplement scoring logic for each query type — a combinatorial explosion that couples the reader to every query shape. The reader should only know how to produce postings, not how to build scorers. It also prevents supporting arbitrary query types without changing the trait.
+
+### Feature-gated types for external readers
+
+Using `#[cfg(feature = "quickwit")]` branches in the macro to add additional downcast targets. Requires recompiling tantivy for each reader and doesn't scale to arbitrary `SegmentReader` / `InvertedIndexReader` implementations.
+
+### Reader-side dispatch with a callback trait
+
+A method like `fn with_typed_reader(&self, cb: &mut dyn TypedCb<R>) -> R` on `DynInvertedIndexReader` would let the reader dispatch the callback with its concrete type. But the generic `R` parameter makes the trait not object-safe. Working around this with type erasure (storing results in the callback via `Any`) is complex and fragile.
+
+## Planned: `TypedSegmentReader` trait for external fast paths
+
+The current `try_downcast_and_call!` hardcodes `TantivyInvertedIndexReader`. To give external crates the monomorphized fast path, the downcast target should be a **trait with associated types**, not a specific concrete struct.
+
+```rust
+trait TypedSegmentReader: SegmentReader {
+    type InvertedIndexReader: InvertedIndexReader;
+    // future: type FastFieldReader: ...;
+    // future: type StoreReader: ...;
+
+    fn typed_inverted_index(&self, field: Field) -> &Self::InvertedIndexReader;
+}
+```
+
+The dispatch downcasts `dyn SegmentReader` (via `as_any()`) to a concrete type that implements `TypedSegmentReader`, then the body works generically through the associated types. The body is compiled once per registered concrete type but is written against the trait — it never names `TantivyInvertedIndexReader` or `SegmentPostings` directly.
+
+- External crates implement `TypedSegmentReader` with their own associated types and get the monomorphized fast path.
+- One dispatch point covers all typed sub-components (inverted index, fast fields, store reader, etc.).
+- Query weight code is fully generic — adding a new backend doesn't touch any query code.
+- This does **not** mean query-specific methods on `SegmentReader`. The trait provides typed access to sub-components, not knowledge of query shapes.
+
+### Open question: downcast chain registration
+
+The concrete type must still be known for the `Any` downcast. The dispatch needs a list of concrete types to try. Since tantivy cannot depend on external crates, this list can't live in tantivy itself.
+
+A macro invoked by the final binary could generate the downcast chain with all `TypedSegmentReader` implementors. Not yet designed.
diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs
@@ -70,7 +70,7 @@ impl Collector for StatsCollector {
     fn for_segment(
         &self,
         _segment_local_id: u32,
-        segment_reader: &SegmentReader,
+        segment_reader: &dyn SegmentReader,
     ) -> tantivy::Result<StatsSegmentCollector> {
         let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
         Ok(StatsSegmentCollector {

diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
         let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?;
         assert_eq!(count_docs.len(), 1);
         for (_score, doc_address) in count_docs {
-            let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
+            let retrieved_doc = searcher.doc(doc_address)?;
             assert!(retrieved_doc
                 .get_first(occurred_at)
                 .unwrap()

diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs
@@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> {
         );
         let top_docs_by_custom_score =
             // Call TopDocs with a custom tweak score
-            TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
+            TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| {
                 let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
                 let facet_dict = ingredient_reader.facet_dict();
 
@@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> {
             .iter()
             .map(|(_, doc_id)| {
                 searcher
-                    .doc::<TantivyDocument>(*doc_id)
+                    .doc(*doc_id)
                     .unwrap()
                     .get_first(title)
                     .and_then(|v| v.as_str().map(|el| el.to_string()))

diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs
@@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> {
         }
     }
 
-    // A `Term` is a text token associated with a field.
-    // Let's go through all docs containing the term `title:the` and access their position
-    let term_the = Term::from_field_text(title, "the");
-
-    // Some other powerful operations (especially `.skip_to`) may be useful to consume these
+    // Some other powerful operations (especially `.seek`) may be useful to consume these
     // posting lists rapidly.
     // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
     // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
 
-    // Also, for some VERY specific high performance use case like an OLAP analysis of logs,
-    // you can get better performance by accessing directly the blocks of doc ids.
-    for segment_reader in searcher.segment_readers() {
-        // A segment contains different data structure.
-        // Inverted index stands for the combination of
-        // - the term dictionary
-        // - the inverted lists associated with each terms and their positions
-        let inverted_index = segment_reader.inverted_index(title)?;
-
-        // This segment posting object is like a cursor over the documents matching the term.
-        // The `IndexRecordOption` arguments tells tantivy we will be interested in both term
-        // frequencies and positions.
-        //
-        // If you don't need all this information, you may get better performance by decompressing
-        // less information.
-        if let Some(mut block_segment_postings) =
-            inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
-        {
-            loop {
-                let docs = block_segment_postings.docs();
-                if docs.is_empty() {
-                    break;
-                }
-                // Once again these docs MAY contains deleted documents as well.
-                let docs = block_segment_postings.docs();
-                // Prints `Docs [0, 2].`
-                println!("Docs {docs:?}");
-                block_segment_postings.advance();
-            }
-        }
-    }
-
     Ok(())
 }
diff --git a/examples/phrase_prefix_search.rs b/examples/phrase_prefix_search.rs
@@ -67,7 +67,7 @@ fn main() -> Result<()> {
     let mut titles = top_docs
         .into_iter()
         .map(|(_score, doc_address)| {
-            let doc = searcher.doc::<TantivyDocument>(doc_address)?;
+            let doc = searcher.doc(doc_address)?;
             let title = doc
                 .get_first(title)
                 .and_then(|v| v.as_str())

diff --git a/examples/snippet.rs b/examples/snippet.rs
@@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> {
     let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
 
     for (score, doc_address) in top_docs {
-        let doc = searcher.doc::<TantivyDocument>(doc_address)?;
+        let doc = searcher.doc(doc_address)?;
         let snippet = snippet_generator.snippet_from_doc(&doc);
         println!("Document score {score}:");
         println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());

diff --git a/examples/warmer.rs b/examples/warmer.rs
@@ -43,7 +43,7 @@ impl DynamicPriceColumn {
         }
     }
 
-    pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
+    pub fn price_for_segment(&self, segment_reader: &dyn SegmentReader) -> Option<Arc<Vec<Price>>> {
         let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
         self.price_cache.read().unwrap().get(&segment_key).cloned()
     }
@@ -157,7 +157,7 @@ fn main() -> tantivy::Result<()> {
     let query = query_parser.parse_query("cooking")?;
 
     let searcher = reader.searcher();
-    let score_by_price = move |segment_reader: &SegmentReader| {
+    let score_by_price = move |segment_reader: &dyn SegmentReader| {
         let price = price_dynamic_column
             .price_for_segment(segment_reader)
             .unwrap();