Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,8 @@ harness = false
[[bench]]
name = "regex_all_terms"
harness = false

[[bench]]
name = "fill_bitset"
harness = false

112 changes: 112 additions & 0 deletions benches/fill_bitset.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use common::BitSet;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::postings::BlockSegmentPostings;
use tantivy::schema::*;
use tantivy::{
doc, DocSet, Index, InvertedIndexReader, TantivyDocument, TantivyInvertedIndexReader,
};

#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;

fn main() {
let index = build_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = &searcher.segment_readers()[0];
let text_field = index.schema().get_field("text").unwrap();
let inverted_index = segment_reader.inverted_index(text_field).unwrap();
let max_doc = segment_reader.max_doc();

let term = Term::from_field_text(text_field, "hello");
let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();

let mut runner = BenchRunner::new();
runner.set_name("fill_bitset");

let mut group = runner.new_group();
{
let inverted_index = &inverted_index;
let term_info = &term_info;
// This is the path used by queries (AutomatonWeight, RangeQuery, etc.)
// It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo.
group.register("fill_bitset_from_terminfo (via trait)", move |_| {
let mut bitset = BitSet::with_max_value(max_doc);
inverted_index
.fill_bitset_from_terminfo(term_info, &mut bitset)
.unwrap();
black_box(bitset);
});
}
{
let inverted_index = &inverted_index;
let term_info = &term_info;
// This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset.
group.register("read_docset + fill_bitset", move |_| {
let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap();
let mut bitset = BitSet::with_max_value(max_doc);
postings.fill_bitset(&mut bitset);
black_box(bitset);
});
}
{
let inverted_index = &inverted_index;
let term_info = &term_info;
// This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely.
let concrete_reader = inverted_index
.as_any()
.downcast_ref::<TantivyInvertedIndexReader>()
.expect("expected TantivyInvertedIndexReader");
group.register("BlockSegmentPostings direct", move |_| {
let raw = concrete_reader
.read_raw_postings_data(term_info, IndexRecordOption::Basic)
.unwrap();
let mut block_postings = BlockSegmentPostings::open(
term_info.doc_freq,
raw.postings_data,
Comment on lines +64 to +68
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's code smell here: inverted index at this point is a DynInvertedIndex, yet you use the fact that you expect it to be the standard one to interpret the raw postings data.

raw.record_option,
raw.effective_option,
)
.unwrap();
let mut bitset = BitSet::with_max_value(max_doc);
loop {
let docs = block_postings.docs();
if docs.is_empty() {
break;
}
for &doc in docs {
bitset.insert(doc);
}
block_postings.advance();
}
black_box(bitset);
});
}
group.run();
}

fn build_test_index() -> Index {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let text_field = schema.get_field("text").unwrap();

let mut writer = index.writer::<TantivyDocument>(250_000_000).unwrap();
let mut rng = StdRng::from_seed([42u8; 32]);
for _ in 0..100_000 {
if rng.random_bool(0.5) {
writer
.add_document(doc!(text_field => "hello world"))
.unwrap();
} else {
writer
.add_document(doc!(text_field => "goodbye world"))
.unwrap();
}
}
writer.commit().unwrap();
index
}
3 changes: 1 addition & 2 deletions benches/str_search_and_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Count, DocSetCollector};
use tantivy::query::RangeQuery;
use tantivy::schema::document::TantivyDocument;
use tantivy::schema::{Schema, Value, FAST, STORED, STRING};
use tantivy::{doc, Index, ReloadPolicy, Searcher, Term};

Expand Down Expand Up @@ -406,7 +405,7 @@ impl FetchAllStringsFromDocTask {

for doc_address in docs {
// Get the document from the doc store (row store access)
if let Ok(doc) = self.searcher.doc::<TantivyDocument>(doc_address) {
if let Ok(doc) = self.searcher.doc(doc_address) {
// Extract string values from the stored field
if let Some(field_value) = doc.get_first(str_stored_field) {
if let Some(text) = field_value.as_value().as_str() {
Expand Down
11 changes: 11 additions & 0 deletions common/src/bitset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ impl TinySet {
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
// Tracking `len` on every insert/remove adds overhead even when `len()` is never called.
// Consider removing if `len()` usage is rare or not on a hot path.
len: u64,
max_value: u32,
}
Expand Down Expand Up @@ -252,6 +254,7 @@ impl BitSet {

/// Removes all elements from the `BitSet`.
pub fn clear(&mut self) {
self.len = 0;
for tinyset in self.tinysets.iter_mut() {
*tinyset = TinySet::empty();
}
Expand All @@ -271,6 +274,11 @@ impl BitSet {
}
}

/// Estimate the heap memory consumption of this `BitSet` in bytes.
pub fn get_memory_consumption(&self) -> usize {
self.tinysets.len() * std::mem::size_of::<TinySet>()
}

/// Returns the number of elements in the `BitSet`.
#[inline]
pub fn len(&self) -> usize {
Expand Down Expand Up @@ -314,6 +322,9 @@ impl BitSet {
.map(|delta_bucket| bucket + delta_bucket as u32)
}

/// Returns the maximum number of elements in the bitset.
///
/// Warning: The largest element the bitset can contain is `max_value - 1`.
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
Expand Down
1 change: 1 addition & 0 deletions doc/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- [Index Sorting](./index_sorting.md)
- [Innerworkings](./innerworkings.md)
- [Inverted index](./inverted_index.md)
- [Storage Abstraction](./storage_abstraction.md)
- [Best practise](./inverted_index.md)

[Frequently Asked Questions](./faq.md)
Expand Down
76 changes: 76 additions & 0 deletions doc/src/storage_abstraction.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Storage Abstraction — Design Notes

## Problem

tantivy's query engine needs to work with pluggable `SegmentReader` implementations while preserving the monomorphized fast path that avoids `Box<dyn Postings>` vtable
overhead in tight scoring loops (`advance()`, `doc()`, `score()`) or similar cases.

## Requirements

- **Pluggable `SegmentReader`.** External crates can provide their own `SegmentReader` implementation (with their own `InvertedIndexReader`, postings types, etc.) and tantivy's query engine works with it.
- **No performance regression.** tantivy's default path (`SegmentPostings` → `TermScorer<SegmentPostings>` → block WAND) must remain monomorphized — no boxing, no vtable dispatch in scoring loops.
- **Arbitrary implementations without recompiling tantivy.** The design must not require a fixed set of implementations known at tantivy compile time. External crates depend on tantivy, not the reverse.
- **Query code is backend-agnostic.** Adding a new `SegmentReader` implementation must not require changes to `TermWeight`, `PhraseWeight`, `AutomatonWeight`, or any other query code.
- **Non-viral API.** `Searcher`, `Index`, `Weight`, and other public types are not generic over the backend. Users don't need to thread a type parameter through their code.

## Current Design

### Trait hierarchy

- **`SegmentReader`** — trait for accessing a segment's data. Returns `Arc<dyn DynInvertedIndexReader>` from `inverted_index(field)`. `TantivySegmentReader` is the default implementation.
- **`DynInvertedIndexReader`** — object-safe trait for dynamic dispatch. Returns `Box<dyn Postings>`. Used as `Arc<dyn DynInvertedIndexReader>`.
- **`InvertedIndexReader`** — typed trait with `type Postings` and `type DocSet` associated types. `TantivyInvertedIndexReader` implements this with `Postings = SegmentPostings`. There is a blanket impl of `InvertedIndexReader` for `dyn DynInvertedIndexReader` with `Postings = Box<dyn Postings>`.

### `try_downcast_and_call!` macro

The macro attempts to downcast `&dyn DynInvertedIndexReader` to `&TantivyInvertedIndexReader`. The body is compiled twice — once with the concrete reader (typed postings, monomorphized) and once with the dyn fallback (boxed postings).

```rust
try_downcast_and_call!(inverted_index.as_ref(), |reader| {
let postings = reader.read_postings_from_terminfo(&term_info, option)?;
TermScorer::new(postings, fieldnorm_reader, similarity_weight)
})
```

This replaced the earlier `TypedInvertedIndexReaderCb` trait + struct pattern, which required creating a struct for every call site to serve as a "generic closure."

## Rejected approaches

### Specialized methods on `DynInvertedIndexReader`

Adding methods like `build_term_scorer()`, `build_phrase_scorer()`, `fill_bitset_from_terminfo()` to `DynInvertedIndexReader` was rejected. This forces every implementor to reimplement scoring logic for each query type — a combinatorial explosion that couples the reader to every query shape. The reader should only know how to produce postings, not how to build scorers. It also prevents supporting arbitrary query types without changing the trait.

### Feature-gated types for external readers

Using `#[cfg(feature = "quickwit")]` branches in the macro to add additional downcast targets. Requires recompiling tantivy for each reader and doesn't scale to arbitrary `SegmentReader` / `InvertedIndexReader` implementations.

### Reader-side dispatch with a callback trait

A method like `fn with_typed_reader(&self, cb: &mut dyn TypedCb<R>) -> R` on `DynInvertedIndexReader` would let the reader dispatch the callback with its concrete type. But the generic `R` parameter makes the trait not object-safe. Working around this with type erasure (storing results in the callback via `Any`) is complex and fragile.

## Planned: `TypedSegmentReader` trait for external fast paths

The current `try_downcast_and_call!` hardcodes `TantivyInvertedIndexReader`. To give external crates the monomorphized fast path, the downcast target should be a **trait with associated types**, not a specific concrete struct.

```rust
trait TypedSegmentReader: SegmentReader {
type InvertedIndexReader: InvertedIndexReader;
// future: type FastFieldReader: ...;
// future: type StoreReader: ...;

fn typed_inverted_index(&self, field: Field) -> &Self::InvertedIndexReader;
}
```

The dispatch downcasts `dyn SegmentReader` (via `as_any()`) to a concrete type that implements `TypedSegmentReader`, then the body works generically through the associated types. The body is compiled once per registered concrete type but is written against the trait — it never names `TantivyInvertedIndexReader` or `SegmentPostings` directly.

- External crates implement `TypedSegmentReader` with their own associated types and get the monomorphized fast path.
- One dispatch point covers all typed sub-components (inverted index, fast fields, store reader, etc.).
- Query weight code is fully generic — adding a new backend doesn't touch any query code.
- This does **not** mean query-specific methods on `SegmentReader`. The trait provides typed access to sub-components, not knowledge of query shapes.

### Open question: downcast chain registration

The concrete type must still be known for the `Any` downcast. The dispatch needs a list of concrete types to try. Since tantivy cannot depend on external crates, this list can't live in tantivy itself.

A macro invoked by the final binary could generate the downcast chain with all `TypedSegmentReader` implementors. Not yet designed.
2 changes: 1 addition & 1 deletion examples/custom_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ impl Collector for StatsCollector {
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
segment_reader: &dyn SegmentReader,
) -> tantivy::Result<StatsSegmentCollector> {
let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
Ok(StatsSegmentCollector {
Expand Down
2 changes: 1 addition & 1 deletion examples/date_time_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
assert!(retrieved_doc
.get_first(occurred_at)
.unwrap()
Expand Down
4 changes: 2 additions & 2 deletions examples/faceted_search_with_tweaked_score.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> {
);
let top_docs_by_custom_score =
// Call TopDocs with a custom tweak score
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| {
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
let facet_dict = ingredient_reader.facet_dict();

Expand All @@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> {
.iter()
.map(|(_, doc_id)| {
searcher
.doc::<TantivyDocument>(*doc_id)
.doc(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str().map(|el| el.to_string()))
Expand Down
38 changes: 1 addition & 37 deletions examples/iterating_docs_and_positions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> {
}
}

// A `Term` is a text token associated with a field.
// Let's go through all docs containing the term `title:the` and access their position
let term_the = Term::from_field_text(title, "the");

// Some other powerful operations (especially `.skip_to`) may be useful to consume these
// Some other powerful operations (especially `.seek`) may be useful to consume these
// posting lists rapidly.
// You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
// and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait

// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
// you can get better performance by accessing directly the blocks of doc ids.
for segment_reader in searcher.segment_readers() {
// A segment contains different data structure.
// Inverted index stands for the combination of
// - the term dictionary
// - the inverted lists associated with each terms and their positions
let inverted_index = segment_reader.inverted_index(title)?;

// This segment posting object is like a cursor over the documents matching the term.
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
// frequencies and positions.
//
// If you don't need all this information, you may get better performance by decompressing
// less information.
if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
{
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].`
println!("Docs {docs:?}");
block_segment_postings.advance();
}
}
}

Ok(())
}
2 changes: 1 addition & 1 deletion examples/phrase_prefix_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ fn main() -> Result<()> {
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let doc = searcher.doc(doc_address)?;
let title = doc
.get_first(title)
.and_then(|v| v.as_str())
Expand Down
2 changes: 1 addition & 1 deletion examples/snippet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;

for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
Expand Down
4 changes: 2 additions & 2 deletions examples/warmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ impl DynamicPriceColumn {
}
}

pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
pub fn price_for_segment(&self, segment_reader: &dyn SegmentReader) -> Option<Arc<Vec<Price>>> {
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
self.price_cache.read().unwrap().get(&segment_key).cloned()
}
Expand Down Expand Up @@ -157,7 +157,7 @@ fn main() -> tantivy::Result<()> {
let query = query_parser.parse_query("cooking")?;

let searcher = reader.searcher();
let score_by_price = move |segment_reader: &SegmentReader| {
let score_by_price = move |segment_reader: &dyn SegmentReader| {
let price = price_dynamic_column
.price_for_segment(segment_reader)
.unwrap();
Expand Down
Loading
Loading