-
-
Notifications
You must be signed in to change notification settings - Fork 898
Abstract tantivy's data storage behind traits for pluggable backends #2868
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
PSeitz
wants to merge
8
commits into
main
Choose a base branch
from
storage_abstraction
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
3cbcb5d
Abstract tantivy's data storage behind traits for pluggable backends
PSeitz 1f370b4
use downcast macro, remove block wand from trait
PSeitz 1356734
make warmup object safe
PSeitz 548eb6e
Remove Index::tokenizer_for_field, fix BlockSegmentPostings doc comment
PSeitz 32aee13
Simplify Index::fields_metadata to use TantivySegmentReader::open
PSeitz 0734836
Remove read_raw_postings_data from DynInvertedIndexReader trait
PSeitz f730825
Make InvertedIndexReader a subtrait of DynInvertedIndexReader
PSeitz c958f86
Add Send bound to StoreReader::get_async return type
PSeitz File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM}; | ||
| use common::BitSet; | ||
| use rand::rngs::StdRng; | ||
| use rand::{Rng, SeedableRng}; | ||
| use tantivy::postings::BlockSegmentPostings; | ||
| use tantivy::schema::*; | ||
| use tantivy::{ | ||
| doc, DocSet, Index, InvertedIndexReader, TantivyDocument, TantivyInvertedIndexReader, | ||
| }; | ||
|
|
||
| #[global_allocator] | ||
| pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM; | ||
|
|
||
| fn main() { | ||
| let index = build_test_index(); | ||
| let reader = index.reader().unwrap(); | ||
| let searcher = reader.searcher(); | ||
| let segment_reader = &searcher.segment_readers()[0]; | ||
| let text_field = index.schema().get_field("text").unwrap(); | ||
| let inverted_index = segment_reader.inverted_index(text_field).unwrap(); | ||
| let max_doc = segment_reader.max_doc(); | ||
|
|
||
| let term = Term::from_field_text(text_field, "hello"); | ||
| let term_info = inverted_index.get_term_info(&term).unwrap().unwrap(); | ||
|
|
||
| let mut runner = BenchRunner::new(); | ||
| runner.set_name("fill_bitset"); | ||
|
|
||
| let mut group = runner.new_group(); | ||
| { | ||
| let inverted_index = &inverted_index; | ||
| let term_info = &term_info; | ||
| // This is the path used by queries (AutomatonWeight, RangeQuery, etc.) | ||
| // It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo. | ||
| group.register("fill_bitset_from_terminfo (via trait)", move |_| { | ||
| let mut bitset = BitSet::with_max_value(max_doc); | ||
| inverted_index | ||
| .fill_bitset_from_terminfo(term_info, &mut bitset) | ||
| .unwrap(); | ||
| black_box(bitset); | ||
| }); | ||
| } | ||
| { | ||
| let inverted_index = &inverted_index; | ||
| let term_info = &term_info; | ||
| // This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset. | ||
| group.register("read_docset + fill_bitset", move |_| { | ||
| let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap(); | ||
| let mut bitset = BitSet::with_max_value(max_doc); | ||
| postings.fill_bitset(&mut bitset); | ||
| black_box(bitset); | ||
| }); | ||
| } | ||
| { | ||
| let inverted_index = &inverted_index; | ||
| let term_info = &term_info; | ||
| // This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely. | ||
| let concrete_reader = inverted_index | ||
| .as_any() | ||
| .downcast_ref::<TantivyInvertedIndexReader>() | ||
| .expect("expected TantivyInvertedIndexReader"); | ||
| group.register("BlockSegmentPostings direct", move |_| { | ||
| let raw = concrete_reader | ||
| .read_raw_postings_data(term_info, IndexRecordOption::Basic) | ||
| .unwrap(); | ||
| let mut block_postings = BlockSegmentPostings::open( | ||
| term_info.doc_freq, | ||
| raw.postings_data, | ||
| raw.record_option, | ||
| raw.effective_option, | ||
| ) | ||
| .unwrap(); | ||
| let mut bitset = BitSet::with_max_value(max_doc); | ||
| loop { | ||
| let docs = block_postings.docs(); | ||
| if docs.is_empty() { | ||
| break; | ||
| } | ||
| for &doc in docs { | ||
| bitset.insert(doc); | ||
| } | ||
| block_postings.advance(); | ||
| } | ||
| black_box(bitset); | ||
| }); | ||
| } | ||
| group.run(); | ||
| } | ||
|
|
||
| fn build_test_index() -> Index { | ||
| let mut schema_builder = Schema::builder(); | ||
| schema_builder.add_text_field("text", TEXT); | ||
| let schema = schema_builder.build(); | ||
| let index = Index::create_in_ram(schema.clone()); | ||
| let text_field = schema.get_field("text").unwrap(); | ||
|
|
||
| let mut writer = index.writer::<TantivyDocument>(250_000_000).unwrap(); | ||
| let mut rng = StdRng::from_seed([42u8; 32]); | ||
| for _ in 0..100_000 { | ||
| if rng.random_bool(0.5) { | ||
| writer | ||
| .add_document(doc!(text_field => "hello world")) | ||
| .unwrap(); | ||
| } else { | ||
| writer | ||
| .add_document(doc!(text_field => "goodbye world")) | ||
| .unwrap(); | ||
| } | ||
| } | ||
| writer.commit().unwrap(); | ||
| index | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| # Storage Abstraction — Design Notes | ||
|
|
||
| ## Problem | ||
|
|
||
| tantivy's query engine needs to work with pluggable `SegmentReader` implementations while preserving the monomorphized fast path that avoids `Box<dyn Postings>` vtable | ||
| overhead in tight scoring loops (`advance()`, `doc()`, `score()`) or similar cases. | ||
|
|
||
| ## Requirements | ||
|
|
||
| - **Pluggable `SegmentReader`.** External crates can provide their own `SegmentReader` implementation (with their own `InvertedIndexReader`, postings types, etc.) and tantivy's query engine works with it. | ||
| - **No performance regression.** tantivy's default path (`SegmentPostings` → `TermScorer<SegmentPostings>` → block WAND) must remain monomorphized — no boxing, no vtable dispatch in scoring loops. | ||
| - **Arbitrary implementations without recompiling tantivy.** The design must not require a fixed set of implementations known at tantivy compile time. External crates depend on tantivy, not the reverse. | ||
| - **Query code is backend-agnostic.** Adding a new `SegmentReader` implementation must not require changes to `TermWeight`, `PhraseWeight`, `AutomatonWeight`, or any other query code. | ||
| - **Non-viral API.** `Searcher`, `Index`, `Weight`, and other public types are not generic over the backend. Users don't need to thread a type parameter through their code. | ||
|
|
||
| ## Current Design | ||
|
|
||
| ### Trait hierarchy | ||
|
|
||
| - **`SegmentReader`** — trait for accessing a segment's data. Returns `Arc<dyn DynInvertedIndexReader>` from `inverted_index(field)`. `TantivySegmentReader` is the default implementation. | ||
| - **`DynInvertedIndexReader`** — object-safe trait for dynamic dispatch. Returns `Box<dyn Postings>`. Used as `Arc<dyn DynInvertedIndexReader>`. | ||
| - **`InvertedIndexReader`** — typed trait with `type Postings` and `type DocSet` associated types. `TantivyInvertedIndexReader` implements this with `Postings = SegmentPostings`. There is a blanket impl of `InvertedIndexReader` for `dyn DynInvertedIndexReader` with `Postings = Box<dyn Postings>`. | ||
|
|
||
| ### `try_downcast_and_call!` macro | ||
|
|
||
| The macro attempts to downcast `&dyn DynInvertedIndexReader` to `&TantivyInvertedIndexReader`. The body is compiled twice — once with the concrete reader (typed postings, monomorphized) and once with the dyn fallback (boxed postings). | ||
|
|
||
| ```rust | ||
| try_downcast_and_call!(inverted_index.as_ref(), |reader| { | ||
| let postings = reader.read_postings_from_terminfo(&term_info, option)?; | ||
| TermScorer::new(postings, fieldnorm_reader, similarity_weight) | ||
| }) | ||
| ``` | ||
|
|
||
| This replaced the earlier `TypedInvertedIndexReaderCb` trait + struct pattern, which required creating a struct for every call site to serve as a "generic closure." | ||
|
|
||
| ## Rejected approaches | ||
|
|
||
| ### Specialized methods on `DynInvertedIndexReader` | ||
|
|
||
| Adding methods like `build_term_scorer()`, `build_phrase_scorer()`, `fill_bitset_from_terminfo()` to `DynInvertedIndexReader` was rejected. This forces every implementor to reimplement scoring logic for each query type — a combinatorial explosion that couples the reader to every query shape. The reader should only know how to produce postings, not how to build scorers. It also prevents supporting arbitrary query types without changing the trait. | ||
|
|
||
| ### Feature-gated types for external readers | ||
|
|
||
| Using `#[cfg(feature = "quickwit")]` branches in the macro to add additional downcast targets. Requires recompiling tantivy for each reader and doesn't scale to arbitrary `SegmentReader` / `InvertedIndexReader` implementations. | ||
|
|
||
| ### Reader-side dispatch with a callback trait | ||
|
|
||
| A method like `fn with_typed_reader(&self, cb: &mut dyn TypedCb<R>) -> R` on `DynInvertedIndexReader` would let the reader dispatch the callback with its concrete type. But the generic `R` parameter makes the trait not object-safe. Working around this with type erasure (storing results in the callback via `Any`) is complex and fragile. | ||
|
|
||
| ## Planned: `TypedSegmentReader` trait for external fast paths | ||
|
|
||
| The current `try_downcast_and_call!` hardcodes `TantivyInvertedIndexReader`. To give external crates the monomorphized fast path, the downcast target should be a **trait with associated types**, not a specific concrete struct. | ||
|
|
||
| ```rust | ||
| trait TypedSegmentReader: SegmentReader { | ||
| type InvertedIndexReader: InvertedIndexReader; | ||
| // future: type FastFieldReader: ...; | ||
| // future: type StoreReader: ...; | ||
|
|
||
| fn typed_inverted_index(&self, field: Field) -> &Self::InvertedIndexReader; | ||
| } | ||
| ``` | ||
|
|
||
| The dispatch downcasts `dyn SegmentReader` (via `as_any()`) to a concrete type that implements `TypedSegmentReader`, then the body works generically through the associated types. The body is compiled once per registered concrete type but is written against the trait — it never names `TantivyInvertedIndexReader` or `SegmentPostings` directly. | ||
|
|
||
| - External crates implement `TypedSegmentReader` with their own associated types and get the monomorphized fast path. | ||
| - One dispatch point covers all typed sub-components (inverted index, fast fields, store reader, etc.). | ||
| - Query weight code is fully generic — adding a new backend doesn't touch any query code. | ||
| - This does **not** mean query-specific methods on `SegmentReader`. The trait provides typed access to sub-components, not knowledge of query shapes. | ||
|
|
||
| ### Open question: downcast chain registration | ||
|
|
||
| The concrete type must still be known for the `Any` downcast. The dispatch needs a list of concrete types to try. Since tantivy cannot depend on external crates, this list can't live in tantivy itself. | ||
|
|
||
| A macro invoked by the final binary could generate the downcast chain with all `TypedSegmentReader` implementors. Not yet designed. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's code smell here: inverted index at this point is a DynInvertedIndex, yet you use the fact that you expect it to be the standard one to interpret the raw postings data.