Skip to content

Commit 64b93ce

Browse files
committed
Abstract tantivy's data storage behind traits for pluggable backends
Extract trait interfaces from tantivy's core reader types so that alternative storage backends (e.g. Quickwit) can provide their own implementations while tantivy's query engine works through dynamic dispatch. Reader trait extraction: - SegmentReader is now a trait; the concrete implementation is renamed to TantivySegmentReader. - DynInvertedIndexReader trait for object-safe dynamic dispatch, plus a typed InvertedIndexReader trait with associated Postings/DocSet types for static dispatch. The concrete reader becomes TantivyInvertedIndexReader. - StoreReader is now a trait; the concrete implementation is renamed to TantivyStoreReader. get() returns TantivyDocument directly instead of requiring a generic DocumentDeserialize bound. Typed downcast for performance-critical paths: - try_downcast_and_call() + TypedInvertedIndexReaderCb allow query weights (TermWeight, PhraseWeight) to attempt a downcast to the concrete TantivyInvertedIndexReader, obtaining typed postings for zero-cost scoring, and falling back to the dynamic path otherwise. - TermScorer<TPostings> is now generic over its postings type. - PostingsWithBlockMax trait enables block-max WAND acceleration through the trait boundary. - block_wand() and block_wand_single_scorer() are generic over PostingsWithBlockMax, and for_each_pruning is dispatched through the SegmentReader trait so custom backends can provide their own block-max implementations. Searcher decoupled from Index: - New SearcherContext holds schema, executor, and tokenizers. - Searcher can be constructed from Vec<Arc<dyn SegmentReader>> via Searcher::from_segment_readers(), without needing an Index. - Searcher::index() is deprecated in favor of Searcher::context(). Postings and DocSet changes: - Postings trait gains doc_freq() -> DocFreq (Exact/Approximate) and has_freq(). - RawPostingsData struct carries raw postings bytes across the trait boundary for custom reader implementations. - BlockSegmentPostings::open() takes OwnedBytes instead of FileSlice. - DocSet gains fill_bitset() method. Scorer improvements: - Scorer trait absorbs for_each, for_each_pruning, and explain (previously free functions or on Weight). - box_scorer() helper avoids double-boxing Box<dyn Scorer>. - BoxedTermScorer wraps a type-erased term scorer. - BufferedUnionScorer initialization fixed to avoid an extra advance() on construction. Other changes: - Document::to_json() now returns serde_json::Value; the old string serialization is renamed to to_serialized_json(). - DocumentDeserialize removed from the store reader public API.
1 parent 129c40f commit 64b93ce

File tree

116 files changed

+3409
-1903
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+3409
-1903
lines changed

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,8 @@ harness = false
201201
[[bench]]
202202
name = "regex_all_terms"
203203
harness = false
204+
205+
[[bench]]
206+
name = "fill_bitset"
207+
harness = false
208+

benches/fill_bitset.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
2+
use common::BitSet;
3+
use rand::rngs::StdRng;
4+
use rand::{Rng, SeedableRng};
5+
use tantivy::postings::BlockSegmentPostings;
6+
use tantivy::schema::*;
7+
use tantivy::{doc, DocSet as _, Index, InvertedIndexReader as _, TantivyDocument};
8+
9+
#[global_allocator]
10+
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
11+
12+
fn main() {
13+
let index = build_test_index();
14+
let reader = index.reader().unwrap();
15+
let searcher = reader.searcher();
16+
let segment_reader = &searcher.segment_readers()[0];
17+
let text_field = index.schema().get_field("text").unwrap();
18+
let inverted_index = segment_reader.inverted_index(text_field).unwrap();
19+
let max_doc = segment_reader.max_doc();
20+
21+
let term = Term::from_field_text(text_field, "hello");
22+
let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
23+
24+
let mut runner = BenchRunner::new();
25+
runner.set_name("fill_bitset");
26+
27+
let mut group = runner.new_group();
28+
{
29+
let inverted_index = &inverted_index;
30+
let term_info = &term_info;
31+
// This is the path used by queries (AutomatonWeight, RangeQuery, etc.)
32+
// It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo.
33+
group.register("fill_bitset_from_terminfo (via trait)", move |_| {
34+
let mut bitset = BitSet::with_max_value(max_doc);
35+
inverted_index
36+
.fill_bitset_from_terminfo(term_info, &mut bitset)
37+
.unwrap();
38+
black_box(bitset);
39+
});
40+
}
41+
{
42+
let inverted_index = &inverted_index;
43+
let term_info = &term_info;
44+
// This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset.
45+
group.register("read_docset + fill_bitset", move |_| {
46+
let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap();
47+
let mut bitset = BitSet::with_max_value(max_doc);
48+
postings.fill_bitset(&mut bitset);
49+
black_box(bitset);
50+
});
51+
}
52+
{
53+
let inverted_index = &inverted_index;
54+
let term_info = &term_info;
55+
// This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely.
56+
group.register("BlockSegmentPostings direct", move |_| {
57+
let raw = inverted_index
58+
.read_raw_postings_data(term_info, IndexRecordOption::Basic)
59+
.unwrap();
60+
let mut block_postings = BlockSegmentPostings::open(
61+
term_info.doc_freq,
62+
raw.postings_data,
63+
raw.record_option,
64+
raw.effective_option,
65+
)
66+
.unwrap();
67+
let mut bitset = BitSet::with_max_value(max_doc);
68+
loop {
69+
let docs = block_postings.docs();
70+
if docs.is_empty() {
71+
break;
72+
}
73+
for &doc in docs {
74+
bitset.insert(doc);
75+
}
76+
block_postings.advance();
77+
}
78+
black_box(bitset);
79+
});
80+
}
81+
group.run();
82+
}
83+
84+
fn build_test_index() -> Index {
85+
let mut schema_builder = Schema::builder();
86+
schema_builder.add_text_field("text", TEXT);
87+
let schema = schema_builder.build();
88+
let index = Index::create_in_ram(schema.clone());
89+
let text_field = schema.get_field("text").unwrap();
90+
91+
let mut writer = index.writer::<TantivyDocument>(250_000_000).unwrap();
92+
let mut rng = StdRng::from_seed([42u8; 32]);
93+
for _ in 0..100_000 {
94+
if rng.random_bool(0.5) {
95+
writer
96+
.add_document(doc!(text_field => "hello world"))
97+
.unwrap();
98+
} else {
99+
writer
100+
.add_document(doc!(text_field => "goodbye world"))
101+
.unwrap();
102+
}
103+
}
104+
writer.commit().unwrap();
105+
index
106+
}

benches/str_search_and_get.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ use rand::rngs::StdRng;
1717
use rand::SeedableRng;
1818
use tantivy::collector::{Count, DocSetCollector};
1919
use tantivy::query::RangeQuery;
20-
use tantivy::schema::document::TantivyDocument;
2120
use tantivy::schema::{Schema, Value, FAST, STORED, STRING};
2221
use tantivy::{doc, Index, ReloadPolicy, Searcher, Term};
2322

@@ -406,7 +405,7 @@ impl FetchAllStringsFromDocTask {
406405

407406
for doc_address in docs {
408407
// Get the document from the doc store (row store access)
409-
if let Ok(doc) = self.searcher.doc::<TantivyDocument>(doc_address) {
408+
if let Ok(doc) = self.searcher.doc(doc_address) {
410409
// Extract string values from the stored field
411410
if let Some(field_value) = doc.get_first(str_stored_field) {
412411
if let Some(text) = field_value.as_value().as_str() {

common/src/bitset.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ impl TinySet {
193193
#[derive(Clone)]
194194
pub struct BitSet {
195195
tinysets: Box<[TinySet]>,
196+
// Tracking `len` on every insert/remove adds overhead even when `len()` is never called.
197+
// Consider removing if `len()` usage is rare or not on a hot path.
196198
len: u64,
197199
max_value: u32,
198200
}
@@ -252,6 +254,7 @@ impl BitSet {
252254

253255
/// Removes all elements from the `BitSet`.
254256
pub fn clear(&mut self) {
257+
self.len = 0;
255258
for tinyset in self.tinysets.iter_mut() {
256259
*tinyset = TinySet::empty();
257260
}
@@ -271,6 +274,11 @@ impl BitSet {
271274
}
272275
}
273276

277+
/// Estimate the heap memory consumption of this `BitSet` in bytes.
278+
pub fn get_memory_consumption(&self) -> usize {
279+
self.tinysets.len() * std::mem::size_of::<TinySet>()
280+
}
281+
274282
/// Returns the number of elements in the `BitSet`.
275283
#[inline]
276284
pub fn len(&self) -> usize {
@@ -314,6 +322,9 @@ impl BitSet {
314322
.map(|delta_bucket| bucket + delta_bucket as u32)
315323
}
316324

325+
/// Returns the maximum number of elements in the bitset.
326+
///
327+
/// Warning: The largest element the bitset can contain is `max_value - 1`.
317328
#[inline]
318329
pub fn max_value(&self) -> u32 {
319330
self.max_value

examples/custom_collector.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ impl Collector for StatsCollector {
7070
fn for_segment(
7171
&self,
7272
_segment_local_id: u32,
73-
segment_reader: &SegmentReader,
73+
segment_reader: &dyn SegmentReader,
7474
) -> tantivy::Result<StatsSegmentCollector> {
7575
let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
7676
Ok(StatsSegmentCollector {

examples/date_time_field.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
6060
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?;
6161
assert_eq!(count_docs.len(), 1);
6262
for (_score, doc_address) in count_docs {
63-
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
63+
let retrieved_doc = searcher.doc(doc_address)?;
6464
assert!(retrieved_doc
6565
.get_first(occurred_at)
6666
.unwrap()

examples/faceted_search_with_tweaked_score.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> {
6565
);
6666
let top_docs_by_custom_score =
6767
// Call TopDocs with a custom tweak score
68-
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
68+
TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| {
6969
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
7070
let facet_dict = ingredient_reader.facet_dict();
7171

@@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> {
9191
.iter()
9292
.map(|(_, doc_id)| {
9393
searcher
94-
.doc::<TantivyDocument>(*doc_id)
94+
.doc(*doc_id)
9595
.unwrap()
9696
.get_first(title)
9797
.and_then(|v| v.as_str().map(|el| el.to_string()))

examples/iterating_docs_and_positions.rs

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> {
9191
}
9292
}
9393

94-
// A `Term` is a text token associated with a field.
95-
// Let's go through all docs containing the term `title:the` and access their position
96-
let term_the = Term::from_field_text(title, "the");
97-
98-
// Some other powerful operations (especially `.skip_to`) may be useful to consume these
94+
// Some other powerful operations (especially `.seek`) may be useful to consume these
9995
// posting lists rapidly.
10096
// You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
10197
// and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
10298

103-
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
104-
// you can get better performance by accessing directly the blocks of doc ids.
105-
for segment_reader in searcher.segment_readers() {
106-
// A segment contains different data structure.
107-
// Inverted index stands for the combination of
108-
// - the term dictionary
109-
// - the inverted lists associated with each terms and their positions
110-
let inverted_index = segment_reader.inverted_index(title)?;
111-
112-
// This segment posting object is like a cursor over the documents matching the term.
113-
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
114-
// frequencies and positions.
115-
//
116-
// If you don't need all this information, you may get better performance by decompressing
117-
// less information.
118-
if let Some(mut block_segment_postings) =
119-
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
120-
{
121-
loop {
122-
let docs = block_segment_postings.docs();
123-
if docs.is_empty() {
124-
break;
125-
}
126-
// Once again these docs MAY contains deleted documents as well.
127-
let docs = block_segment_postings.docs();
128-
// Prints `Docs [0, 2].`
129-
println!("Docs {docs:?}");
130-
block_segment_postings.advance();
131-
}
132-
}
133-
}
134-
13599
Ok(())
136100
}

examples/phrase_prefix_search.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ fn main() -> Result<()> {
6767
let mut titles = top_docs
6868
.into_iter()
6969
.map(|(_score, doc_address)| {
70-
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
70+
let doc = searcher.doc(doc_address)?;
7171
let title = doc
7272
.get_first(title)
7373
.and_then(|v| v.as_str())

examples/snippet.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> {
5555
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
5656

5757
for (score, doc_address) in top_docs {
58-
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
58+
let doc = searcher.doc(doc_address)?;
5959
let snippet = snippet_generator.snippet_from_doc(&doc);
6060
println!("Document score {score}:");
6161
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());

0 commit comments

Comments
 (0)