Skip to content

Commit 57fe659

Browse files
authored
make serializer pub (#2835)
some changes on the posting list serializer to make it usable in other contexts. Improve errors Signed-off-by: Pascal Seitz <pascal.seitz@gmail.com>
1 parent 5562ce6 commit 57fe659

File tree

6 files changed

+49
-24
lines changed

6 files changed

+49
-24
lines changed

common/src/writer.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
6262
pub struct AntiCallToken(());
6363

6464
/// Trait used to indicate when no more write need to be done on a writer
65-
pub trait TerminatingWrite: Write + Send + Sync {
65+
///
66+
/// Thread-safety is enforced at the call sites that require it.
67+
pub trait TerminatingWrite: Write {
6668
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
6769
fn terminate(mut self) -> io::Result<()>
6870
where Self: Sized {

src/directory/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::path::PathBuf;
2121
pub use common::file_slice::{FileHandle, FileSlice};
2222
pub use common::{AntiCallToken, OwnedBytes, TerminatingWrite};
2323

24-
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
24+
pub use self::composite_file::{CompositeFile, CompositeWrite};
2525
pub use self::directory::{Directory, DirectoryClone, DirectoryLock};
2626
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
2727
pub use self::ram_directory::RamDirectory;
@@ -52,7 +52,7 @@ pub use self::mmap_directory::MmapDirectory;
5252
///
5353
/// `WritePtr` are required to implement both Write
5454
/// and Seek.
55-
pub type WritePtr = BufWriter<Box<dyn TerminatingWrite>>;
55+
pub type WritePtr = BufWriter<Box<dyn TerminatingWrite + Send + Sync>>;
5656

5757
#[cfg(test)]
5858
mod tests;

src/postings/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ mod postings;
1414
mod postings_writer;
1515
mod recorder;
1616
mod segment_postings;
17-
mod serializer;
17+
/// Serializer module for the inverted index
18+
pub mod serializer;
1819
mod skip;
1920
mod term_info;
2021

src/postings/serializer.rs

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use crate::positions::PositionSerializer;
1111
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
1212
use crate::postings::skip::SkipSerializer;
1313
use crate::query::Bm25Weight;
14-
use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
14+
use crate::schema::{Field, FieldEntry, IndexRecordOption, Schema};
1515
use crate::termdict::TermDictionaryBuilder;
1616
use crate::{DocId, Score};
1717

@@ -80,9 +80,12 @@ impl InvertedIndexSerializer {
8080
let term_dictionary_write = self.terms_write.for_field(field);
8181
let postings_write = self.postings_write.for_field(field);
8282
let positions_write = self.positions_write.for_field(field);
83-
let field_type: FieldType = (*field_entry.field_type()).clone();
83+
let index_record_option = field_entry
84+
.field_type()
85+
.index_record_option()
86+
.unwrap_or(IndexRecordOption::Basic);
8487
FieldSerializer::create(
85-
&field_type,
88+
index_record_option,
8689
total_num_tokens,
8790
term_dictionary_write,
8891
postings_write,
@@ -102,29 +105,27 @@ impl InvertedIndexSerializer {
102105

103106
/// The field serializer is in charge of
104107
/// the serialization of a specific field.
105-
pub struct FieldSerializer<'a> {
106-
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
108+
pub struct FieldSerializer<'a, W: Write = WritePtr> {
109+
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<W>>,
107110
postings_serializer: PostingsSerializer,
108-
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
111+
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<W>>>,
109112
current_term_info: TermInfo,
110113
term_open: bool,
111-
postings_write: &'a mut CountingWriter<WritePtr>,
114+
postings_write: &'a mut CountingWriter<W>,
112115
postings_start_offset: u64,
113116
}
114117

115-
impl<'a> FieldSerializer<'a> {
116-
fn create(
117-
field_type: &FieldType,
118+
impl<'a, W: Write> FieldSerializer<'a, W> {
119+
/// Creates a new `FieldSerializer` for the given field type.
120+
pub fn create(
121+
index_record_option: IndexRecordOption,
118122
total_num_tokens: u64,
119-
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
120-
postings_write: &'a mut CountingWriter<WritePtr>,
121-
positions_write: &'a mut CountingWriter<WritePtr>,
123+
term_dictionary_write: &'a mut CountingWriter<W>,
124+
postings_write: &'a mut CountingWriter<W>,
125+
positions_write: &'a mut CountingWriter<W>,
122126
fieldnorm_reader: Option<FieldNormReader>,
123-
) -> io::Result<FieldSerializer<'a>> {
127+
) -> io::Result<FieldSerializer<'a, W>> {
124128
total_num_tokens.serialize(postings_write)?;
125-
let index_record_option = field_type
126-
.index_record_option()
127-
.unwrap_or(IndexRecordOption::Basic);
128129
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
129130
let average_fieldnorm = fieldnorm_reader
130131
.as_ref()
@@ -192,6 +193,11 @@ impl<'a> FieldSerializer<'a> {
192193
Ok(())
193194
}
194195

196+
/// Starts the postings for a new term without recording term frequencies.
197+
pub fn new_term_without_freq(&mut self, term: &[u8]) -> io::Result<()> {
198+
self.new_term(term, 0, false)
199+
}
200+
195201
/// Serialize the information that a document contains for the current term:
196202
/// its term frequency, and the position deltas.
197203
///
@@ -297,6 +303,7 @@ impl Block {
297303
}
298304
}
299305

306+
/// Serializer for postings lists.
300307
pub struct PostingsSerializer {
301308
last_doc_id_encoded: u32,
302309

@@ -316,6 +323,9 @@ pub struct PostingsSerializer {
316323
}
317324

318325
impl PostingsSerializer {
326+
/// Creates a new `PostingsSerializer`.
327+
/// * avg_fieldnorm - average field norm for the field being serialized.
328+
/// * mode - indexing options for the field being serialized.
319329
pub fn new(
320330
avg_fieldnorm: Score,
321331
mode: IndexRecordOption,
@@ -338,6 +348,8 @@ impl PostingsSerializer {
338348
}
339349
}
340350

351+
/// Starts the serialization for a new term.
352+
/// * term_doc_freq - the number of documents containing the term.
341353
pub fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool) {
342354
self.bm25_weight = None;
343355

@@ -377,6 +389,7 @@ impl PostingsSerializer {
377389
self.postings_write.extend(block_encoded);
378390
}
379391
if self.term_has_freq {
392+
// encode the term frequencies
380393
let (num_bits, block_encoded): (u8, &[u8]) = self
381394
.block_encoder
382395
.compress_block_unsorted(self.block.term_freqs(), true);
@@ -417,13 +430,17 @@ impl PostingsSerializer {
417430
self.block.clear();
418431
}
419432

433+
/// Register that the given document contains the current term.
434+
/// * doc_id - the document id.
435+
/// * term_freq - the term frequency within the document.
420436
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) {
421437
self.block.append_doc(doc_id, term_freq);
422438
if self.block.is_full() {
423439
self.write_block();
424440
}
425441
}
426442

443+
/// Finish the serialization for this term.
427444
pub fn close_term(
428445
&mut self,
429446
doc_freq: u32,

src/postings/skip.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@ use crate::{DocId, Score, TERMINATED};
1414
// (requiring a 6th bit), but the biggest doc_id we can want to encode is TERMINATED-1, which can
1515
// be represented on 31b without delta encoding.
1616
fn encode_bitwidth(bitwidth: u8, delta_1: bool) -> u8 {
17-
assert!(bitwidth < 32);
17+
assert!(
18+
bitwidth < 32,
19+
"bitwidth needs to be less than 32, but got {}",
20+
bitwidth
21+
);
1822
bitwidth | ((delta_1 as u8) << 6)
1923
}
2024

sstable/src/lib.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,9 @@ where
302302
|| self.previous_key[keep_len] < key[keep_len];
303303
assert!(
304304
increasing_keys,
305-
"Keys should be increasing. ({:?} > {key:?})",
306-
self.previous_key
305+
"Keys should be increasing. ({:?} > {:?})",
306+
String::from_utf8_lossy(&self.previous_key),
307+
String::from_utf8_lossy(key),
307308
);
308309
self.previous_key.resize(key.len(), 0u8);
309310
self.previous_key[keep_len..].copy_from_slice(&key[keep_len..]);

0 commit comments

Comments
 (0)