Skip to content

Commit 8355823

Browse files
XiangpengHaoalamb
andauthored
Complete StringViewArray and BinaryViewArray parquet decoder: implement delta byte array and delta length byte array encoding (#6004)
* implement all encodings * address comments * fix bug * Update parquet/src/arrow/array_reader/byte_view_array.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * fix test * update comments * update test * Only copy strings one --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent b9e4497 commit 8355823

5 files changed

Lines changed: 218 additions & 72 deletions

File tree

parquet/src/arrow/array_reader/builder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::sync::Arc;
1919

2020
use arrow_schema::{DataType, Fields, SchemaBuilder};
2121

22-
use crate::arrow::array_reader::byte_array::make_byte_view_array_reader;
22+
use crate::arrow::array_reader::byte_view_array::make_byte_view_array_reader;
2323
use crate::arrow::array_reader::empty_array::make_empty_array_reader;
2424
use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader;
2525
use crate::arrow::array_reader::{

parquet/src/arrow/array_reader/byte_array.rs

Lines changed: 22 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -74,36 +74,6 @@ pub fn make_byte_array_reader(
7474
}
7575
}
7676

77-
/// Returns an [`ArrayReader`] that decodes the provided byte array column to view types.
78-
pub fn make_byte_view_array_reader(
79-
pages: Box<dyn PageIterator>,
80-
column_desc: ColumnDescPtr,
81-
arrow_type: Option<ArrowType>,
82-
) -> Result<Box<dyn ArrayReader>> {
83-
// Check if Arrow type is specified, else create it from Parquet type
84-
let data_type = match arrow_type {
85-
Some(t) => t,
86-
None => match parquet_to_arrow_field(column_desc.as_ref())?.data_type() {
87-
ArrowType::Utf8 | ArrowType::Utf8View => ArrowType::Utf8View,
88-
_ => ArrowType::BinaryView,
89-
},
90-
};
91-
92-
match data_type {
93-
ArrowType::BinaryView | ArrowType::Utf8View => {
94-
let reader = GenericRecordReader::new(column_desc);
95-
Ok(Box::new(ByteArrayReader::<i32>::new(
96-
pages, data_type, reader,
97-
)))
98-
}
99-
100-
_ => Err(general_err!(
101-
"invalid data type for byte array reader read to view type - {}",
102-
data_type
103-
)),
104-
}
105-
}
106-
10777
/// An [`ArrayReader`] for variable length byte arrays
10878
struct ByteArrayReader<I: OffsetSizeTrait> {
10979
data_type: ArrowType,
@@ -472,6 +442,23 @@ impl ByteArrayDecoderDeltaLength {
472442
let mut lengths = vec![0; values];
473443
len_decoder.get(&mut lengths)?;
474444

445+
let mut total_bytes = 0;
446+
447+
for l in lengths.iter() {
448+
if *l < 0 {
449+
return Err(ParquetError::General(
450+
"negative delta length byte array length".to_string(),
451+
));
452+
}
453+
total_bytes += *l as usize;
454+
}
455+
456+
if total_bytes + len_decoder.get_offset() > data.len() {
457+
return Err(ParquetError::General(
458+
"Insufficient delta length byte array bytes".to_string(),
459+
));
460+
}
461+
475462
Ok(Self {
476463
lengths,
477464
data,
@@ -496,23 +483,17 @@ impl ByteArrayDecoderDeltaLength {
496483
let total_bytes: usize = src_lengths.iter().map(|x| *x as usize).sum();
497484
output.values.reserve(total_bytes);
498485

499-
if self.data_offset + total_bytes > self.data.len() {
500-
return Err(ParquetError::EOF(
501-
"Insufficient delta length byte array bytes".to_string(),
502-
));
503-
}
504-
505-
let mut start_offset = self.data_offset;
486+
let mut current_offset = self.data_offset;
506487
for length in src_lengths {
507-
let end_offset = start_offset + *length as usize;
488+
let end_offset = current_offset + *length as usize;
508489
output.try_push(
509-
&self.data.as_ref()[start_offset..end_offset],
490+
&self.data.as_ref()[current_offset..end_offset],
510491
self.validate_utf8,
511492
)?;
512-
start_offset = end_offset;
493+
current_offset = end_offset;
513494
}
514495

515-
self.data_offset = start_offset;
496+
self.data_offset = current_offset;
516497
self.length_offset += to_read;
517498

518499
if self.validate_utf8 {

parquet/src/arrow/array_reader/byte_view_array.rs

Lines changed: 191 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,23 @@
1717

1818
use crate::arrow::array_reader::{read_records, skip_records, ArrayReader};
1919
use crate::arrow::buffer::view_buffer::ViewBuffer;
20-
use crate::arrow::decoder::DictIndexDecoder;
20+
use crate::arrow::decoder::{DeltaByteArrayDecoder, DictIndexDecoder};
2121
use crate::arrow::record_reader::GenericRecordReader;
2222
use crate::arrow::schema::parquet_to_arrow_field;
2323
use crate::basic::{ConvertedType, Encoding};
2424
use crate::column::page::PageIterator;
2525
use crate::column::reader::decoder::ColumnValueDecoder;
26+
use crate::data_type::Int32Type;
27+
use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder};
2628
use crate::errors::{ParquetError, Result};
2729
use crate::schema::types::ColumnDescPtr;
28-
use arrow_array::ArrayRef;
30+
use arrow_array::{builder::make_view, ArrayRef};
2931
use arrow_data::ByteView;
3032
use arrow_schema::DataType as ArrowType;
3133
use bytes::Bytes;
3234
use std::any::Any;
3335

3436
/// Returns an [`ArrayReader`] that decodes the provided byte array column to view types.
35-
#[allow(unused)]
3637
pub fn make_byte_view_array_reader(
3738
pages: Box<dyn PageIterator>,
3839
column_desc: ColumnDescPtr,
@@ -61,7 +62,6 @@ pub fn make_byte_view_array_reader(
6162
}
6263

6364
/// An [`ArrayReader`] for variable length byte arrays
64-
#[allow(unused)]
6565
struct ByteViewArrayReader {
6666
data_type: ArrowType,
6767
pages: Box<dyn PageIterator>,
@@ -213,6 +213,8 @@ impl ColumnValueDecoder for ByteViewArrayColumnValueDecoder {
213213
pub enum ByteViewArrayDecoder {
214214
Plain(ByteViewArrayDecoderPlain),
215215
Dictionary(ByteViewArrayDecoderDictionary),
216+
DeltaLength(ByteViewArrayDecoderDeltaLength),
217+
DeltaByteArray(ByteViewArrayDecoderDelta),
216218
}
217219

218220
impl ByteViewArrayDecoder {
@@ -235,9 +237,12 @@ impl ByteViewArrayDecoder {
235237
data, num_levels, num_values,
236238
))
237239
}
238-
Encoding::DELTA_LENGTH_BYTE_ARRAY | Encoding::DELTA_BYTE_ARRAY => {
239-
unimplemented!("stay tuned!")
240-
}
240+
Encoding::DELTA_LENGTH_BYTE_ARRAY => ByteViewArrayDecoder::DeltaLength(
241+
ByteViewArrayDecoderDeltaLength::new(data, validate_utf8)?,
242+
),
243+
Encoding::DELTA_BYTE_ARRAY => ByteViewArrayDecoder::DeltaByteArray(
244+
ByteViewArrayDecoderDelta::new(data, validate_utf8)?,
245+
),
241246
_ => {
242247
return Err(general_err!(
243248
"unsupported encoding for byte array: {}",
@@ -263,6 +268,8 @@ impl ByteViewArrayDecoder {
263268
.ok_or_else(|| general_err!("dictionary required for dictionary encoding"))?;
264269
d.read(out, dict, len)
265270
}
271+
ByteViewArrayDecoder::DeltaLength(d) => d.read(out, len),
272+
ByteViewArrayDecoder::DeltaByteArray(d) => d.read(out, len),
266273
}
267274
}
268275

@@ -275,6 +282,8 @@ impl ByteViewArrayDecoder {
275282
.ok_or_else(|| general_err!("dictionary required for dictionary encoding"))?;
276283
d.skip(dict, len)
277284
}
285+
ByteViewArrayDecoder::DeltaLength(d) => d.skip(len),
286+
ByteViewArrayDecoder::DeltaByteArray(d) => d.skip(len),
278287
}
279288
}
280289
}
@@ -487,6 +496,181 @@ impl ByteViewArrayDecoderDictionary {
487496
}
488497
}
489498

499+
/// Decoder from [`Encoding::DELTA_LENGTH_BYTE_ARRAY`] data to [`ViewBuffer`]
500+
pub struct ByteViewArrayDecoderDeltaLength {
501+
lengths: Vec<i32>,
502+
data: Bytes,
503+
length_offset: usize,
504+
data_offset: usize,
505+
validate_utf8: bool,
506+
}
507+
508+
impl ByteViewArrayDecoderDeltaLength {
509+
fn new(data: Bytes, validate_utf8: bool) -> Result<Self> {
510+
let mut len_decoder = DeltaBitPackDecoder::<Int32Type>::new();
511+
len_decoder.set_data(data.clone(), 0)?;
512+
let values = len_decoder.values_left();
513+
514+
let mut lengths = vec![0; values];
515+
len_decoder.get(&mut lengths)?;
516+
517+
let mut total_bytes = 0;
518+
519+
for l in lengths.iter() {
520+
if *l < 0 {
521+
return Err(ParquetError::General(
522+
"negative delta length byte array length".to_string(),
523+
));
524+
}
525+
total_bytes += *l as usize;
526+
}
527+
528+
if total_bytes + len_decoder.get_offset() > data.len() {
529+
return Err(ParquetError::General(
530+
"Insufficient delta length byte array bytes".to_string(),
531+
));
532+
}
533+
534+
Ok(Self {
535+
lengths,
536+
data,
537+
validate_utf8,
538+
length_offset: 0,
539+
data_offset: len_decoder.get_offset(),
540+
})
541+
}
542+
543+
fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result<usize> {
544+
let to_read = len.min(self.lengths.len() - self.length_offset);
545+
output.views.reserve(to_read);
546+
547+
let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read];
548+
549+
let block_id = output.append_block(self.data.clone().into());
550+
551+
let mut current_offset = self.data_offset;
552+
let initial_offset = current_offset;
553+
for length in src_lengths {
554+
// # Safety
555+
// The length is from the delta length decoder, so it is valid
556+
// The start_offset is calculated from the lengths, so it is valid
557+
// `start_offset + length` is guaranteed to be within the bounds of `data`, as checked in `new`
558+
unsafe { output.append_view_unchecked(block_id, current_offset as u32, *length as u32) }
559+
560+
current_offset += *length as usize;
561+
}
562+
563+
// Delta length encoding has continuous strings, we can validate utf8 in one go
564+
if self.validate_utf8 {
565+
check_valid_utf8(&self.data[initial_offset..current_offset])?;
566+
}
567+
568+
self.data_offset = current_offset;
569+
self.length_offset += to_read;
570+
571+
Ok(to_read)
572+
}
573+
574+
fn skip(&mut self, to_skip: usize) -> Result<usize> {
575+
let remain_values = self.lengths.len() - self.length_offset;
576+
let to_skip = remain_values.min(to_skip);
577+
578+
let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_skip];
579+
let total_bytes: usize = src_lengths.iter().map(|x| *x as usize).sum();
580+
581+
self.data_offset += total_bytes;
582+
self.length_offset += to_skip;
583+
Ok(to_skip)
584+
}
585+
}
586+
587+
/// Decoder from [`Encoding::DELTA_BYTE_ARRAY`] to [`ViewBuffer`]
588+
pub struct ByteViewArrayDecoderDelta {
589+
decoder: DeltaByteArrayDecoder,
590+
validate_utf8: bool,
591+
}
592+
593+
impl ByteViewArrayDecoderDelta {
594+
fn new(data: Bytes, validate_utf8: bool) -> Result<Self> {
595+
Ok(Self {
596+
decoder: DeltaByteArrayDecoder::new(data)?,
597+
validate_utf8,
598+
})
599+
}
600+
601+
// Unlike other encodings, we need to copy the data.
602+
//
603+
// DeltaByteArray data is stored using shared prefixes/suffixes,
604+
// which results in potentially non-contiguous
605+
// strings, while Arrow encodings require contiguous strings
606+
//
607+
// <https://parquet.apache.org/docs/file-format/data-pages/encodings/#delta-strings-delta_byte_array--7>
608+
609+
fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result<usize> {
610+
output.views.reserve(len.min(self.decoder.remaining()));
611+
612+
// array buffer only have long strings
613+
let mut array_buffer: Vec<u8> = Vec::with_capacity(4096);
614+
615+
let buffer_id = output.buffers.len() as u32;
616+
617+
let read = if !self.validate_utf8 {
618+
self.decoder.read(len, |bytes| {
619+
let offset = array_buffer.len();
620+
let view = make_view(bytes, buffer_id, offset as u32);
621+
if bytes.len() > 12 {
622+
// only copy the data to buffer if the string can not be inlined.
623+
array_buffer.extend_from_slice(bytes);
624+
}
625+
626+
// # Safety
627+
// The buffer_id is the last buffer in the output buffers
628+
// The offset is calculated from the buffer, so it is valid
629+
unsafe {
630+
output.append_raw_view_unchecked(&view);
631+
}
632+
Ok(())
633+
})?
634+
} else {
635+
// utf8 validation buffer has only short strings. These short
636+
// strings are inlined into the views but we copy them into a
637+
// contiguous buffer to accelerate validation.®
638+
let mut utf8_validation_buffer = Vec::with_capacity(4096);
639+
640+
let v = self.decoder.read(len, |bytes| {
641+
let offset = array_buffer.len();
642+
let view = make_view(bytes, buffer_id, offset as u32);
643+
if bytes.len() > 12 {
644+
// only copy the data to buffer if the string can not be inlined.
645+
array_buffer.extend_from_slice(bytes);
646+
} else {
647+
utf8_validation_buffer.extend_from_slice(bytes);
648+
}
649+
650+
// # Safety
651+
// The buffer_id is the last buffer in the output buffers
652+
// The offset is calculated from the buffer, so it is valid
653+
// Utf-8 validation is done later
654+
unsafe {
655+
output.append_raw_view_unchecked(&view);
656+
}
657+
Ok(())
658+
})?;
659+
check_valid_utf8(&array_buffer)?;
660+
check_valid_utf8(&utf8_validation_buffer)?;
661+
v
662+
};
663+
664+
let actual_block_id = output.append_block(array_buffer.into());
665+
assert_eq!(actual_block_id, buffer_id);
666+
Ok(read)
667+
}
668+
669+
fn skip(&mut self, to_skip: usize) -> Result<usize> {
670+
self.decoder.skip(to_skip)
671+
}
672+
}
673+
490674
/// Check that `val` is a valid UTF-8 sequence
491675
pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
492676
match std::str::from_utf8(val) {
@@ -525,13 +709,6 @@ mod tests {
525709
.unwrap();
526710

527711
for (encoding, page) in pages {
528-
if encoding != Encoding::PLAIN
529-
&& encoding != Encoding::RLE_DICTIONARY
530-
&& encoding != Encoding::PLAIN_DICTIONARY
531-
{
532-
// skip unsupported encodings for now as they are not yet implemented
533-
continue;
534-
}
535712
let mut output = ViewBuffer::default();
536713
decoder.set_data(encoding, page, 4, Some(4)).unwrap();
537714

0 commit comments

Comments
 (0)