1717
1818use crate :: arrow:: array_reader:: { read_records, skip_records, ArrayReader } ;
1919use crate :: arrow:: buffer:: view_buffer:: ViewBuffer ;
20- use crate :: arrow:: decoder:: DictIndexDecoder ;
20+ use crate :: arrow:: decoder:: { DeltaByteArrayDecoder , DictIndexDecoder } ;
2121use crate :: arrow:: record_reader:: GenericRecordReader ;
2222use crate :: arrow:: schema:: parquet_to_arrow_field;
2323use crate :: basic:: { ConvertedType , Encoding } ;
2424use crate :: column:: page:: PageIterator ;
2525use crate :: column:: reader:: decoder:: ColumnValueDecoder ;
26+ use crate :: data_type:: Int32Type ;
27+ use crate :: encodings:: decoding:: { Decoder , DeltaBitPackDecoder } ;
2628use crate :: errors:: { ParquetError , Result } ;
2729use crate :: schema:: types:: ColumnDescPtr ;
28- use arrow_array:: ArrayRef ;
30+ use arrow_array:: { builder :: make_view , ArrayRef } ;
2931use arrow_data:: ByteView ;
3032use arrow_schema:: DataType as ArrowType ;
3133use bytes:: Bytes ;
3234use std:: any:: Any ;
3335
3436/// Returns an [`ArrayReader`] that decodes the provided byte array column to view types.
35- #[ allow( unused) ]
3637pub fn make_byte_view_array_reader (
3738 pages : Box < dyn PageIterator > ,
3839 column_desc : ColumnDescPtr ,
@@ -61,7 +62,6 @@ pub fn make_byte_view_array_reader(
6162}
6263
6364/// An [`ArrayReader`] for variable length byte arrays
64- #[ allow( unused) ]
6565struct ByteViewArrayReader {
6666 data_type : ArrowType ,
6767 pages : Box < dyn PageIterator > ,
@@ -213,6 +213,8 @@ impl ColumnValueDecoder for ByteViewArrayColumnValueDecoder {
213213pub enum ByteViewArrayDecoder {
214214 Plain ( ByteViewArrayDecoderPlain ) ,
215215 Dictionary ( ByteViewArrayDecoderDictionary ) ,
216+ DeltaLength ( ByteViewArrayDecoderDeltaLength ) ,
217+ DeltaByteArray ( ByteViewArrayDecoderDelta ) ,
216218}
217219
218220impl ByteViewArrayDecoder {
@@ -235,9 +237,12 @@ impl ByteViewArrayDecoder {
235237 data, num_levels, num_values,
236238 ) )
237239 }
238- Encoding :: DELTA_LENGTH_BYTE_ARRAY | Encoding :: DELTA_BYTE_ARRAY => {
239- unimplemented ! ( "stay tuned!" )
240- }
240+ Encoding :: DELTA_LENGTH_BYTE_ARRAY => ByteViewArrayDecoder :: DeltaLength (
241+ ByteViewArrayDecoderDeltaLength :: new ( data, validate_utf8) ?,
242+ ) ,
243+ Encoding :: DELTA_BYTE_ARRAY => ByteViewArrayDecoder :: DeltaByteArray (
244+ ByteViewArrayDecoderDelta :: new ( data, validate_utf8) ?,
245+ ) ,
241246 _ => {
242247 return Err ( general_err ! (
243248 "unsupported encoding for byte array: {}" ,
@@ -263,6 +268,8 @@ impl ByteViewArrayDecoder {
263268 . ok_or_else ( || general_err ! ( "dictionary required for dictionary encoding" ) ) ?;
264269 d. read ( out, dict, len)
265270 }
271+ ByteViewArrayDecoder :: DeltaLength ( d) => d. read ( out, len) ,
272+ ByteViewArrayDecoder :: DeltaByteArray ( d) => d. read ( out, len) ,
266273 }
267274 }
268275
@@ -275,6 +282,8 @@ impl ByteViewArrayDecoder {
275282 . ok_or_else ( || general_err ! ( "dictionary required for dictionary encoding" ) ) ?;
276283 d. skip ( dict, len)
277284 }
285+ ByteViewArrayDecoder :: DeltaLength ( d) => d. skip ( len) ,
286+ ByteViewArrayDecoder :: DeltaByteArray ( d) => d. skip ( len) ,
278287 }
279288 }
280289}
@@ -487,6 +496,181 @@ impl ByteViewArrayDecoderDictionary {
487496 }
488497}
489498
499+ /// Decoder from [`Encoding::DELTA_LENGTH_BYTE_ARRAY`] data to [`ViewBuffer`]
500+ pub struct ByteViewArrayDecoderDeltaLength {
501+ lengths : Vec < i32 > ,
502+ data : Bytes ,
503+ length_offset : usize ,
504+ data_offset : usize ,
505+ validate_utf8 : bool ,
506+ }
507+
508+ impl ByteViewArrayDecoderDeltaLength {
509+ fn new ( data : Bytes , validate_utf8 : bool ) -> Result < Self > {
510+ let mut len_decoder = DeltaBitPackDecoder :: < Int32Type > :: new ( ) ;
511+ len_decoder. set_data ( data. clone ( ) , 0 ) ?;
512+ let values = len_decoder. values_left ( ) ;
513+
514+ let mut lengths = vec ! [ 0 ; values] ;
515+ len_decoder. get ( & mut lengths) ?;
516+
517+ let mut total_bytes = 0 ;
518+
519+ for l in lengths. iter ( ) {
520+ if * l < 0 {
521+ return Err ( ParquetError :: General (
522+ "negative delta length byte array length" . to_string ( ) ,
523+ ) ) ;
524+ }
525+ total_bytes += * l as usize ;
526+ }
527+
528+ if total_bytes + len_decoder. get_offset ( ) > data. len ( ) {
529+ return Err ( ParquetError :: General (
530+ "Insufficient delta length byte array bytes" . to_string ( ) ,
531+ ) ) ;
532+ }
533+
534+ Ok ( Self {
535+ lengths,
536+ data,
537+ validate_utf8,
538+ length_offset : 0 ,
539+ data_offset : len_decoder. get_offset ( ) ,
540+ } )
541+ }
542+
543+ fn read ( & mut self , output : & mut ViewBuffer , len : usize ) -> Result < usize > {
544+ let to_read = len. min ( self . lengths . len ( ) - self . length_offset ) ;
545+ output. views . reserve ( to_read) ;
546+
547+ let src_lengths = & self . lengths [ self . length_offset ..self . length_offset + to_read] ;
548+
549+ let block_id = output. append_block ( self . data . clone ( ) . into ( ) ) ;
550+
551+ let mut current_offset = self . data_offset ;
552+ let initial_offset = current_offset;
553+ for length in src_lengths {
554+ // # Safety
555+ // The length is from the delta length decoder, so it is valid
556+ // The start_offset is calculated from the lengths, so it is valid
557+ // `start_offset + length` is guaranteed to be within the bounds of `data`, as checked in `new`
558+ unsafe { output. append_view_unchecked ( block_id, current_offset as u32 , * length as u32 ) }
559+
560+ current_offset += * length as usize ;
561+ }
562+
563+ // Delta length encoding has continuous strings, we can validate utf8 in one go
564+ if self . validate_utf8 {
565+ check_valid_utf8 ( & self . data [ initial_offset..current_offset] ) ?;
566+ }
567+
568+ self . data_offset = current_offset;
569+ self . length_offset += to_read;
570+
571+ Ok ( to_read)
572+ }
573+
574+ fn skip ( & mut self , to_skip : usize ) -> Result < usize > {
575+ let remain_values = self . lengths . len ( ) - self . length_offset ;
576+ let to_skip = remain_values. min ( to_skip) ;
577+
578+ let src_lengths = & self . lengths [ self . length_offset ..self . length_offset + to_skip] ;
579+ let total_bytes: usize = src_lengths. iter ( ) . map ( |x| * x as usize ) . sum ( ) ;
580+
581+ self . data_offset += total_bytes;
582+ self . length_offset += to_skip;
583+ Ok ( to_skip)
584+ }
585+ }
586+
587+ /// Decoder from [`Encoding::DELTA_BYTE_ARRAY`] to [`ViewBuffer`]
588+ pub struct ByteViewArrayDecoderDelta {
589+ decoder : DeltaByteArrayDecoder ,
590+ validate_utf8 : bool ,
591+ }
592+
593+ impl ByteViewArrayDecoderDelta {
594+ fn new ( data : Bytes , validate_utf8 : bool ) -> Result < Self > {
595+ Ok ( Self {
596+ decoder : DeltaByteArrayDecoder :: new ( data) ?,
597+ validate_utf8,
598+ } )
599+ }
600+
601+ // Unlike other encodings, we need to copy the data.
602+ //
603+ // DeltaByteArray data is stored using shared prefixes/suffixes,
604+ // which results in potentially non-contiguous
605+ // strings, while Arrow encodings require contiguous strings
606+ //
607+ // <https://parquet.apache.org/docs/file-format/data-pages/encodings/#delta-strings-delta_byte_array--7>
608+
609+ fn read ( & mut self , output : & mut ViewBuffer , len : usize ) -> Result < usize > {
610+ output. views . reserve ( len. min ( self . decoder . remaining ( ) ) ) ;
611+
612+ // array buffer only have long strings
613+ let mut array_buffer: Vec < u8 > = Vec :: with_capacity ( 4096 ) ;
614+
615+ let buffer_id = output. buffers . len ( ) as u32 ;
616+
617+ let read = if !self . validate_utf8 {
618+ self . decoder . read ( len, |bytes| {
619+ let offset = array_buffer. len ( ) ;
620+ let view = make_view ( bytes, buffer_id, offset as u32 ) ;
621+ if bytes. len ( ) > 12 {
622+ // only copy the data to buffer if the string can not be inlined.
623+ array_buffer. extend_from_slice ( bytes) ;
624+ }
625+
626+ // # Safety
627+ // The buffer_id is the last buffer in the output buffers
628+ // The offset is calculated from the buffer, so it is valid
629+ unsafe {
630+ output. append_raw_view_unchecked ( & view) ;
631+ }
632+ Ok ( ( ) )
633+ } ) ?
634+ } else {
635+ // utf8 validation buffer has only short strings. These short
636+ // strings are inlined into the views but we copy them into a
637+ // contiguous buffer to accelerate validation.®
638+ let mut utf8_validation_buffer = Vec :: with_capacity ( 4096 ) ;
639+
640+ let v = self . decoder . read ( len, |bytes| {
641+ let offset = array_buffer. len ( ) ;
642+ let view = make_view ( bytes, buffer_id, offset as u32 ) ;
643+ if bytes. len ( ) > 12 {
644+ // only copy the data to buffer if the string can not be inlined.
645+ array_buffer. extend_from_slice ( bytes) ;
646+ } else {
647+ utf8_validation_buffer. extend_from_slice ( bytes) ;
648+ }
649+
650+ // # Safety
651+ // The buffer_id is the last buffer in the output buffers
652+ // The offset is calculated from the buffer, so it is valid
653+ // Utf-8 validation is done later
654+ unsafe {
655+ output. append_raw_view_unchecked ( & view) ;
656+ }
657+ Ok ( ( ) )
658+ } ) ?;
659+ check_valid_utf8 ( & array_buffer) ?;
660+ check_valid_utf8 ( & utf8_validation_buffer) ?;
661+ v
662+ } ;
663+
664+ let actual_block_id = output. append_block ( array_buffer. into ( ) ) ;
665+ assert_eq ! ( actual_block_id, buffer_id) ;
666+ Ok ( read)
667+ }
668+
669+ fn skip ( & mut self , to_skip : usize ) -> Result < usize > {
670+ self . decoder . skip ( to_skip)
671+ }
672+ }
673+
490674/// Check that `val` is a valid UTF-8 sequence
491675pub fn check_valid_utf8 ( val : & [ u8 ] ) -> Result < ( ) > {
492676 match std:: str:: from_utf8 ( val) {
@@ -525,13 +709,6 @@ mod tests {
525709 . unwrap ( ) ;
526710
527711 for ( encoding, page) in pages {
528- if encoding != Encoding :: PLAIN
529- && encoding != Encoding :: RLE_DICTIONARY
530- && encoding != Encoding :: PLAIN_DICTIONARY
531- {
532- // skip unsupported encodings for now as they are not yet implemented
533- continue ;
534- }
535712 let mut output = ViewBuffer :: default ( ) ;
536713 decoder. set_data ( encoding, page, 4 , Some ( 4 ) ) . unwrap ( ) ;
537714
0 commit comments