@@ -2425,89 +2425,179 @@ mod tests {
24252425
24262426 fn test_invalid_utf8_string_array_inner < O : OffsetSizeTrait > ( ) {
24272427 let cases = [
2428- (
2429- invalid_utf8_first_char :: < O > ( ) ,
2430- "Parquet argument error: Parquet error: encountered non UTF-8 data" ,
2431- ) ,
2432- (
2433- invalid_utf8_later_char :: < O > ( ) ,
2434- "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6" ,
2435- ) ,
2428+ invalid_utf8_first_char :: < O > ( ) ,
2429+ invalid_utf8_first_char_long_strings :: < O > ( ) ,
2430+ invalid_utf8_later_char :: < O > ( ) ,
2431+ invalid_utf8_later_char_long_strings :: < O > ( ) ,
2432+ invalid_utf8_later_char_really_long_strings :: < O > ( ) ,
2433+ invalid_utf8_later_char_really_long_strings2 :: < O > ( ) ,
24362434 ] ;
2437- for ( array, expected_error) in cases {
2438- // data is not valid utf8 we can not construct a correct StringArray
2439- // safely, so purposely create an invalid StringArray
2440- let array = unsafe {
2441- GenericStringArray :: < O > :: new_unchecked (
2442- array. offsets ( ) . clone ( ) ,
2443- array. values ( ) . clone ( ) ,
2444- array. nulls ( ) . cloned ( ) ,
2445- )
2446- } ;
2447- let data_type = array. data_type ( ) . clone ( ) ;
2448- let data = write_to_parquet ( Arc :: new ( array) ) ;
2449- let err = read_from_parquet ( data) . unwrap_err ( ) ;
2450- assert_eq ! ( err. to_string( ) , expected_error, "data type: {data_type:?}" )
2435+ for array in & cases {
2436+ for encoding in STRING_ENCODINGS {
2437+ // data is not valid utf8 we can not construct a correct StringArray
2438+ // safely, so purposely create an invalid StringArray
2439+ let array = unsafe {
2440+ GenericStringArray :: < O > :: new_unchecked (
2441+ array. offsets ( ) . clone ( ) ,
2442+ array. values ( ) . clone ( ) ,
2443+ array. nulls ( ) . cloned ( ) ,
2444+ )
2445+ } ;
2446+ let data_type = array. data_type ( ) . clone ( ) ;
2447+ let data = write_to_parquet_with_encoding ( Arc :: new ( array) , * encoding) ;
2448+ let err = read_from_parquet ( data) . unwrap_err ( ) ;
2449+ let expected_err =
2450+ "Parquet argument error: Parquet error: encountered non UTF-8 data" ;
2451+ assert ! (
2452+ err. to_string( ) . contains( expected_err) ,
2453+ "data type: {data_type:?}, expected: {expected_err}, got: {err}"
2454+ ) ;
2455+ }
24512456 }
24522457 }
24532458
24542459 #[ test]
24552460 fn test_invalid_utf8_string_view_array ( ) {
24562461 let cases = [
2457- (
2458- invalid_utf8_first_char :: < i32 > ( ) ,
2459- "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11" ,
2460- ) ,
2461- (
2462- invalid_utf8_later_char :: < i32 > ( ) ,
2463- "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14" ,
2464- ) ,
2462+ invalid_utf8_first_char :: < i32 > ( ) ,
2463+ invalid_utf8_first_char_long_strings :: < i32 > ( ) ,
2464+ invalid_utf8_later_char :: < i32 > ( ) ,
2465+ invalid_utf8_later_char_long_strings :: < i32 > ( ) ,
2466+ invalid_utf8_later_char_really_long_strings :: < i32 > ( ) ,
2467+ invalid_utf8_later_char_really_long_strings2 :: < i32 > ( ) ,
24652468 ] ;
2466- for ( array, expected_error) in cases {
2467- let array = arrow_cast:: cast ( & array, & ArrowDataType :: BinaryView ) . unwrap ( ) ;
2468- let array = array. as_binary_view ( ) ;
2469-
2470- // data is not valid utf8 we can not construct a correct StringArray
2471- // safely, so purposely create an invalid StringArray
2472- let array = unsafe {
2473- StringViewArray :: new_unchecked (
2474- array. views ( ) . clone ( ) ,
2475- array. data_buffers ( ) . to_vec ( ) ,
2476- array. nulls ( ) . cloned ( ) ,
2477- )
2478- } ;
2479- let data_type = array. data_type ( ) . clone ( ) ;
2480- let data = write_to_parquet ( Arc :: new ( array) ) ;
2481- let err = read_from_parquet ( data) . unwrap_err ( ) ;
2482- assert_eq ! ( err. to_string( ) , expected_error, "data type: {data_type:?}" )
2469+
2470+ for encoding in STRING_ENCODINGS {
2471+ for array in & cases {
2472+ let array = arrow_cast:: cast ( & array, & ArrowDataType :: BinaryView ) . unwrap ( ) ;
2473+ let array = array. as_binary_view ( ) ;
2474+
2475+ // data is not valid utf8 we can not construct a correct StringArray
2476+ // safely, so purposely create an invalid StringViewArray
2477+ let array = unsafe {
2478+ StringViewArray :: new_unchecked (
2479+ array. views ( ) . clone ( ) ,
2480+ array. data_buffers ( ) . to_vec ( ) ,
2481+ array. nulls ( ) . cloned ( ) ,
2482+ )
2483+ } ;
2484+
2485+ let data_type = array. data_type ( ) . clone ( ) ;
2486+ let data = write_to_parquet_with_encoding ( Arc :: new ( array) , * encoding) ;
2487+ let err = read_from_parquet ( data) . unwrap_err ( ) ;
2488+ let expected_err =
2489+ "Parquet argument error: Parquet error: encountered non UTF-8 data" ;
2490+ assert ! (
2491+ err. to_string( ) . contains( expected_err) ,
2492+ "data type: {data_type:?}, expected: {expected_err}, got: {err}"
2493+ ) ;
2494+ }
24832495 }
24842496 }
24852497
2498+ /// Encodings suitable for string data
2499+ const STRING_ENCODINGS : & [ Option < Encoding > ] = & [
2500+ None ,
2501+ Some ( Encoding :: PLAIN ) ,
2502+ Some ( Encoding :: DELTA_LENGTH_BYTE_ARRAY ) ,
2503+ Some ( Encoding :: DELTA_BYTE_ARRAY ) ,
2504+ ] ;
2505+
2506+ /// Invalid Utf-8 sequence in the first character
2507+ /// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2508+ const INVALID_UTF8_FIRST_CHAR : & [ u8 ] = & [ 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2509+
2510+ /// Invalid Utf=8 sequence in NOT the first character
2511+ /// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2512+ const INVALID_UTF8_LATER_CHAR : & [ u8 ] = & [ 0x20 , 0x20 , 0x20 , 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2513+
24862514 /// returns a BinaryArray with invalid UTF8 data in the first character
24872515 fn invalid_utf8_first_char < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2488- // invalid sequence in the first character
2489- // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
24902516 let valid: & [ u8 ] = b" " ;
2491- let invalid: & [ u8 ] = & [ 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2517+ let invalid = INVALID_UTF8_FIRST_CHAR ;
24922518 GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( invalid) ] )
24932519 }
24942520
2521+ /// Returns a BinaryArray with invalid UTF8 data in the first character of a
2522+ /// string larger than 12 bytes which is handled specially when reading
2523+ /// `ByteViewArray`s
2524+ fn invalid_utf8_first_char_long_strings < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2525+ let valid: & [ u8 ] = b" " ;
2526+ let mut invalid = vec ! [ ] ;
2527+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2528+ invalid. extend_from_slice ( INVALID_UTF8_FIRST_CHAR ) ;
2529+ GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] )
2530+ }
2531+
24952532 /// returns a BinaryArray with invalid UTF8 data in a character other than
24962533 /// the first (this is checked in a special codepath)
24972534 fn invalid_utf8_later_char < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2498- // invalid sequence in NOT the first character
2499- // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
25002535 let valid: & [ u8 ] = b" " ;
2501- let invalid: & [ u8 ] = & [ 0x20 , 0x20 , 0x20 , 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2536+ let invalid: & [ u8 ] = INVALID_UTF8_LATER_CHAR ;
25022537 GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( invalid) ] )
25032538 }
25042539
2505- // writes the array into a single column parquet file
2506- fn write_to_parquet ( array : ArrayRef ) -> Vec < u8 > {
2540+ /// returns a BinaryArray with invalid UTF8 data in a character other than
2541+ /// the first in a string larger than 12 bytes which is handled specially
2542+ /// when reading `ByteViewArray`s (this is checked in a special codepath)
2543+ fn invalid_utf8_later_char_long_strings < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2544+ let valid: & [ u8 ] = b" " ;
2545+ let mut invalid = vec ! [ ] ;
2546+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2547+ invalid. extend_from_slice ( INVALID_UTF8_LATER_CHAR ) ;
2548+ GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] )
2549+ }
2550+
2551+ /// returns a BinaryArray with invalid UTF8 data in a character other than
2552+ /// the first in a string larger than 128 bytes which is handled specially
2553+ /// when reading `ByteViewArray`s (this is checked in a special codepath)
2554+ fn invalid_utf8_later_char_really_long_strings < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2555+ let valid: & [ u8 ] = b" " ;
2556+ let mut invalid = vec ! [ ] ;
2557+ for _ in 0 ..10 {
2558+ // each instance is 38 bytes
2559+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2560+ }
2561+ invalid. extend_from_slice ( INVALID_UTF8_LATER_CHAR ) ;
2562+ GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] )
2563+ }
2564+
2565+ /// returns a BinaryArray with small invalid UTF8 data followed by a large
2566+ /// invalid UTF8 data in a character other than the first in a string larger
2567+ fn invalid_utf8_later_char_really_long_strings2 < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2568+ let valid: & [ u8 ] = b" " ;
2569+ let mut valid_long = vec ! [ ] ;
2570+ for _ in 0 ..10 {
2571+ // each instance is 38 bytes
2572+ valid_long. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2573+ }
2574+ let invalid = INVALID_UTF8_LATER_CHAR ;
2575+ GenericBinaryArray :: < O > :: from_iter ( vec ! [
2576+ None ,
2577+ Some ( valid) ,
2578+ Some ( invalid) ,
2579+ None ,
2580+ Some ( & valid_long) ,
2581+ Some ( valid) ,
2582+ ] )
2583+ }
2584+
2585+ /// writes the array into a single column parquet file with the specified
2586+ /// encoding.
2587+ ///
2588+ /// If no encoding is specified, use default (dictionary) encoding
2589+ fn write_to_parquet_with_encoding ( array : ArrayRef , encoding : Option < Encoding > ) -> Vec < u8 > {
25072590 let batch = RecordBatch :: try_from_iter ( vec ! [ ( "c" , array) ] ) . unwrap ( ) ;
25082591 let mut data = vec ! [ ] ;
25092592 let schema = batch. schema ( ) ;
2510- let props = None ;
2593+ let props = encoding. map ( |encoding| {
2594+ WriterProperties :: builder ( )
2595+ // must disable dictionary encoding to actually use encoding
2596+ . set_dictionary_enabled ( false )
2597+ . set_encoding ( encoding)
2598+ . build ( )
2599+ } ) ;
2600+
25112601 {
25122602 let mut writer = ArrowWriter :: try_new ( & mut data, schema, props) . unwrap ( ) ;
25132603 writer. write ( & batch) . unwrap ( ) ;
0 commit comments