@@ -1180,7 +1180,7 @@ pub fn cast_with_options(
11801180 }
11811181 Date32 => cast_date32_to_string :: < i32 > ( array) ,
11821182 Date64 => cast_date64_to_string :: < i32 > ( array) ,
1183- Binary => cast_binary_to_generic_string :: < i32 , i32 > ( array, cast_options) ,
1183+ Binary => cast_binary_to_string :: < i32 > ( array, cast_options) ,
11841184 LargeBinary => cast_binary_to_generic_string :: < i64 , i32 > ( array, cast_options) ,
11851185 _ => Err ( ArrowError :: CastError ( format ! (
11861186 "Casting from {from_type:?} to {to_type:?} not supported" ,
@@ -1215,7 +1215,7 @@ pub fn cast_with_options(
12151215 Date32 => cast_date32_to_string :: < i64 > ( array) ,
12161216 Date64 => cast_date64_to_string :: < i64 > ( array) ,
12171217 Binary => cast_binary_to_generic_string :: < i32 , i64 > ( array, cast_options) ,
1218- LargeBinary => cast_binary_to_generic_string :: < i64 , i64 > ( array, cast_options) ,
1218+ LargeBinary => cast_binary_to_string :: < i64 > ( array, cast_options) ,
12191219 _ => Err ( ArrowError :: CastError ( format ! (
12201220 "Casting from {from_type:?} to {to_type:?} not supported" ,
12211221 ) ) ) ,
@@ -3392,6 +3392,66 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
33923392 Ok ( Arc :: new ( list) as ArrayRef )
33933393}
33943394
3395+ /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
3396+ /// offset size so re-encoding offset is unnecessary.
3397+ fn cast_binary_to_string < O > (
3398+ array : & dyn Array ,
3399+ cast_options : & CastOptions ,
3400+ ) -> Result < ArrayRef , ArrowError >
3401+ where
3402+ O : OffsetSizeTrait + ToPrimitive ,
3403+ {
3404+ let array = array
3405+ . as_any ( )
3406+ . downcast_ref :: < GenericByteArray < GenericBinaryType < O > > > ( )
3407+ . unwrap ( ) ;
3408+
3409+ if !cast_options. safe {
3410+ let offsets = array. value_offsets ( ) ;
3411+ let values = array. value_data ( ) ;
3412+
3413+ // We only need to validate that all values are valid UTF-8
3414+ let validated = std:: str:: from_utf8 ( values)
3415+ . map_err ( |_| ArrowError :: CastError ( "Invalid UTF-8 sequence" . to_string ( ) ) ) ?;
3416+ // Checks if the offsets are valid but does not re-encode
3417+ for offset in offsets. iter ( ) {
3418+ if !validated. is_char_boundary ( offset. as_usize ( ) ) {
3419+ return Err ( ArrowError :: CastError ( "Invalid UTF-8 sequence" . to_string ( ) ) ) ;
3420+ }
3421+ }
3422+
3423+ let builder = array
3424+ . into_data ( )
3425+ . into_builder ( )
3426+ . data_type ( GenericStringArray :: < O > :: DATA_TYPE ) ;
3427+ // SAFETY:
3428+ // Validated UTF-8 above
3429+ Ok ( Arc :: new ( GenericStringArray :: < O > :: from ( unsafe {
3430+ builder. build_unchecked ( )
3431+ } ) ) )
3432+ } else {
3433+ let mut null_builder = BooleanBufferBuilder :: new ( array. len ( ) ) ;
3434+ array. iter ( ) . for_each ( |maybe_value| {
3435+ null_builder. append (
3436+ maybe_value
3437+ . and_then ( |value| std:: str:: from_utf8 ( value) . ok ( ) )
3438+ . is_some ( ) ,
3439+ ) ;
3440+ } ) ;
3441+
3442+ let builder = array
3443+ . into_data ( )
3444+ . into_builder ( )
3445+ . null_bit_buffer ( Some ( null_builder. finish ( ) ) )
3446+ . data_type ( GenericStringArray :: < O > :: DATA_TYPE ) ;
3447+ // SAFETY:
3448+ // Validated UTF-8 above
3449+ Ok ( Arc :: new ( GenericStringArray :: < O > :: from ( unsafe {
3450+ builder. build_unchecked ( )
3451+ } ) ) )
3452+ }
3453+ }
3454+
33953455/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs
33963456/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending
33973457/// `CastOptions`.
@@ -3417,6 +3477,7 @@ where
34173477 . map_err ( |_| ArrowError :: CastError ( "Invalid UTF-8 sequence" . to_string ( ) ) ) ?;
34183478
34193479 let mut offset_builder = BufferBuilder :: < O > :: new ( offsets. len ( ) ) ;
3480+ // Checks if the offset is a valid char boundary and re-encode the offset
34203481 offsets
34213482 . iter ( )
34223483 . try_for_each :: < _ , Result < _ , ArrowError > > ( |offset| {
0 commit comments