Skip to content

Commit dd16811

Browse files
authored
Specified version of helper function to cast binary to string (#3624)
* Specified version of helper function to cast binary to string * Simplify it
1 parent e80d87f commit dd16811

File tree

1 file changed

+63
-2
lines changed

1 file changed

+63
-2
lines changed

arrow-cast/src/cast.rs

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,7 @@ pub fn cast_with_options(
11801180
}
11811181
Date32 => cast_date32_to_string::<i32>(array),
11821182
Date64 => cast_date64_to_string::<i32>(array),
1183-
Binary => cast_binary_to_generic_string::<i32, i32>(array, cast_options),
1183+
Binary => cast_binary_to_string::<i32>(array, cast_options),
11841184
LargeBinary => cast_binary_to_generic_string::<i64, i32>(array, cast_options),
11851185
_ => Err(ArrowError::CastError(format!(
11861186
"Casting from {from_type:?} to {to_type:?} not supported",
@@ -1215,7 +1215,7 @@ pub fn cast_with_options(
12151215
Date32 => cast_date32_to_string::<i64>(array),
12161216
Date64 => cast_date64_to_string::<i64>(array),
12171217
Binary => cast_binary_to_generic_string::<i32, i64>(array, cast_options),
1218-
LargeBinary => cast_binary_to_generic_string::<i64, i64>(array, cast_options),
1218+
LargeBinary => cast_binary_to_string::<i64>(array, cast_options),
12191219
_ => Err(ArrowError::CastError(format!(
12201220
"Casting from {from_type:?} to {to_type:?} not supported",
12211221
))),
@@ -3392,6 +3392,66 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
33923392
Ok(Arc::new(list) as ArrayRef)
33933393
}
33943394

3395+
/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
3396+
/// offset size so re-encoding offset is unnecessary.
3397+
fn cast_binary_to_string<O>(
3398+
array: &dyn Array,
3399+
cast_options: &CastOptions,
3400+
) -> Result<ArrayRef, ArrowError>
3401+
where
3402+
O: OffsetSizeTrait + ToPrimitive,
3403+
{
3404+
let array = array
3405+
.as_any()
3406+
.downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
3407+
.unwrap();
3408+
3409+
if !cast_options.safe {
3410+
let offsets = array.value_offsets();
3411+
let values = array.value_data();
3412+
3413+
// We only need to validate that all values are valid UTF-8
3414+
let validated = std::str::from_utf8(values)
3415+
.map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?;
3416+
// Checks if the offsets are valid but does not re-encode
3417+
for offset in offsets.iter() {
3418+
if !validated.is_char_boundary(offset.as_usize()) {
3419+
return Err(ArrowError::CastError("Invalid UTF-8 sequence".to_string()));
3420+
}
3421+
}
3422+
3423+
let builder = array
3424+
.into_data()
3425+
.into_builder()
3426+
.data_type(GenericStringArray::<O>::DATA_TYPE);
3427+
// SAFETY:
3428+
// Validated UTF-8 above
3429+
Ok(Arc::new(GenericStringArray::<O>::from(unsafe {
3430+
builder.build_unchecked()
3431+
})))
3432+
} else {
3433+
let mut null_builder = BooleanBufferBuilder::new(array.len());
3434+
array.iter().for_each(|maybe_value| {
3435+
null_builder.append(
3436+
maybe_value
3437+
.and_then(|value| std::str::from_utf8(value).ok())
3438+
.is_some(),
3439+
);
3440+
});
3441+
3442+
let builder = array
3443+
.into_data()
3444+
.into_builder()
3445+
.null_bit_buffer(Some(null_builder.finish()))
3446+
.data_type(GenericStringArray::<O>::DATA_TYPE);
3447+
// SAFETY:
3448+
// Validated UTF-8 above
3449+
Ok(Arc::new(GenericStringArray::<O>::from(unsafe {
3450+
builder.build_unchecked()
3451+
})))
3452+
}
3453+
}
3454+
33953455
/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs
33963456
/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending
33973457
/// `CastOptions`.
@@ -3417,6 +3477,7 @@ where
34173477
.map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?;
34183478

34193479
let mut offset_builder = BufferBuilder::<O>::new(offsets.len());
3480+
// Checks if the offset is a valid char boundary and re-encode the offset
34203481
offsets
34213482
.iter()
34223483
.try_for_each::<_, Result<_, ArrowError>>(|offset| {

0 commit comments

Comments
 (0)