Skip to content

Commit 57f79c0

Browse files
authored
Enable casting of string to timestamp with microsecond resolution (#3752)
* Enable casting of string to timestamp with microsecond resolution * Enable string conversion to timestamp with second and millisecond resolution
1 parent 9699e1d commit 57f79c0

File tree

1 file changed

+101
-27
lines changed

1 file changed

+101
-27
lines changed

arrow-cast/src/cast.rs

Lines changed: 101 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
166166
| Time32(TimeUnit::Millisecond)
167167
| Time64(TimeUnit::Microsecond)
168168
| Time64(TimeUnit::Nanosecond)
169+
| Timestamp(TimeUnit::Second, _)
170+
| Timestamp(TimeUnit::Millisecond, _)
171+
| Timestamp(TimeUnit::Microsecond, _)
169172
| Timestamp(TimeUnit::Nanosecond, _)
170173
) => true,
171174
(Utf8, _) => to_type.is_numeric() && to_type != &Float16,
@@ -179,6 +182,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
179182
| Time32(TimeUnit::Millisecond)
180183
| Time64(TimeUnit::Microsecond)
181184
| Time64(TimeUnit::Nanosecond)
185+
| Timestamp(TimeUnit::Second, _)
186+
| Timestamp(TimeUnit::Millisecond, _)
187+
| Timestamp(TimeUnit::Microsecond, _)
182188
| Timestamp(TimeUnit::Nanosecond, _)
183189
) => true,
184190
(LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
@@ -1141,8 +1147,17 @@ pub fn cast_with_options(
11411147
Time64(TimeUnit::Nanosecond) => {
11421148
cast_string_to_time64nanosecond::<i32>(array, cast_options)
11431149
}
1150+
Timestamp(TimeUnit::Second, _) => {
1151+
cast_string_to_timestamp::<i32, TimestampSecondType>(array, cast_options)
1152+
}
1153+
Timestamp(TimeUnit::Millisecond, _) => {
1154+
cast_string_to_timestamp::<i32, TimestampMillisecondType>(array, cast_options)
1155+
}
1156+
Timestamp(TimeUnit::Microsecond, _) => {
1157+
cast_string_to_timestamp::<i32, TimestampMicrosecondType>(array, cast_options)
1158+
}
11441159
Timestamp(TimeUnit::Nanosecond, _) => {
1145-
cast_string_to_timestamp_ns::<i32>(array, cast_options)
1160+
cast_string_to_timestamp::<i32, TimestampNanosecondType>(array, cast_options)
11461161
}
11471162
_ => Err(ArrowError::CastError(format!(
11481163
"Casting from {from_type:?} to {to_type:?} not supported",
@@ -1182,8 +1197,17 @@ pub fn cast_with_options(
11821197
Time64(TimeUnit::Nanosecond) => {
11831198
cast_string_to_time64nanosecond::<i64>(array, cast_options)
11841199
}
1200+
Timestamp(TimeUnit::Second, _) => {
1201+
cast_string_to_timestamp::<i64, TimestampSecondType>(array, cast_options)
1202+
}
1203+
Timestamp(TimeUnit::Millisecond, _) => {
1204+
cast_string_to_timestamp::<i64, TimestampMillisecondType>(array, cast_options)
1205+
}
1206+
Timestamp(TimeUnit::Microsecond, _) => {
1207+
cast_string_to_timestamp::<i64, TimestampMicrosecondType>(array, cast_options)
1208+
}
11851209
Timestamp(TimeUnit::Nanosecond, _) => {
1186-
cast_string_to_timestamp_ns::<i64>(array, cast_options)
1210+
cast_string_to_timestamp::<i64, TimestampNanosecondType>(array, cast_options)
11871211
}
11881212
_ => Err(ArrowError::CastError(format!(
11891213
"Casting from {from_type:?} to {to_type:?} not supported",
@@ -2552,8 +2576,11 @@ fn cast_string_to_time64nanosecond<Offset: OffsetSizeTrait>(
25522576
Ok(Arc::new(array) as ArrayRef)
25532577
}
25542578

2555-
/// Casts generic string arrays to TimeStampNanosecondArray
2556-
fn cast_string_to_timestamp_ns<Offset: OffsetSizeTrait>(
2579+
/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
2580+
fn cast_string_to_timestamp<
2581+
Offset: OffsetSizeTrait,
2582+
TimestampType: ArrowTimestampType<Native = i64>,
2583+
>(
25572584
array: &dyn Array,
25582585
cast_options: &CastOptions,
25592586
) -> Result<ArrayRef, ArrowError> {
@@ -2562,26 +2589,36 @@ fn cast_string_to_timestamp_ns<Offset: OffsetSizeTrait>(
25622589
.downcast_ref::<GenericStringArray<Offset>>()
25632590
.unwrap();
25642591

2592+
let scale_factor = match TimestampType::get_time_unit() {
2593+
TimeUnit::Second => 1_000_000_000,
2594+
TimeUnit::Millisecond => 1_000_000,
2595+
TimeUnit::Microsecond => 1_000,
2596+
TimeUnit::Nanosecond => 1,
2597+
};
2598+
25652599
let array = if cast_options.safe {
2566-
let iter = string_array
2567-
.iter()
2568-
.map(|v| v.and_then(|v| string_to_timestamp_nanos(v).ok()));
2600+
let iter = string_array.iter().map(|v| {
2601+
v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor))
2602+
});
25692603
// Benefit:
25702604
// 20% performance improvement
25712605
// Soundness:
25722606
// The iterator is trustedLen because it comes from an `StringArray`.
2573-
unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) }
2607+
unsafe { PrimitiveArray::<TimestampType>::from_trusted_len_iter(iter) }
25742608
} else {
25752609
let vec = string_array
25762610
.iter()
2577-
.map(|v| v.map(string_to_timestamp_nanos).transpose())
2611+
.map(|v| {
2612+
v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor))
2613+
.transpose()
2614+
})
25782615
.collect::<Result<Vec<Option<i64>>, _>>()?;
25792616

25802617
// Benefit:
25812618
// 20% performance improvement
25822619
// Soundness:
25832620
// The iterator is trustedLen because it comes from an `StringArray`.
2584-
unsafe { TimestampNanosecondArray::from_trusted_len_iter(vec.iter()) }
2621+
unsafe { PrimitiveArray::<TimestampType>::from_trusted_len_iter(vec.iter()) }
25852622
};
25862623

25872624
Ok(Arc::new(array) as ArrayRef)
@@ -4704,32 +4741,69 @@ mod tests {
47044741
#[test]
47054742
fn test_cast_string_to_timestamp() {
47064743
let a1 = Arc::new(StringArray::from(vec![
4707-
Some("2020-09-08T12:00:00+00:00"),
4744+
Some("2020-09-08T12:00:00.123456789+00:00"),
47084745
Some("Not a valid date"),
47094746
None,
47104747
])) as ArrayRef;
47114748
let a2 = Arc::new(LargeStringArray::from(vec![
4712-
Some("2020-09-08T12:00:00+00:00"),
4749+
Some("2020-09-08T12:00:00.123456789+00:00"),
47134750
Some("Not a valid date"),
47144751
None,
47154752
])) as ArrayRef;
47164753
for array in &[a1, a2] {
4717-
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
4718-
let b = cast(array, &to_type).unwrap();
4719-
let c = b
4720-
.as_any()
4721-
.downcast_ref::<TimestampNanosecondArray>()
4722-
.unwrap();
4723-
assert_eq!(1599566400000000000, c.value(0));
4724-
assert!(c.is_null(1));
4725-
assert!(c.is_null(2));
4754+
for time_unit in &[
4755+
TimeUnit::Second,
4756+
TimeUnit::Millisecond,
4757+
TimeUnit::Microsecond,
4758+
TimeUnit::Nanosecond,
4759+
] {
4760+
let to_type = DataType::Timestamp(time_unit.clone(), None);
4761+
let b = cast(array, &to_type).unwrap();
4762+
4763+
match time_unit {
4764+
TimeUnit::Second => {
4765+
let c =
4766+
b.as_any().downcast_ref::<TimestampSecondArray>().unwrap();
4767+
assert_eq!(1599566400, c.value(0));
4768+
assert!(c.is_null(1));
4769+
assert!(c.is_null(2));
4770+
}
4771+
TimeUnit::Millisecond => {
4772+
let c = b
4773+
.as_any()
4774+
.downcast_ref::<TimestampMillisecondArray>()
4775+
.unwrap();
4776+
assert_eq!(1599566400123, c.value(0));
4777+
assert!(c.is_null(1));
4778+
assert!(c.is_null(2));
4779+
}
4780+
TimeUnit::Microsecond => {
4781+
let c = b
4782+
.as_any()
4783+
.downcast_ref::<TimestampMicrosecondArray>()
4784+
.unwrap();
4785+
assert_eq!(1599566400123456, c.value(0));
4786+
assert!(c.is_null(1));
4787+
assert!(c.is_null(2));
4788+
}
4789+
TimeUnit::Nanosecond => {
4790+
let c = b
4791+
.as_any()
4792+
.downcast_ref::<TimestampNanosecondArray>()
4793+
.unwrap();
4794+
assert_eq!(1599566400123456789, c.value(0));
4795+
assert!(c.is_null(1));
4796+
assert!(c.is_null(2));
4797+
}
4798+
}
47264799

4727-
let options = CastOptions { safe: false };
4728-
let err = cast_with_options(array, &to_type, &options).unwrap_err();
4729-
assert_eq!(
4730-
err.to_string(),
4731-
"Cast error: Error parsing 'Not a valid date' as timestamp"
4732-
);
4800+
let options = CastOptions { safe: false };
4801+
let err = cast_with_options(array, &to_type, &options).unwrap_err();
4802+
assert_eq!(
4803+
err.to_string(),
4804+
"Cast error: Error parsing 'Not a valid date' as timestamp"
4805+
);
4806+
}
47334807
}
47344808
}
47354809

0 commit comments

Comments
 (0)