Skip to content

Commit 519866c

Browse files
alambneilconway
andauthored
[branch-53] perf: Optimize to_char to allocate less, fix NULL handling (apache#20635) (apache#20885)
- Part of apache#19692 - Closes apache#20634 on branch-53 This PR: - Backports apache#20635 from @neilconway to the branch-53 line Co-authored-by: Neil Conway <neil.conway@gmail.com>
1 parent ab7507f commit 519866c

3 files changed

Lines changed: 130 additions & 164 deletions

File tree

datafusion/functions/benches/to_char.rs

Lines changed: 24 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,12 @@
1818
use std::hint::black_box;
1919
use std::sync::Arc;
2020

21-
use arrow::array::{ArrayRef, Date32Array, StringArray};
21+
use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
2222
use arrow::datatypes::{DataType, Field};
2323
use chrono::TimeDelta;
2424
use chrono::prelude::*;
2525
use criterion::{Criterion, criterion_group, criterion_main};
2626
use datafusion_common::ScalarValue;
27-
use datafusion_common::ScalarValue::TimestampNanosecond;
2827
use datafusion_common::config::ConfigOptions;
2928
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
3029
use datafusion_functions::datetime::to_char;
@@ -63,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array {
6362
Date32Array::from(data)
6463
}
6564

65+
fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
66+
let start_date = "1970-01-01"
67+
.parse::<NaiveDate>()
68+
.expect("Date should parse");
69+
let end_date = "2050-12-31"
70+
.parse::<NaiveDate>()
71+
.expect("Date should parse");
72+
let mut data: Vec<i64> = Vec::with_capacity(1000);
73+
for _ in 0..1000 {
74+
let date = pick_date_in_range(rng, start_date, end_date);
75+
let millis = date
76+
.and_hms_opt(0, 0, 0)
77+
.unwrap()
78+
.and_utc()
79+
.timestamp_millis();
80+
data.push(millis);
81+
}
82+
Date64Array::from(data)
83+
}
84+
6685
const DATE_PATTERNS: [&str; 5] =
6786
["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
6887

@@ -155,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
155174

156175
c.bench_function("to_char_array_datetime_patterns_1000", |b| {
157176
let mut rng = rand::rng();
158-
let data_arr = generate_date32_array(&mut rng);
177+
let data_arr = generate_date64_array(&mut rng);
159178
let batch_len = data_arr.len();
160179
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
161180
let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -182,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
182201

183202
c.bench_function("to_char_array_mixed_patterns_1000", |b| {
184203
let mut rng = rand::rng();
185-
let data_arr = generate_date32_array(&mut rng);
204+
let data_arr = generate_date64_array(&mut rng);
186205
let batch_len = data_arr.len();
187206
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
188207
let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -235,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
235254

236255
c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
237256
let mut rng = rand::rng();
238-
let data_arr = generate_date32_array(&mut rng);
257+
let data_arr = generate_date64_array(&mut rng);
239258
let batch_len = data_arr.len();
240259
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
241260
let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -259,38 +278,6 @@ fn criterion_benchmark(c: &mut Criterion) {
259278
)
260279
})
261280
});
262-
263-
c.bench_function("to_char_scalar_1000", |b| {
264-
let mut rng = rand::rng();
265-
let timestamp = "2026-07-08T09:10:11"
266-
.parse::<NaiveDateTime>()
267-
.unwrap()
268-
.with_nanosecond(56789)
269-
.unwrap()
270-
.and_utc()
271-
.timestamp_nanos_opt()
272-
.unwrap();
273-
let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None));
274-
let pattern =
275-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
276-
277-
b.iter(|| {
278-
black_box(
279-
to_char()
280-
.invoke_with_args(ScalarFunctionArgs {
281-
args: vec![data.clone(), pattern.clone()],
282-
arg_fields: vec![
283-
Field::new("a", data.data_type(), true).into(),
284-
Field::new("b", pattern.data_type(), true).into(),
285-
],
286-
number_rows: 1,
287-
return_field: Field::new("f", DataType::Utf8, true).into(),
288-
config_options: Arc::clone(&config_options),
289-
})
290-
.expect("to_char should work on valid values"),
291-
)
292-
})
293-
});
294281
}
295282

296283
criterion_group!(benches, criterion_benchmark);

datafusion/functions/src/datetime/to_char.rs

Lines changed: 89 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21+
use arrow::array::builder::StringBuilder;
2122
use arrow::array::cast::AsArray;
22-
use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
23+
use arrow::array::{Array, ArrayRef};
2324
use arrow::compute::cast;
2425
use arrow::datatypes::DataType;
2526
use arrow::datatypes::DataType::{
2627
Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
2728
};
2829
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
29-
use arrow::error::ArrowError;
3030
use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
3131
use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
3232
use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,17 @@ impl ScalarUDFImpl for ToCharFunc {
143143
let [date_time, format] = take_function_args(self.name(), &args)?;
144144

145145
match format {
146-
ColumnarValue::Scalar(ScalarValue::Utf8(None))
147-
| ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None),
148-
// constant format
149-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
150-
// invoke to_char_scalar with the known string, without converting to array
151-
to_char_scalar(date_time, Some(format))
146+
ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None)) => {
147+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
152148
}
153-
ColumnarValue::Array(_) => to_char_array(&args),
154-
_ => {
155-
exec_err!(
156-
"Format for `to_char` must be non-null Utf8, received {}",
157-
format.data_type()
158-
)
149+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
150+
to_char_scalar(date_time, fmt)
159151
}
152+
ColumnarValue::Array(_) => to_char_array(&args),
153+
_ => exec_err!(
154+
"Format for `to_char` must be non-null Utf8, received {}",
155+
format.data_type()
156+
),
160157
}
161158
}
162159

@@ -171,11 +168,8 @@ impl ScalarUDFImpl for ToCharFunc {
171168

172169
fn build_format_options<'a>(
173170
data_type: &DataType,
174-
format: Option<&'a str>,
175-
) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
176-
let Some(format) = format else {
177-
return Ok(FormatOptions::new());
178-
};
171+
format: &'a str,
172+
) -> Result<FormatOptions<'a>> {
179173
let format_options = match data_type {
180174
Date32 => FormatOptions::new()
181175
.with_date_format(Some(format))
@@ -194,144 +188,114 @@ fn build_format_options<'a>(
194188
},
195189
),
196190
other => {
197-
return Err(exec_err!(
191+
return exec_err!(
198192
"to_char only supports date, time, timestamp and duration data types, received {other:?}"
199-
));
193+
);
200194
}
201195
};
202196
Ok(format_options)
203197
}
204198

205-
/// Special version when arg\[1] is a scalar
206-
fn to_char_scalar(
207-
expression: &ColumnarValue,
208-
format: Option<&str>,
209-
) -> Result<ColumnarValue> {
210-
// it's possible that the expression is a scalar however because
211-
// of the implementation in arrow-rs we need to convert it to an array
199+
/// Formats `expression` using a constant `format` string.
200+
fn to_char_scalar(expression: &ColumnarValue, format: &str) -> Result<ColumnarValue> {
201+
// ArrayFormatter requires an array, so scalar expressions must be
202+
// converted to a 1-element array first.
212203
let data_type = &expression.data_type();
213204
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
214-
let array = expression.clone().into_array(1)?;
205+
let array = expression.to_array(1)?;
215206

216-
if format.is_none() {
217-
return if is_scalar_expression {
218-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
219-
} else {
220-
Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())))
221-
};
222-
}
207+
let format_options = build_format_options(data_type, format)?;
208+
let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
223209

224-
let format_options = match build_format_options(data_type, format) {
225-
Ok(value) => value,
226-
Err(value) => return value,
227-
};
210+
// Pad the preallocated capacity a bit because format specifiers often
211+
// expand the string (e.g., %Y -> "2026")
212+
let fmt_len = format.len() + 10;
213+
let mut builder = StringBuilder::with_capacity(array.len(), array.len() * fmt_len);
228214

229-
let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
230-
let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
231-
.map(|i| {
232-
if array.is_null(i) {
233-
Ok(None)
234-
} else {
235-
formatter.value(i).try_to_string().map(Some)
236-
}
237-
})
238-
.collect();
239-
240-
if let Ok(formatted) = formatted {
241-
if is_scalar_expression {
242-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
243-
formatted.first().unwrap().clone(),
244-
)))
215+
for i in 0..array.len() {
216+
if array.is_null(i) {
217+
builder.append_null();
245218
} else {
246-
Ok(ColumnarValue::Array(
247-
Arc::new(StringArray::from(formatted)) as ArrayRef
248-
))
249-
}
250-
} else {
251-
// if the data type was a Date32, formatting could have failed because the format string
252-
// contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
253-
if data_type == &Date32 {
254-
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
219+
// Write directly into the builder's internal buffer, then
220+
// commit the value with append_value("").
221+
match formatter.value(i).write(&mut builder) {
222+
Ok(()) => builder.append_value(""),
223+
// Arrow's Date32 formatter only handles date specifiers
224+
// (%Y, %m, %d, ...). Format strings with time specifiers
225+
// (%H, %M, %S, ...) cause it to fail. When this happens,
226+
// we retry by casting to Date64, whose datetime formatter
227+
// handles both date and time specifiers (with zero for
228+
// the time components).
229+
Err(_) if data_type == &Date32 => {
230+
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
231+
}
232+
Err(e) => return Err(e.into()),
233+
}
255234
}
235+
}
256236

257-
exec_err!("{}", formatted.unwrap_err())
237+
let result = builder.finish();
238+
if is_scalar_expression {
239+
let val = result.is_valid(0).then(|| result.value(0).to_string());
240+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
241+
} else {
242+
Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
258243
}
259244
}
260245

261246
fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
262247
let arrays = ColumnarValue::values_to_arrays(args)?;
263-
let mut results: Vec<Option<String>> = vec![];
248+
let data_array = &arrays[0];
264249
let format_array = arrays[1].as_string::<i32>();
265-
let data_type = arrays[0].data_type();
250+
let data_type = data_array.data_type();
266251

267-
for idx in 0..arrays[0].len() {
268-
let format = if format_array.is_null(idx) {
269-
None
270-
} else {
271-
Some(format_array.value(idx))
272-
};
273-
if format.is_none() {
274-
results.push(None);
252+
// Arbitrary guess for the length of a typical formatted datetime string
253+
let fmt_len = 30;
254+
let mut builder =
255+
StringBuilder::with_capacity(data_array.len(), data_array.len() * fmt_len);
256+
let mut buffer = String::with_capacity(fmt_len);
257+
258+
for idx in 0..data_array.len() {
259+
if format_array.is_null(idx) || data_array.is_null(idx) {
260+
builder.append_null();
275261
continue;
276262
}
277-
let format_options = match build_format_options(data_type, format) {
278-
Ok(value) => value,
279-
Err(value) => return value,
280-
};
281-
// this isn't ideal but this can't use ValueFormatter as it isn't independent
282-
// from ArrayFormatter
283-
let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?;
284-
let result = formatter.value(idx).try_to_string();
285-
match result {
286-
Ok(value) => results.push(Some(value)),
287-
Err(e) => {
288-
// if the data type was a Date32, formatting could have failed because the format string
289-
// contained datetime specifiers, so we'll treat this specific date element as a timestamp
290-
if data_type == &Date32 {
291-
let failed_date_value = arrays[0].slice(idx, 1);
292-
293-
match retry_date_as_timestamp(&failed_date_value, &format_options) {
294-
Ok(value) => {
295-
results.push(Some(value));
296-
continue;
297-
}
298-
Err(e) => {
299-
return exec_err!("{}", e);
300-
}
301-
}
302-
}
303263

304-
return exec_err!("{}", e);
264+
let format = format_array.value(idx);
265+
let format_options = build_format_options(data_type, format)?;
266+
let formatter = ArrayFormatter::try_new(data_array.as_ref(), &format_options)?;
267+
268+
buffer.clear();
269+
270+
// We'd prefer to write directly to the StringBuilder's internal buffer,
271+
// but the write might fail, and there's no easy way to ensure a partial
272+
// write is removed from the buffer. So instead we write to a temporary
273+
// buffer and `append_value` on success.
274+
match formatter.value(idx).write(&mut buffer) {
275+
Ok(()) => builder.append_value(&buffer),
276+
// Retry with Date64 (see comment in to_char_scalar).
277+
Err(_) if data_type == &Date32 => {
278+
buffer.clear();
279+
let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
280+
let retry_fmt =
281+
ArrayFormatter::try_new(date64_value.as_ref(), &format_options)?;
282+
retry_fmt.value(0).write(&mut buffer)?;
283+
builder.append_value(&buffer);
305284
}
285+
Err(e) => return Err(e.into()),
306286
}
307287
}
308288

289+
let result = builder.finish();
309290
match args[0] {
310-
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from(
311-
results,
312-
)) as ArrayRef)),
313-
ColumnarValue::Scalar(_) => match results.first().unwrap() {
314-
Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
315-
value.to_string(),
316-
)))),
317-
None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
318-
},
291+
ColumnarValue::Scalar(_) => {
292+
let val = result.is_valid(0).then(|| result.value(0).to_string());
293+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
294+
}
295+
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)),
319296
}
320297
}
321298

322-
fn retry_date_as_timestamp(
323-
array_ref: &ArrayRef,
324-
format_options: &FormatOptions,
325-
) -> Result<String> {
326-
let target_data_type = Date64;
327-
328-
let date_value = cast(&array_ref, &target_data_type)?;
329-
let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?;
330-
let result = formatter.value(0).try_to_string()?;
331-
332-
Ok(result)
333-
}
334-
335299
#[cfg(test)]
336300
mod tests {
337301
use crate::datetime::to_char::ToCharFunc;

0 commit comments

Comments
 (0)