Skip to content

Commit e34bd1a

Browse files
alambRich-T-kid
authored andcommitted
[main] add 52.4.0 changelog (apache#21053)
~(Draft until we have actually released 52.4.0)~ - part of apache#20855 Get the changelog and version update on main - Forward ports apache#21004 - Fix changelog issue noticed by @Omega359 in apache#20878 (comment) By CI New version / Change log Researching code flow defined test suite for GroupValuesDictionary first iteration on intern() all test pass, first iteration done| need to run benchmarks fixed null handleing & added test revised PR fix-ci optimized scalarvalues tranistioned from scalarValue to raw hashes fixed regressions & added test Removed non string value types Revert "Removed non string value types" This reverts commit 3bcb8c7. introduced new data type & changed storage format so that we no longer need to meta-data about nulls All test pass fix CI issues Revert "fix CI issues" This reverts commit e640a65. fix CI issues reduced PR fix lint Dictionary encoding: Optimize hash aggregation with i64 len prefixes
1 parent 0359a3c commit e34bd1a

5 files changed

Lines changed: 1031 additions & 6 deletions

File tree

datafusion/physical-plan/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,11 @@ required-features = ["test_utils"]
106106
harness = false
107107
name = "aggregate_vectorized"
108108
required-features = ["test_utils"]
109+
110+
[[bench]]
111+
name = "single_column_aggr"
112+
harness = false
113+
114+
[profile.profiling]
115+
inherits = "release"
116+
debug = true

datafusion/physical-plan/src/aggregates/group_values/mod.rs

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
//! [`GroupValues`] trait for storing and interning group keys
1919
2020
use arrow::array::types::{
21-
Date32Type, Date64Type, Decimal128Type, Time32MillisecondType, Time32SecondType,
22-
Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
23-
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
21+
Date32Type, Date64Type, Decimal128Type, Int8Type, Int16Type, Int32Type, Int64Type,
22+
Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
23+
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
24+
TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
2425
};
2526
use arrow::array::{ArrayRef, downcast_primitive};
2627
use arrow::datatypes::{DataType, SchemaRef, TimeUnit};
@@ -41,15 +42,15 @@ pub(crate) use single_group_by::primitive::HashValue;
4142
use crate::aggregates::{
4243
group_values::single_group_by::{
4344
boolean::GroupValuesBoolean, bytes::GroupValuesBytes,
44-
bytes_view::GroupValuesBytesView, primitive::GroupValuesPrimitive,
45+
bytes_view::GroupValuesBytesView, dictionary::GroupValuesDictionary,
46+
primitive::GroupValuesPrimitive,
4547
},
4648
order::GroupOrdering,
4749
};
4850

4951
mod metrics;
50-
mod null_builder;
51-
5252
pub(crate) use metrics::GroupByMetrics;
53+
mod null_builder;
5354

5455
/// Stores the group values during hash aggregation.
5556
///
@@ -196,6 +197,45 @@ pub fn new_group_values(
196197
DataType::Boolean => {
197198
return Ok(Box::new(GroupValuesBoolean::new()));
198199
}
200+
DataType::Dictionary(key_type, value_type) => {
201+
if supported_single_dictionary_value(value_type) {
202+
return match key_type.as_ref() {
203+
DataType::Int8 => Ok(Box::new(
204+
GroupValuesDictionary::<Int8Type>::new(value_type),
205+
)),
206+
DataType::Int16 => Ok(Box::new(
207+
GroupValuesDictionary::<Int16Type>::new(value_type),
208+
)),
209+
DataType::Int32 => Ok(Box::new(
210+
GroupValuesDictionary::<Int32Type>::new(value_type),
211+
)),
212+
DataType::Int64 => Ok(Box::new(
213+
GroupValuesDictionary::<Int64Type>::new(value_type),
214+
)),
215+
DataType::UInt8 => Ok(Box::new(
216+
GroupValuesDictionary::<UInt8Type>::new(value_type),
217+
)),
218+
DataType::UInt16 => {
219+
Ok(Box::new(GroupValuesDictionary::<UInt16Type>::new(
220+
value_type,
221+
)))
222+
}
223+
DataType::UInt32 => {
224+
Ok(Box::new(GroupValuesDictionary::<UInt32Type>::new(
225+
value_type,
226+
)))
227+
}
228+
DataType::UInt64 => {
229+
Ok(Box::new(GroupValuesDictionary::<UInt64Type>::new(
230+
value_type,
231+
)))
232+
}
233+
_ => Err(datafusion_common::DataFusionError::NotImplemented(
234+
format!("Unsupported dictionary key type: {key_type:?}"),
235+
)),
236+
};
237+
}
238+
}
199239
_ => {}
200240
}
201241
}
@@ -210,3 +250,18 @@ pub fn new_group_values(
210250
Ok(Box::new(GroupValuesRows::try_new(schema)?))
211251
}
212252
}
253+
254+
fn supported_single_dictionary_value(t: &DataType) -> bool {
255+
match t {
256+
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => true,
257+
DataType::List(field)
258+
if matches!(
259+
field.data_type(),
260+
DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
261+
) =>
262+
{
263+
true
264+
}
265+
_ => false,
266+
}
267+
}

0 commit comments

Comments
 (0)