-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Add additional data types are supported in hash join #2721
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0136d23
2af3744
3c75f6b
8035e46
5429e5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,13 +22,17 @@ use ahash::RandomState; | |
|
|
||
| use arrow::{ | ||
| array::{ | ||
| ArrayData, ArrayRef, BooleanArray, Date32Array, Date64Array, LargeStringArray, | ||
| as_dictionary_array, as_string_array, ArrayData, ArrayRef, BooleanArray, | ||
| Date32Array, Date64Array, DecimalArray, DictionaryArray, LargeStringArray, | ||
| PrimitiveArray, TimestampMicrosecondArray, TimestampMillisecondArray, | ||
| TimestampSecondArray, UInt32BufferBuilder, UInt32Builder, UInt64BufferBuilder, | ||
| UInt64Builder, | ||
| }, | ||
| compute, | ||
| datatypes::{UInt32Type, UInt64Type}, | ||
| datatypes::{ | ||
| Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, | ||
| UInt8Type, | ||
| }, | ||
| }; | ||
| use smallvec::{smallvec, SmallVec}; | ||
| use std::sync::Arc; | ||
|
|
@@ -38,7 +42,7 @@ use std::{time::Instant, vec}; | |
| use futures::{ready, Stream, StreamExt, TryStreamExt}; | ||
|
|
||
| use arrow::array::{as_boolean_array, new_null_array, Array}; | ||
| use arrow::datatypes::DataType; | ||
| use arrow::datatypes::{ArrowNativeType, DataType}; | ||
| use arrow::datatypes::{Schema, SchemaRef}; | ||
| use arrow::error::Result as ArrowResult; | ||
| use arrow::record_batch::RecordBatch; | ||
|
|
@@ -947,6 +951,58 @@ macro_rules! equal_rows_elem { | |
| }}; | ||
| } | ||
|
|
||
| macro_rules! equal_rows_elem_with_string_dict { | ||
| ($key_array_type:ident, $l: ident, $r: ident, $left: ident, $right: ident, $null_equals_null: ident) => {{ | ||
| let left_array: &DictionaryArray<$key_array_type> = | ||
| as_dictionary_array::<$key_array_type>($l); | ||
| let right_array: &DictionaryArray<$key_array_type> = | ||
| as_dictionary_array::<$key_array_type>($r); | ||
|
|
||
| let (left_values, left_values_index) = { | ||
| let keys_col = left_array.keys(); | ||
| if keys_col.is_valid($left) { | ||
| let values_index = keys_col.value($left).to_usize().ok_or_else(|| { | ||
| DataFusionError::Internal(format!( | ||
| "Can not convert index to usize in dictionary of type creating group by value {:?}", | ||
| keys_col.data_type() | ||
| )) | ||
| }); | ||
|
|
||
| match values_index { | ||
| Ok(index) => (as_string_array(left_array.values()), Some(index)), | ||
| _ => (as_string_array(left_array.values()), None) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this code silently ignores failures Filed #2767 to make this a panic
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks!
yes, i'll trace the issue and try to solve this issue. |
||
| } | ||
| } else { | ||
| (as_string_array(left_array.values()), None) | ||
| } | ||
| }; | ||
| let (right_values, right_values_index) = { | ||
| let keys_col = right_array.keys(); | ||
| if keys_col.is_valid($right) { | ||
| let values_index = keys_col.value($right).to_usize().ok_or_else(|| { | ||
| DataFusionError::Internal(format!( | ||
| "Can not convert index to usize in dictionary of type creating group by value {:?}", | ||
| keys_col.data_type() | ||
| )) | ||
| }); | ||
|
|
||
| match values_index { | ||
| Ok(index) => (as_string_array(right_array.values()), Some(index)), | ||
| _ => (as_string_array(right_array.values()), None) | ||
| } | ||
| } else { | ||
| (as_string_array(right_array.values()), None) | ||
| } | ||
| }; | ||
|
|
||
| match (left_values_index, right_values_index) { | ||
| (Some(left_values_index), Some(right_values_index)) => left_values.value(left_values_index) == right_values.value(right_values_index), | ||
| (None, None) => $null_equals_null, | ||
| _ => false, | ||
| } | ||
| }}; | ||
| } | ||
|
|
||
| /// Left and right row have equal values | ||
| /// If more data types are supported here, please also add the data types in can_hash function | ||
| /// to generate hash join logical plan. | ||
|
|
@@ -1054,6 +1110,124 @@ fn equal_rows( | |
| DataType::LargeUtf8 => { | ||
| equal_rows_elem!(LargeStringArray, l, r, left, right, null_equals_null) | ||
| } | ||
| DataType::Decimal(_, lscale) => match r.data_type() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From my knowledge in the datafusion, we have converted the left and right to the same data type in the planner.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I create two table in the spark, From the from, we can get the the t1.c1 and t2.c1 will be casted to coerced type(decimal(11,4));
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I create two tables in datafusion ❯ describe t1; select * from t1; ❯ describe t2; select * from t2; this query gets 0 rows but this query gets one row Does the planner still not converts the data types between left and right for decimal ? If so, perhaps we can add this to planner.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should make the data type is same between left and right.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, we should check the data types and make the coercion in the planner. I'll create an issue and follow up the pr later. Thanks!
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you @AssHero |
||
| DataType::Decimal(_, rscale) => { | ||
| if lscale == rscale { | ||
| equal_rows_elem!( | ||
| DecimalArray, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } else { | ||
|
alamb marked this conversation as resolved.
|
||
| err = Some(Err(DataFusionError::Internal( | ||
| "Inconsistent Decimal data type in hasher, the scale should be same".to_string(), | ||
| ))); | ||
| false | ||
| } | ||
| } | ||
| _ => { | ||
| err = Some(Err(DataFusionError::Internal( | ||
| "Unsupported data type in hasher".to_string(), | ||
| ))); | ||
| false | ||
| } | ||
| }, | ||
| DataType::Dictionary(key_type, value_type) | ||
| if *value_type.as_ref() == DataType::Utf8 => | ||
| { | ||
| match key_type.as_ref() { | ||
| DataType::Int8 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| Int8Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::Int16 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| Int16Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::Int32 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| Int32Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::Int64 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| Int64Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::UInt8 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| UInt8Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::UInt16 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| UInt16Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::UInt32 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| UInt32Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| DataType::UInt64 => { | ||
| equal_rows_elem_with_string_dict!( | ||
| UInt64Type, | ||
| l, | ||
| r, | ||
| left, | ||
| right, | ||
| null_equals_null | ||
| ) | ||
| } | ||
| _ => { | ||
| // should not happen | ||
| err = Some(Err(DataFusionError::Internal( | ||
| "Unsupported data type in hasher".to_string(), | ||
| ))); | ||
| false | ||
| } | ||
| } | ||
| } | ||
| other => { | ||
| // This is internal because we should have caught this before. | ||
| err = Some(Err(DataFusionError::Internal(format!( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For dictionaries, I think
$leftand$rightare actually indexes into the keys array, and then the keys array contains the corresponding index intovalues.In other words, I think you need to compare the values using something like:
https://github.com/AssHero/arrow-datafusion/blob/hashjoin/datafusion/common/src/scalar.rs#L338-L361