Skip to content

Commit f69d36f

Browse files
authored
feat: add kernel ExpressionEvaluator (#1829)
~~based on #1807~~ # Description In the effort to advance protocol support and move our internal APIs closer to the kernel library, it is advantageous to leverage the expression handling logic from kernel specifically for filtering actions etc. This PR just add the expression definitions and evaluation logic. Integrating it with our current codebase and basing the existing partition handling logic on this is left for follow up PRs to keep thigs review-able. related: #1894, #1776
1 parent ef84cff commit f69d36f

8 files changed

Lines changed: 787 additions & 2 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ debug = "line-tables-only"
2020
[workspace.dependencies]
2121
# arrow
2222
arrow = { version = "48.0.1" }
23+
arrow-arith = { version = "48.0.1" }
2324
arrow-array = { version = "48.0.1" }
2425
arrow-buffer = { version = "48.0.1" }
2526
arrow-cast = { version = "48.0.1" }

crates/deltalake-core/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ features = ["azure", "datafusion", "gcs", "glue", "hdfs", "json", "python", "s3"
2020
[dependencies]
2121
# arrow
2222
arrow = { workspace = true, optional = true }
23+
arrow-arith = { workspace = true, optional = true }
2324
arrow-array = { workspace = true, optional = true }
2425
arrow-buffer = { workspace = true, optional = true }
2526
arrow-cast = { workspace = true, optional = true }
@@ -136,6 +137,7 @@ criterion = "0.5"
136137
azure = ["object_store/azure"]
137138
arrow = [
138139
"dep:arrow",
140+
"arrow-arith",
139141
"arrow-array",
140142
"arrow-cast",
141143
"arrow-ord",
Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
//! Default Expression handler.
2+
//!
3+
//! Expression handling based on arrow-rs compute kernels.
4+
5+
use std::sync::Arc;
6+
7+
use arrow_arith::boolean::{and, is_null, not, or};
8+
use arrow_arith::numeric::{add, div, mul, sub};
9+
use arrow_array::{
10+
Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array,
11+
Int32Array, RecordBatch, StringArray, TimestampMicrosecondArray,
12+
};
13+
use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq};
14+
15+
use crate::kernel::error::{DeltaResult, Error};
16+
use crate::kernel::expressions::{scalars::Scalar, Expression};
17+
use crate::kernel::expressions::{BinaryOperator, UnaryOperator};
18+
19+
// TODO leverage scalars / Datum
20+
21+
impl Scalar {
22+
/// Convert scalar to arrow array.
23+
pub fn to_array(&self, num_rows: usize) -> ArrayRef {
24+
use Scalar::*;
25+
match self {
26+
Integer(val) => Arc::new(Int32Array::from(vec![*val; num_rows])),
27+
Float(val) => Arc::new(Float32Array::from(vec![*val; num_rows])),
28+
String(val) => Arc::new(StringArray::from(vec![val.clone(); num_rows])),
29+
Boolean(val) => Arc::new(BooleanArray::from(vec![*val; num_rows])),
30+
Timestamp(val) => Arc::new(TimestampMicrosecondArray::from(vec![*val; num_rows])),
31+
Date(val) => Arc::new(Date32Array::from(vec![*val; num_rows])),
32+
Binary(val) => Arc::new(BinaryArray::from(vec![val.as_slice(); num_rows])),
33+
Decimal(val, precision, scale) => Arc::new(
34+
Decimal128Array::from(vec![*val; num_rows])
35+
.with_precision_and_scale(*precision, *scale)
36+
.unwrap(),
37+
),
38+
Null(_) => todo!(),
39+
}
40+
}
41+
}
42+
43+
pub(crate) fn evaluate_expression(
44+
expression: &Expression,
45+
batch: &RecordBatch,
46+
) -> DeltaResult<ArrayRef> {
47+
match expression {
48+
Expression::Literal(scalar) => Ok(scalar.to_array(batch.num_rows())),
49+
Expression::Column(name) => batch
50+
.column_by_name(name)
51+
.ok_or(Error::MissingColumn(name.clone()))
52+
.cloned(),
53+
Expression::UnaryOperation { op, expr } => {
54+
let arr = evaluate_expression(expr.as_ref(), batch)?;
55+
match op {
56+
UnaryOperator::Not => {
57+
let arr = arr
58+
.as_any()
59+
.downcast_ref::<BooleanArray>()
60+
.ok_or(Error::Generic("expected boolean array".to_string()))?;
61+
let result = not(arr)?;
62+
Ok(Arc::new(result))
63+
}
64+
UnaryOperator::IsNull => {
65+
let result = is_null(&arr)?;
66+
Ok(Arc::new(result))
67+
}
68+
}
69+
}
70+
Expression::BinaryOperation { op, left, right } => {
71+
let left_arr = evaluate_expression(left.as_ref(), batch)?;
72+
let right_arr = evaluate_expression(right.as_ref(), batch)?;
73+
match op {
74+
BinaryOperator::Plus => {
75+
add(&left_arr, &right_arr).map_err(|err| Error::GenericError {
76+
source: Box::new(err),
77+
})
78+
}
79+
BinaryOperator::Minus => {
80+
sub(&left_arr, &right_arr).map_err(|err| Error::GenericError {
81+
source: Box::new(err),
82+
})
83+
}
84+
BinaryOperator::Multiply => {
85+
mul(&left_arr, &right_arr).map_err(|err| Error::GenericError {
86+
source: Box::new(err),
87+
})
88+
}
89+
BinaryOperator::Divide => {
90+
div(&left_arr, &right_arr).map_err(|err| Error::GenericError {
91+
source: Box::new(err),
92+
})
93+
}
94+
BinaryOperator::LessThan => {
95+
let result = lt(&left_arr, &right_arr).map_err(|err| Error::GenericError {
96+
source: Box::new(err),
97+
})?;
98+
Ok(Arc::new(result))
99+
}
100+
BinaryOperator::LessThanOrEqual => {
101+
let result =
102+
lt_eq(&left_arr, &right_arr).map_err(|err| Error::GenericError {
103+
source: Box::new(err),
104+
})?;
105+
Ok(Arc::new(result))
106+
}
107+
BinaryOperator::GreaterThan => {
108+
let result = gt(&left_arr, &right_arr).map_err(|err| Error::GenericError {
109+
source: Box::new(err),
110+
})?;
111+
Ok(Arc::new(result))
112+
}
113+
BinaryOperator::GreaterThanOrEqual => {
114+
let result =
115+
gt_eq(&left_arr, &right_arr).map_err(|err| Error::GenericError {
116+
source: Box::new(err),
117+
})?;
118+
Ok(Arc::new(result))
119+
}
120+
BinaryOperator::Equal => {
121+
let result = eq(&left_arr, &right_arr).map_err(|err| Error::GenericError {
122+
source: Box::new(err),
123+
})?;
124+
Ok(Arc::new(result))
125+
}
126+
BinaryOperator::NotEqual => {
127+
let result = neq(&left_arr, &right_arr).map_err(|err| Error::GenericError {
128+
source: Box::new(err),
129+
})?;
130+
Ok(Arc::new(result))
131+
}
132+
BinaryOperator::And => {
133+
let left_arr = evaluate_expression(left.as_ref(), batch)?;
134+
let left_arr = left_arr
135+
.as_any()
136+
.downcast_ref::<BooleanArray>()
137+
.ok_or(Error::Generic("expected boolean array".to_string()))?;
138+
let right_arr = evaluate_expression(right.as_ref(), batch)?;
139+
let right_arr = right_arr
140+
.as_any()
141+
.downcast_ref::<BooleanArray>()
142+
.ok_or(Error::Generic("expected boolean array".to_string()))?;
143+
let result = and(left_arr, right_arr).map_err(|err| Error::GenericError {
144+
source: Box::new(err),
145+
})?;
146+
Ok(Arc::new(result))
147+
}
148+
BinaryOperator::Or => {
149+
let left_arr = evaluate_expression(left.as_ref(), batch)?;
150+
let left_arr = left_arr
151+
.as_any()
152+
.downcast_ref::<BooleanArray>()
153+
.ok_or(Error::Generic("expected boolean array".to_string()))?;
154+
let right_arr = evaluate_expression(right.as_ref(), batch)?;
155+
let right_arr = right_arr
156+
.as_any()
157+
.downcast_ref::<BooleanArray>()
158+
.ok_or(Error::Generic("expected boolean array".to_string()))?;
159+
let result = or(left_arr, right_arr).map_err(|err| Error::GenericError {
160+
source: Box::new(err),
161+
})?;
162+
Ok(Arc::new(result))
163+
}
164+
}
165+
}
166+
}
167+
}
168+
169+
#[cfg(test)]
170+
mod tests {
171+
use super::*;
172+
use arrow_array::Int32Array;
173+
use arrow_schema::{DataType, Field, Schema};
174+
use std::ops::{Add, Div, Mul, Sub};
175+
176+
#[test]
177+
fn test_binary_op_scalar() {
178+
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
179+
let values = Int32Array::from(vec![1, 2, 3]);
180+
let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap();
181+
let column = Expression::Column("a".to_string());
182+
183+
let expression = Box::new(column.clone().add(Expression::Literal(Scalar::Integer(1))));
184+
let results = evaluate_expression(&expression, &batch).unwrap();
185+
let expected = Arc::new(Int32Array::from(vec![2, 3, 4]));
186+
assert_eq!(results.as_ref(), expected.as_ref());
187+
188+
let expression = Box::new(column.clone().sub(Expression::Literal(Scalar::Integer(1))));
189+
let results = evaluate_expression(&expression, &batch).unwrap();
190+
let expected = Arc::new(Int32Array::from(vec![0, 1, 2]));
191+
assert_eq!(results.as_ref(), expected.as_ref());
192+
193+
let expression = Box::new(column.clone().mul(Expression::Literal(Scalar::Integer(2))));
194+
let results = evaluate_expression(&expression, &batch).unwrap();
195+
let expected = Arc::new(Int32Array::from(vec![2, 4, 6]));
196+
assert_eq!(results.as_ref(), expected.as_ref());
197+
198+
// TODO handle type casting
199+
let expression = Box::new(column.div(Expression::Literal(Scalar::Integer(1))));
200+
let results = evaluate_expression(&expression, &batch).unwrap();
201+
let expected = Arc::new(Int32Array::from(vec![1, 2, 3]));
202+
assert_eq!(results.as_ref(), expected.as_ref())
203+
}
204+
205+
#[test]
206+
fn test_binary_op() {
207+
let schema = Schema::new(vec![
208+
Field::new("a", DataType::Int32, false),
209+
Field::new("b", DataType::Int32, false),
210+
]);
211+
let values = Int32Array::from(vec![1, 2, 3]);
212+
let batch = RecordBatch::try_new(
213+
Arc::new(schema.clone()),
214+
vec![Arc::new(values.clone()), Arc::new(values)],
215+
)
216+
.unwrap();
217+
let column_a = Expression::Column("a".to_string());
218+
let column_b = Expression::Column("b".to_string());
219+
220+
let expression = Box::new(column_a.clone().add(column_b.clone()));
221+
let results = evaluate_expression(&expression, &batch).unwrap();
222+
let expected = Arc::new(Int32Array::from(vec![2, 4, 6]));
223+
assert_eq!(results.as_ref(), expected.as_ref());
224+
225+
let expression = Box::new(column_a.clone().sub(column_b.clone()));
226+
let results = evaluate_expression(&expression, &batch).unwrap();
227+
let expected = Arc::new(Int32Array::from(vec![0, 0, 0]));
228+
assert_eq!(results.as_ref(), expected.as_ref());
229+
230+
let expression = Box::new(column_a.clone().mul(column_b));
231+
let results = evaluate_expression(&expression, &batch).unwrap();
232+
let expected = Arc::new(Int32Array::from(vec![1, 4, 9]));
233+
assert_eq!(results.as_ref(), expected.as_ref());
234+
}
235+
236+
#[test]
237+
fn test_binary_cmp() {
238+
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
239+
let values = Int32Array::from(vec![1, 2, 3]);
240+
let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap();
241+
let column = Expression::Column("a".to_string());
242+
let lit = Expression::Literal(Scalar::Integer(2));
243+
244+
let expression = Box::new(column.clone().lt(lit.clone()));
245+
let results = evaluate_expression(&expression, &batch).unwrap();
246+
let expected = Arc::new(BooleanArray::from(vec![true, false, false]));
247+
assert_eq!(results.as_ref(), expected.as_ref());
248+
249+
let expression = Box::new(column.clone().lt_eq(lit.clone()));
250+
let results = evaluate_expression(&expression, &batch).unwrap();
251+
let expected = Arc::new(BooleanArray::from(vec![true, true, false]));
252+
assert_eq!(results.as_ref(), expected.as_ref());
253+
254+
let expression = Box::new(column.clone().gt(lit.clone()));
255+
let results = evaluate_expression(&expression, &batch).unwrap();
256+
let expected = Arc::new(BooleanArray::from(vec![false, false, true]));
257+
assert_eq!(results.as_ref(), expected.as_ref());
258+
259+
let expression = Box::new(column.clone().gt_eq(lit.clone()));
260+
let results = evaluate_expression(&expression, &batch).unwrap();
261+
let expected = Arc::new(BooleanArray::from(vec![false, true, true]));
262+
assert_eq!(results.as_ref(), expected.as_ref());
263+
264+
let expression = Box::new(column.clone().eq(lit.clone()));
265+
let results = evaluate_expression(&expression, &batch).unwrap();
266+
let expected = Arc::new(BooleanArray::from(vec![false, true, false]));
267+
assert_eq!(results.as_ref(), expected.as_ref());
268+
269+
let expression = Box::new(column.clone().ne(lit.clone()));
270+
let results = evaluate_expression(&expression, &batch).unwrap();
271+
let expected = Arc::new(BooleanArray::from(vec![true, false, true]));
272+
assert_eq!(results.as_ref(), expected.as_ref());
273+
}
274+
275+
#[test]
276+
fn test_logical() {
277+
let schema = Schema::new(vec![
278+
Field::new("a", DataType::Boolean, false),
279+
Field::new("b", DataType::Boolean, false),
280+
]);
281+
let batch = RecordBatch::try_new(
282+
Arc::new(schema.clone()),
283+
vec![
284+
Arc::new(BooleanArray::from(vec![true, false])),
285+
Arc::new(BooleanArray::from(vec![false, true])),
286+
],
287+
)
288+
.unwrap();
289+
let column_a = Expression::Column("a".to_string());
290+
let column_b = Expression::Column("b".to_string());
291+
292+
let expression = Box::new(column_a.clone().and(column_b.clone()));
293+
let results = evaluate_expression(&expression, &batch).unwrap();
294+
let expected = Arc::new(BooleanArray::from(vec![false, false]));
295+
assert_eq!(results.as_ref(), expected.as_ref());
296+
297+
let expression = Box::new(
298+
column_a
299+
.clone()
300+
.and(Expression::literal(Scalar::Boolean(true))),
301+
);
302+
let results = evaluate_expression(&expression, &batch).unwrap();
303+
let expected = Arc::new(BooleanArray::from(vec![true, false]));
304+
assert_eq!(results.as_ref(), expected.as_ref());
305+
306+
let expression = Box::new(column_a.clone().or(column_b));
307+
let results = evaluate_expression(&expression, &batch).unwrap();
308+
let expected = Arc::new(BooleanArray::from(vec![true, true]));
309+
assert_eq!(results.as_ref(), expected.as_ref());
310+
311+
let expression = Box::new(
312+
column_a
313+
.clone()
314+
.or(Expression::literal(Scalar::Boolean(false))),
315+
);
316+
let results = evaluate_expression(&expression, &batch).unwrap();
317+
let expected = Arc::new(BooleanArray::from(vec![true, false]));
318+
assert_eq!(results.as_ref(), expected.as_ref());
319+
}
320+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//! Delta kernel client implementation.
2+
use std::sync::Arc;
3+
4+
use arrow_array::RecordBatch;
5+
6+
use self::expressions::evaluate_expression;
7+
use crate::kernel::error::DeltaResult;
8+
use crate::kernel::expressions::Expression;
9+
use crate::kernel::schema::SchemaRef;
10+
11+
pub mod expressions;
12+
13+
/// Interface for implementing an Expression evaluator.
14+
///
15+
/// It contains one Expression which can be evaluated on multiple ColumnarBatches.
16+
/// Connectors can implement this interface to optimize the evaluation using the
17+
/// connector specific capabilities.
18+
pub trait ExpressionEvaluator {
19+
/// Evaluate the expression on given ColumnarBatch data.
20+
///
21+
/// Contains one value for each row of the input.
22+
/// The data type of the output is same as the type output of the expression this evaluator is using.
23+
fn evaluate(&self, batch: &RecordBatch, output_schema: SchemaRef) -> DeltaResult<RecordBatch>;
24+
}
25+
26+
#[derive(Debug)]
27+
/// Expression evaluator based on arrow compute kernels.
28+
pub struct ArrowExpressionEvaluator {
29+
_input_schema: SchemaRef,
30+
expression: Box<Expression>,
31+
}
32+
33+
impl ExpressionEvaluator for ArrowExpressionEvaluator {
34+
fn evaluate(&self, batch: &RecordBatch, output_schema: SchemaRef) -> DeltaResult<RecordBatch> {
35+
Ok(RecordBatch::try_new(
36+
Arc::new(output_schema.as_ref().try_into()?),
37+
vec![evaluate_expression(&self.expression, batch)?],
38+
)?)
39+
}
40+
}

0 commit comments

Comments
 (0)