Skip to content

Commit 3ff5447

Browse files
devanshu0987Devanshucomphead
authored andcommitted
Implement preimage for floor function to enable predicate pushdown (apache#20059)
This adds a `preimage` implementation for the `floor()` function that transforms `floor(x) = N` into `x >= N AND x < N+1`. This enables statistics-based predicate pushdown for queries using floor(). For example, a query like: `SELECT * FROM t WHERE floor(price) = 100` Is rewritten to: `SELECT * FROM t WHERE price >= 100 AND price < 101` This allows the query engine to leverage min/max statistics from Parquet row groups, significantly reducing the amount of data scanned. Benchmarks on the ClickBench hits dataset show: - 80% file pruning (89 out of 111 files skipped) - 70x fewer rows scanned (1.4M vs 100M) ``` CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION 'benchmarks/data/hits_partitioned/'; -- Test the floor preimage optimization EXPLAIN ANALYZE SELECT COUNT(*) FROM hits WHERE floor(CAST("CounterID" AS DOUBLE)) = 62; ``` Metric | Before (no preimage) | After (with preimage) -- | -- | -- Files pruned | 111 → 111 (0 pruned) | 111 → 22 (89 pruned) Row groups pruned | 325 → 325 (0 pruned) | 51 → 4 (47 pruned) Rows scanned | 99,997,497 | 1,410,000 Output rows | 738,172 | 738,172 Pruning predicate | None | CAST(CounterID_max) >= 62 AND CAST(CounterID_min) < 63 ## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes apache#123` indicates that this PR will close issue apache#123. --> - Closes #. ## Rationale for this change apache#19946 This epic introduced the pre-image API. This PR is using the pre-image API to provide it for `floor` function where it is applicable. ## What changes are included in this PR? ## Are these changes tested? - Unit Tests added - Existing SLT tests pass for this. ## Are there any user-facing changes? No --------- Co-authored-by: Devanshu <devanshu@codapayments.com> Co-authored-by: Oleks V <comphead@users.noreply.github.com>
1 parent 1442bc8 commit 3ff5447

1 file changed

Lines changed: 262 additions & 2 deletions

File tree

datafusion/functions/src/math/floor.rs

Lines changed: 262 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,15 @@ use arrow::datatypes::{
2525
};
2626
use datafusion_common::{Result, ScalarValue, exec_err};
2727
use datafusion_expr::interval_arithmetic::Interval;
28+
use datafusion_expr::preimage::PreimageResult;
29+
use datafusion_expr::simplify::SimplifyContext;
2830
use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
2931
use datafusion_expr::{
30-
Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
31-
TypeSignature, TypeSignatureClass, Volatility,
32+
Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl,
33+
Signature, TypeSignature, TypeSignatureClass, Volatility,
3234
};
3335
use datafusion_macros::user_doc;
36+
use num_traits::{CheckedAdd, Float, One};
3437

3538
use super::decimal::{apply_decimal_op, floor_decimal_value};
3639

@@ -200,7 +203,264 @@ impl ScalarUDFImpl for FloorFunc {
200203
Interval::make_unbounded(&data_type)
201204
}
202205

206+
/// Compute the preimage for floor function.
207+
///
208+
/// For `floor(x) = N`, the preimage is `x >= N AND x < N + 1`
209+
/// because floor(x) = N for all x in [N, N+1).
210+
///
211+
/// This enables predicate pushdown optimizations, transforming:
212+
/// `floor(col) = 100` into `col >= 100 AND col < 101`
213+
fn preimage(
214+
&self,
215+
args: &[Expr],
216+
lit_expr: &Expr,
217+
_info: &SimplifyContext,
218+
) -> Result<PreimageResult> {
219+
// floor takes exactly one argument
220+
if args.len() != 1 {
221+
return Ok(PreimageResult::None);
222+
}
223+
224+
let arg = args[0].clone();
225+
226+
// Extract the literal value being compared to
227+
let Expr::Literal(lit_value, _) = lit_expr else {
228+
return Ok(PreimageResult::None);
229+
};
230+
231+
// Compute lower bound (N) and upper bound (N + 1) using helper functions
232+
let Some((lower, upper)) = (match lit_value {
233+
// Decimal types should be supported and tracked in
234+
// https://github.com/apache/datafusion/issues/20080
235+
// Floating-point types
236+
ScalarValue::Float64(Some(n)) => float_preimage_bounds(*n).map(|(lo, hi)| {
237+
(
238+
ScalarValue::Float64(Some(lo)),
239+
ScalarValue::Float64(Some(hi)),
240+
)
241+
}),
242+
ScalarValue::Float32(Some(n)) => float_preimage_bounds(*n).map(|(lo, hi)| {
243+
(
244+
ScalarValue::Float32(Some(lo)),
245+
ScalarValue::Float32(Some(hi)),
246+
)
247+
}),
248+
249+
// Integer types
250+
ScalarValue::Int8(Some(n)) => int_preimage_bounds(*n).map(|(lo, hi)| {
251+
(ScalarValue::Int8(Some(lo)), ScalarValue::Int8(Some(hi)))
252+
}),
253+
ScalarValue::Int16(Some(n)) => int_preimage_bounds(*n).map(|(lo, hi)| {
254+
(ScalarValue::Int16(Some(lo)), ScalarValue::Int16(Some(hi)))
255+
}),
256+
ScalarValue::Int32(Some(n)) => int_preimage_bounds(*n).map(|(lo, hi)| {
257+
(ScalarValue::Int32(Some(lo)), ScalarValue::Int32(Some(hi)))
258+
}),
259+
ScalarValue::Int64(Some(n)) => int_preimage_bounds(*n).map(|(lo, hi)| {
260+
(ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi)))
261+
}),
262+
263+
// Unsupported types
264+
_ => None,
265+
}) else {
266+
return Ok(PreimageResult::None);
267+
};
268+
269+
Ok(PreimageResult::Range {
270+
expr: arg,
271+
interval: Box::new(Interval::try_new(lower, upper)?),
272+
})
273+
}
274+
203275
fn documentation(&self) -> Option<&Documentation> {
204276
self.doc()
205277
}
206278
}
279+
280+
// ============ Helper functions for preimage bounds ============
281+
282+
/// Compute preimage bounds for floor function on floating-point types.
283+
/// For floor(x) = n, the preimage is [n, n+1).
284+
/// Returns None if:
285+
/// - The value is non-finite (infinity, NaN)
286+
/// - The value is not an integer (floor always returns integers, so floor(x) = 1.3 has no solution)
287+
/// - Adding 1 would lose precision at extreme values
288+
fn float_preimage_bounds<F: Float>(n: F) -> Option<(F, F)> {
289+
let one = F::one();
290+
// Check for non-finite values (infinity, NaN)
291+
if !n.is_finite() {
292+
return None;
293+
}
294+
// floor always returns an integer, so if n has a fractional part, there's no solution
295+
if n.fract() != F::zero() {
296+
return None;
297+
}
298+
// Check for precision loss at extreme values
299+
if n + one <= n {
300+
return None;
301+
}
302+
Some((n, n + one))
303+
}
304+
305+
/// Compute preimage bounds for floor function on integer types.
306+
/// For floor(x) = n, the preimage is [n, n+1).
307+
/// Returns None if adding 1 would overflow.
308+
fn int_preimage_bounds<I: CheckedAdd + One + Copy>(n: I) -> Option<(I, I)> {
309+
let upper = n.checked_add(&I::one())?;
310+
Some((n, upper))
311+
}
312+
313+
#[cfg(test)]
314+
mod tests {
315+
use super::*;
316+
use datafusion_expr::col;
317+
318+
/// Helper to test valid preimage cases that should return a Range
319+
fn assert_preimage_range(
320+
input: ScalarValue,
321+
expected_lower: ScalarValue,
322+
expected_upper: ScalarValue,
323+
) {
324+
let floor_func = FloorFunc::new();
325+
let args = vec![col("x")];
326+
let lit_expr = Expr::Literal(input.clone(), None);
327+
let info = SimplifyContext::default();
328+
329+
let result = floor_func.preimage(&args, &lit_expr, &info).unwrap();
330+
331+
match result {
332+
PreimageResult::Range { expr, interval } => {
333+
assert_eq!(expr, col("x"));
334+
assert_eq!(interval.lower().clone(), expected_lower);
335+
assert_eq!(interval.upper().clone(), expected_upper);
336+
}
337+
PreimageResult::None => {
338+
panic!("Expected Range, got None for input {input:?}")
339+
}
340+
}
341+
}
342+
343+
/// Helper to test cases that should return None
344+
fn assert_preimage_none(input: ScalarValue) {
345+
let floor_func = FloorFunc::new();
346+
let args = vec![col("x")];
347+
let lit_expr = Expr::Literal(input.clone(), None);
348+
let info = SimplifyContext::default();
349+
350+
let result = floor_func.preimage(&args, &lit_expr, &info).unwrap();
351+
assert!(
352+
matches!(result, PreimageResult::None),
353+
"Expected None for input {input:?}"
354+
);
355+
}
356+
357+
#[test]
358+
fn test_floor_preimage_valid_cases() {
359+
// Float64
360+
assert_preimage_range(
361+
ScalarValue::Float64(Some(100.0)),
362+
ScalarValue::Float64(Some(100.0)),
363+
ScalarValue::Float64(Some(101.0)),
364+
);
365+
// Float32
366+
assert_preimage_range(
367+
ScalarValue::Float32(Some(50.0)),
368+
ScalarValue::Float32(Some(50.0)),
369+
ScalarValue::Float32(Some(51.0)),
370+
);
371+
// Int64
372+
assert_preimage_range(
373+
ScalarValue::Int64(Some(42)),
374+
ScalarValue::Int64(Some(42)),
375+
ScalarValue::Int64(Some(43)),
376+
);
377+
// Int32
378+
assert_preimage_range(
379+
ScalarValue::Int32(Some(100)),
380+
ScalarValue::Int32(Some(100)),
381+
ScalarValue::Int32(Some(101)),
382+
);
383+
// Negative values
384+
assert_preimage_range(
385+
ScalarValue::Float64(Some(-5.0)),
386+
ScalarValue::Float64(Some(-5.0)),
387+
ScalarValue::Float64(Some(-4.0)),
388+
);
389+
// Zero
390+
assert_preimage_range(
391+
ScalarValue::Float64(Some(0.0)),
392+
ScalarValue::Float64(Some(0.0)),
393+
ScalarValue::Float64(Some(1.0)),
394+
);
395+
}
396+
397+
#[test]
398+
fn test_floor_preimage_non_integer_float() {
399+
// floor(x) = 1.3 has NO SOLUTION because floor always returns an integer
400+
// Therefore preimage should return None for non-integer literals
401+
assert_preimage_none(ScalarValue::Float64(Some(1.3)));
402+
assert_preimage_none(ScalarValue::Float64(Some(-2.5)));
403+
assert_preimage_none(ScalarValue::Float32(Some(3.7)));
404+
}
405+
406+
#[test]
407+
fn test_floor_preimage_integer_overflow() {
408+
// All integer types at MAX value should return None
409+
assert_preimage_none(ScalarValue::Int64(Some(i64::MAX)));
410+
assert_preimage_none(ScalarValue::Int32(Some(i32::MAX)));
411+
assert_preimage_none(ScalarValue::Int16(Some(i16::MAX)));
412+
assert_preimage_none(ScalarValue::Int8(Some(i8::MAX)));
413+
}
414+
415+
#[test]
416+
fn test_floor_preimage_float_edge_cases() {
417+
// Float64 edge cases
418+
assert_preimage_none(ScalarValue::Float64(Some(f64::INFINITY)));
419+
assert_preimage_none(ScalarValue::Float64(Some(f64::NEG_INFINITY)));
420+
assert_preimage_none(ScalarValue::Float64(Some(f64::NAN)));
421+
assert_preimage_none(ScalarValue::Float64(Some(f64::MAX))); // precision loss
422+
423+
// Float32 edge cases
424+
assert_preimage_none(ScalarValue::Float32(Some(f32::INFINITY)));
425+
assert_preimage_none(ScalarValue::Float32(Some(f32::NEG_INFINITY)));
426+
assert_preimage_none(ScalarValue::Float32(Some(f32::NAN)));
427+
assert_preimage_none(ScalarValue::Float32(Some(f32::MAX))); // precision loss
428+
}
429+
430+
#[test]
431+
fn test_floor_preimage_null_values() {
432+
assert_preimage_none(ScalarValue::Float64(None));
433+
assert_preimage_none(ScalarValue::Float32(None));
434+
assert_preimage_none(ScalarValue::Int64(None));
435+
}
436+
437+
#[test]
438+
fn test_floor_preimage_invalid_inputs() {
439+
let floor_func = FloorFunc::new();
440+
let info = SimplifyContext::default();
441+
442+
// Non-literal comparison value
443+
let result = floor_func.preimage(&[col("x")], &col("y"), &info).unwrap();
444+
assert!(
445+
matches!(result, PreimageResult::None),
446+
"Expected None for non-literal"
447+
);
448+
449+
// Wrong argument count (too many)
450+
let lit = Expr::Literal(ScalarValue::Float64(Some(100.0)), None);
451+
let result = floor_func
452+
.preimage(&[col("x"), col("y")], &lit, &info)
453+
.unwrap();
454+
assert!(
455+
matches!(result, PreimageResult::None),
456+
"Expected None for wrong arg count"
457+
);
458+
459+
// Wrong argument count (zero)
460+
let result = floor_func.preimage(&[], &lit, &info).unwrap();
461+
assert!(
462+
matches!(result, PreimageResult::None),
463+
"Expected None for zero args"
464+
);
465+
}
466+
}

0 commit comments

Comments
 (0)