Skip to content

Commit d1a3058

Browse files
authored
perf: Apply logical regexp optimizations to Utf8View and LargeUtf8 inputs (#20581)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #20580 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> I ran into a bug that prevented some regexp optimizations from working that were introduced in #15299. After #16290, some SQL types were updated to `utf8view`. As part of that PR, some expected query plans in sqllogictest were updated to expect the unoptimized version. I need this fixed to avoid additional test failures while implementing a new regexp optimization for #20579. ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> - Add support for Utf8View and LargeUtf8 in `regex.rs`. - Properly return `Transformed::no()` on cases when the plan isn't modified (previously, it was always returning `Transformed::yes()` - Updates the tests back to expect the optimized query plans ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Fixed existing tests that previously weren't working. Now they reflect the optimization being reflected properly. ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> No. Just applying the optimizations to more cases.
1 parent 0ca9d65 commit d1a3058

4 files changed

Lines changed: 146 additions & 73 deletions

File tree

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,17 +1660,19 @@ impl TreeNodeRewriter for Simplifier<'_> {
16601660
left,
16611661
op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
16621662
right,
1663-
}) => Transformed::yes(simplify_regex_expr(left, op, right)?),
1663+
}) => simplify_regex_expr(left, op, right)?,
16641664

16651665
// Rules for Like
16661666
Expr::Like(like) => {
16671667
// `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291
16681668
let escape_char = like.escape_char.unwrap_or('\\');
1669-
match as_string_scalar(&like.pattern) {
1670-
Some((data_type, pattern_str)) => {
1669+
1670+
match StringScalar::try_from_expr(&like.pattern) {
1671+
Some(string_scalar) => {
1672+
let pattern_str = string_scalar.as_str();
16711673
match pattern_str {
16721674
None => return Ok(Transformed::yes(lit_bool_null())),
1673-
Some(pattern_str) if pattern_str == "%" => {
1675+
Some("%") => {
16741676
// exp LIKE '%' is
16751677
// - when exp is not NULL, it's true
16761678
// - when exp is NULL, it's NULL
@@ -1702,10 +1704,9 @@ impl TreeNodeRewriter for Simplifier<'_> {
17021704
.replace_all(pattern_str, "%")
17031705
.to_string();
17041706
Transformed::yes(Expr::Like(Like {
1705-
pattern: Box::new(to_string_scalar(
1706-
&data_type,
1707-
Some(simplified_pattern),
1708-
)),
1707+
pattern: Box::new(
1708+
string_scalar.to_expr(&simplified_pattern),
1709+
),
17091710
..like
17101711
}))
17111712
}
@@ -2126,21 +2127,54 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool {
21262127
}
21272128
}
21282129

2129-
fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
2130-
match expr {
2131-
Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)),
2132-
Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)),
2133-
Expr::Literal(ScalarValue::Utf8View(s), _) => Some((DataType::Utf8View, s)),
2134-
_ => None,
2135-
}
2130+
/// Helper for working with string scalar values (Utf8, LargeUtf8, Utf8View)
2131+
pub(crate) enum StringScalar<'a> {
2132+
Utf8(&'a ScalarValue),
2133+
LargeUtf8(&'a ScalarValue),
2134+
Utf8View(&'a ScalarValue),
21362135
}
21372136

2138-
fn to_string_scalar(data_type: &DataType, value: Option<String>) -> Expr {
2139-
match data_type {
2140-
DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None),
2141-
DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None),
2142-
DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value), None),
2143-
_ => unreachable!(),
2137+
impl<'a> StringScalar<'a> {
2138+
/// Create a `StringScalar` view from an `Expr` if it is a supported string literal.
2139+
/// Returns `None` if the expression is not a string literal.
2140+
pub(crate) fn try_from_expr(expr: &'a Expr) -> Option<Self> {
2141+
match expr {
2142+
Expr::Literal(scalar, _) => Self::try_from_scalar(scalar),
2143+
_ => None,
2144+
}
2145+
}
2146+
2147+
/// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type.
2148+
/// Returns `None` if the scalar value is not a supported string type.
2149+
fn try_from_scalar(scalar: &'a ScalarValue) -> Option<Self> {
2150+
match scalar {
2151+
ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)),
2152+
ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)),
2153+
ScalarValue::Utf8View(_) => Some(Self::Utf8View(scalar)),
2154+
_ => None,
2155+
}
2156+
}
2157+
2158+
/// Returns the underlying string slice.
2159+
pub(crate) fn as_str(&self) -> Option<&'a str> {
2160+
match self {
2161+
Self::Utf8(scalar) | Self::LargeUtf8(scalar) | Self::Utf8View(scalar) => {
2162+
scalar.try_as_str().flatten()
2163+
}
2164+
}
2165+
}
2166+
2167+
/// Build a new `Expr` of the same string type with the given value.
2168+
pub(crate) fn to_expr(&self, val: &str) -> Expr {
2169+
match self {
2170+
Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None),
2171+
Self::LargeUtf8(_) => {
2172+
Expr::Literal(ScalarValue::LargeUtf8(Some(val.to_owned())), None)
2173+
}
2174+
Self::Utf8View(_) => {
2175+
Expr::Literal(ScalarValue::Utf8View(Some(val.to_owned())), None)
2176+
}
2177+
}
21442178
}
21452179
}
21462180

datafusion/optimizer/src/simplify_expressions/regex.rs

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,13 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use datafusion_common::{DataFusionError, Result, ScalarValue};
18+
use datafusion_common::tree_node::Transformed;
19+
use datafusion_common::{DataFusionError, Result};
1920
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
2021
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
2122

23+
use crate::simplify_expressions::expr_simplifier::StringScalar;
24+
2225
/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
2326
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
2427

@@ -43,52 +46,70 @@ pub fn simplify_regex_expr(
4346
left: Box<Expr>,
4447
op: Operator,
4548
right: Box<Expr>,
46-
) -> Result<Expr> {
47-
let mode = OperatorMode::new(&op);
49+
) -> Result<Transformed<Expr>> {
50+
// Check if the right operand is a supported string literal
51+
let Some(string_scalar) = StringScalar::try_from_expr(right.as_ref()) else {
52+
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
53+
left,
54+
op,
55+
right,
56+
})));
57+
};
58+
let pattern = string_scalar.as_str();
59+
let Some(pattern) = pattern else {
60+
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
61+
left,
62+
op,
63+
right,
64+
})));
65+
};
4866

49-
if let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = right.as_ref() {
50-
// Handle the special case for ".*" pattern
51-
if pattern == ANY_CHAR_REGEX_PATTERN {
52-
let new_expr = if mode.not {
53-
// not empty
54-
let empty_lit = Box::new(lit(""));
55-
Expr::BinaryExpr(BinaryExpr {
56-
left,
57-
op: Operator::Eq,
58-
right: empty_lit,
59-
})
60-
} else {
61-
// not null
62-
left.is_not_null()
63-
};
64-
return Ok(new_expr);
65-
}
67+
let mode = OperatorMode::new(&op);
68+
// Handle the special case for ".*" pattern
69+
if pattern == ANY_CHAR_REGEX_PATTERN {
70+
let new_expr = if mode.not {
71+
// not empty
72+
let empty_lit = Box::new(string_scalar.to_expr(""));
73+
Expr::BinaryExpr(BinaryExpr {
74+
left,
75+
op: Operator::Eq,
76+
right: empty_lit,
77+
})
78+
} else {
79+
// not null
80+
left.is_not_null()
81+
};
82+
return Ok(Transformed::yes(new_expr));
83+
}
6684

67-
match regex_syntax::Parser::new().parse(pattern) {
68-
Ok(hir) => {
69-
let kind = hir.kind();
70-
if let HirKind::Alternation(alts) = kind {
71-
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
72-
&& let Some(expr) = lower_alt(&mode, &left, alts)
73-
{
74-
return Ok(expr);
75-
}
76-
} else if let Some(expr) = lower_simple(&mode, &left, &hir) {
77-
return Ok(expr);
85+
match regex_syntax::Parser::new().parse(pattern) {
86+
Ok(hir) => {
87+
let kind = hir.kind();
88+
if let HirKind::Alternation(alts) = kind {
89+
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
90+
&& let Some(expr) = lower_alt(&mode, &left, alts, &string_scalar)
91+
{
92+
return Ok(Transformed::yes(expr));
7893
}
79-
}
80-
Err(e) => {
81-
// error out early since the execution may fail anyways
82-
return Err(DataFusionError::Context(
83-
"Invalid regex".to_owned(),
84-
Box::new(DataFusionError::External(Box::new(e))),
85-
));
94+
} else if let Some(expr) = lower_simple(&mode, &left, &hir, &string_scalar) {
95+
return Ok(Transformed::yes(expr));
8696
}
8797
}
98+
Err(e) => {
99+
// error out early since the execution may fail anyways
100+
return Err(DataFusionError::Context(
101+
"Invalid regex".to_owned(),
102+
Box::new(DataFusionError::External(Box::new(e))),
103+
));
104+
}
88105
}
89106

90107
// Leave untouched if optimization didn't work
91-
Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
108+
Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
109+
left,
110+
op,
111+
right,
112+
})))
92113
}
93114

94115
#[derive(Debug)]
@@ -117,11 +138,11 @@ impl OperatorMode {
117138
}
118139

119140
/// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
120-
fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
141+
fn expr(&self, expr: Box<Expr>, pattern: Box<Expr>) -> Expr {
121142
let like = Like {
122143
negated: self.not,
123144
expr,
124-
pattern: Box::new(Expr::Literal(ScalarValue::from(pattern), None)),
145+
pattern,
125146
escape_char: None,
126147
case_insensitive: self.i,
127148
};
@@ -311,14 +332,24 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
311332
}
312333

313334
/// Tries to lower (transform) a simple regex pattern to a LIKE expression.
314-
fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
335+
fn lower_simple(
336+
mode: &OperatorMode,
337+
left: &Expr,
338+
hir: &Hir,
339+
string_scalar: &StringScalar,
340+
) -> Option<Expr> {
315341
match hir.kind() {
316342
HirKind::Empty => {
317-
return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
343+
return Some(
344+
mode.expr(Box::new(left.clone()), Box::new(string_scalar.to_expr("%"))),
345+
);
318346
}
319347
HirKind::Literal(l) => {
320348
let s = like_str_from_literal(l)?;
321-
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
349+
return Some(mode.expr(
350+
Box::new(left.clone()),
351+
Box::new(string_scalar.to_expr(&format!("%{s}%"))),
352+
));
322353
}
323354
HirKind::Concat(inner) if is_anchored_literal(inner) => {
324355
return anchored_literal_to_expr(inner).map(|right| {
@@ -333,7 +364,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
333364
if let Some(pattern) = partial_anchored_literal_to_like(inner)
334365
.or_else(|| collect_concat_to_like_string(inner))
335366
{
336-
return Some(mode.expr(Box::new(left.clone()), pattern));
367+
return Some(mode.expr(
368+
Box::new(left.clone()),
369+
Box::new(string_scalar.to_expr(&pattern)),
370+
));
337371
}
338372
}
339373
_ => {}
@@ -344,11 +378,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
344378
/// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and`
345379
/// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this
346380
/// function to return `None`.
347-
fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
381+
fn lower_alt(
382+
mode: &OperatorMode,
383+
left: &Expr,
384+
alts: &[Hir],
385+
string_scalar: &StringScalar,
386+
) -> Option<Expr> {
348387
let mut accu: Option<Expr> = None;
349388

350389
for part in alts {
351-
if let Some(expr) = lower_simple(mode, left, part) {
390+
if let Some(expr) = lower_simple(mode, left, part, string_scalar) {
352391
accu = match accu {
353392
Some(accu) => {
354393
if mode.not {

datafusion/sqllogictest/test_files/simplify_expr.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,20 @@ query TT
3434
explain select b from t where b ~ '.*'
3535
----
3636
logical_plan
37-
01)Filter: t.b ~ Utf8View(".*")
37+
01)Filter: t.b IS NOT NULL
3838
02)--TableScan: t projection=[b]
3939
physical_plan
40-
01)FilterExec: b@0 ~ .*
40+
01)FilterExec: b@0 IS NOT NULL
4141
02)--DataSourceExec: partitions=1, partition_sizes=[1]
4242

4343
query TT
4444
explain select b from t where b !~ '.*'
4545
----
4646
logical_plan
47-
01)Filter: t.b !~ Utf8View(".*")
47+
01)Filter: t.b = Utf8View("")
4848
02)--TableScan: t projection=[b]
4949
physical_plan
50-
01)FilterExec: b@0 !~ .*
50+
01)FilterExec: b@0 =
5151
02)--DataSourceExec: partitions=1, partition_sizes=[1]
5252

5353
query T

datafusion/sqllogictest/test_files/string/string_view.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,7 @@ EXPLAIN SELECT
11001100
FROM test;
11011101
----
11021102
logical_plan
1103-
01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
1103+
01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1
11041104
02)--TableScan: test projection=[column1_utf8view]
11051105

11061106
# `~*` operator (regex match case-insensitive)

0 commit comments

Comments
 (0)