Skip to content

Commit 3c973ef

Browse files
committed
feat: simplify regex wildcard pattern
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
1 parent a181e1d commit 3c973ef

File tree

3 files changed

+84
-3
lines changed

3 files changed

+84
-3
lines changed

datafusion/optimizer/src/simplify_expressions/regex.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
3333
/// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'`
3434
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
3535
/// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
36+
/// - `EQ .*` is always true
37+
/// - `NE .*` means IS EMPTY
3638
///
3739
/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
3840
pub fn simplify_regex_expr(
@@ -43,6 +45,23 @@ pub fn simplify_regex_expr(
4345
let mode = OperatorMode::new(&op);
4446

4547
if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() {
48+
// Handle the special case for ".*" pattern
49+
if pattern == ".*" {
50+
let new_expr = if mode.not {
51+
// not empty
52+
let empty_lit = Box::new(lit(""));
53+
Expr::BinaryExpr(BinaryExpr {
54+
left,
55+
op: Operator::Eq,
56+
right: empty_lit,
57+
})
58+
} else {
59+
// always true
60+
lit(true)
61+
};
62+
return Ok(new_expr);
63+
}
64+
4665
match regex_syntax::Parser::new().parse(pattern) {
4766
Ok(hir) => {
4867
let kind = hir.kind();

datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,4 +765,48 @@ mod tests {
765765

766766
assert_optimized_plan_eq(plan, expected)
767767
}
768+
769+
#[test]
770+
fn test_simplify_regex_special_cases() -> Result<()> {
771+
let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
772+
let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
773+
774+
// Test `= ".*"` transforms to true (except for empty strings)
775+
let plan = LogicalPlanBuilder::from(table_scan.clone())
776+
.filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))?
777+
.build()?;
778+
let expected = "Filter: Boolean(true)\
779+
\n TableScan: test";
780+
781+
assert_optimized_plan_eq(plan, expected)?;
782+
783+
// Test `!= ".*"` transforms to checking if the column is empty
784+
let plan = LogicalPlanBuilder::from(table_scan.clone())
785+
.filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
786+
.build()?;
787+
let expected = "Filter: test.a = Utf8(\"\")\
788+
\n TableScan: test";
789+
790+
assert_optimized_plan_eq(plan, expected)?;
791+
792+
// Test case-insensitive versions
793+
794+
// Test `=~ ".*"` (case-insensitive) transforms to true (except for empty strings)
795+
let plan = LogicalPlanBuilder::from(table_scan.clone())
796+
.filter(binary_expr(col("a"), Operator::RegexIMatch, lit(".*")))?
797+
.build()?;
798+
let expected = "Filter: Boolean(true)\
799+
\n TableScan: test";
800+
801+
assert_optimized_plan_eq(plan, expected)?;
802+
803+
// Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty
804+
let plan = LogicalPlanBuilder::from(table_scan.clone())
805+
.filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))?
806+
.build()?;
807+
let expected = "Filter: test.a = Utf8(\"\")\
808+
\n TableScan: test";
809+
810+
assert_optimized_plan_eq(plan, expected)
811+
}
768812
}

datafusion/sqllogictest/test_files/simplify_expr.slt

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
statement count 0
19-
create table t(a int) as values (1);
18+
statement ok
19+
create table t(a int, b string) as values (1, 'a');
2020

2121
# test between simplification
2222
query TT
@@ -30,5 +30,23 @@ physical_plan
3030
02)--FilterExec: a@0 = 3
3131
03)----DataSourceExec: partitions=1, partition_sizes=[1]
3232

33-
statement count 0
33+
# test regex exprs
34+
query TT
35+
explain select b from t where b ~ '.*'
36+
----
37+
logical_plan TableScan: t projection=[b]
38+
physical_plan DataSourceExec: partitions=1, partition_sizes=[1]
39+
40+
query TT
41+
explain select b from t where b !~ '.*'
42+
----
43+
logical_plan
44+
01)Filter: t.b = Utf8("")
45+
02)--TableScan: t projection=[b]
46+
physical_plan
47+
01)CoalesceBatchesExec: target_batch_size=8192
48+
02)--FilterExec: b@0 =
49+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
50+
51+
statement ok
3452
drop table t;

0 commit comments

Comments
 (0)