Skip to content

Commit c18e121

Browse files
GaneshPatil7517alamb
authored andcommitted
Fix incorrect regex pattern in regex_replace_posix_groups (apache#19827)
The `regex_replace_posix_groups` method was using the pattern `(\d*)` to match POSIX capture group references like `\1`. However, `*` matches zero or more digits, which caused a lone backslash `\` to incorrectly become `${}`. Changed to `(\d+)` which requires at least one digit, fixing the issue. Added unit tests to validate correct behavior. - Fixes apache#19766 --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 8daa1e9 commit c18e121

1 file changed

Lines changed: 45 additions & 3 deletions

File tree

datafusion/functions/src/regex/regexpreplace.rs

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,13 +189,19 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
189189
}
190190
}
191191

192-
/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
192+
/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1})
193193
/// used by regexp_replace
194+
/// Handles both single backslash (\1) and double backslash (\\1) which can occur
195+
/// when SQL strings with escaped backslashes are passed through
196+
///
197+
/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
198+
/// substitutes the entire match. This is consistent with POSIX behavior where
199+
/// \0 (or &) refers to the entire matched string.
194200
fn regex_replace_posix_groups(replacement: &str) -> String {
195201
static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
196-
LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
202+
LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
197203
CAPTURE_GROUPS_RE_LOCK
198-
.replace_all(replacement, "$${$2}")
204+
.replace_all(replacement, "$${$1}")
199205
.into_owned()
200206
}
201207

@@ -659,6 +665,42 @@ mod tests {
659665

660666
use super::*;
661667

668+
#[test]
669+
fn test_regex_replace_posix_groups() {
670+
// Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
671+
assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
672+
assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
673+
assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
674+
assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
675+
676+
// Test double backslash (from SQL escaped strings like '\\1')
677+
assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
678+
assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
679+
assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
680+
681+
// Test 3 or 4 backslashes before digits to document expected behavior
682+
assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
683+
assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
684+
assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
685+
686+
// Test that a lone backslash is NOT replaced (requires at least one digit)
687+
assert_eq!(regex_replace_posix_groups(r"\"), r"\");
688+
assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
689+
690+
// Test that backslash followed by non-digit is preserved
691+
assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
692+
assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
693+
694+
// Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
695+
// replacement syntax substitutes the entire match. This is consistent
696+
// with POSIX behavior where \0 (or &) refers to the entire matched string.
697+
assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
698+
assert_eq!(
699+
regex_replace_posix_groups(r"prefix\0suffix"),
700+
"prefix${0}suffix"
701+
);
702+
}
703+
662704
macro_rules! static_pattern_regexp_replace {
663705
($name:ident, $T:ty, $O:ty) => {
664706
#[test]

0 commit comments

Comments
 (0)