Merge 'Fixes glob giving wrong results in some cases ' from Vrishabh

Fixes #577
With the previous implementation we weren't escaping the regex meta
characters . And in certain cases glob had a different meaning than
regex.
For e.g , the below shows a glob pattern with its regex equivalent
- `[][]` translates to `[\]\[]`
- `[^][]` translates to `[^\]\[]`

Closes #578
This commit is contained in:
Pekka Enberg
2024-12-31 10:31:49 +02:00
3 changed files with 205 additions and 26 deletions

View File

@@ -1,3 +1,5 @@
use std::collections::HashMap;
use regex::{Regex, RegexBuilder};
use crate::{types::OwnedValue, LimboError};
@@ -61,6 +63,124 @@ fn construct_like_regex_with_escape(pattern: &str, escape: char) -> Regex {
.unwrap()
}
// Implements GLOB pattern matching. Caches the constructed regex if a cache is provided
pub fn exec_glob(
regex_cache: Option<&mut HashMap<String, Regex>>,
pattern: &str,
text: &str,
) -> bool {
if let Some(cache) = regex_cache {
match cache.get(pattern) {
Some(re) => re.is_match(text),
None => match construct_glob_regex(pattern) {
Ok(re) => {
let res = re.is_match(text);
cache.insert(pattern.to_string(), re);
res
}
Err(_) => false,
},
}
} else {
construct_glob_regex(pattern)
.map(|re| re.is_match(text))
.unwrap_or(false)
}
}
fn push_char_to_regex_pattern(c: char, regex_pattern: &mut String) {
if regex_syntax::is_meta_character(c) {
regex_pattern.push('\\');
}
regex_pattern.push(c);
}
fn construct_glob_regex(pattern: &str) -> Result<Regex, LimboError> {
let mut regex_pattern = String::with_capacity(pattern.len() * 2);
regex_pattern.push('^');
let mut chars = pattern.chars();
let mut bracket_closed = true;
while let Some(ch) = chars.next() {
match ch {
'[' => {
bracket_closed = false;
regex_pattern.push('[');
if let Some(next_ch) = chars.next() {
match next_ch {
']' => {
// The string enclosed by the brackets cannot be empty;
// therefore ']' can be allowed between the brackets,
// provided that it is the first character.
// so this means
// - `[]]` will be translated to `[\]]`
// - `[[]` will be translated to `[\[]`
regex_pattern.push_str("\\]");
}
'^' => {
// For the most cases we can pass `^` directly to regex
// but in certain cases like [^][a] , `[^]` will make regex crate
// throw unenclosed character class. So this means
// - `[^][a]` will be translated to `[^\]a]`
regex_pattern.push('^');
if let Some(next_ch_2) = chars.next() {
match next_ch_2 {
']' => {
regex_pattern.push('\\');
regex_pattern.push(']');
}
c => {
push_char_to_regex_pattern(c, &mut regex_pattern);
}
}
}
}
c => {
push_char_to_regex_pattern(c, &mut regex_pattern);
}
}
};
while let Some(next_ch) = chars.next() {
match next_ch {
']' => {
bracket_closed = true;
regex_pattern.push(']');
break;
}
'-' => {
regex_pattern.push('-');
}
c => {
push_char_to_regex_pattern(c, &mut regex_pattern);
}
}
}
}
'?' => {
regex_pattern.push('.');
}
'*' => {
regex_pattern.push_str(".*");
}
c => {
push_char_to_regex_pattern(c, &mut regex_pattern);
}
}
}
regex_pattern.push('$');
if bracket_closed {
Ok(Regex::new(&regex_pattern).unwrap())
} else {
Result::Err(LimboError::Constraint(
"blob pattern is not closed".to_string(),
))
}
}
#[cfg(test)]
mod test {
use super::*;
@@ -82,4 +202,16 @@ mod test {
assert!(!exec_like_with_escape("abcXX", "abc", 'X'));
assert!(!exec_like_with_escape("abcXX", "abcXX", 'X'));
}
#[test]
fn test_glob_no_cache() {
assert!(exec_glob(None, r#"?*/abc/?*"#, r#"x//a/ab/abc/y"#));
assert!(exec_glob(None, r#"a[1^]"#, r#"a1"#));
assert!(exec_glob(None, r#"a[1^]*"#, r#"a^"#));
assert!(!exec_glob(None, r#"a[a*"#, r#"a["#));
assert!(!exec_glob(None, r#"a[a"#, r#"a[a"#));
assert!(exec_glob(None, r#"a[[]"#, r#"a["#));
assert!(exec_glob(None, r#"abc[^][*?]efg"#, r#"abcdefg"#));
assert!(!exec_glob(None, r#"abc[^][*?]efg"#, r#"abc]efg"#));
}
}

View File

@@ -42,7 +42,7 @@ use crate::vdbe::insn::Insn;
use crate::{function::JsonFunc, json::get_json, json::json_array, json::json_array_length};
use crate::{Connection, Result, Rows, TransactionState, DATABASE_VERSION};
use datetime::{exec_date, exec_time, exec_unixepoch};
use likeop::{construct_like_escape_arg, exec_like_with_escape};
use likeop::{construct_like_escape_arg, exec_glob, exec_like_with_escape};
use rand::distributions::{Distribution, Uniform};
use rand::{thread_rng, Rng};
use regex::{Regex, RegexBuilder};
@@ -2880,31 +2880,6 @@ fn exec_like(regex_cache: Option<&mut HashMap<String, Regex>>, pattern: &str, te
}
}
fn construct_glob_regex(pattern: &str) -> Regex {
let mut regex_pattern = String::from("^");
regex_pattern.push_str(&pattern.replace('*', ".*").replace("?", "."));
regex_pattern.push('$');
Regex::new(&regex_pattern).unwrap()
}
// Implements GLOB pattern matching. Caches the constructed regex if a cache is provided
fn exec_glob(regex_cache: Option<&mut HashMap<String, Regex>>, pattern: &str, text: &str) -> bool {
if let Some(cache) = regex_cache {
match cache.get(pattern) {
Some(re) => re.is_match(text),
None => {
let re = construct_glob_regex(pattern);
let res = re.is_match(text);
cache.insert(pattern.to_string(), re);
res
}
}
} else {
let re = construct_glob_regex(pattern);
re.is_match(text)
}
}
fn exec_min(regs: Vec<&OwnedValue>) -> OwnedValue {
regs.iter()
.min()

View File

@@ -68,3 +68,75 @@ Robert|Roberts}
do_execsql_test where-glob-impossible {
select * from products where 'foobar' glob 'fooba';
} {}
foreach {testnum pattern text ans} {
1 abcdefg abcdefg 1
2 abcdefG abcdefg 0
3 abcdef abcdefg 0
4 abcdefgh abcdefg 0
5 abcdef? abcdefg 1
6 abcdef? abcdef 0
7 abcdef? abcdefgh 0
8 abcdefg abcdef? 0
9 abcdef? abcdef? 1
10 abc/def abc/def 1
11 abc//def abc/def 0
12 */abc/* x/abc/y 1
13 */abc/* /abc/ 1
16 */abc/* x///a/ab/abc 0
17 */abc/* x//a/ab/abc/ 1
16 */abc/* x///a/ab/abc 0
17 */abc/* x//a/ab/abc/ 1
18 **/abc/** x//a/ab/abc/ 1
19 *?/abc/*? x//a/ab/abc/y 1
20 ?*/abc/?* x//a/ab/abc/y 1
21 {abc[cde]efg} abcbefg 0
22 {abc[cde]efg} abccefg 1
23 {abc[cde]efg} abcdefg 1
24 {abc[cde]efg} abceefg 1
25 {abc[cde]efg} abcfefg 0
26 {abc[^cde]efg} abcbefg 1
27 {abc[^cde]efg} abccefg 0
28 {abc[^cde]efg} abcdefg 0
29 {abc[^cde]efg} abceefg 0
30 {abc[^cde]efg} abcfefg 1
31 {abc[c-e]efg} abcbefg 0
32 {abc[c-e]efg} abccefg 1
33 {abc[c-e]efg} abcdefg 1
34 {abc[c-e]efg} abceefg 1
35 {abc[c-e]efg} abcfefg 0
36 {abc[^c-e]efg} abcbefg 1
37 {abc[^c-e]efg} abccefg 0
38 {abc[^c-e]efg} abcdefg 0
39 {abc[^c-e]efg} abceefg 0
40 {abc[^c-e]efg} abcfefg 1
41 {abc[c-e]efg} abc-efg 0
42 {abc[-ce]efg} abc-efg 1
43 {abc[ce-]efg} abc-efg 1
44 {abc[][*?]efg} {abc]efg} 1
45 {abc[][*?]efg} {abc*efg} 1
46 {abc[][*?]efg} {abc?efg} 1
47 {abc[][*?]efg} {abc[efg} 1
48 {abc[^][*?]efg} {abc]efg} 0
49 {abc[^][*?]efg} {abc*efg} 0
50 {abc[^][*?]efg} {abc?efg} 0
51 {abc[^][*?]efg} {abc[efg} 0
52 {abc[^][*?]efg} {abcdefg} 1
53 {*[xyz]efg} {abcxefg} 1
54 {*[xyz]efg} {abcwefg} 0
55 {[-c]} {c} 1
56 {[-c]} {-} 1
57 {[-c]} {x} 0
} {
do_execsql_test glob-$testnum.1 "SELECT glob ( '$pattern' , '$text' )" $::ans
}
foreach {testnum pattern text ans} {
1 {abc[} {abc[} 0
2 {abc[} {abc} 0
3 {a]b} {a]b} 1
4 {a]b} {a[b} 0
} {
do_execsql_test glob-unenclosed-$testnum.1 "SELECT glob ( '$pattern' , '$text' )" $::ans
}