diff --git a/core/vdbe/likeop.rs b/core/vdbe/likeop.rs index 2d01e6c0d..ad69ba97c 100644 --- a/core/vdbe/likeop.rs +++ b/core/vdbe/likeop.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use regex::{Regex, RegexBuilder}; use crate::{types::OwnedValue, LimboError}; @@ -61,6 +63,124 @@ fn construct_like_regex_with_escape(pattern: &str, escape: char) -> Regex { .unwrap() } +// Implements GLOB pattern matching. Caches the constructed regex if a cache is provided +pub fn exec_glob( + regex_cache: Option<&mut HashMap>, + pattern: &str, + text: &str, +) -> bool { + if let Some(cache) = regex_cache { + match cache.get(pattern) { + Some(re) => re.is_match(text), + None => match construct_glob_regex(pattern) { + Ok(re) => { + let res = re.is_match(text); + cache.insert(pattern.to_string(), re); + res + } + Err(_) => false, + }, + } + } else { + construct_glob_regex(pattern) + .map(|re| re.is_match(text)) + .unwrap_or(false) + } +} + +fn push_char_to_regex_pattern(c: char, regex_pattern: &mut String) { + if regex_syntax::is_meta_character(c) { + regex_pattern.push('\\'); + } + regex_pattern.push(c); +} + +fn construct_glob_regex(pattern: &str) -> Result { + let mut regex_pattern = String::with_capacity(pattern.len() * 2); + + regex_pattern.push('^'); + + let mut chars = pattern.chars(); + let mut bracket_closed = true; + + while let Some(ch) = chars.next() { + match ch { + '[' => { + bracket_closed = false; + regex_pattern.push('['); + if let Some(next_ch) = chars.next() { + match next_ch { + ']' => { + // The string enclosed by the brackets cannot be empty; + // therefore ']' can be allowed between the brackets, + // provided that it is the first character. + // so this means + // - `[]]` will be translated to `[\]]` + // - `[[]` will be translated to `[\[]` + regex_pattern.push_str("\\]"); + } + '^' => { + // For the most cases we can pass `^` directly to regex + // but in certain cases like [^][a] , `[^]` will make regex crate + // throw unenclosed character class. So this means + // - `[^][a]` will be translated to `[^\]a]` + regex_pattern.push('^'); + if let Some(next_ch_2) = chars.next() { + match next_ch_2 { + ']' => { + regex_pattern.push('\\'); + regex_pattern.push(']'); + } + c => { + push_char_to_regex_pattern(c, &mut regex_pattern); + } + } + } + } + c => { + push_char_to_regex_pattern(c, &mut regex_pattern); + } + } + }; + + while let Some(next_ch) = chars.next() { + match next_ch { + ']' => { + bracket_closed = true; + regex_pattern.push(']'); + break; + } + '-' => { + regex_pattern.push('-'); + } + c => { + push_char_to_regex_pattern(c, &mut regex_pattern); + } + } + } + } + '?' => { + regex_pattern.push('.'); + } + '*' => { + regex_pattern.push_str(".*"); + } + c => { + push_char_to_regex_pattern(c, &mut regex_pattern); + } + } + } + regex_pattern.push('$'); + + if bracket_closed { + Ok(Regex::new(®ex_pattern).unwrap()) + } else { + Result::Err(LimboError::Constraint( + "blob pattern is not closed".to_string(), + )) + } +} + #[cfg(test)] mod test { use super::*; @@ -82,4 +202,16 @@ mod test { assert!(!exec_like_with_escape("abcXX", "abc", 'X')); assert!(!exec_like_with_escape("abcXX", "abcXX", 'X')); } + + #[test] + fn test_glob_no_cache() { + assert!(exec_glob(None, r#"?*/abc/?*"#, r#"x//a/ab/abc/y"#)); + assert!(exec_glob(None, r#"a[1^]"#, r#"a1"#)); + assert!(exec_glob(None, r#"a[1^]*"#, r#"a^"#)); + assert!(!exec_glob(None, r#"a[a*"#, r#"a["#)); + assert!(!exec_glob(None, r#"a[a"#, r#"a[a"#)); + assert!(exec_glob(None, r#"a[[]"#, r#"a["#)); + assert!(exec_glob(None, r#"abc[^][*?]efg"#, r#"abcdefg"#)); + assert!(!exec_glob(None, r#"abc[^][*?]efg"#, r#"abc]efg"#)); + } } diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 609b8ad86..52b74ccc2 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -42,7 +42,7 @@ use crate::vdbe::insn::Insn; use crate::{function::JsonFunc, json::get_json, json::json_array, json::json_array_length}; use crate::{Connection, Result, Rows, TransactionState, DATABASE_VERSION}; use datetime::{exec_date, exec_time, exec_unixepoch}; -use likeop::{construct_like_escape_arg, exec_like_with_escape}; +use likeop::{construct_like_escape_arg, exec_glob, exec_like_with_escape}; use rand::distributions::{Distribution, Uniform}; use rand::{thread_rng, Rng}; use regex::{Regex, RegexBuilder}; @@ -2880,31 +2880,6 @@ fn exec_like(regex_cache: Option<&mut HashMap>, pattern: &str, te } } -fn construct_glob_regex(pattern: &str) -> Regex { - let mut regex_pattern = String::from("^"); - regex_pattern.push_str(&pattern.replace('*', ".*").replace("?", ".")); - regex_pattern.push('$'); - Regex::new(®ex_pattern).unwrap() -} - -// Implements GLOB pattern matching. Caches the constructed regex if a cache is provided -fn exec_glob(regex_cache: Option<&mut HashMap>, pattern: &str, text: &str) -> bool { - if let Some(cache) = regex_cache { - match cache.get(pattern) { - Some(re) => re.is_match(text), - None => { - let re = construct_glob_regex(pattern); - let res = re.is_match(text); - cache.insert(pattern.to_string(), re); - res - } - } - } else { - let re = construct_glob_regex(pattern); - re.is_match(text) - } -} - fn exec_min(regs: Vec<&OwnedValue>) -> OwnedValue { regs.iter() .min() diff --git a/testing/glob.test b/testing/glob.test index 249ea8151..730fd20d6 100644 --- a/testing/glob.test +++ b/testing/glob.test @@ -68,3 +68,75 @@ Robert|Roberts} do_execsql_test where-glob-impossible { select * from products where 'foobar' glob 'fooba'; } {} + +foreach {testnum pattern text ans} { + 1 abcdefg abcdefg 1 + 2 abcdefG abcdefg 0 + 3 abcdef abcdefg 0 + 4 abcdefgh abcdefg 0 + 5 abcdef? abcdefg 1 + 6 abcdef? abcdef 0 + 7 abcdef? abcdefgh 0 + 8 abcdefg abcdef? 0 + 9 abcdef? abcdef? 1 + 10 abc/def abc/def 1 + 11 abc//def abc/def 0 + 12 */abc/* x/abc/y 1 + 13 */abc/* /abc/ 1 + 16 */abc/* x///a/ab/abc 0 + 17 */abc/* x//a/ab/abc/ 1 + 16 */abc/* x///a/ab/abc 0 + 17 */abc/* x//a/ab/abc/ 1 + 18 **/abc/** x//a/ab/abc/ 1 + 19 *?/abc/*? x//a/ab/abc/y 1 + 20 ?*/abc/?* x//a/ab/abc/y 1 + 21 {abc[cde]efg} abcbefg 0 + 22 {abc[cde]efg} abccefg 1 + 23 {abc[cde]efg} abcdefg 1 + 24 {abc[cde]efg} abceefg 1 + 25 {abc[cde]efg} abcfefg 0 + 26 {abc[^cde]efg} abcbefg 1 + 27 {abc[^cde]efg} abccefg 0 + 28 {abc[^cde]efg} abcdefg 0 + 29 {abc[^cde]efg} abceefg 0 + 30 {abc[^cde]efg} abcfefg 1 + 31 {abc[c-e]efg} abcbefg 0 + 32 {abc[c-e]efg} abccefg 1 + 33 {abc[c-e]efg} abcdefg 1 + 34 {abc[c-e]efg} abceefg 1 + 35 {abc[c-e]efg} abcfefg 0 + 36 {abc[^c-e]efg} abcbefg 1 + 37 {abc[^c-e]efg} abccefg 0 + 38 {abc[^c-e]efg} abcdefg 0 + 39 {abc[^c-e]efg} abceefg 0 + 40 {abc[^c-e]efg} abcfefg 1 + 41 {abc[c-e]efg} abc-efg 0 + 42 {abc[-ce]efg} abc-efg 1 + 43 {abc[ce-]efg} abc-efg 1 + 44 {abc[][*?]efg} {abc]efg} 1 + 45 {abc[][*?]efg} {abc*efg} 1 + 46 {abc[][*?]efg} {abc?efg} 1 + 47 {abc[][*?]efg} {abc[efg} 1 + 48 {abc[^][*?]efg} {abc]efg} 0 + 49 {abc[^][*?]efg} {abc*efg} 0 + 50 {abc[^][*?]efg} {abc?efg} 0 + 51 {abc[^][*?]efg} {abc[efg} 0 + 52 {abc[^][*?]efg} {abcdefg} 1 + 53 {*[xyz]efg} {abcxefg} 1 + 54 {*[xyz]efg} {abcwefg} 0 + 55 {[-c]} {c} 1 + 56 {[-c]} {-} 1 + 57 {[-c]} {x} 0 +} { + do_execsql_test glob-$testnum.1 "SELECT glob ( '$pattern' , '$text' )" $::ans +} + + +foreach {testnum pattern text ans} { + 1 {abc[} {abc[} 0 + 2 {abc[} {abc} 0 + 3 {a]b} {a]b} 1 + 4 {a]b} {a[b} 0 +} { + do_execsql_test glob-unenclosed-$testnum.1 "SELECT glob ( '$pattern' , '$text' )" $::ans +}