diff --git a/extensions/fuzzy/src/lib.rs b/extensions/fuzzy/src/lib.rs index 64f65804d..aeb5dad9f 100644 --- a/extensions/fuzzy/src/lib.rs +++ b/extensions/fuzzy/src/lib.rs @@ -5,10 +5,11 @@ mod caver; mod common; mod editdist; mod phonetic; +mod rsoundex; mod soundex; register_extension! { - scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic, fuzzy_caver}, + scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic, fuzzy_caver, fuzzy_rsoundex}, } /// Calculates and returns the Levenshtein distance of two non NULL strings. @@ -419,6 +420,16 @@ pub fn fuzzy_caver(args: &[Value]) { } } +#[scalar(name = "fuzzy_rsoundex")] +pub fn fuzzy_rsoundex(args: &[Value]) { + let arg1 = args[0].to_text(); + if let Some(txt) = rsoundex::rsoundex(arg1) { + Value::from_text(txt) + } else { + Value::null() + } +} + //tests adapted from sqlean fuzzy #[cfg(test)] mod tests { @@ -599,4 +610,22 @@ mod tests { ); } } + #[test] + fn test_rsoundex() { + let cases = vec![ + (None, None), + (Some(""), Some("".to_string())), + (Some("phonetics"), Some("P1080603".to_string())), + (Some("is"), Some("I03".to_string())), + (Some("awesome"), Some("A03080".to_string())), + ]; + + for (input, expected) in cases { + let result = rsoundex::rsoundex(input); + assert_eq!( + result, expected, + "fuzzy_rsoundex({input:?}) failed: expected {expected:?}, got {result:?}" + ); + } + } } diff --git a/extensions/fuzzy/src/rsoundex.rs b/extensions/fuzzy/src/rsoundex.rs new file mode 100644 index 000000000..2163d0cce --- /dev/null +++ b/extensions/fuzzy/src/rsoundex.rs @@ -0,0 +1,49 @@ +/// Computes and returns the soundex representation of a given non NULL string. +/// More information about the algorithm can be found here: +/// http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html +pub fn rsoundex(input: Option<&str>) -> Option { + if let Some(s) = input { + if s.is_empty() { + return Some("".to_string()); + } + + let str_bytes = s.as_bytes(); + let str_len = str_bytes.len(); + + let mut code = String::with_capacity(str_len + 1); + code.push(str_bytes[0].to_ascii_uppercase() as char); + + let mut buf: Vec = Vec::with_capacity(str_len); + for &b in str_bytes { + buf.push(refined_soundex_encode(b as char)); + } + + let mut prev: Option = None; + for c in buf { + if Some(c) != prev { + code.push(c); + prev = Some(c); + } + } + + Some(code) + } else { + None + } +} + +//helper +fn refined_soundex_encode(c: char) -> char { + match c.to_ascii_lowercase() { + 'b' | 'p' => '1', + 'f' | 'v' => '2', + 'c' | 'k' | 's' => '3', + 'g' | 'j' => '4', + 'q' | 'x' | 'z' => '5', + 'd' | 't' => '6', + 'l' => '7', + 'm' | 'n' => '8', + 'r' => '9', + _ => '0', + } +} diff --git a/testing/cli_tests/extensions.py b/testing/cli_tests/extensions.py index 6d8527a91..a6c6cb89c 100755 --- a/testing/cli_tests/extensions.py +++ b/testing/cli_tests/extensions.py @@ -590,6 +590,9 @@ def validate_fuzzy_phonetic(a): def validate_fuzzy_caver(a): return a == "AWSM111111" +def validate_fuzzy_rsoundex(a): + return a == "A03080" + def test_fuzzy(): limbo = TestTursoShell() ext_path = "./target/debug/liblimbo_fuzzy" @@ -644,6 +647,11 @@ def test_fuzzy(): validate_fuzzy_caver, "fuzzy caver function works", ) + limbo.run_test_fn( + "SELECT fuzzy_rsoundex('awesome');", + validate_fuzzy_rsoundex, + "fuzzy rsoundex function works", + ) def test_vfs(): limbo = TestTursoShell()