From 815f0ffe259a418ea05111b38abbe8cd062b6549 Mon Sep 17 00:00:00 2001 From: danawan Date: Wed, 24 Sep 2025 13:58:19 +0700 Subject: [PATCH] add phonetic and soundex --- extensions/fuzzy/src/common.rs | 28 ++++++-- extensions/fuzzy/src/lib.rs | 60 ++++++++++++++++- extensions/fuzzy/src/phonetic.rs | 110 +++++++++++++++++++++++++++++++ extensions/fuzzy/src/soundex.rs | 65 ++++++++++++++++++ testing/cli_tests/extensions.py | 11 ++++ 5 files changed, 267 insertions(+), 7 deletions(-) create mode 100644 extensions/fuzzy/src/phonetic.rs create mode 100644 extensions/fuzzy/src/soundex.rs diff --git a/extensions/fuzzy/src/common.rs b/extensions/fuzzy/src/common.rs index 4b0c12fd1..9fc5a3a4d 100644 --- a/extensions/fuzzy/src/common.rs +++ b/extensions/fuzzy/src/common.rs @@ -2,13 +2,12 @@ pub const CCLASS_SILENT: u8 = 0; pub const CCLASS_VOWEL: u8 = 1; pub const CCLASS_B: u8 = 2; pub const CCLASS_Y: u8 = 9; -//This will be useful in the phonetic -//pub const CCLASS_L: u8 = 6; -//pub const CCLASS_R: u8 = 7; +pub const CCLASS_L: u8 = 6; +pub const CCLASS_R: u8 = 7; //pub const CCLASS_M: u8 = 8; -//pub const CCLASS_DIGIT: u8 = 10; -//pub const CCLASS_SPACE: u8 = 11; -//pub const CCLASS_OTHER: u8 = 12; +pub const CCLASS_DIGIT: u8 = 10; +pub const CCLASS_SPACE: u8 = 11; +pub const CCLASS_OTHER: u8 = 12; pub const MID_CLASS: [u8; 128] = [ 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, // 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, // @@ -30,3 +29,20 @@ pub const INIT_CLASS: [u8; 128] = [ 12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, // 2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, // ]; + +// Based on: const unsigned char className[] = ".ABCDHLRMY9 ?"; +pub const CLASS_NAME: [u8; 13] = [ + b'.', // CCLASS_SILENT (0) -> . + b'A', // CCLASS_VOWEL (1) -> A + b'B', // CCLASS_B (2) -> B + b'C', // CCLASS_C (3) -> C + b'D', // CCLASS_D (4) -> D + b'H', // CCLASS_H (5) -> H + b'L', // CCLASS_L (6) -> L + b'R', // CCLASS_R (7) -> R + b'M', // CCLASS_M (8) -> M + b'Y', // CCLASS_Y (9) -> Y + b'9', // CCLASS_DIGIT (10) -> 9 + b' ', // CCLASS_SPACE (11) -> space + b'?', // CCLASS_OTHER (12) -> ? +]; diff --git a/extensions/fuzzy/src/lib.rs b/extensions/fuzzy/src/lib.rs index d34a42a06..3ede9faa5 100644 --- a/extensions/fuzzy/src/lib.rs +++ b/extensions/fuzzy/src/lib.rs @@ -3,9 +3,11 @@ use std::cmp; use turso_ext::{register_extension, scalar, ResultCode, Value}; mod common; mod editdist; +mod phonetic; +mod soundex; register_extension! { - scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist}, + scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic}, } /// Calculates and returns the Levenshtein distance of two non NULL strings. @@ -386,6 +388,26 @@ fn optimal_string_alignment(s1: &str, s2: &str) -> usize { matrix[len1][len2] } +#[scalar(name = "fuzzy_soundex")] +pub fn fuzzy_soundex(args: &[Value]) { + let arg1 = args[0].to_text(); + if let Some(txt) = soundex::soundex(arg1) { + Value::from_text(txt) + } else { + Value::null() + } +} + +#[scalar(name = "fuzzy_phonetic")] +pub fn fuzzy_phonetic(args: &[Value]) { + let arg1 = args[0].to_text(); + if let Some(txt) = phonetic::phonetic_hash_str(arg1) { + Value::from_text(txt) + } else { + Value::null() + } +} + //tests adapted from sqlean fuzzy #[cfg(test)] mod tests { @@ -511,4 +533,40 @@ mod tests { assert_eq!(got, expected, "osadist({s1}, {s2}) failed"); } } + #[test] + fn test_soundex() { + let cases = vec![ + (None, None), + (Some(""), Some("".to_string())), + (Some("phonetics"), Some("P532".to_string())), + (Some("is"), Some("I200".to_string())), + (Some("awesome"), Some("A250".to_string())), + ]; + + for (input, expected) in cases { + let result = soundex::soundex(input); + assert_eq!( + result, expected, + "fuzzy_soundex({input:?}) failed: expected {expected:?}, got {result:?}" + ); + } + } + #[test] + fn test_phonetic() { + let cases = vec![ + (None, None), + (Some(""), Some("".to_string())), + (Some("phonetics"), Some("BAMADAC".to_string())), + (Some("is"), Some("AC".to_string())), + (Some("awesome"), Some("ABACAMA".to_string())), + ]; + + for (input, expected) in cases { + let result = phonetic::phonetic_hash_str(input); + assert_eq!( + result, expected, + "fuzzy_phonetic({input:?}) failed: expected {expected:?}, got {result:?}" + ); + } + } } diff --git a/extensions/fuzzy/src/phonetic.rs b/extensions/fuzzy/src/phonetic.rs new file mode 100644 index 000000000..624e7ed27 --- /dev/null +++ b/extensions/fuzzy/src/phonetic.rs @@ -0,0 +1,110 @@ +use crate::common::*; + +/// Generate a "phonetic hash" from a string of ASCII characters. +/// +/// The algorithm: +/// Maps characters by character class as defined above +/// Omits double-letters +/// Omits vowels beside R and L +/// Omits T when followed by CH +/// Omits W when followed by R +/// Omits D when followed by J or G +/// Omits K in KN or G in GN at the beginning of a word +/// +/// Returns a Vec containing the phonetic hash, or None if input is invalid. +pub fn phonetic_hash(z_in: &[u8]) -> Option> { + if z_in.is_empty() { + return Some(Vec::new()); + } + + let mut z_out = Vec::with_capacity(z_in.len() + 1); + let mut c_prev = 0x77u8; + let mut c_prev_x = 0x77u8; + let mut a_class = &INIT_CLASS; + + let mut input = z_in; + if z_in.len() > 2 { + match z_in[0] { + b'g' | b'k' => { + if z_in[1] == b'n' { + input = &z_in[1..]; + } + } + _ => {} + } + } + + let mut i = 0; + while i < input.len() { + let mut c = input[i]; + + if i + 1 < input.len() { + if c == b'w' && input[i + 1] == b'r' { + i += 1; + continue; + } + if c == b'd' && (input[i + 1] == b'j' || input[i + 1] == b'g') { + i += 1; + continue; + } + if i + 2 < input.len() && c == b't' && input[i + 1] == b'c' && input[i + 2] == b'h' { + i += 1; + continue; + } + } + + c = a_class[(c & 0x7f) as usize]; + + if c == CCLASS_SPACE { + i += 1; + continue; + } + + if c == CCLASS_OTHER && c_prev != CCLASS_DIGIT { + i += 1; + continue; + } + + a_class = &MID_CLASS; + + if c == CCLASS_VOWEL && (c_prev_x == CCLASS_R || c_prev_x == CCLASS_L) { + i += 1; + continue; + } + + if (c == CCLASS_R || c == CCLASS_L) && c_prev_x == CCLASS_VOWEL && !z_out.is_empty() { + z_out.pop(); + } + + c_prev = c; + + if c == CCLASS_SILENT { + i += 1; + continue; + } + + c_prev_x = c; + if (c as usize) < CLASS_NAME.len() { + c = CLASS_NAME[c as usize]; + } else { + c = b'?'; + } + + if z_out.is_empty() || c != *z_out.last().unwrap() { + z_out.push(c); + } + + i += 1; + } + + Some(z_out) +} + +pub fn phonetic_hash_str(input: Option<&str>) -> Option { + match input { + None => None, + Some(s) => { + phonetic_hash(s.as_bytes()).map(|bytes| String::from_utf8_lossy(&bytes).into_owned()) + } + } +} diff --git a/extensions/fuzzy/src/soundex.rs b/extensions/fuzzy/src/soundex.rs new file mode 100644 index 000000000..809a76df3 --- /dev/null +++ b/extensions/fuzzy/src/soundex.rs @@ -0,0 +1,65 @@ +/// Computes and returns the soundex representation of a given string. +/// https://en.wikipedia.org/wiki/Soundex +pub fn soundex(input: Option<&str>) -> Option { + if let Some(input_str) = input { + if input_str.is_empty() { + return Some("".to_string()); + } + + let str_bytes = input_str.as_bytes(); + let str_len = str_bytes.len(); + + let mut code = String::with_capacity(4); + code.push(str_bytes[0].to_ascii_uppercase() as char); + + let mut buf: Vec = Vec::with_capacity(str_len); + for &byte in str_bytes { + buf.push(soundex_encode(byte as char)); + } + + let mut d = 1; // digit counter + let mut i = 1; // index counter + + while i < str_len && d < 4 { + let current = buf[i]; + let previous = buf[i - 1]; + + if current != previous && current != '0' { + if i > 1 { + let two_back = buf[i - 2]; + let separator = str_bytes[i - 1].to_ascii_lowercase() as char; + if current == two_back && (separator == 'h' || separator == 'w') { + i += 1; + continue; + } + } + + code.push(current); + d += 1; + } + i += 1; + } + + while d < 4 { + code.push('0'); + d += 1; + } + + Some(code) + } else { + None + } +} + +/// Helper function +fn soundex_encode(c: char) -> char { + match c.to_ascii_lowercase() { + 'b' | 'f' | 'p' | 'v' => '1', + 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2', + 'd' | 't' => '3', + 'l' => '4', + 'm' | 'n' => '5', + 'r' => '6', + _ => '0', + } +} diff --git a/testing/cli_tests/extensions.py b/testing/cli_tests/extensions.py index d88a771f5..dbe142a60 100755 --- a/testing/cli_tests/extensions.py +++ b/testing/cli_tests/extensions.py @@ -581,6 +581,12 @@ def validate_fuzzy_jarowin(a): def validate_fuzzy_osadist(a): return a == "3" +def validate_fuzzy_soundex(a): + return a == "A250" + +def validate_fuzzy_phonetic(a): + return a == "ABACAMA" + def test_fuzzy(): limbo = TestTursoShell() ext_path = "./target/debug/liblimbo_fuzzy" @@ -625,6 +631,11 @@ def test_fuzzy(): validate_fuzzy_osadist, "fuzzy osadist function works", ) + limbo.run_test_fn( + "SELECT fuzzy_phonetic('awesome');", + validate_fuzzy_phonetic, + "fuzzy phonetic function works", + ) def test_vfs(): limbo = TestTursoShell()