add phonetic and soundex

2026-02-06 00:34:23 +01:00 · 2025-09-24 13:58:19 +07:00
parent c3132d0368
commit 815f0ffe25
5 changed files with 267 additions and 7 deletions
--- a/extensions/fuzzy/src/common.rs
+++ b/extensions/fuzzy/src/common.rs
@@ -2,13 +2,12 @@ pub const CCLASS_SILENT: u8 = 0;
 pub const CCLASS_VOWEL: u8 = 1;
 pub const CCLASS_B: u8 = 2;
 pub const CCLASS_Y: u8 = 9;
-//This will be useful in the phonetic
-//pub const CCLASS_L: u8 = 6;
-//pub const CCLASS_R: u8 = 7;
+pub const CCLASS_L: u8 = 6;
+pub const CCLASS_R: u8 = 7;
 //pub const CCLASS_M: u8 = 8;
-//pub const CCLASS_DIGIT: u8 = 10;
-//pub const CCLASS_SPACE: u8 = 11;
-//pub const CCLASS_OTHER: u8 = 12;
+pub const CCLASS_DIGIT: u8 = 10;
+pub const CCLASS_SPACE: u8 = 11;
+pub const CCLASS_OTHER: u8 = 12;
 pub const MID_CLASS: [u8; 128] = [
    12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
@@ -30,3 +29,20 @@ pub const INIT_CLASS: [u8; 128] = [
    12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
    2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
 ];
+
+// Based on: const unsigned char className[] = ".ABCDHLRMY9 ?";
+pub const CLASS_NAME: [u8; 13] = [
+    b'.', // CCLASS_SILENT (0) -> .
+    b'A', // CCLASS_VOWEL (1) -> A
+    b'B', // CCLASS_B (2) -> B
+    b'C', // CCLASS_C (3) -> C
+    b'D', // CCLASS_D (4) -> D
+    b'H', // CCLASS_H (5) -> H
+    b'L', // CCLASS_L (6) -> L
+    b'R', // CCLASS_R (7) -> R
+    b'M', // CCLASS_M (8) -> M
+    b'Y', // CCLASS_Y (9) -> Y
+    b'9', // CCLASS_DIGIT (10) -> 9
+    b' ', // CCLASS_SPACE (11) -> space
+    b'?', // CCLASS_OTHER (12) -> ?
+];
--- a/extensions/fuzzy/src/lib.rs
+++ b/extensions/fuzzy/src/lib.rs
@@ -3,9 +3,11 @@ use std::cmp;
 use turso_ext::{register_extension, scalar, ResultCode, Value};
 mod common;
 mod editdist;
+mod phonetic;
+mod soundex;

 register_extension! {
-    scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist},
+    scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic},
 }

 /// Calculates and returns the Levenshtein distance of two non NULL strings.
@@ -386,6 +388,26 @@ fn optimal_string_alignment(s1: &str, s2: &str) -> usize {
    matrix[len1][len2]
 }

+#[scalar(name = "fuzzy_soundex")]
+pub fn fuzzy_soundex(args: &[Value]) {
+    let arg1 = args[0].to_text();
+    if let Some(txt) = soundex::soundex(arg1) {
+        Value::from_text(txt)
+    } else {
+        Value::null()
+    }
+}
+
+#[scalar(name = "fuzzy_phonetic")]
+pub fn fuzzy_phonetic(args: &[Value]) {
+    let arg1 = args[0].to_text();
+    if let Some(txt) = phonetic::phonetic_hash_str(arg1) {
+        Value::from_text(txt)
+    } else {
+        Value::null()
+    }
+}
+
 //tests adapted from sqlean fuzzy
 #[cfg(test)]
 mod tests {
@@ -511,4 +533,40 @@ mod tests {
            assert_eq!(got, expected, "osadist({s1}, {s2}) failed");
        }
    }
+    #[test]
+    fn test_soundex() {
+        let cases = vec![
+            (None, None),
+            (Some(""), Some("".to_string())),
+            (Some("phonetics"), Some("P532".to_string())),
+            (Some("is"), Some("I200".to_string())),
+            (Some("awesome"), Some("A250".to_string())),
+        ];
+
+        for (input, expected) in cases {
+            let result = soundex::soundex(input);
+            assert_eq!(
+                result, expected,
+                "fuzzy_soundex({input:?}) failed: expected {expected:?}, got {result:?}"
+            );
+        }
+    }
+    #[test]
+    fn test_phonetic() {
+        let cases = vec![
+            (None, None),
+            (Some(""), Some("".to_string())),
+            (Some("phonetics"), Some("BAMADAC".to_string())),
+            (Some("is"), Some("AC".to_string())),
+            (Some("awesome"), Some("ABACAMA".to_string())),
+        ];
+
+        for (input, expected) in cases {
+            let result = phonetic::phonetic_hash_str(input);
+            assert_eq!(
+                result, expected,
+                "fuzzy_phonetic({input:?}) failed: expected {expected:?}, got {result:?}"
+            );
+        }
+    }
 }
--- a/extensions/fuzzy/src/phonetic.rs
+++ b/extensions/fuzzy/src/phonetic.rs
@@ -0,0 +1,110 @@
+use crate::common::*;
+
+/// Generate a "phonetic hash" from a string of ASCII characters.
+///
+/// The algorithm:
+///  Maps characters by character class as defined above
+///  Omits double-letters
+///  Omits vowels beside R and L
+///  Omits T when followed by CH
+///  Omits W when followed by R
+///  Omits D when followed by J or G
+///  Omits K in KN or G in GN at the beginning of a word
+///
+/// Returns a Vec<u8> containing the phonetic hash, or None if input is invalid.
+pub fn phonetic_hash(z_in: &[u8]) -> Option<Vec<u8>> {
+    if z_in.is_empty() {
+        return Some(Vec::new());
+    }
+
+    let mut z_out = Vec::with_capacity(z_in.len() + 1);
+    let mut c_prev = 0x77u8;
+    let mut c_prev_x = 0x77u8;
+    let mut a_class = &INIT_CLASS;
+
+    let mut input = z_in;
+    if z_in.len() > 2 {
+        match z_in[0] {
+            b'g' | b'k' => {
+                if z_in[1] == b'n' {
+                    input = &z_in[1..];
+                }
+            }
+            _ => {}
+        }
+    }
+
+    let mut i = 0;
+    while i < input.len() {
+        let mut c = input[i];
+
+        if i + 1 < input.len() {
+            if c == b'w' && input[i + 1] == b'r' {
+                i += 1;
+                continue;
+            }
+            if c == b'd' && (input[i + 1] == b'j' || input[i + 1] == b'g') {
+                i += 1;
+                continue;
+            }
+            if i + 2 < input.len() && c == b't' && input[i + 1] == b'c' && input[i + 2] == b'h' {
+                i += 1;
+                continue;
+            }
+        }
+
+        c = a_class[(c & 0x7f) as usize];
+
+        if c == CCLASS_SPACE {
+            i += 1;
+            continue;
+        }
+
+        if c == CCLASS_OTHER && c_prev != CCLASS_DIGIT {
+            i += 1;
+            continue;
+        }
+
+        a_class = &MID_CLASS;
+
+        if c == CCLASS_VOWEL && (c_prev_x == CCLASS_R || c_prev_x == CCLASS_L) {
+            i += 1;
+            continue;
+        }
+
+        if (c == CCLASS_R || c == CCLASS_L) && c_prev_x == CCLASS_VOWEL && !z_out.is_empty() {
+            z_out.pop();
+        }
+
+        c_prev = c;
+
+        if c == CCLASS_SILENT {
+            i += 1;
+            continue;
+        }
+
+        c_prev_x = c;
+        if (c as usize) < CLASS_NAME.len() {
+            c = CLASS_NAME[c as usize];
+        } else {
+            c = b'?';
+        }
+
+        if z_out.is_empty() || c != *z_out.last().unwrap() {
+            z_out.push(c);
+        }
+
+        i += 1;
+    }
+
+    Some(z_out)
+}
+
+pub fn phonetic_hash_str(input: Option<&str>) -> Option<String> {
+    match input {
+        None => None,
+        Some(s) => {
+            phonetic_hash(s.as_bytes()).map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
+        }
+    }
+}
--- a/extensions/fuzzy/src/soundex.rs
+++ b/extensions/fuzzy/src/soundex.rs
@@ -0,0 +1,65 @@
+/// Computes and returns the soundex representation of a given string.
+/// https://en.wikipedia.org/wiki/Soundex
+pub fn soundex(input: Option<&str>) -> Option<String> {
+    if let Some(input_str) = input {
+        if input_str.is_empty() {
+            return Some("".to_string());
+        }
+
+        let str_bytes = input_str.as_bytes();
+        let str_len = str_bytes.len();
+
+        let mut code = String::with_capacity(4);
+        code.push(str_bytes[0].to_ascii_uppercase() as char);
+
+        let mut buf: Vec<char> = Vec::with_capacity(str_len);
+        for &byte in str_bytes {
+            buf.push(soundex_encode(byte as char));
+        }
+
+        let mut d = 1; // digit counter
+        let mut i = 1; // index counter
+
+        while i < str_len && d < 4 {
+            let current = buf[i];
+            let previous = buf[i - 1];
+
+            if current != previous && current != '0' {
+                if i > 1 {
+                    let two_back = buf[i - 2];
+                    let separator = str_bytes[i - 1].to_ascii_lowercase() as char;
+                    if current == two_back && (separator == 'h' || separator == 'w') {
+                        i += 1;
+                        continue;
+                    }
+                }
+
+                code.push(current);
+                d += 1;
+            }
+            i += 1;
+        }
+
+        while d < 4 {
+            code.push('0');
+            d += 1;
+        }
+
+        Some(code)
+    } else {
+        None
+    }
+}
+
+/// Helper function
+fn soundex_encode(c: char) -> char {
+    match c.to_ascii_lowercase() {
+        'b' | 'f' | 'p' | 'v' => '1',
+        'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2',
+        'd' | 't' => '3',
+        'l' => '4',
+        'm' | 'n' => '5',
+        'r' => '6',
+        _ => '0',
+    }
+}
--- a/testing/cli_tests/extensions.py
+++ b/testing/cli_tests/extensions.py
@@ -581,6 +581,12 @@ def validate_fuzzy_jarowin(a):
 def validate_fuzzy_osadist(a):
    return a == "3"

+def validate_fuzzy_soundex(a):
+    return a == "A250"
+
+def validate_fuzzy_phonetic(a):
+    return a == "ABACAMA"
+
 def test_fuzzy():
    limbo = TestTursoShell()
    ext_path = "./target/debug/liblimbo_fuzzy"
@@ -625,6 +631,11 @@ def test_fuzzy():
        validate_fuzzy_osadist,
        "fuzzy osadist function works",
    )
+    limbo.run_test_fn(
+        "SELECT fuzzy_phonetic('awesome');",
+        validate_fuzzy_phonetic,
+        "fuzzy phonetic function works",
+    )

 def test_vfs():
    limbo = TestTursoShell()