add phonetic and soundex

This commit is contained in:
danawan
2025-09-24 13:58:19 +07:00
parent c3132d0368
commit 815f0ffe25
5 changed files with 267 additions and 7 deletions

View File

@@ -2,13 +2,12 @@ pub const CCLASS_SILENT: u8 = 0;
pub const CCLASS_VOWEL: u8 = 1;
pub const CCLASS_B: u8 = 2;
pub const CCLASS_Y: u8 = 9;
//This will be useful in the phonetic
//pub const CCLASS_L: u8 = 6;
//pub const CCLASS_R: u8 = 7;
pub const CCLASS_L: u8 = 6;
pub const CCLASS_R: u8 = 7;
//pub const CCLASS_M: u8 = 8;
//pub const CCLASS_DIGIT: u8 = 10;
//pub const CCLASS_SPACE: u8 = 11;
//pub const CCLASS_OTHER: u8 = 12;
pub const CCLASS_DIGIT: u8 = 10;
pub const CCLASS_SPACE: u8 = 11;
pub const CCLASS_OTHER: u8 = 12;
pub const MID_CLASS: [u8; 128] = [
12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
@@ -30,3 +29,20 @@ pub const INIT_CLASS: [u8; 128] = [
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
];
// Based on: const unsigned char className[] = ".ABCDHLRMY9 ?";
pub const CLASS_NAME: [u8; 13] = [
b'.', // CCLASS_SILENT (0) -> .
b'A', // CCLASS_VOWEL (1) -> A
b'B', // CCLASS_B (2) -> B
b'C', // CCLASS_C (3) -> C
b'D', // CCLASS_D (4) -> D
b'H', // CCLASS_H (5) -> H
b'L', // CCLASS_L (6) -> L
b'R', // CCLASS_R (7) -> R
b'M', // CCLASS_M (8) -> M
b'Y', // CCLASS_Y (9) -> Y
b'9', // CCLASS_DIGIT (10) -> 9
b' ', // CCLASS_SPACE (11) -> space
b'?', // CCLASS_OTHER (12) -> ?
];

View File

@@ -3,9 +3,11 @@ use std::cmp;
use turso_ext::{register_extension, scalar, ResultCode, Value};
mod common;
mod editdist;
mod phonetic;
mod soundex;
register_extension! {
scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist},
scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic},
}
/// Calculates and returns the Levenshtein distance of two non NULL strings.
@@ -386,6 +388,26 @@ fn optimal_string_alignment(s1: &str, s2: &str) -> usize {
matrix[len1][len2]
}
#[scalar(name = "fuzzy_soundex")]
pub fn fuzzy_soundex(args: &[Value]) {
let arg1 = args[0].to_text();
if let Some(txt) = soundex::soundex(arg1) {
Value::from_text(txt)
} else {
Value::null()
}
}
#[scalar(name = "fuzzy_phonetic")]
pub fn fuzzy_phonetic(args: &[Value]) {
let arg1 = args[0].to_text();
if let Some(txt) = phonetic::phonetic_hash_str(arg1) {
Value::from_text(txt)
} else {
Value::null()
}
}
//tests adapted from sqlean fuzzy
#[cfg(test)]
mod tests {
@@ -511,4 +533,40 @@ mod tests {
assert_eq!(got, expected, "osadist({s1}, {s2}) failed");
}
}
#[test]
fn test_soundex() {
let cases = vec![
(None, None),
(Some(""), Some("".to_string())),
(Some("phonetics"), Some("P532".to_string())),
(Some("is"), Some("I200".to_string())),
(Some("awesome"), Some("A250".to_string())),
];
for (input, expected) in cases {
let result = soundex::soundex(input);
assert_eq!(
result, expected,
"fuzzy_soundex({input:?}) failed: expected {expected:?}, got {result:?}"
);
}
}
#[test]
fn test_phonetic() {
let cases = vec![
(None, None),
(Some(""), Some("".to_string())),
(Some("phonetics"), Some("BAMADAC".to_string())),
(Some("is"), Some("AC".to_string())),
(Some("awesome"), Some("ABACAMA".to_string())),
];
for (input, expected) in cases {
let result = phonetic::phonetic_hash_str(input);
assert_eq!(
result, expected,
"fuzzy_phonetic({input:?}) failed: expected {expected:?}, got {result:?}"
);
}
}
}

View File

@@ -0,0 +1,110 @@
use crate::common::*;
/// Generate a "phonetic hash" from a string of ASCII characters.
///
/// The algorithm:
/// Maps characters by character class as defined above
/// Omits double-letters
/// Omits vowels beside R and L
/// Omits T when followed by CH
/// Omits W when followed by R
/// Omits D when followed by J or G
/// Omits K in KN or G in GN at the beginning of a word
///
/// Returns a Vec<u8> containing the phonetic hash, or None if input is invalid.
pub fn phonetic_hash(z_in: &[u8]) -> Option<Vec<u8>> {
if z_in.is_empty() {
return Some(Vec::new());
}
let mut z_out = Vec::with_capacity(z_in.len() + 1);
let mut c_prev = 0x77u8;
let mut c_prev_x = 0x77u8;
let mut a_class = &INIT_CLASS;
let mut input = z_in;
if z_in.len() > 2 {
match z_in[0] {
b'g' | b'k' => {
if z_in[1] == b'n' {
input = &z_in[1..];
}
}
_ => {}
}
}
let mut i = 0;
while i < input.len() {
let mut c = input[i];
if i + 1 < input.len() {
if c == b'w' && input[i + 1] == b'r' {
i += 1;
continue;
}
if c == b'd' && (input[i + 1] == b'j' || input[i + 1] == b'g') {
i += 1;
continue;
}
if i + 2 < input.len() && c == b't' && input[i + 1] == b'c' && input[i + 2] == b'h' {
i += 1;
continue;
}
}
c = a_class[(c & 0x7f) as usize];
if c == CCLASS_SPACE {
i += 1;
continue;
}
if c == CCLASS_OTHER && c_prev != CCLASS_DIGIT {
i += 1;
continue;
}
a_class = &MID_CLASS;
if c == CCLASS_VOWEL && (c_prev_x == CCLASS_R || c_prev_x == CCLASS_L) {
i += 1;
continue;
}
if (c == CCLASS_R || c == CCLASS_L) && c_prev_x == CCLASS_VOWEL && !z_out.is_empty() {
z_out.pop();
}
c_prev = c;
if c == CCLASS_SILENT {
i += 1;
continue;
}
c_prev_x = c;
if (c as usize) < CLASS_NAME.len() {
c = CLASS_NAME[c as usize];
} else {
c = b'?';
}
if z_out.is_empty() || c != *z_out.last().unwrap() {
z_out.push(c);
}
i += 1;
}
Some(z_out)
}
pub fn phonetic_hash_str(input: Option<&str>) -> Option<String> {
match input {
None => None,
Some(s) => {
phonetic_hash(s.as_bytes()).map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
}
}
}

View File

@@ -0,0 +1,65 @@
/// Computes and returns the soundex representation of a given string.
/// https://en.wikipedia.org/wiki/Soundex
pub fn soundex(input: Option<&str>) -> Option<String> {
if let Some(input_str) = input {
if input_str.is_empty() {
return Some("".to_string());
}
let str_bytes = input_str.as_bytes();
let str_len = str_bytes.len();
let mut code = String::with_capacity(4);
code.push(str_bytes[0].to_ascii_uppercase() as char);
let mut buf: Vec<char> = Vec::with_capacity(str_len);
for &byte in str_bytes {
buf.push(soundex_encode(byte as char));
}
let mut d = 1; // digit counter
let mut i = 1; // index counter
while i < str_len && d < 4 {
let current = buf[i];
let previous = buf[i - 1];
if current != previous && current != '0' {
if i > 1 {
let two_back = buf[i - 2];
let separator = str_bytes[i - 1].to_ascii_lowercase() as char;
if current == two_back && (separator == 'h' || separator == 'w') {
i += 1;
continue;
}
}
code.push(current);
d += 1;
}
i += 1;
}
while d < 4 {
code.push('0');
d += 1;
}
Some(code)
} else {
None
}
}
/// Helper function
fn soundex_encode(c: char) -> char {
match c.to_ascii_lowercase() {
'b' | 'f' | 'p' | 'v' => '1',
'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2',
'd' | 't' => '3',
'l' => '4',
'm' | 'n' => '5',
'r' => '6',
_ => '0',
}
}

View File

@@ -581,6 +581,12 @@ def validate_fuzzy_jarowin(a):
def validate_fuzzy_osadist(a):
return a == "3"
def validate_fuzzy_soundex(a):
return a == "A250"
def validate_fuzzy_phonetic(a):
return a == "ABACAMA"
def test_fuzzy():
limbo = TestTursoShell()
ext_path = "./target/debug/liblimbo_fuzzy"
@@ -625,6 +631,11 @@ def test_fuzzy():
validate_fuzzy_osadist,
"fuzzy osadist function works",
)
limbo.run_test_fn(
"SELECT fuzzy_phonetic('awesome');",
validate_fuzzy_phonetic,
"fuzzy phonetic function works",
)
def test_vfs():
limbo = TestTursoShell()