mirror of
https://github.com/aljazceru/turso.git
synced 2026-02-06 00:34:23 +01:00
add phonetic and soundex
This commit is contained in:
@@ -2,13 +2,12 @@ pub const CCLASS_SILENT: u8 = 0;
|
||||
pub const CCLASS_VOWEL: u8 = 1;
|
||||
pub const CCLASS_B: u8 = 2;
|
||||
pub const CCLASS_Y: u8 = 9;
|
||||
//This will be useful in the phonetic
|
||||
//pub const CCLASS_L: u8 = 6;
|
||||
//pub const CCLASS_R: u8 = 7;
|
||||
pub const CCLASS_L: u8 = 6;
|
||||
pub const CCLASS_R: u8 = 7;
|
||||
//pub const CCLASS_M: u8 = 8;
|
||||
//pub const CCLASS_DIGIT: u8 = 10;
|
||||
//pub const CCLASS_SPACE: u8 = 11;
|
||||
//pub const CCLASS_OTHER: u8 = 12;
|
||||
pub const CCLASS_DIGIT: u8 = 10;
|
||||
pub const CCLASS_SPACE: u8 = 11;
|
||||
pub const CCLASS_OTHER: u8 = 12;
|
||||
pub const MID_CLASS: [u8; 128] = [
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
|
||||
@@ -30,3 +29,20 @@ pub const INIT_CLASS: [u8; 128] = [
|
||||
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
|
||||
2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
|
||||
];
|
||||
|
||||
// Based on: const unsigned char className[] = ".ABCDHLRMY9 ?";
|
||||
pub const CLASS_NAME: [u8; 13] = [
|
||||
b'.', // CCLASS_SILENT (0) -> .
|
||||
b'A', // CCLASS_VOWEL (1) -> A
|
||||
b'B', // CCLASS_B (2) -> B
|
||||
b'C', // CCLASS_C (3) -> C
|
||||
b'D', // CCLASS_D (4) -> D
|
||||
b'H', // CCLASS_H (5) -> H
|
||||
b'L', // CCLASS_L (6) -> L
|
||||
b'R', // CCLASS_R (7) -> R
|
||||
b'M', // CCLASS_M (8) -> M
|
||||
b'Y', // CCLASS_Y (9) -> Y
|
||||
b'9', // CCLASS_DIGIT (10) -> 9
|
||||
b' ', // CCLASS_SPACE (11) -> space
|
||||
b'?', // CCLASS_OTHER (12) -> ?
|
||||
];
|
||||
|
||||
@@ -3,9 +3,11 @@ use std::cmp;
|
||||
use turso_ext::{register_extension, scalar, ResultCode, Value};
|
||||
mod common;
|
||||
mod editdist;
|
||||
mod phonetic;
|
||||
mod soundex;
|
||||
|
||||
register_extension! {
|
||||
scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist},
|
||||
scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic},
|
||||
}
|
||||
|
||||
/// Calculates and returns the Levenshtein distance of two non NULL strings.
|
||||
@@ -386,6 +388,26 @@ fn optimal_string_alignment(s1: &str, s2: &str) -> usize {
|
||||
matrix[len1][len2]
|
||||
}
|
||||
|
||||
#[scalar(name = "fuzzy_soundex")]
|
||||
pub fn fuzzy_soundex(args: &[Value]) {
|
||||
let arg1 = args[0].to_text();
|
||||
if let Some(txt) = soundex::soundex(arg1) {
|
||||
Value::from_text(txt)
|
||||
} else {
|
||||
Value::null()
|
||||
}
|
||||
}
|
||||
|
||||
#[scalar(name = "fuzzy_phonetic")]
|
||||
pub fn fuzzy_phonetic(args: &[Value]) {
|
||||
let arg1 = args[0].to_text();
|
||||
if let Some(txt) = phonetic::phonetic_hash_str(arg1) {
|
||||
Value::from_text(txt)
|
||||
} else {
|
||||
Value::null()
|
||||
}
|
||||
}
|
||||
|
||||
//tests adapted from sqlean fuzzy
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -511,4 +533,40 @@ mod tests {
|
||||
assert_eq!(got, expected, "osadist({s1}, {s2}) failed");
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_soundex() {
|
||||
let cases = vec![
|
||||
(None, None),
|
||||
(Some(""), Some("".to_string())),
|
||||
(Some("phonetics"), Some("P532".to_string())),
|
||||
(Some("is"), Some("I200".to_string())),
|
||||
(Some("awesome"), Some("A250".to_string())),
|
||||
];
|
||||
|
||||
for (input, expected) in cases {
|
||||
let result = soundex::soundex(input);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"fuzzy_soundex({input:?}) failed: expected {expected:?}, got {result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_phonetic() {
|
||||
let cases = vec![
|
||||
(None, None),
|
||||
(Some(""), Some("".to_string())),
|
||||
(Some("phonetics"), Some("BAMADAC".to_string())),
|
||||
(Some("is"), Some("AC".to_string())),
|
||||
(Some("awesome"), Some("ABACAMA".to_string())),
|
||||
];
|
||||
|
||||
for (input, expected) in cases {
|
||||
let result = phonetic::phonetic_hash_str(input);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"fuzzy_phonetic({input:?}) failed: expected {expected:?}, got {result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
110
extensions/fuzzy/src/phonetic.rs
Normal file
110
extensions/fuzzy/src/phonetic.rs
Normal file
@@ -0,0 +1,110 @@
|
||||
use crate::common::*;
|
||||
|
||||
/// Generate a "phonetic hash" from a string of ASCII characters.
|
||||
///
|
||||
/// The algorithm:
|
||||
/// Maps characters by character class as defined above
|
||||
/// Omits double-letters
|
||||
/// Omits vowels beside R and L
|
||||
/// Omits T when followed by CH
|
||||
/// Omits W when followed by R
|
||||
/// Omits D when followed by J or G
|
||||
/// Omits K in KN or G in GN at the beginning of a word
|
||||
///
|
||||
/// Returns a Vec<u8> containing the phonetic hash, or None if input is invalid.
|
||||
pub fn phonetic_hash(z_in: &[u8]) -> Option<Vec<u8>> {
|
||||
if z_in.is_empty() {
|
||||
return Some(Vec::new());
|
||||
}
|
||||
|
||||
let mut z_out = Vec::with_capacity(z_in.len() + 1);
|
||||
let mut c_prev = 0x77u8;
|
||||
let mut c_prev_x = 0x77u8;
|
||||
let mut a_class = &INIT_CLASS;
|
||||
|
||||
let mut input = z_in;
|
||||
if z_in.len() > 2 {
|
||||
match z_in[0] {
|
||||
b'g' | b'k' => {
|
||||
if z_in[1] == b'n' {
|
||||
input = &z_in[1..];
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let mut i = 0;
|
||||
while i < input.len() {
|
||||
let mut c = input[i];
|
||||
|
||||
if i + 1 < input.len() {
|
||||
if c == b'w' && input[i + 1] == b'r' {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if c == b'd' && (input[i + 1] == b'j' || input[i + 1] == b'g') {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if i + 2 < input.len() && c == b't' && input[i + 1] == b'c' && input[i + 2] == b'h' {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
c = a_class[(c & 0x7f) as usize];
|
||||
|
||||
if c == CCLASS_SPACE {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == CCLASS_OTHER && c_prev != CCLASS_DIGIT {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
a_class = &MID_CLASS;
|
||||
|
||||
if c == CCLASS_VOWEL && (c_prev_x == CCLASS_R || c_prev_x == CCLASS_L) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == CCLASS_R || c == CCLASS_L) && c_prev_x == CCLASS_VOWEL && !z_out.is_empty() {
|
||||
z_out.pop();
|
||||
}
|
||||
|
||||
c_prev = c;
|
||||
|
||||
if c == CCLASS_SILENT {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
c_prev_x = c;
|
||||
if (c as usize) < CLASS_NAME.len() {
|
||||
c = CLASS_NAME[c as usize];
|
||||
} else {
|
||||
c = b'?';
|
||||
}
|
||||
|
||||
if z_out.is_empty() || c != *z_out.last().unwrap() {
|
||||
z_out.push(c);
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
Some(z_out)
|
||||
}
|
||||
|
||||
pub fn phonetic_hash_str(input: Option<&str>) -> Option<String> {
|
||||
match input {
|
||||
None => None,
|
||||
Some(s) => {
|
||||
phonetic_hash(s.as_bytes()).map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
|
||||
}
|
||||
}
|
||||
}
|
||||
65
extensions/fuzzy/src/soundex.rs
Normal file
65
extensions/fuzzy/src/soundex.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
/// Computes and returns the soundex representation of a given string.
|
||||
/// https://en.wikipedia.org/wiki/Soundex
|
||||
pub fn soundex(input: Option<&str>) -> Option<String> {
|
||||
if let Some(input_str) = input {
|
||||
if input_str.is_empty() {
|
||||
return Some("".to_string());
|
||||
}
|
||||
|
||||
let str_bytes = input_str.as_bytes();
|
||||
let str_len = str_bytes.len();
|
||||
|
||||
let mut code = String::with_capacity(4);
|
||||
code.push(str_bytes[0].to_ascii_uppercase() as char);
|
||||
|
||||
let mut buf: Vec<char> = Vec::with_capacity(str_len);
|
||||
for &byte in str_bytes {
|
||||
buf.push(soundex_encode(byte as char));
|
||||
}
|
||||
|
||||
let mut d = 1; // digit counter
|
||||
let mut i = 1; // index counter
|
||||
|
||||
while i < str_len && d < 4 {
|
||||
let current = buf[i];
|
||||
let previous = buf[i - 1];
|
||||
|
||||
if current != previous && current != '0' {
|
||||
if i > 1 {
|
||||
let two_back = buf[i - 2];
|
||||
let separator = str_bytes[i - 1].to_ascii_lowercase() as char;
|
||||
if current == two_back && (separator == 'h' || separator == 'w') {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
code.push(current);
|
||||
d += 1;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
while d < 4 {
|
||||
code.push('0');
|
||||
d += 1;
|
||||
}
|
||||
|
||||
Some(code)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function
|
||||
fn soundex_encode(c: char) -> char {
|
||||
match c.to_ascii_lowercase() {
|
||||
'b' | 'f' | 'p' | 'v' => '1',
|
||||
'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2',
|
||||
'd' | 't' => '3',
|
||||
'l' => '4',
|
||||
'm' | 'n' => '5',
|
||||
'r' => '6',
|
||||
_ => '0',
|
||||
}
|
||||
}
|
||||
@@ -581,6 +581,12 @@ def validate_fuzzy_jarowin(a):
|
||||
def validate_fuzzy_osadist(a):
|
||||
return a == "3"
|
||||
|
||||
def validate_fuzzy_soundex(a):
|
||||
return a == "A250"
|
||||
|
||||
def validate_fuzzy_phonetic(a):
|
||||
return a == "ABACAMA"
|
||||
|
||||
def test_fuzzy():
|
||||
limbo = TestTursoShell()
|
||||
ext_path = "./target/debug/liblimbo_fuzzy"
|
||||
@@ -625,6 +631,11 @@ def test_fuzzy():
|
||||
validate_fuzzy_osadist,
|
||||
"fuzzy osadist function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_phonetic('awesome');",
|
||||
validate_fuzzy_phonetic,
|
||||
"fuzzy phonetic function works",
|
||||
)
|
||||
|
||||
def test_vfs():
|
||||
limbo = TestTursoShell()
|
||||
|
||||
Reference in New Issue
Block a user