mirror of
https://github.com/aljazceru/turso.git
synced 2026-02-23 17:05:36 +01:00
Merge 'Sqlean fuzzy string ' from Danawan Bimantoro
Add implementations of string distance and phonetics functions: fuzzy_damlev fuzzy_hamming fuzzy_jarowin fuzzy_leven fuzzy_osadist fuzzy_editdist fuzzy_soundex fuzzy_rsoundex fuzzy_phonetic fuzzy_caver fuzzy_translit This implementation follows sqlean-fuzzy Reviewed-by: Preston Thorpe <preston@turso.tech> Closes #3262
This commit is contained in:
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -2293,6 +2293,14 @@ dependencies = [
|
||||
"turso_ext",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "limbo_fuzzy"
|
||||
version = "0.2.0-pre.8"
|
||||
dependencies = [
|
||||
"mimalloc",
|
||||
"turso_ext",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "limbo_ipaddr"
|
||||
version = "0.2.0-pre.9"
|
||||
|
||||
@@ -19,6 +19,7 @@ members = [
|
||||
"extensions/percentile",
|
||||
"extensions/regexp",
|
||||
"extensions/tests",
|
||||
"extensions/fuzzy",
|
||||
"macros",
|
||||
"simulator",
|
||||
"sqlite3",
|
||||
@@ -62,6 +63,7 @@ limbo_regexp = { path = "extensions/regexp", version = "0.2.0-pre.9" }
|
||||
turso_sqlite3_parser = { path = "vendored/sqlite3-parser", version = "0.2.0-pre.9" }
|
||||
limbo_uuid = { path = "extensions/uuid", version = "0.2.0-pre.9" }
|
||||
turso_parser = { path = "parser", version = "0.2.0-pre.9" }
|
||||
limbo_fuzzy = { path = "extensions/fuzzy", version = "0.2.0-pre.9" }
|
||||
sql_generation = { path = "sql_generation" }
|
||||
strum = { version = "0.26", features = ["derive"] }
|
||||
strum_macros = "0.26"
|
||||
|
||||
20
extensions/fuzzy/Cargo.toml
Normal file
20
extensions/fuzzy/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "limbo_fuzzy"
|
||||
version.workspace = true
|
||||
authors.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
description = "Limbo fuzzy string extension"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "lib"]
|
||||
|
||||
[features]
|
||||
static = ["turso_ext/static"]
|
||||
|
||||
[dependencies]
|
||||
turso_ext = { workspace = true, features = ["static"] }
|
||||
|
||||
[target.'cfg(not(target_family = "wasm"))'.dependencies]
|
||||
mimalloc = { version = "0.1", default-features = false }
|
||||
5
extensions/fuzzy/build.rs
Normal file
5
extensions/fuzzy/build.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
fn main() {
|
||||
if cfg!(target_os = "windows") {
|
||||
println!("cargo:rustc-link-lib=advapi32");
|
||||
}
|
||||
}
|
||||
191
extensions/fuzzy/src/caver.rs
Normal file
191
extensions/fuzzy/src/caver.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
// remove_non_letters deletes everything from the source string,
|
||||
// except lowercased letters a-z
|
||||
fn remove_non_letters(src: &str) -> String {
|
||||
src.chars()
|
||||
.filter(|x: &char| x.is_ascii_lowercase())
|
||||
.collect()
|
||||
}
|
||||
|
||||
// replace_start replaces the `old` substring with the `new` one
|
||||
// if it matches at the beginning of the `src` string
|
||||
fn replace_start(src: &str, old: &str, new: &str) -> String {
|
||||
if let Some(suffix) = src.strip_prefix(old) {
|
||||
let mut result = String::with_capacity(src.len() - old.len() + new.len());
|
||||
result.push_str(new);
|
||||
result.push_str(suffix);
|
||||
result
|
||||
} else {
|
||||
src.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// replace_end replaces the `old` substring with the `new` one
|
||||
// if it matches at the end of the `src` string
|
||||
fn replace_end(src: &str, old: &str, new: &str) -> String {
|
||||
if let Some(prefix) = src.strip_suffix(old) {
|
||||
let mut result = String::with_capacity(src.len() - old.len() + new.len());
|
||||
result.push_str(prefix);
|
||||
result.push_str(new);
|
||||
result
|
||||
} else {
|
||||
src.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// replace replaces all `old` substrings with `new` ones
|
||||
// in the the `src` string
|
||||
fn replace(src: &str, old: &str, new: &str) -> String {
|
||||
if old.is_empty() || src.is_empty() {
|
||||
return src.to_string();
|
||||
}
|
||||
|
||||
let mut result = String::with_capacity(src.len());
|
||||
let mut idx = 0;
|
||||
|
||||
while idx < src.len() {
|
||||
if idx + old.len() <= src.len() && &src[idx..idx + old.len()] == old {
|
||||
result.push_str(new);
|
||||
idx += old.len();
|
||||
} else {
|
||||
let ch = src[idx..].chars().next().unwrap();
|
||||
result.push(ch);
|
||||
idx += ch.len_utf8();
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// replace_seq replaces all sequences of the `old` character
|
||||
// with the `new` substring in the the `src` string
|
||||
fn replace_seq(src: &str, old: char, new: &str) -> String {
|
||||
let mut result = String::with_capacity(src.len());
|
||||
let mut match_len = 0;
|
||||
|
||||
for ch in src.chars() {
|
||||
if ch == old {
|
||||
match_len += 1;
|
||||
} else {
|
||||
if match_len > 0 {
|
||||
result.push_str(new);
|
||||
match_len = 0;
|
||||
}
|
||||
result.push(ch);
|
||||
}
|
||||
}
|
||||
|
||||
if match_len > 0 {
|
||||
result.push_str(new);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// pad pads `src` string with trailing 1s
|
||||
// up to the length of 10 characters
|
||||
fn pad(src: &str) -> String {
|
||||
let max_len = 10;
|
||||
let mut result = String::with_capacity(max_len);
|
||||
for ch in src.chars().take(max_len) {
|
||||
result.push(ch);
|
||||
}
|
||||
while result.chars().count() < max_len {
|
||||
result.push('1');
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// caverphone implements the Caverphone phonetic hashing algorithm
|
||||
// https://en.wikipedia.org/wiki/Caverphone
|
||||
fn caverphone(src: &str) -> String {
|
||||
if src.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let mut res = remove_non_letters(src);
|
||||
res = replace_end(&res, "e", "");
|
||||
res = replace_start(&res, "cough", "cou2f");
|
||||
res = replace_start(&res, "rough", "rou2f");
|
||||
res = replace_start(&res, "tough", "tou2f");
|
||||
res = replace_start(&res, "enough", "enou2f");
|
||||
res = replace_start(&res, "trough", "trou2f");
|
||||
|
||||
res = replace_start(&res, "gn", "2n");
|
||||
res = replace_end(&res, "mb", "m2");
|
||||
|
||||
res = replace(&res, "cq", "2q");
|
||||
res = replace(&res, "ci", "si");
|
||||
res = replace(&res, "ce", "se");
|
||||
res = replace(&res, "cy", "sy");
|
||||
res = replace(&res, "tch", "2ch");
|
||||
res = replace(&res, "c", "k");
|
||||
res = replace(&res, "q", "k");
|
||||
res = replace(&res, "x", "k");
|
||||
res = replace(&res, "v", "f");
|
||||
res = replace(&res, "dg", "2g");
|
||||
res = replace(&res, "tio", "sio");
|
||||
res = replace(&res, "tia", "sia");
|
||||
res = replace(&res, "d", "t");
|
||||
res = replace(&res, "ph", "fh");
|
||||
res = replace(&res, "b", "p");
|
||||
res = replace(&res, "sh", "s2");
|
||||
res = replace(&res, "z", "s");
|
||||
|
||||
res = replace_start(&res, "a", "A");
|
||||
res = replace_start(&res, "e", "A");
|
||||
res = replace_start(&res, "i", "A");
|
||||
res = replace_start(&res, "o", "A");
|
||||
res = replace_start(&res, "u", "A");
|
||||
|
||||
res = replace(&res, "a", "3");
|
||||
res = replace(&res, "e", "3");
|
||||
res = replace(&res, "i", "3");
|
||||
res = replace(&res, "o", "3");
|
||||
res = replace(&res, "u", "3");
|
||||
|
||||
res = replace(&res, "j", "y");
|
||||
res = replace_start(&res, "y3", "Y3");
|
||||
res = replace_start(&res, "y", "A");
|
||||
res = replace(&res, "y", "3");
|
||||
|
||||
res = replace(&res, "3gh3", "3kh3");
|
||||
res = replace(&res, "gh", "22");
|
||||
res = replace(&res, "g", "k");
|
||||
|
||||
res = replace_seq(&res, 's', "S");
|
||||
res = replace_seq(&res, 't', "T");
|
||||
res = replace_seq(&res, 'p', "P");
|
||||
res = replace_seq(&res, 'k', "K");
|
||||
res = replace_seq(&res, 'f', "F");
|
||||
res = replace_seq(&res, 'm', "M");
|
||||
res = replace_seq(&res, 'n', "N");
|
||||
|
||||
res = replace(&res, "w3", "W3");
|
||||
res = replace(&res, "wh3", "Wh3");
|
||||
res = replace_end(&res, "w", "3");
|
||||
res = replace(&res, "w", "2");
|
||||
|
||||
res = replace_start(&res, "h", "A");
|
||||
res = replace(&res, "h", "2");
|
||||
|
||||
res = replace(&res, "r3", "R3");
|
||||
res = replace_end(&res, "r", "3");
|
||||
res = replace(&res, "r", "2");
|
||||
|
||||
res = replace(&res, "l3", "L3");
|
||||
res = replace_end(&res, "l", "3");
|
||||
res = replace(&res, "l", "2");
|
||||
|
||||
res = replace(&res, "2", "");
|
||||
res = replace_end(&res, "3", "A");
|
||||
res = replace(&res, "3", "");
|
||||
|
||||
res = pad(&res);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub fn caver_str(input: Option<&str>) -> Option<String> {
|
||||
input.map(caverphone)
|
||||
}
|
||||
54
extensions/fuzzy/src/common.rs
Normal file
54
extensions/fuzzy/src/common.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
pub const CCLASS_SILENT: u8 = 0;
|
||||
pub const CCLASS_VOWEL: u8 = 1;
|
||||
pub const CCLASS_B: u8 = 2;
|
||||
pub const CCLASS_Y: u8 = 9;
|
||||
pub const CCLASS_L: u8 = 6;
|
||||
pub const CCLASS_R: u8 = 7;
|
||||
//pub const CCLASS_M: u8 = 8;
|
||||
pub const CCLASS_DIGIT: u8 = 10;
|
||||
pub const CCLASS_SPACE: u8 = 11;
|
||||
pub const CCLASS_OTHER: u8 = 12;
|
||||
pub const MID_CLASS: [u8; 128] = [
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
|
||||
12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, //
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, //
|
||||
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
|
||||
2, 3, 7, 3, 4, 1, 2, 2, 3, 1, 3, 12, 12, 12, 12, 12, //
|
||||
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
|
||||
2, 3, 7, 3, 4, 1, 2, 2, 3, 1, 3, 12, 12, 12, 12, 12, //
|
||||
];
|
||||
|
||||
pub const INIT_CLASS: [u8; 128] = [
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
|
||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, //
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, //
|
||||
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
|
||||
2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
|
||||
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
|
||||
2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
|
||||
];
|
||||
|
||||
// Based on: const unsigned char className[] = ".ABCDHLRMY9 ?";
|
||||
pub const CLASS_NAME: [u8; 13] = [
|
||||
b'.', // CCLASS_SILENT (0) -> .
|
||||
b'A', // CCLASS_VOWEL (1) -> A
|
||||
b'B', // CCLASS_B (2) -> B
|
||||
b'C', // CCLASS_C (3) -> C
|
||||
b'D', // CCLASS_D (4) -> D
|
||||
b'H', // CCLASS_H (5) -> H
|
||||
b'L', // CCLASS_L (6) -> L
|
||||
b'R', // CCLASS_R (7) -> R
|
||||
b'M', // CCLASS_M (8) -> M
|
||||
b'Y', // CCLASS_Y (9) -> Y
|
||||
b'9', // CCLASS_DIGIT (10) -> 9
|
||||
b' ', // CCLASS_SPACE (11) -> space
|
||||
b'?', // CCLASS_OTHER (12) -> ?
|
||||
];
|
||||
|
||||
pub const SCRIPT_LATIN: u32 = 0x0001;
|
||||
pub const SCRIPT_CYRILLIC: u32 = 0x0002;
|
||||
pub const SCRIPT_GREEK: u32 = 0x0004;
|
||||
pub const SCRIPT_HEBREW: u32 = 0x0008;
|
||||
pub const SCRIPT_ARABIC: u32 = 0x0010;
|
||||
276
extensions/fuzzy/src/editdist.rs
Normal file
276
extensions/fuzzy/src/editdist.rs
Normal file
@@ -0,0 +1,276 @@
|
||||
// Adapted from SQLite spellfix.c extension and sqlean fuzzy/editdist.c
|
||||
use crate::common::*;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum EditDistanceError {
|
||||
NonAsciiInput,
|
||||
}
|
||||
|
||||
pub type EditDistanceResult = Result<i32, EditDistanceError>;
|
||||
|
||||
fn character_class(c_prev: u8, c: u8) -> u8 {
|
||||
if c_prev == 0 {
|
||||
INIT_CLASS[(c & 0x7f) as usize]
|
||||
} else {
|
||||
MID_CLASS[(c & 0x7f) as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the cost of inserting or deleting character c immediately
|
||||
/// following character c_prev. If c_prev == 0, that means c is the first
|
||||
/// character of the word.
|
||||
fn insert_or_delete_cost(c_prev: u8, c: u8, c_next: u8) -> i32 {
|
||||
let class_c = character_class(c_prev, c);
|
||||
|
||||
if class_c == CCLASS_SILENT {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if c_prev == c {
|
||||
return 10;
|
||||
}
|
||||
|
||||
if class_c == CCLASS_VOWEL && (c_prev == b'r' || c_next == b'r') {
|
||||
return 20; // Insert a vowel before or after 'r'
|
||||
}
|
||||
|
||||
let class_c_prev = character_class(c_prev, c_prev);
|
||||
if class_c == class_c_prev {
|
||||
if class_c == CCLASS_VOWEL {
|
||||
15
|
||||
} else {
|
||||
50
|
||||
}
|
||||
} else {
|
||||
// Any other character insertion or deletion
|
||||
100
|
||||
}
|
||||
}
|
||||
|
||||
const FINAL_INS_COST_DIV: i32 = 4;
|
||||
|
||||
/// Return the cost of substituting c_to in place of c_from assuming
|
||||
/// the previous character is c_prev. If c_prev == 0 then c_to is the first
|
||||
/// character of the word.
|
||||
fn substitute_cost(c_prev: u8, c_from: u8, c_to: u8) -> i32 {
|
||||
if c_from == c_to {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if c_from == (c_to ^ 0x20) && c_to.is_ascii_alphabetic() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let class_from = character_class(c_prev, c_from);
|
||||
let class_to = character_class(c_prev, c_to);
|
||||
|
||||
if class_from == class_to {
|
||||
40
|
||||
} else if (CCLASS_B..=CCLASS_Y).contains(&class_from)
|
||||
&& (CCLASS_B..=CCLASS_Y).contains(&class_to)
|
||||
{
|
||||
75
|
||||
} else {
|
||||
100
|
||||
}
|
||||
}
|
||||
|
||||
/// Given two strings z_a and z_b which are pure ASCII, return the cost
|
||||
/// of transforming z_a into z_b. If z_a ends with '*' assume that it is
|
||||
/// a prefix of z_b and give only minimal penalty for extra characters
|
||||
/// on the end of z_b.
|
||||
///
|
||||
/// Returns cost where smaller numbers mean a closer match
|
||||
///
|
||||
/// Returns Err for Non-ASCII characters on input
|
||||
pub fn edit_distance(z_a: &str, z_b: &str) -> EditDistanceResult {
|
||||
if z_a.is_empty() && z_b.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let za_bytes = z_a.as_bytes();
|
||||
let zb_bytes = z_b.as_bytes();
|
||||
|
||||
if !z_a.is_ascii() || !z_b.is_ascii() {
|
||||
return Err(EditDistanceError::NonAsciiInput);
|
||||
}
|
||||
|
||||
if z_a.is_empty() {
|
||||
let mut res = 0;
|
||||
let mut c_b_prev = 0u8;
|
||||
let zb_bytes = z_b.as_bytes();
|
||||
|
||||
for (i, &c_b) in zb_bytes.iter().enumerate() {
|
||||
let c_b_next = if i + 1 < zb_bytes.len() {
|
||||
zb_bytes[i + 1]
|
||||
} else {
|
||||
0
|
||||
};
|
||||
res += insert_or_delete_cost(c_b_prev, c_b, c_b_next) / FINAL_INS_COST_DIV;
|
||||
c_b_prev = c_b;
|
||||
}
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
if z_b.is_empty() {
|
||||
let mut res = 0;
|
||||
let mut c_a_prev = 0u8;
|
||||
let za_bytes = z_a.as_bytes();
|
||||
|
||||
for (i, &c_a) in za_bytes.iter().enumerate() {
|
||||
let c_a_next = if i + 1 < za_bytes.len() {
|
||||
za_bytes[i + 1]
|
||||
} else {
|
||||
0
|
||||
};
|
||||
res += insert_or_delete_cost(c_a_prev, c_a, c_a_next);
|
||||
c_a_prev = c_a;
|
||||
}
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
let mut za_start = 0;
|
||||
let mut zb_start = 0;
|
||||
|
||||
// Skip any common prefix
|
||||
while za_start < za_bytes.len()
|
||||
&& zb_start < zb_bytes.len()
|
||||
&& za_bytes[za_start] == zb_bytes[zb_start]
|
||||
{
|
||||
za_start += 1;
|
||||
zb_start += 1;
|
||||
}
|
||||
|
||||
// If both strings are exhausted after common prefix
|
||||
if za_start >= za_bytes.len() && zb_start >= zb_bytes.len() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let za_remaining = &za_bytes[za_start..];
|
||||
let zb_remaining = &zb_bytes[zb_start..];
|
||||
let n_a = za_remaining.len();
|
||||
let n_b = zb_remaining.len();
|
||||
|
||||
// Special processing if either remaining string is empty after prefix matching
|
||||
if n_a == 0 {
|
||||
let mut res = 0;
|
||||
let mut c_b_prev = if za_start > 0 {
|
||||
za_bytes[za_start - 1]
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
for (i, &c_b) in zb_remaining.iter().enumerate() {
|
||||
let c_b_next = if i + 1 < n_b { zb_remaining[i + 1] } else { 0 };
|
||||
res += insert_or_delete_cost(c_b_prev, c_b, c_b_next) / FINAL_INS_COST_DIV;
|
||||
c_b_prev = c_b;
|
||||
}
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
if n_b == 0 {
|
||||
let mut res = 0;
|
||||
let mut c_a_prev = if za_start > 0 {
|
||||
za_bytes[za_start - 1]
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
for (i, &c_a) in za_remaining.iter().enumerate() {
|
||||
let c_a_next = if i + 1 < n_a { za_remaining[i + 1] } else { 0 };
|
||||
res += insert_or_delete_cost(c_a_prev, c_a, c_a_next);
|
||||
c_a_prev = c_a;
|
||||
}
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
// Check if a is a prefix pattern
|
||||
if za_remaining.len() == 1 && za_remaining[0] == b'*' {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let mut m = vec![0i32; n_b + 1];
|
||||
let mut cx = vec![0u8; n_b + 1];
|
||||
|
||||
let dc = if za_start > 0 {
|
||||
za_bytes[za_start - 1]
|
||||
} else {
|
||||
0
|
||||
};
|
||||
m[0] = 0;
|
||||
cx[0] = dc;
|
||||
|
||||
let mut c_b_prev = dc;
|
||||
for x_b in 1..=n_b {
|
||||
let c_b = zb_remaining[x_b - 1];
|
||||
let c_b_next = if x_b < n_b { zb_remaining[x_b] } else { 0 };
|
||||
cx[x_b] = c_b;
|
||||
m[x_b] = m[x_b - 1] + insert_or_delete_cost(c_b_prev, c_b, c_b_next);
|
||||
c_b_prev = c_b;
|
||||
}
|
||||
|
||||
let mut c_a_prev = dc;
|
||||
for x_a in 1..=n_a {
|
||||
let last_a = x_a == n_a;
|
||||
let c_a = za_remaining[x_a - 1];
|
||||
let c_a_next = if x_a < n_a { za_remaining[x_a] } else { 0 };
|
||||
|
||||
if c_a == b'*' && last_a {
|
||||
break;
|
||||
}
|
||||
|
||||
let mut d = m[0];
|
||||
m[0] = d + insert_or_delete_cost(c_a_prev, c_a, c_a_next);
|
||||
|
||||
for x_b in 1..=n_b {
|
||||
let c_b = zb_remaining[x_b - 1];
|
||||
let c_b_next = if x_b < n_b { zb_remaining[x_b] } else { 0 };
|
||||
|
||||
// Cost to insert c_b
|
||||
let mut ins_cost = insert_or_delete_cost(cx[x_b - 1], c_b, c_b_next);
|
||||
if last_a {
|
||||
ins_cost /= FINAL_INS_COST_DIV;
|
||||
}
|
||||
|
||||
// Cost to delete c_a
|
||||
let del_cost = insert_or_delete_cost(cx[x_b], c_a, c_b_next);
|
||||
|
||||
// Cost to substitute c_a -> c_b
|
||||
let sub_cost = substitute_cost(cx[x_b - 1], c_a, c_b);
|
||||
|
||||
// Find best cost
|
||||
let mut total_cost = ins_cost + m[x_b - 1];
|
||||
let mut ncx = c_b;
|
||||
|
||||
if del_cost + m[x_b] < total_cost {
|
||||
total_cost = del_cost + m[x_b];
|
||||
ncx = c_a;
|
||||
}
|
||||
|
||||
if sub_cost + d < total_cost {
|
||||
total_cost = sub_cost + d;
|
||||
}
|
||||
|
||||
d = m[x_b];
|
||||
m[x_b] = total_cost;
|
||||
cx[x_b] = ncx;
|
||||
}
|
||||
c_a_prev = c_a;
|
||||
}
|
||||
|
||||
let res = if za_remaining.last() == Some(&b'*') {
|
||||
let mut min_cost = m[1];
|
||||
|
||||
for &val in m.iter().skip(1).take(n_b) {
|
||||
if val < min_cost {
|
||||
min_cost = val;
|
||||
}
|
||||
}
|
||||
|
||||
min_cost
|
||||
} else {
|
||||
m[n_b]
|
||||
};
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
666
extensions/fuzzy/src/lib.rs
Normal file
666
extensions/fuzzy/src/lib.rs
Normal file
@@ -0,0 +1,666 @@
|
||||
// Adapted from sqlean fuzzy
|
||||
use std::cmp;
|
||||
use turso_ext::{register_extension, scalar, ResultCode, Value};
|
||||
mod caver;
|
||||
mod common;
|
||||
mod editdist;
|
||||
mod phonetic;
|
||||
mod rsoundex;
|
||||
mod soundex;
|
||||
mod translit;
|
||||
|
||||
register_extension! {
|
||||
scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic, fuzzy_caver, fuzzy_rsoundex, fuzzy_translit, fuzzy_script}
|
||||
}
|
||||
|
||||
/// Calculates and returns the Levenshtein distance of two non NULL strings.
|
||||
#[scalar(name = "fuzzy_leven")]
|
||||
fn levenshtein(args: &[Value]) -> Value {
|
||||
let Some(arg1) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let Some(arg2) = args[1].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let dist = leven(arg1, arg2);
|
||||
return Value::from_integer(dist);
|
||||
}
|
||||
|
||||
fn leven(s1: &str, s2: &str) -> i64 {
|
||||
let mut str1: &[u8] = s1.as_bytes();
|
||||
let mut str2: &[u8] = s2.as_bytes();
|
||||
let mut str1_len = str1.len();
|
||||
let mut str2_len = str2.len();
|
||||
|
||||
if str1_len == 0 {
|
||||
return str2_len as i64;
|
||||
}
|
||||
|
||||
if str2_len == 0 {
|
||||
return str1_len as i64;
|
||||
}
|
||||
|
||||
while str1_len > 0 && str2_len > 0 && str1[0] == str2[0] {
|
||||
str1 = &str1[1..];
|
||||
str2 = &str2[1..];
|
||||
str1_len -= 1;
|
||||
str2_len -= 1;
|
||||
}
|
||||
|
||||
let mut vector: Vec<usize> = (0..=str1_len).collect();
|
||||
|
||||
let mut last_diag: usize;
|
||||
let mut cur: usize;
|
||||
|
||||
for row in 1..=str2_len {
|
||||
last_diag = row - 1;
|
||||
vector[0] = row;
|
||||
|
||||
for col in 1..=str1_len {
|
||||
cur = vector[col];
|
||||
|
||||
let cost = if str1[col - 1] == str2[row - 1] { 0 } else { 1 };
|
||||
|
||||
vector[col] = std::cmp::min(
|
||||
std::cmp::min(vector[col] + 1, vector[col - 1] + 1),
|
||||
last_diag + cost,
|
||||
);
|
||||
|
||||
last_diag = cur;
|
||||
}
|
||||
}
|
||||
vector[str1_len] as i64
|
||||
}
|
||||
|
||||
/// Calculates and returns the Damerau-Levenshtein distance of two non NULL
|
||||
#[scalar(name = "fuzzy_damlev")]
|
||||
fn damerau_levenshtein(args: &[Value]) -> Value {
|
||||
let Some(arg1) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let Some(arg2) = args[1].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let dist = damlev(arg1, arg2);
|
||||
return Value::from_integer(dist);
|
||||
}
|
||||
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
fn damlev(s1: &str, s2: &str) -> i64 {
|
||||
let str1: &[u8] = s1.as_bytes();
|
||||
let str2: &[u8] = s2.as_bytes();
|
||||
let str1_len = str1.len();
|
||||
let str2_len = str2.len();
|
||||
|
||||
if str1_len == 0 {
|
||||
return str2_len as i64;
|
||||
}
|
||||
|
||||
if str2_len == 0 {
|
||||
return str1_len as i64;
|
||||
}
|
||||
|
||||
let mut start = 0;
|
||||
while start < str1_len && start < str2_len && str1[start] == str2[start] {
|
||||
start += 1;
|
||||
}
|
||||
let str1 = &str1[start..];
|
||||
let str2 = &str2[start..];
|
||||
let len1 = str1.len();
|
||||
let len2 = str2.len();
|
||||
|
||||
const ALPHA_SIZE: usize = 255;
|
||||
let infi = len1 + len2;
|
||||
|
||||
let mut dict = vec![0usize; ALPHA_SIZE];
|
||||
|
||||
let rows = len1 + 2;
|
||||
let cols = len2 + 2;
|
||||
let mut matrix = vec![vec![0usize; cols]; rows];
|
||||
|
||||
matrix[0][0] = infi;
|
||||
|
||||
for i in 1..rows {
|
||||
matrix[i][0] = infi;
|
||||
matrix[i][1] = i - 1;
|
||||
}
|
||||
for j in 1..cols {
|
||||
matrix[0][j] = infi;
|
||||
matrix[1][j] = j - 1;
|
||||
}
|
||||
|
||||
for (row, &c1) in str1.iter().enumerate() {
|
||||
let mut db = 0;
|
||||
for (col, &c2) in str2.iter().enumerate() {
|
||||
let i = dict[c2 as usize];
|
||||
let k = db;
|
||||
let cost = if c1 == c2 { 0 } else { 1 };
|
||||
if cost == 0 {
|
||||
db = col + 1;
|
||||
}
|
||||
|
||||
matrix[row + 2][col + 2] = std::cmp::min(
|
||||
std::cmp::min(
|
||||
matrix[row + 1][col + 1] + cost,
|
||||
matrix[row + 2][col + 1] + 1,
|
||||
),
|
||||
std::cmp::min(
|
||||
matrix[row + 1][col + 2] + 1,
|
||||
matrix[i][k] + (row + 1 - i - 1) + (col + 1 - k - 1) + 1,
|
||||
),
|
||||
);
|
||||
}
|
||||
dict[c1 as usize] = row + 1;
|
||||
}
|
||||
|
||||
matrix[rows - 1][cols - 1] as i64
|
||||
}
|
||||
//
|
||||
// fuzzy_editdist(A,B)
|
||||
//
|
||||
// Return the cost of transforming string A into string B. Both strings
|
||||
// must be pure ASCII text. If A ends with '*' then it is assumed to be
|
||||
// a prefix of B and extra characters on the end of B have minimal additional
|
||||
// cost.
|
||||
//
|
||||
#[scalar(name = "fuzzy_editdist")]
|
||||
fn edit_distance(args: &[Value]) {
|
||||
let Some(arg1) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let Some(arg2) = args[1].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
if let Ok(res) = editdist::edit_distance(arg1, arg2) {
|
||||
return Value::from_integer(res as i64);
|
||||
} else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
}
|
||||
}
|
||||
|
||||
// returns the hamming distance between two strings
|
||||
#[scalar(name = "fuzzy_hamming")]
|
||||
fn hamming(args: &[Value]) {
|
||||
let Some(arg1) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let Some(arg2) = args[1].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let dist = hamming_dist(arg1, arg2);
|
||||
return Value::from_integer(dist);
|
||||
}
|
||||
|
||||
fn hamming_dist(s1: &str, s2: &str) -> i64 {
|
||||
let str1_b = s1.as_bytes();
|
||||
let str2_b = s2.as_bytes();
|
||||
|
||||
if str1_b.len() != str2_b.len() {
|
||||
return -1_i64;
|
||||
}
|
||||
|
||||
let res = str1_b
|
||||
.iter()
|
||||
.zip(str2_b.iter())
|
||||
.filter(|(a, b)| a != b)
|
||||
.count();
|
||||
|
||||
res as i64
|
||||
}
|
||||
#[scalar(name = "fuzzy_jarowin")]
|
||||
fn jaronwin(args: &[Value]) {
|
||||
let Some(arg1) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let Some(arg2) = args[1].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let res = jaro_winkler(arg1, arg2);
|
||||
return Value::from_float(res);
|
||||
}
|
||||
|
||||
/// Calculates and returns the Jaro-Winkler distance of two non NULL strings.
|
||||
fn jaro_winkler(s1: &str, s2: &str) -> f64 {
|
||||
let dist = jaro(s1, s2);
|
||||
|
||||
let mut prefix_len = 0;
|
||||
for (c1, c2) in s1.chars().zip(s2.chars()) {
|
||||
if c1 == c2 {
|
||||
prefix_len += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
if prefix_len == 3 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
dist + (prefix_len as f64) * 0.1 * (1.0 - dist)
|
||||
}
|
||||
|
||||
/// Calculates and returns the Jaro distance of two non NULL strings.
|
||||
fn jaro(s1: &str, s2: &str) -> f64 {
|
||||
if s1 == s2 {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
let s1: Vec<char> = s1.chars().collect();
|
||||
let s2: Vec<char> = s2.chars().collect();
|
||||
|
||||
let len1 = s1.len();
|
||||
let len2 = s2.len();
|
||||
|
||||
if len1 == 0 || len2 == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let max_dist = (cmp::max(len1, len2) / 2).saturating_sub(1);
|
||||
let mut match_count = 0;
|
||||
|
||||
let mut hash_s1 = vec![false; len1];
|
||||
let mut hash_s2 = vec![false; len2];
|
||||
|
||||
for i in 0..len1 {
|
||||
let start = i.saturating_sub(max_dist);
|
||||
let end = cmp::min(i + max_dist + 1, len2);
|
||||
|
||||
for j in start..end {
|
||||
if s1[i] == s2[j] && !hash_s2[j] {
|
||||
hash_s1[i] = true;
|
||||
hash_s2[j] = true;
|
||||
match_count += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if match_count == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut t = 0;
|
||||
let mut point = 0;
|
||||
|
||||
for i in 0..len1 {
|
||||
if hash_s1[i] {
|
||||
while point < len2 && !hash_s2[point] {
|
||||
point += 1;
|
||||
}
|
||||
if point < len2 && s1[i] != s2[point] {
|
||||
t += 1;
|
||||
}
|
||||
point += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let t = t as f64 / 2.0;
|
||||
let match_count = match_count as f64;
|
||||
|
||||
(match_count / len1 as f64 + match_count / len2 as f64 + (match_count - t) / match_count) / 3.0
|
||||
}
|
||||
|
||||
/// Computes and returns the Optimal String Alignment distance for two non NULL
|
||||
#[scalar(name = "fuzzy_osadist")]
|
||||
fn osadist(args: &[Value]) {
|
||||
let Some(arg1) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let Some(arg2) = args[1].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
|
||||
let dist = optimal_string_alignment(arg1, arg2);
|
||||
return Value::from_integer(dist as i64);
|
||||
}
|
||||
|
||||
fn optimal_string_alignment(s1: &str, s2: &str) -> usize {
|
||||
let mut s1_chars: Vec<char> = s1.chars().collect();
|
||||
let mut s2_chars: Vec<char> = s2.chars().collect();
|
||||
|
||||
let mut len1 = s1_chars.len();
|
||||
let mut len2 = s2_chars.len();
|
||||
|
||||
while len1 > 0 && len2 > 0 && s1_chars[0] == s2_chars[0] {
|
||||
s1_chars.remove(0);
|
||||
s2_chars.remove(0);
|
||||
len1 -= 1;
|
||||
len2 -= 1;
|
||||
}
|
||||
|
||||
if len1 == 0 {
|
||||
return len2;
|
||||
}
|
||||
if len2 == 0 {
|
||||
return len1;
|
||||
}
|
||||
|
||||
let mut matrix = vec![vec![0usize; len2 + 1]; len1 + 1];
|
||||
|
||||
// clippy from this
|
||||
//for i in 0..=len1 {
|
||||
// matrix[i][0] = i;
|
||||
//}
|
||||
//for j in 0..=len2 {
|
||||
// matrix[0][j] = j;
|
||||
//}
|
||||
// to
|
||||
for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
|
||||
row[0] = i;
|
||||
}
|
||||
|
||||
for (j, item) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
|
||||
*item = j;
|
||||
}
|
||||
|
||||
for i in 1..=len1 {
|
||||
for j in 1..=len2 {
|
||||
let cost = if s1_chars[i - 1] == s2_chars[j - 1] {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
|
||||
let deletion = matrix[i - 1][j] + 1;
|
||||
let insertion = matrix[i][j - 1] + 1;
|
||||
let substitution = matrix[i - 1][j - 1] + cost;
|
||||
|
||||
matrix[i][j] = deletion.min(insertion).min(substitution);
|
||||
|
||||
if i > 1
|
||||
&& j > 1
|
||||
&& s1_chars[i % len1] == s2_chars[j - 2]
|
||||
&& s1_chars[i - 2] == s2_chars[j % len2]
|
||||
{
|
||||
matrix[i][j] = matrix[i][j].min(matrix[i - 2][j - 2] + cost);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matrix[len1][len2]
|
||||
}
|
||||
|
||||
#[scalar(name = "fuzzy_soundex")]
|
||||
fn fuzzy_soundex(args: &[Value]) {
|
||||
let arg1 = args[0].to_text();
|
||||
if let Some(txt) = soundex::soundex(arg1) {
|
||||
Value::from_text(txt)
|
||||
} else {
|
||||
Value::null()
|
||||
}
|
||||
}
|
||||
|
||||
#[scalar(name = "fuzzy_phonetic")]
|
||||
fn fuzzy_phonetic(args: &[Value]) {
|
||||
let arg1 = args[0].to_text();
|
||||
if let Some(txt) = phonetic::phonetic_hash_str(arg1) {
|
||||
Value::from_text(txt)
|
||||
} else {
|
||||
Value::null()
|
||||
}
|
||||
}
|
||||
|
||||
#[scalar(name = "fuzzy_caver")]
|
||||
fn fuzzy_caver(args: &[Value]) {
|
||||
let arg1 = args[0].to_text();
|
||||
if let Some(txt) = caver::caver_str(arg1) {
|
||||
Value::from_text(txt)
|
||||
} else {
|
||||
Value::null()
|
||||
}
|
||||
}
|
||||
|
||||
#[scalar(name = "fuzzy_rsoundex")]
|
||||
pub fn fuzzy_rsoundex(args: &[Value]) {
|
||||
let arg1 = args[0].to_text();
|
||||
if let Some(txt) = rsoundex::rsoundex(arg1) {
|
||||
Value::from_text(txt)
|
||||
} else {
|
||||
Value::null()
|
||||
}
|
||||
}
|
||||
|
||||
//Convert a string that contains non-ASCII Roman characters into
|
||||
//pure ASCII.
|
||||
#[scalar(name = "fuzzy_translit")]
|
||||
fn fuzzy_translit(args: &[Value]) {
|
||||
let Some(arg) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
let dist = translit::transliterate_str(arg);
|
||||
return Value::from_text(dist);
|
||||
}
|
||||
|
||||
// Try to determine the dominant script used by the word X and return
|
||||
// its ISO 15924 numeric code.
|
||||
//
|
||||
// The current implementation only understands the following scripts:
|
||||
//
|
||||
// 125 (Hebrew)
|
||||
// 160 (Arabic)
|
||||
// 200 (Greek)
|
||||
// 215 (Latin)
|
||||
// 220 (Cyrillic)
|
||||
//
|
||||
// This routine will return 998 if the input X contains characters from
|
||||
// two or more of the above scripts or 999 if X contains no characters
|
||||
// from any of the above scripts.
|
||||
#[scalar(name = "fuzzy_script")]
|
||||
pub fn fuzzy_script(args: &[Value]) {
|
||||
let Some(arg) = args[0].to_text() else {
|
||||
return Value::error(ResultCode::InvalidArgs);
|
||||
};
|
||||
let dist = translit::script_code(arg.as_bytes());
|
||||
return Value::from_integer(dist as i64);
|
||||
}
|
||||
|
||||
//tests adapted from sqlean fuzzy
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_damlev() {
|
||||
let cases = vec![
|
||||
("abc", "abc", 0),
|
||||
("abc", "", 3),
|
||||
("", "abc", 3),
|
||||
("abc", "ab", 1),
|
||||
("abc", "abcd", 1),
|
||||
("abc", "acb", 1),
|
||||
("abc", "ca", 2),
|
||||
];
|
||||
|
||||
for (s1, s2, expected) in cases {
|
||||
let got = damlev(s1, s2);
|
||||
assert_eq!(got, expected, "damlev({s1}, {s2}) failed");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hamming() {
|
||||
let cases = vec![
|
||||
("abc", "abc", 0),
|
||||
("abc", "", -1),
|
||||
("", "abc", -1),
|
||||
("hello", "hellp", 1),
|
||||
("hello", "heloh", 2),
|
||||
];
|
||||
|
||||
for (s1, s2, expected) in cases {
|
||||
let got = hamming_dist(s1, s2);
|
||||
assert_eq!(got, expected, "hamming({s1}, {s2}) failed");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jaro_win() {
|
||||
let cases: Vec<(&str, &str, f64)> = vec![
|
||||
("abc", "abc", 1.0),
|
||||
("abc", "", 0.0),
|
||||
("", "abc", 0.0),
|
||||
("my string", "my tsring", 0.974),
|
||||
("my string", "my ntrisg", 0.896),
|
||||
];
|
||||
|
||||
for (s1, s2, expected) in cases {
|
||||
let got = jaro_winkler(s1, s2);
|
||||
|
||||
if (expected - 0.974).abs() < 1e-6 || (expected - 0.896).abs() < 1e-6 {
|
||||
let got_rounded = (got * 1000.0).round() / 1000.0;
|
||||
assert!(
|
||||
(got_rounded - expected).abs() < 1e-6,
|
||||
"jaro_winkler({s1}, {s2}) failed: got {got_rounded}, expected {expected}"
|
||||
);
|
||||
} else {
|
||||
assert!(
|
||||
(got - expected).abs() < 1e-6,
|
||||
"jaro_winkler({s1}, {s2}) failed: got {got}, expected {expected}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_leven() {
|
||||
let cases = vec![
|
||||
("abc", "abc", 0),
|
||||
("abc", "", 3),
|
||||
("", "abc", 3),
|
||||
("abc", "ab", 1),
|
||||
("abc", "abcd", 1),
|
||||
("abc", "acb", 2),
|
||||
("abc", "ca", 3),
|
||||
];
|
||||
|
||||
for (s1, s2, expected) in cases {
|
||||
let got = leven(s1, s2);
|
||||
assert_eq!(got, expected, "leven({s1}, {s2}) failed");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_edit_distance() {
|
||||
let test_cases = vec![
|
||||
("abc", "abc", 0),
|
||||
("abc", "", 300),
|
||||
("", "abc", 75),
|
||||
("abc", "ab", 100),
|
||||
("abc", "abcd", 25),
|
||||
("abc", "acb", 110),
|
||||
("abc", "ca", 225),
|
||||
//more cases
|
||||
("awesome", "aewsme", 215),
|
||||
("kitten", "sitting", 105),
|
||||
("flaw", "lawn", 110),
|
||||
("rust", "trust", 100),
|
||||
("gumbo", "gambol", 65),
|
||||
];
|
||||
for (s1, s2, expected) in test_cases {
|
||||
let res = editdist::edit_distance(s1, s2).unwrap();
|
||||
assert_eq!(res, expected, "edit_distance({s1}, {s2}) failed");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_osadist() {
|
||||
let cases = vec![
|
||||
("abc", "abc", 0),
|
||||
("abc", "", 3),
|
||||
("", "abc", 3),
|
||||
("abc", "ab", 1),
|
||||
("abc", "abcd", 1),
|
||||
("abc", "acb", 2),
|
||||
("abc", "ca", 3),
|
||||
];
|
||||
|
||||
for (s1, s2, expected) in cases {
|
||||
let got = optimal_string_alignment(s1, s2);
|
||||
assert_eq!(got, expected, "osadist({s1}, {s2}) failed");
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_soundex() {
|
||||
let cases = vec![
|
||||
(None, None),
|
||||
(Some(""), Some("".to_string())),
|
||||
(Some("phonetics"), Some("P532".to_string())),
|
||||
(Some("is"), Some("I200".to_string())),
|
||||
(Some("awesome"), Some("A250".to_string())),
|
||||
];
|
||||
|
||||
for (input, expected) in cases {
|
||||
let result = soundex::soundex(input);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"fuzzy_soundex({input:?}) failed: expected {expected:?}, got {result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_phonetic() {
|
||||
let cases = vec![
|
||||
(None, None),
|
||||
(Some(""), Some("".to_string())),
|
||||
(Some("phonetics"), Some("BAMADAC".to_string())),
|
||||
(Some("is"), Some("AC".to_string())),
|
||||
(Some("awesome"), Some("ABACAMA".to_string())),
|
||||
];
|
||||
|
||||
for (input, expected) in cases {
|
||||
let result = phonetic::phonetic_hash_str(input);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"fuzzy_phonetic({input:?}) failed: expected {expected:?}, got {result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_caver() {
|
||||
let cases = vec![
|
||||
(None, None),
|
||||
(Some(""), Some("".to_string())),
|
||||
(Some("phonetics"), Some("FNTKS11111".to_string())),
|
||||
(Some("is"), Some("AS11111111".to_string())),
|
||||
(Some("awesome"), Some("AWSM111111".to_string())),
|
||||
];
|
||||
|
||||
for (input, expected) in cases {
|
||||
let result = caver::caver_str(input);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"fuzzy_caver({input:?}) failed: expected {expected:?}, got {result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_rsoundex() {
|
||||
let cases = vec![
|
||||
(None, None),
|
||||
(Some(""), Some("".to_string())),
|
||||
(Some("phonetics"), Some("P1080603".to_string())),
|
||||
(Some("is"), Some("I03".to_string())),
|
||||
(Some("awesome"), Some("A03080".to_string())),
|
||||
];
|
||||
|
||||
for (input, expected) in cases {
|
||||
let result = rsoundex::rsoundex(input);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"fuzzy_rsoundex({input:?}) failed: expected {expected:?}, got {result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
110
extensions/fuzzy/src/phonetic.rs
Normal file
110
extensions/fuzzy/src/phonetic.rs
Normal file
@@ -0,0 +1,110 @@
|
||||
use crate::common::*;
|
||||
|
||||
/// Generate a "phonetic hash" from a string of ASCII characters.
|
||||
///
|
||||
/// The algorithm:
|
||||
/// Maps characters by character class as defined above
|
||||
/// Omits double-letters
|
||||
/// Omits vowels beside R and L
|
||||
/// Omits T when followed by CH
|
||||
/// Omits W when followed by R
|
||||
/// Omits D when followed by J or G
|
||||
/// Omits K in KN or G in GN at the beginning of a word
|
||||
///
|
||||
/// Returns a Vec<u8> containing the phonetic hash, or None if input is invalid.
|
||||
pub fn phonetic_hash(z_in: &[u8]) -> Option<Vec<u8>> {
|
||||
if z_in.is_empty() {
|
||||
return Some(Vec::new());
|
||||
}
|
||||
|
||||
let mut z_out = Vec::with_capacity(z_in.len() + 1);
|
||||
let mut c_prev = 0x77u8;
|
||||
let mut c_prev_x = 0x77u8;
|
||||
let mut a_class = &INIT_CLASS;
|
||||
|
||||
let mut input = z_in;
|
||||
if z_in.len() > 2 {
|
||||
match z_in[0] {
|
||||
b'g' | b'k' => {
|
||||
if z_in[1] == b'n' {
|
||||
input = &z_in[1..];
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let mut i = 0;
|
||||
while i < input.len() {
|
||||
let mut c = input[i];
|
||||
|
||||
if i + 1 < input.len() {
|
||||
if c == b'w' && input[i + 1] == b'r' {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if c == b'd' && (input[i + 1] == b'j' || input[i + 1] == b'g') {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if i + 2 < input.len() && c == b't' && input[i + 1] == b'c' && input[i + 2] == b'h' {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
c = a_class[(c & 0x7f) as usize];
|
||||
|
||||
if c == CCLASS_SPACE {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == CCLASS_OTHER && c_prev != CCLASS_DIGIT {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
a_class = &MID_CLASS;
|
||||
|
||||
if c == CCLASS_VOWEL && (c_prev_x == CCLASS_R || c_prev_x == CCLASS_L) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == CCLASS_R || c == CCLASS_L) && c_prev_x == CCLASS_VOWEL && !z_out.is_empty() {
|
||||
z_out.pop();
|
||||
}
|
||||
|
||||
c_prev = c;
|
||||
|
||||
if c == CCLASS_SILENT {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
c_prev_x = c;
|
||||
if (c as usize) < CLASS_NAME.len() {
|
||||
c = CLASS_NAME[c as usize];
|
||||
} else {
|
||||
c = b'?';
|
||||
}
|
||||
|
||||
if z_out.is_empty() || c != *z_out.last().unwrap() {
|
||||
z_out.push(c);
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
Some(z_out)
|
||||
}
|
||||
|
||||
pub fn phonetic_hash_str(input: Option<&str>) -> Option<String> {
|
||||
match input {
|
||||
None => None,
|
||||
Some(s) => {
|
||||
phonetic_hash(s.as_bytes()).map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
|
||||
}
|
||||
}
|
||||
}
|
||||
49
extensions/fuzzy/src/rsoundex.rs
Normal file
49
extensions/fuzzy/src/rsoundex.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
/// Computes and returns the soundex representation of a given non NULL string.
|
||||
/// More information about the algorithm can be found here:
|
||||
/// http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html
|
||||
pub fn rsoundex(input: Option<&str>) -> Option<String> {
|
||||
if let Some(s) = input {
|
||||
if s.is_empty() {
|
||||
return Some("".to_string());
|
||||
}
|
||||
|
||||
let str_bytes = s.as_bytes();
|
||||
let str_len = str_bytes.len();
|
||||
|
||||
let mut code = String::with_capacity(str_len + 1);
|
||||
code.push(str_bytes[0].to_ascii_uppercase() as char);
|
||||
|
||||
let mut buf: Vec<char> = Vec::with_capacity(str_len);
|
||||
for &b in str_bytes {
|
||||
buf.push(refined_soundex_encode(b as char));
|
||||
}
|
||||
|
||||
let mut prev: Option<char> = None;
|
||||
for c in buf {
|
||||
if Some(c) != prev {
|
||||
code.push(c);
|
||||
prev = Some(c);
|
||||
}
|
||||
}
|
||||
|
||||
Some(code)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
//helper
|
||||
fn refined_soundex_encode(c: char) -> char {
|
||||
match c.to_ascii_lowercase() {
|
||||
'b' | 'p' => '1',
|
||||
'f' | 'v' => '2',
|
||||
'c' | 'k' | 's' => '3',
|
||||
'g' | 'j' => '4',
|
||||
'q' | 'x' | 'z' => '5',
|
||||
'd' | 't' => '6',
|
||||
'l' => '7',
|
||||
'm' | 'n' => '8',
|
||||
'r' => '9',
|
||||
_ => '0',
|
||||
}
|
||||
}
|
||||
65
extensions/fuzzy/src/soundex.rs
Normal file
65
extensions/fuzzy/src/soundex.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
/// Computes and returns the soundex representation of a given string.
|
||||
/// https://en.wikipedia.org/wiki/Soundex
|
||||
pub fn soundex(input: Option<&str>) -> Option<String> {
|
||||
if let Some(input_str) = input {
|
||||
if input_str.is_empty() {
|
||||
return Some("".to_string());
|
||||
}
|
||||
|
||||
let str_bytes = input_str.as_bytes();
|
||||
let str_len = str_bytes.len();
|
||||
|
||||
let mut code = String::with_capacity(4);
|
||||
code.push(str_bytes[0].to_ascii_uppercase() as char);
|
||||
|
||||
let mut buf: Vec<char> = Vec::with_capacity(str_len);
|
||||
for &byte in str_bytes {
|
||||
buf.push(soundex_encode(byte as char));
|
||||
}
|
||||
|
||||
let mut d = 1; // digit counter
|
||||
let mut i = 1; // index counter
|
||||
|
||||
while i < str_len && d < 4 {
|
||||
let current = buf[i];
|
||||
let previous = buf[i - 1];
|
||||
|
||||
if current != previous && current != '0' {
|
||||
if i > 1 {
|
||||
let two_back = buf[i - 2];
|
||||
let separator = str_bytes[i - 1].to_ascii_lowercase() as char;
|
||||
if current == two_back && (separator == 'h' || separator == 'w') {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
code.push(current);
|
||||
d += 1;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
while d < 4 {
|
||||
code.push('0');
|
||||
d += 1;
|
||||
}
|
||||
|
||||
Some(code)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function
|
||||
fn soundex_encode(c: char) -> char {
|
||||
match c.to_ascii_lowercase() {
|
||||
'b' | 'f' | 'p' | 'v' => '1',
|
||||
'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2',
|
||||
'd' | 't' => '3',
|
||||
'l' => '4',
|
||||
'm' | 'n' => '5',
|
||||
'r' => '6',
|
||||
_ => '0',
|
||||
}
|
||||
}
|
||||
577
extensions/fuzzy/src/translit.rs
Normal file
577
extensions/fuzzy/src/translit.rs
Normal file
@@ -0,0 +1,577 @@
|
||||
use crate::common::*;
|
||||
|
||||
static TRANSLIT_UTF8_LOOKUP: [u8; 64] = [
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
|
||||
];
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
struct Transliteration {
|
||||
c_from: u16,
|
||||
c_to0: u8,
|
||||
c_to1: u8,
|
||||
c_to2: u8,
|
||||
c_to3: u8,
|
||||
}
|
||||
|
||||
impl Transliteration {
|
||||
const fn new(c_from: u16, c_to0: u8, c_to1: u8, c_to2: u8, c_to3: u8) -> Self {
|
||||
Self {
|
||||
c_from,
|
||||
c_to0,
|
||||
c_to1,
|
||||
c_to2,
|
||||
c_to3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static TRANSLIT: [Transliteration; 389] = [
|
||||
Transliteration::new(0x00A0, b' ', 0x00, 0x00, 0x00), /* to */
|
||||
Transliteration::new(0x00B5, b'u', 0x00, 0x00, 0x00), /* µ to u */
|
||||
Transliteration::new(0x00C0, b'A', 0x00, 0x00, 0x00), /* À to A */
|
||||
Transliteration::new(0x00C1, b'A', 0x00, 0x00, 0x00), /* Á to A */
|
||||
Transliteration::new(0x00C2, b'A', 0x00, 0x00, 0x00), /* Â to A */
|
||||
Transliteration::new(0x00C3, b'A', 0x00, 0x00, 0x00), /* Ã to A */
|
||||
Transliteration::new(0x00C4, b'A', b'e', 0x00, 0x00), /* Ä to Ae */
|
||||
Transliteration::new(0x00C5, b'A', b'a', 0x00, 0x00), /* Å to Aa */
|
||||
Transliteration::new(0x00C6, b'A', b'E', 0x00, 0x00), /* Æ to AE */
|
||||
Transliteration::new(0x00C7, b'C', 0x00, 0x00, 0x00), /* Ç to C */
|
||||
Transliteration::new(0x00C8, b'E', 0x00, 0x00, 0x00), /* È to E */
|
||||
Transliteration::new(0x00C9, b'E', 0x00, 0x00, 0x00), /* É to E */
|
||||
Transliteration::new(0x00CA, b'E', 0x00, 0x00, 0x00), /* Ê to E */
|
||||
Transliteration::new(0x00CB, b'E', 0x00, 0x00, 0x00), /* Ë to E */
|
||||
Transliteration::new(0x00CC, b'I', 0x00, 0x00, 0x00), /* Ì to I */
|
||||
Transliteration::new(0x00CD, b'I', 0x00, 0x00, 0x00), /* Í to I */
|
||||
Transliteration::new(0x00CE, b'I', 0x00, 0x00, 0x00), /* Î to I */
|
||||
Transliteration::new(0x00CF, b'I', 0x00, 0x00, 0x00), /* Ï to I */
|
||||
Transliteration::new(0x00D0, b'D', 0x00, 0x00, 0x00), /* Ð to D */
|
||||
Transliteration::new(0x00D1, b'N', 0x00, 0x00, 0x00), /* Ñ to N */
|
||||
Transliteration::new(0x00D2, b'O', 0x00, 0x00, 0x00), /* Ò to O */
|
||||
Transliteration::new(0x00D3, b'O', 0x00, 0x00, 0x00), /* Ó to O */
|
||||
Transliteration::new(0x00D4, b'O', 0x00, 0x00, 0x00), /* Ô to O */
|
||||
Transliteration::new(0x00D5, b'O', 0x00, 0x00, 0x00), /* Õ to O */
|
||||
Transliteration::new(0x00D6, b'O', b'e', 0x00, 0x00), /* Ö to Oe */
|
||||
Transliteration::new(0x00D7, b'x', 0x00, 0x00, 0x00), /* × to x */
|
||||
Transliteration::new(0x00D8, b'O', 0x00, 0x00, 0x00), /* Ø to O */
|
||||
Transliteration::new(0x00D9, b'U', 0x00, 0x00, 0x00), /* Ù to U */
|
||||
Transliteration::new(0x00DA, b'U', 0x00, 0x00, 0x00), /* Ú to U */
|
||||
Transliteration::new(0x00DB, b'U', 0x00, 0x00, 0x00), /* Û to U */
|
||||
Transliteration::new(0x00DC, b'U', b'e', 0x00, 0x00), /* Ü to Ue */
|
||||
Transliteration::new(0x00DD, b'Y', 0x00, 0x00, 0x00), /* Ý to Y */
|
||||
Transliteration::new(0x00DE, b'T', b'h', 0x00, 0x00), /* Þ to Th */
|
||||
Transliteration::new(0x00DF, b's', b's', 0x00, 0x00), /* ß to ss */
|
||||
Transliteration::new(0x00E0, b'a', 0x00, 0x00, 0x00), /* à to a */
|
||||
Transliteration::new(0x00E1, b'a', 0x00, 0x00, 0x00), /* á to a */
|
||||
Transliteration::new(0x00E2, b'a', 0x00, 0x00, 0x00), /* â to a */
|
||||
Transliteration::new(0x00E3, b'a', 0x00, 0x00, 0x00), /* ã to a */
|
||||
Transliteration::new(0x00E4, b'a', b'e', 0x00, 0x00), /* ä to ae */
|
||||
Transliteration::new(0x00E5, b'a', b'a', 0x00, 0x00), /* å to aa */
|
||||
Transliteration::new(0x00E6, b'a', b'e', 0x00, 0x00), /* æ to ae */
|
||||
Transliteration::new(0x00E7, b'c', 0x00, 0x00, 0x00), /* ç to c */
|
||||
Transliteration::new(0x00E8, b'e', 0x00, 0x00, 0x00), /* è to e */
|
||||
Transliteration::new(0x00E9, b'e', 0x00, 0x00, 0x00), /* é to e */
|
||||
Transliteration::new(0x00EA, b'e', 0x00, 0x00, 0x00), /* ê to e */
|
||||
Transliteration::new(0x00EB, b'e', 0x00, 0x00, 0x00), /* ë to e */
|
||||
Transliteration::new(0x00EC, b'i', 0x00, 0x00, 0x00), /* ì to i */
|
||||
Transliteration::new(0x00ED, b'i', 0x00, 0x00, 0x00), /* í to i */
|
||||
Transliteration::new(0x00EE, b'i', 0x00, 0x00, 0x00), /* î to i */
|
||||
Transliteration::new(0x00EF, b'i', 0x00, 0x00, 0x00), /* ï to i */
|
||||
Transliteration::new(0x00F0, b'd', 0x00, 0x00, 0x00), /* ð to d */
|
||||
Transliteration::new(0x00F1, b'n', 0x00, 0x00, 0x00), /* ñ to n */
|
||||
Transliteration::new(0x00F2, b'o', 0x00, 0x00, 0x00), /* ò to o */
|
||||
Transliteration::new(0x00F3, b'o', 0x00, 0x00, 0x00), /* ó to o */
|
||||
Transliteration::new(0x00F4, b'o', 0x00, 0x00, 0x00), /* ô to o */
|
||||
Transliteration::new(0x00F5, b'o', 0x00, 0x00, 0x00), /* õ to o */
|
||||
Transliteration::new(0x00F6, b'o', b'e', 0x00, 0x00), /* ö to oe */
|
||||
Transliteration::new(0x00F7, b':', 0x00, 0x00, 0x00), /* ÷ to : */
|
||||
Transliteration::new(0x00F8, b'o', 0x00, 0x00, 0x00), /* ø to o */
|
||||
Transliteration::new(0x00F9, b'u', 0x00, 0x00, 0x00), /* ù to u */
|
||||
Transliteration::new(0x00FA, b'u', 0x00, 0x00, 0x00), /* ú to u */
|
||||
Transliteration::new(0x00FB, b'u', 0x00, 0x00, 0x00), /* û to u */
|
||||
Transliteration::new(0x00FC, b'u', b'e', 0x00, 0x00), /* ü to ue */
|
||||
Transliteration::new(0x00FD, b'y', 0x00, 0x00, 0x00), /* ý to y */
|
||||
Transliteration::new(0x00FE, b't', b'h', 0x00, 0x00), /* þ to th */
|
||||
Transliteration::new(0x00FF, b'y', 0x00, 0x00, 0x00), /* ÿ to y */
|
||||
Transliteration::new(0x0100, b'A', 0x00, 0x00, 0x00), /* Ā to A */
|
||||
Transliteration::new(0x0101, b'a', 0x00, 0x00, 0x00), /* ā to a */
|
||||
Transliteration::new(0x0102, b'A', 0x00, 0x00, 0x00), /* Ă to A */
|
||||
Transliteration::new(0x0103, b'a', 0x00, 0x00, 0x00), /* ă to a */
|
||||
Transliteration::new(0x0104, b'A', 0x00, 0x00, 0x00), /* Ą to A */
|
||||
Transliteration::new(0x0105, b'a', 0x00, 0x00, 0x00), /* ą to a */
|
||||
Transliteration::new(0x0106, b'C', 0x00, 0x00, 0x00), /* Ć to C */
|
||||
Transliteration::new(0x0107, b'c', 0x00, 0x00, 0x00), /* ć to c */
|
||||
Transliteration::new(0x0108, b'C', b'h', 0x00, 0x00), /* Ĉ to Ch */
|
||||
Transliteration::new(0x0109, b'c', b'h', 0x00, 0x00), /* ĉ to ch */
|
||||
Transliteration::new(0x010A, b'C', 0x00, 0x00, 0x00), /* Ċ to C */
|
||||
Transliteration::new(0x010B, b'c', 0x00, 0x00, 0x00), /* ċ to c */
|
||||
Transliteration::new(0x010C, b'C', 0x00, 0x00, 0x00), /* Č to C */
|
||||
Transliteration::new(0x010D, b'c', 0x00, 0x00, 0x00), /* č to c */
|
||||
Transliteration::new(0x010E, b'D', 0x00, 0x00, 0x00), /* Ď to D */
|
||||
Transliteration::new(0x010F, b'd', 0x00, 0x00, 0x00), /* ď to d */
|
||||
Transliteration::new(0x0110, b'D', 0x00, 0x00, 0x00), /* Đ to D */
|
||||
Transliteration::new(0x0111, b'd', 0x00, 0x00, 0x00), /* đ to d */
|
||||
Transliteration::new(0x0112, b'E', 0x00, 0x00, 0x00), /* Ē to E */
|
||||
Transliteration::new(0x0113, b'e', 0x00, 0x00, 0x00), /* ē to e */
|
||||
Transliteration::new(0x0114, b'E', 0x00, 0x00, 0x00), /* Ĕ to E */
|
||||
Transliteration::new(0x0115, b'e', 0x00, 0x00, 0x00), /* ĕ to e */
|
||||
Transliteration::new(0x0116, b'E', 0x00, 0x00, 0x00), /* Ė to E */
|
||||
Transliteration::new(0x0117, b'e', 0x00, 0x00, 0x00), /* ė to e */
|
||||
Transliteration::new(0x0118, b'E', 0x00, 0x00, 0x00), /* Ę to E */
|
||||
Transliteration::new(0x0119, b'e', 0x00, 0x00, 0x00), /* ę to e */
|
||||
Transliteration::new(0x011A, b'E', 0x00, 0x00, 0x00), /* Ě to E */
|
||||
Transliteration::new(0x011B, b'e', 0x00, 0x00, 0x00), /* ě to e */
|
||||
Transliteration::new(0x011C, b'G', b'h', 0x00, 0x00), /* Ĝ to Gh */
|
||||
Transliteration::new(0x011D, b'g', b'h', 0x00, 0x00), /* ĝ to gh */
|
||||
Transliteration::new(0x011E, b'G', 0x00, 0x00, 0x00), /* Ğ to G */
|
||||
Transliteration::new(0x011F, b'g', 0x00, 0x00, 0x00), /* ğ to g */
|
||||
Transliteration::new(0x0120, b'G', 0x00, 0x00, 0x00), /* Ġ to G */
|
||||
Transliteration::new(0x0121, b'g', 0x00, 0x00, 0x00), /* ġ to g */
|
||||
Transliteration::new(0x0122, b'G', 0x00, 0x00, 0x00), /* Ģ to G */
|
||||
Transliteration::new(0x0123, b'g', 0x00, 0x00, 0x00), /* ģ to g */
|
||||
Transliteration::new(0x0124, b'H', b'h', 0x00, 0x00), /* Ĥ to Hh */
|
||||
Transliteration::new(0x0125, b'h', b'h', 0x00, 0x00), /* ĥ to hh */
|
||||
Transliteration::new(0x0126, b'H', 0x00, 0x00, 0x00), /* Ħ to H */
|
||||
Transliteration::new(0x0127, b'h', 0x00, 0x00, 0x00), /* ħ to h */
|
||||
Transliteration::new(0x0128, b'I', 0x00, 0x00, 0x00), /* Ĩ to I */
|
||||
Transliteration::new(0x0129, b'i', 0x00, 0x00, 0x00), /* ĩ to i */
|
||||
Transliteration::new(0x012A, b'I', 0x00, 0x00, 0x00), /* Ī to I */
|
||||
Transliteration::new(0x012B, b'i', 0x00, 0x00, 0x00), /* ī to i */
|
||||
Transliteration::new(0x012C, b'I', 0x00, 0x00, 0x00), /* Ĭ to I */
|
||||
Transliteration::new(0x012D, b'i', 0x00, 0x00, 0x00), /* ĭ to i */
|
||||
Transliteration::new(0x012E, b'I', 0x00, 0x00, 0x00), /* Į to I */
|
||||
Transliteration::new(0x012F, b'i', 0x00, 0x00, 0x00), /* į to i */
|
||||
Transliteration::new(0x0130, b'I', 0x00, 0x00, 0x00), /* İ to I */
|
||||
Transliteration::new(0x0131, b'i', 0x00, 0x00, 0x00), /* ı to i */
|
||||
Transliteration::new(0x0132, b'I', b'J', 0x00, 0x00), /* IJ to IJ */
|
||||
Transliteration::new(0x0133, b'i', b'j', 0x00, 0x00), /* ij to ij */
|
||||
Transliteration::new(0x0134, b'J', b'h', 0x00, 0x00), /* Ĵ to Jh */
|
||||
Transliteration::new(0x0135, b'j', b'h', 0x00, 0x00), /* ĵ to jh */
|
||||
Transliteration::new(0x0136, b'K', 0x00, 0x00, 0x00), /* Ķ to K */
|
||||
Transliteration::new(0x0137, b'k', 0x00, 0x00, 0x00), /* ķ to k */
|
||||
Transliteration::new(0x0138, b'k', 0x00, 0x00, 0x00), /* ĸ to k */
|
||||
Transliteration::new(0x0139, b'L', 0x00, 0x00, 0x00), /* Ĺ to L */
|
||||
Transliteration::new(0x013A, b'l', 0x00, 0x00, 0x00), /* ĺ to l */
|
||||
Transliteration::new(0x013B, b'L', 0x00, 0x00, 0x00), /* Ļ to L */
|
||||
Transliteration::new(0x013C, b'l', 0x00, 0x00, 0x00), /* ļ to l */
|
||||
Transliteration::new(0x013D, b'L', 0x00, 0x00, 0x00), /* Ľ to L */
|
||||
Transliteration::new(0x013E, b'l', 0x00, 0x00, 0x00), /* ľ to l */
|
||||
Transliteration::new(0x013F, b'L', b'.', 0x00, 0x00), /* Ŀ to L. */
|
||||
Transliteration::new(0x0140, b'l', b'.', 0x00, 0x00), /* ŀ to l. */
|
||||
Transliteration::new(0x0141, b'L', 0x00, 0x00, 0x00), /* Ł to L */
|
||||
Transliteration::new(0x0142, b'l', 0x00, 0x00, 0x00), /* ł to l */
|
||||
Transliteration::new(0x0143, b'N', 0x00, 0x00, 0x00), /* Ń to N */
|
||||
Transliteration::new(0x0144, b'n', 0x00, 0x00, 0x00), /* ń to n */
|
||||
Transliteration::new(0x0145, b'N', 0x00, 0x00, 0x00), /* Ņ to N */
|
||||
Transliteration::new(0x0146, b'n', 0x00, 0x00, 0x00), /* ņ to n */
|
||||
Transliteration::new(0x0147, b'N', 0x00, 0x00, 0x00), /* Ň to N */
|
||||
Transliteration::new(0x0148, b'n', 0x00, 0x00, 0x00), /* ň to n */
|
||||
Transliteration::new(0x0149, b'\'', b'n', 0x00, 0x00), /* ʼn to 'n */
|
||||
Transliteration::new(0x014A, b'N', b'G', 0x00, 0x00), /* Ŋ to NG */
|
||||
Transliteration::new(0x014B, b'n', b'g', 0x00, 0x00), /* ŋ to ng */
|
||||
Transliteration::new(0x014C, b'O', 0x00, 0x00, 0x00), /* Ō to O */
|
||||
Transliteration::new(0x014D, b'o', 0x00, 0x00, 0x00), /* ō to o */
|
||||
Transliteration::new(0x014E, b'O', 0x00, 0x00, 0x00), /* Ŏ to O */
|
||||
Transliteration::new(0x014F, b'o', 0x00, 0x00, 0x00), /* ŏ to o */
|
||||
Transliteration::new(0x0150, b'O', 0x00, 0x00, 0x00), /* Ő to O */
|
||||
Transliteration::new(0x0151, b'o', 0x00, 0x00, 0x00), /* ő to o */
|
||||
Transliteration::new(0x0152, b'O', b'E', 0x00, 0x00), /* Œ to OE */
|
||||
Transliteration::new(0x0153, b'o', b'e', 0x00, 0x00), /* œ to oe */
|
||||
Transliteration::new(0x0154, b'R', 0x00, 0x00, 0x00), /* Ŕ to R */
|
||||
Transliteration::new(0x0155, b'r', 0x00, 0x00, 0x00), /* ŕ to r */
|
||||
Transliteration::new(0x0156, b'R', 0x00, 0x00, 0x00), /* Ŗ to R */
|
||||
Transliteration::new(0x0157, b'r', 0x00, 0x00, 0x00), /* ŗ to r */
|
||||
Transliteration::new(0x0158, b'R', 0x00, 0x00, 0x00), /* Ř to R */
|
||||
Transliteration::new(0x0159, b'r', 0x00, 0x00, 0x00), /* ř to r */
|
||||
Transliteration::new(0x015A, b'S', 0x00, 0x00, 0x00), /* Ś to S */
|
||||
Transliteration::new(0x015B, b's', 0x00, 0x00, 0x00), /* ś to s */
|
||||
Transliteration::new(0x015C, b'S', b'h', 0x00, 0x00), /* Ŝ to Sh */
|
||||
Transliteration::new(0x015D, b's', b'h', 0x00, 0x00), /* ŝ to sh */
|
||||
Transliteration::new(0x015E, b'S', 0x00, 0x00, 0x00), /* Ş to S */
|
||||
Transliteration::new(0x015F, b's', 0x00, 0x00, 0x00), /* ş to s */
|
||||
Transliteration::new(0x0160, b'S', 0x00, 0x00, 0x00), /* Š to S */
|
||||
Transliteration::new(0x0161, b's', 0x00, 0x00, 0x00), /* š to s */
|
||||
Transliteration::new(0x0162, b'T', 0x00, 0x00, 0x00), /* Ţ to T */
|
||||
Transliteration::new(0x0163, b't', 0x00, 0x00, 0x00), /* ţ to t */
|
||||
Transliteration::new(0x0164, b'T', 0x00, 0x00, 0x00), /* Ť to T */
|
||||
Transliteration::new(0x0165, b't', 0x00, 0x00, 0x00), /* ť to t */
|
||||
Transliteration::new(0x0166, b'T', 0x00, 0x00, 0x00), /* Ŧ to T */
|
||||
Transliteration::new(0x0167, b't', 0x00, 0x00, 0x00), /* ŧ to t */
|
||||
Transliteration::new(0x0168, b'U', 0x00, 0x00, 0x00), /* Ũ to U */
|
||||
Transliteration::new(0x0169, b'u', 0x00, 0x00, 0x00), /* ũ to u */
|
||||
Transliteration::new(0x016A, b'U', 0x00, 0x00, 0x00), /* Ū to U */
|
||||
Transliteration::new(0x016B, b'u', 0x00, 0x00, 0x00), /* ū to u */
|
||||
Transliteration::new(0x016C, b'U', 0x00, 0x00, 0x00), /* Ŭ to U */
|
||||
Transliteration::new(0x016D, b'u', 0x00, 0x00, 0x00), /* ŭ to u */
|
||||
Transliteration::new(0x016E, b'U', 0x00, 0x00, 0x00), /* Ů to U */
|
||||
Transliteration::new(0x016F, b'u', 0x00, 0x00, 0x00), /* ů to u */
|
||||
Transliteration::new(0x0170, b'U', 0x00, 0x00, 0x00), /* Ű to U */
|
||||
Transliteration::new(0x0171, b'u', 0x00, 0x00, 0x00), /* ű to u */
|
||||
Transliteration::new(0x0172, b'U', 0x00, 0x00, 0x00), /* Ų to U */
|
||||
Transliteration::new(0x0173, b'u', 0x00, 0x00, 0x00), /* ų to u */
|
||||
Transliteration::new(0x0174, b'W', 0x00, 0x00, 0x00), /* Ŵ to W */
|
||||
Transliteration::new(0x0175, b'w', 0x00, 0x00, 0x00), /* ŵ to w */
|
||||
Transliteration::new(0x0176, b'Y', 0x00, 0x00, 0x00), /* Ŷ to Y */
|
||||
Transliteration::new(0x0177, b'y', 0x00, 0x00, 0x00), /* ŷ to y */
|
||||
Transliteration::new(0x0178, b'Y', 0x00, 0x00, 0x00), /* Ÿ to Y */
|
||||
Transliteration::new(0x0179, b'Z', 0x00, 0x00, 0x00), /* Ź to Z */
|
||||
Transliteration::new(0x017A, b'z', 0x00, 0x00, 0x00), /* ź to z */
|
||||
Transliteration::new(0x017B, b'Z', 0x00, 0x00, 0x00), /* Ż to Z */
|
||||
Transliteration::new(0x017C, b'z', 0x00, 0x00, 0x00), /* ż to z */
|
||||
Transliteration::new(0x017D, b'Z', 0x00, 0x00, 0x00), /* Ž to Z */
|
||||
Transliteration::new(0x017E, b'z', 0x00, 0x00, 0x00), /* ž to z */
|
||||
Transliteration::new(0x017F, b's', 0x00, 0x00, 0x00), /* ſ to s */
|
||||
Transliteration::new(0x0192, b'f', 0x00, 0x00, 0x00), /* ƒ to f */
|
||||
Transliteration::new(0x0218, b'S', 0x00, 0x00, 0x00), /* Ș to S */
|
||||
Transliteration::new(0x0219, b's', 0x00, 0x00, 0x00), /* ș to s */
|
||||
Transliteration::new(0x021A, b'T', 0x00, 0x00, 0x00), /* Ț to T */
|
||||
Transliteration::new(0x021B, b't', 0x00, 0x00, 0x00), /* ț to t */
|
||||
Transliteration::new(0x0386, b'A', 0x00, 0x00, 0x00), /* Ά to A */
|
||||
Transliteration::new(0x0388, b'E', 0x00, 0x00, 0x00), /* Έ to E */
|
||||
Transliteration::new(0x0389, b'I', 0x00, 0x00, 0x00), /* Ή to I */
|
||||
Transliteration::new(0x038A, b'I', 0x00, 0x00, 0x00), /* Ί to I */
|
||||
Transliteration::new(0x038C, b'O', 0x00, 0x00, 0x00), /* Ό to O */
|
||||
Transliteration::new(0x038E, b'Y', 0x00, 0x00, 0x00), /* Ύ to Y */
|
||||
Transliteration::new(0x038F, b'O', 0x00, 0x00, 0x00), /* Ώ to O */
|
||||
Transliteration::new(0x0390, b'i', 0x00, 0x00, 0x00), /* ΐ to i */
|
||||
Transliteration::new(0x0391, b'A', 0x00, 0x00, 0x00), /* Α to A */
|
||||
Transliteration::new(0x0392, b'B', 0x00, 0x00, 0x00), /* Β to B */
|
||||
Transliteration::new(0x0393, b'G', 0x00, 0x00, 0x00), /* Γ to G */
|
||||
Transliteration::new(0x0394, b'D', 0x00, 0x00, 0x00), /* Δ to D */
|
||||
Transliteration::new(0x0395, b'E', 0x00, 0x00, 0x00), /* Ε to E */
|
||||
Transliteration::new(0x0396, b'Z', 0x00, 0x00, 0x00), /* Ζ to Z */
|
||||
Transliteration::new(0x0397, b'I', 0x00, 0x00, 0x00), /* Η to I */
|
||||
Transliteration::new(0x0398, b'T', b'h', 0x00, 0x00), /* Θ to Th */
|
||||
Transliteration::new(0x0399, b'I', 0x00, 0x00, 0x00), /* Ι to I */
|
||||
Transliteration::new(0x039A, b'K', 0x00, 0x00, 0x00), /* Κ to K */
|
||||
Transliteration::new(0x039B, b'L', 0x00, 0x00, 0x00), /* Λ to L */
|
||||
Transliteration::new(0x039C, b'M', 0x00, 0x00, 0x00), /* Μ to M */
|
||||
Transliteration::new(0x039D, b'N', 0x00, 0x00, 0x00), /* Ν to N */
|
||||
Transliteration::new(0x039E, b'X', 0x00, 0x00, 0x00), /* Ξ to X */
|
||||
Transliteration::new(0x039F, b'O', 0x00, 0x00, 0x00), /* Ο to O */
|
||||
Transliteration::new(0x03A0, b'P', 0x00, 0x00, 0x00), /* Π to P */
|
||||
Transliteration::new(0x03A1, b'R', 0x00, 0x00, 0x00), /* Ρ to R */
|
||||
Transliteration::new(0x03A3, b'S', 0x00, 0x00, 0x00), /* Σ to S */
|
||||
Transliteration::new(0x03A4, b'T', 0x00, 0x00, 0x00), /* Τ to T */
|
||||
Transliteration::new(0x03A5, b'Y', 0x00, 0x00, 0x00), /* Υ to Y */
|
||||
Transliteration::new(0x03A6, b'F', 0x00, 0x00, 0x00), /* Φ to F */
|
||||
Transliteration::new(0x03A7, b'C', b'h', 0x00, 0x00), /* Χ to Ch */
|
||||
Transliteration::new(0x03A8, b'P', b's', 0x00, 0x00), /* Ψ to Ps */
|
||||
Transliteration::new(0x03A9, b'O', 0x00, 0x00, 0x00), /* Ω to O */
|
||||
Transliteration::new(0x03AA, b'I', 0x00, 0x00, 0x00), /* Ϊ to I */
|
||||
Transliteration::new(0x03AB, b'Y', 0x00, 0x00, 0x00), /* Ϋ to Y */
|
||||
Transliteration::new(0x03AC, b'a', 0x00, 0x00, 0x00), /* ά to a */
|
||||
Transliteration::new(0x03AD, b'e', 0x00, 0x00, 0x00), /* έ to e */
|
||||
Transliteration::new(0x03AE, b'i', 0x00, 0x00, 0x00), /* ή to i */
|
||||
Transliteration::new(0x03AF, b'i', 0x00, 0x00, 0x00), /* ί to i */
|
||||
Transliteration::new(0x03B1, b'a', 0x00, 0x00, 0x00), /* α to a */
|
||||
Transliteration::new(0x03B2, b'b', 0x00, 0x00, 0x00), /* β to b */
|
||||
Transliteration::new(0x03B3, b'g', 0x00, 0x00, 0x00), /* γ to g */
|
||||
Transliteration::new(0x03B4, b'd', 0x00, 0x00, 0x00), /* δ to d */
|
||||
Transliteration::new(0x03B5, b'e', 0x00, 0x00, 0x00), /* ε to e */
|
||||
Transliteration::new(0x03B6, b'z', 0x00, 0x00, 0x00), /* ζ to z */
|
||||
Transliteration::new(0x03B7, b'i', 0x00, 0x00, 0x00), /* η to i */
|
||||
Transliteration::new(0x03B8, b't', b'h', 0x00, 0x00), /* θ to th */
|
||||
Transliteration::new(0x03B9, b'i', 0x00, 0x00, 0x00), /* ι to i */
|
||||
Transliteration::new(0x03BA, b'k', 0x00, 0x00, 0x00), /* κ to k */
|
||||
Transliteration::new(0x03BB, b'l', 0x00, 0x00, 0x00), /* λ to l */
|
||||
Transliteration::new(0x03BC, b'm', 0x00, 0x00, 0x00), /* μ to m */
|
||||
Transliteration::new(0x03BD, b'n', 0x00, 0x00, 0x00), /* ν to n */
|
||||
Transliteration::new(0x03BE, b'x', 0x00, 0x00, 0x00), /* ξ to x */
|
||||
Transliteration::new(0x03BF, b'o', 0x00, 0x00, 0x00), /* ο to o */
|
||||
Transliteration::new(0x03C0, b'p', 0x00, 0x00, 0x00), /* π to p */
|
||||
Transliteration::new(0x03C1, b'r', 0x00, 0x00, 0x00), /* ρ to r */
|
||||
Transliteration::new(0x03C3, b's', 0x00, 0x00, 0x00), /* σ to s */
|
||||
Transliteration::new(0x03C4, b't', 0x00, 0x00, 0x00), /* τ to t */
|
||||
Transliteration::new(0x03C5, b'y', 0x00, 0x00, 0x00), /* υ to y */
|
||||
Transliteration::new(0x03C6, b'f', 0x00, 0x00, 0x00), /* φ to f */
|
||||
Transliteration::new(0x03C7, b'c', b'h', 0x00, 0x00), /* χ to ch */
|
||||
Transliteration::new(0x03C8, b'p', b's', 0x00, 0x00), /* ψ to ps */
|
||||
Transliteration::new(0x03C9, b'o', 0x00, 0x00, 0x00), /* ω to o */
|
||||
Transliteration::new(0x03CA, b'i', 0x00, 0x00, 0x00), /* ϊ to i */
|
||||
Transliteration::new(0x03CB, b'y', 0x00, 0x00, 0x00), /* ϋ to y */
|
||||
Transliteration::new(0x03CC, b'o', 0x00, 0x00, 0x00), /* ό to o */
|
||||
Transliteration::new(0x03CD, b'y', 0x00, 0x00, 0x00), /* ύ to y */
|
||||
Transliteration::new(0x03CE, b'i', 0x00, 0x00, 0x00), /* ώ to i */
|
||||
Transliteration::new(0x0400, b'E', 0x00, 0x00, 0x00), /* Ѐ to E */
|
||||
Transliteration::new(0x0401, b'E', 0x00, 0x00, 0x00), /* Ё to E */
|
||||
Transliteration::new(0x0402, b'D', 0x00, 0x00, 0x00), /* Ђ to D */
|
||||
Transliteration::new(0x0403, b'G', 0x00, 0x00, 0x00), /* Ѓ to G */
|
||||
Transliteration::new(0x0404, b'E', 0x00, 0x00, 0x00), /* Є to E */
|
||||
Transliteration::new(0x0405, b'Z', 0x00, 0x00, 0x00), /* Ѕ to Z */
|
||||
Transliteration::new(0x0406, b'I', 0x00, 0x00, 0x00), /* І to I */
|
||||
Transliteration::new(0x0407, b'I', 0x00, 0x00, 0x00), /* Ї to I */
|
||||
Transliteration::new(0x0408, b'J', 0x00, 0x00, 0x00), /* Ј to J */
|
||||
Transliteration::new(0x0409, b'I', 0x00, 0x00, 0x00), /* Љ to I */
|
||||
Transliteration::new(0x040A, b'N', 0x00, 0x00, 0x00), /* Њ to N */
|
||||
Transliteration::new(0x040B, b'D', 0x00, 0x00, 0x00), /* Ћ to D */
|
||||
Transliteration::new(0x040C, b'K', 0x00, 0x00, 0x00), /* Ќ to K */
|
||||
Transliteration::new(0x040D, b'I', 0x00, 0x00, 0x00), /* Ѝ to I */
|
||||
Transliteration::new(0x040E, b'U', 0x00, 0x00, 0x00), /* Ў to U */
|
||||
Transliteration::new(0x040F, b'D', 0x00, 0x00, 0x00), /* Џ to D */
|
||||
Transliteration::new(0x0410, b'A', 0x00, 0x00, 0x00), /* А to A */
|
||||
Transliteration::new(0x0411, b'B', 0x00, 0x00, 0x00), /* Б to B */
|
||||
Transliteration::new(0x0412, b'V', 0x00, 0x00, 0x00), /* В to V */
|
||||
Transliteration::new(0x0413, b'G', 0x00, 0x00, 0x00), /* Г to G */
|
||||
Transliteration::new(0x0414, b'D', 0x00, 0x00, 0x00), /* Д to D */
|
||||
Transliteration::new(0x0415, b'E', 0x00, 0x00, 0x00), /* Е to E */
|
||||
Transliteration::new(0x0416, b'Z', b'h', 0x00, 0x00), /* Ж to Zh */
|
||||
Transliteration::new(0x0417, b'Z', 0x00, 0x00, 0x00), /* З to Z */
|
||||
Transliteration::new(0x0418, b'I', 0x00, 0x00, 0x00), /* И to I */
|
||||
Transliteration::new(0x0419, b'I', 0x00, 0x00, 0x00), /* Й to I */
|
||||
Transliteration::new(0x041A, b'K', 0x00, 0x00, 0x00), /* К to K */
|
||||
Transliteration::new(0x041B, b'L', 0x00, 0x00, 0x00), /* Л to L */
|
||||
Transliteration::new(0x041C, b'M', 0x00, 0x00, 0x00), /* М to M */
|
||||
Transliteration::new(0x041D, b'N', 0x00, 0x00, 0x00), /* Н to N */
|
||||
Transliteration::new(0x041E, b'O', 0x00, 0x00, 0x00), /* О to O */
|
||||
Transliteration::new(0x041F, b'P', 0x00, 0x00, 0x00), /* П to P */
|
||||
Transliteration::new(0x0420, b'R', 0x00, 0x00, 0x00), /* Р to R */
|
||||
Transliteration::new(0x0421, b'S', 0x00, 0x00, 0x00), /* С to S */
|
||||
Transliteration::new(0x0422, b'T', 0x00, 0x00, 0x00), /* Т to T */
|
||||
Transliteration::new(0x0423, b'U', 0x00, 0x00, 0x00), /* У to U */
|
||||
Transliteration::new(0x0424, b'F', 0x00, 0x00, 0x00), /* Ф to F */
|
||||
Transliteration::new(0x0425, b'K', b'h', 0x00, 0x00), /* Х to Kh */
|
||||
Transliteration::new(0x0426, b'T', b'c', 0x00, 0x00), /* Ц to Tc */
|
||||
Transliteration::new(0x0427, b'C', b'h', 0x00, 0x00), /* Ч to Ch */
|
||||
Transliteration::new(0x0428, b'S', b'h', 0x00, 0x00), /* Ш to Sh */
|
||||
Transliteration::new(0x0429, b'S', b'h', b'c', b'h'), /* Щ to Shch */
|
||||
Transliteration::new(0x042A, b'a', 0x00, 0x00, 0x00), /* to A */
|
||||
Transliteration::new(0x042B, b'Y', 0x00, 0x00, 0x00), /* Ы to Y */
|
||||
Transliteration::new(0x042C, b'Y', 0x00, 0x00, 0x00), /* to Y */
|
||||
Transliteration::new(0x042D, b'E', 0x00, 0x00, 0x00), /* Э to E */
|
||||
Transliteration::new(0x042E, b'I', b'u', 0x00, 0x00), /* Ю to Iu */
|
||||
Transliteration::new(0x042F, b'I', b'a', 0x00, 0x00), /* Я to Ia */
|
||||
Transliteration::new(0x0430, b'a', 0x00, 0x00, 0x00), /* а to a */
|
||||
Transliteration::new(0x0431, b'b', 0x00, 0x00, 0x00), /* б to b */
|
||||
Transliteration::new(0x0432, b'v', 0x00, 0x00, 0x00), /* в to v */
|
||||
Transliteration::new(0x0433, b'g', 0x00, 0x00, 0x00), /* г to g */
|
||||
Transliteration::new(0x0434, b'd', 0x00, 0x00, 0x00), /* д to d */
|
||||
Transliteration::new(0x0435, b'e', 0x00, 0x00, 0x00), /* е to e */
|
||||
Transliteration::new(0x0436, b'z', b'h', 0x00, 0x00), /* ж to zh */
|
||||
Transliteration::new(0x0437, b'z', 0x00, 0x00, 0x00), /* з to z */
|
||||
Transliteration::new(0x0438, b'i', 0x00, 0x00, 0x00), /* и to i */
|
||||
Transliteration::new(0x0439, b'i', 0x00, 0x00, 0x00), /* й to i */
|
||||
Transliteration::new(0x043A, b'k', 0x00, 0x00, 0x00), /* к to k */
|
||||
Transliteration::new(0x043B, b'l', 0x00, 0x00, 0x00), /* л to l */
|
||||
Transliteration::new(0x043C, b'm', 0x00, 0x00, 0x00), /* м to m */
|
||||
Transliteration::new(0x043D, b'n', 0x00, 0x00, 0x00), /* н to n */
|
||||
Transliteration::new(0x043E, b'o', 0x00, 0x00, 0x00), /* о to o */
|
||||
Transliteration::new(0x043F, b'p', 0x00, 0x00, 0x00), /* п to p */
|
||||
Transliteration::new(0x0440, b'r', 0x00, 0x00, 0x00), /* р to r */
|
||||
Transliteration::new(0x0441, b's', 0x00, 0x00, 0x00), /* с to s */
|
||||
Transliteration::new(0x0442, b't', 0x00, 0x00, 0x00), /* т to t */
|
||||
Transliteration::new(0x0443, b'u', 0x00, 0x00, 0x00), /* у to u */
|
||||
Transliteration::new(0x0444, b'f', 0x00, 0x00, 0x00), /* ф to f */
|
||||
Transliteration::new(0x0445, b'k', b'h', 0x00, 0x00), /* х to kh */
|
||||
Transliteration::new(0x0446, b't', b'c', 0x00, 0x00), /* ц to tc */
|
||||
Transliteration::new(0x0447, b'c', b'h', 0x00, 0x00), /* ч to ch */
|
||||
Transliteration::new(0x0448, b's', b'h', 0x00, 0x00), /* ш to sh */
|
||||
Transliteration::new(0x0449, b's', b'h', b'c', b'h'), /* щ to shch */
|
||||
Transliteration::new(0x044A, b'a', 0x00, 0x00, 0x00), /* to a */
|
||||
Transliteration::new(0x044B, b'y', 0x00, 0x00, 0x00), /* ы to y */
|
||||
Transliteration::new(0x044C, b'y', 0x00, 0x00, 0x00), /* to y */
|
||||
Transliteration::new(0x044D, b'e', 0x00, 0x00, 0x00), /* э to e */
|
||||
Transliteration::new(0x044E, b'i', b'u', 0x00, 0x00), /* ю to iu */
|
||||
Transliteration::new(0x044F, b'i', b'a', 0x00, 0x00), /* я to ia */
|
||||
Transliteration::new(0x0450, b'e', 0x00, 0x00, 0x00), /* ѐ to e */
|
||||
Transliteration::new(0x0451, b'e', 0x00, 0x00, 0x00), /* ё to e */
|
||||
Transliteration::new(0x0452, b'd', 0x00, 0x00, 0x00), /* ђ to d */
|
||||
Transliteration::new(0x0453, b'g', 0x00, 0x00, 0x00), /* ѓ to g */
|
||||
Transliteration::new(0x0454, b'e', 0x00, 0x00, 0x00), /* є to e */
|
||||
Transliteration::new(0x0455, b'z', 0x00, 0x00, 0x00), /* ѕ to z */
|
||||
Transliteration::new(0x0456, b'i', 0x00, 0x00, 0x00), /* і to i */
|
||||
Transliteration::new(0x0457, b'i', 0x00, 0x00, 0x00), /* ї to i */
|
||||
Transliteration::new(0x0458, b'j', 0x00, 0x00, 0x00), /* ј to j */
|
||||
Transliteration::new(0x0459, b'i', 0x00, 0x00, 0x00), /* љ to i */
|
||||
Transliteration::new(0x045A, b'n', 0x00, 0x00, 0x00), /* њ to n */
|
||||
Transliteration::new(0x045B, b'd', 0x00, 0x00, 0x00), /* ћ to d */
|
||||
Transliteration::new(0x045C, b'k', 0x00, 0x00, 0x00), /* ќ to k */
|
||||
Transliteration::new(0x045D, b'i', 0x00, 0x00, 0x00), /* ѝ to i */
|
||||
Transliteration::new(0x045E, b'u', 0x00, 0x00, 0x00), /* ў to u */
|
||||
Transliteration::new(0x045F, b'd', 0x00, 0x00, 0x00), /* џ to d */
|
||||
Transliteration::new(0x1E02, b'B', 0x00, 0x00, 0x00), /* Ḃ to B */
|
||||
Transliteration::new(0x1E03, b'b', 0x00, 0x00, 0x00), /* ḃ to b */
|
||||
Transliteration::new(0x1E0A, b'D', 0x00, 0x00, 0x00), /* Ḋ to D */
|
||||
Transliteration::new(0x1E0B, b'd', 0x00, 0x00, 0x00), /* ḋ to d */
|
||||
Transliteration::new(0x1E1E, b'F', 0x00, 0x00, 0x00), /* Ḟ to F */
|
||||
Transliteration::new(0x1E1F, b'f', 0x00, 0x00, 0x00), /* ḟ to f */
|
||||
Transliteration::new(0x1E40, b'M', 0x00, 0x00, 0x00), /* Ṁ to M */
|
||||
Transliteration::new(0x1E41, b'm', 0x00, 0x00, 0x00), /* ṁ to m */
|
||||
Transliteration::new(0x1E56, b'P', 0x00, 0x00, 0x00), /* Ṗ to P */
|
||||
Transliteration::new(0x1E57, b'p', 0x00, 0x00, 0x00), /* ṗ to p */
|
||||
Transliteration::new(0x1E60, b'S', 0x00, 0x00, 0x00), /* Ṡ to S */
|
||||
Transliteration::new(0x1E61, b's', 0x00, 0x00, 0x00), /* ṡ to s */
|
||||
Transliteration::new(0x1E6A, b'T', 0x00, 0x00, 0x00), /* Ṫ to T */
|
||||
Transliteration::new(0x1E6B, b't', 0x00, 0x00, 0x00), /* ṫ to t */
|
||||
Transliteration::new(0x1E80, b'W', 0x00, 0x00, 0x00), /* Ẁ to W */
|
||||
Transliteration::new(0x1E81, b'w', 0x00, 0x00, 0x00), /* ẁ to w */
|
||||
Transliteration::new(0x1E82, b'W', 0x00, 0x00, 0x00), /* Ẃ to W */
|
||||
Transliteration::new(0x1E83, b'w', 0x00, 0x00, 0x00), /* ẃ to w */
|
||||
Transliteration::new(0x1E84, b'W', 0x00, 0x00, 0x00), /* Ẅ to W */
|
||||
Transliteration::new(0x1E85, b'w', 0x00, 0x00, 0x00), /* ẅ to w */
|
||||
Transliteration::new(0x1EF2, b'Y', 0x00, 0x00, 0x00), /* Ỳ to Y */
|
||||
Transliteration::new(0x1EF3, b'y', 0x00, 0x00, 0x00), /* ỳ to y */
|
||||
Transliteration::new(0xFB00, b'f', b'f', 0x00, 0x00), /* ff to ff */
|
||||
Transliteration::new(0xFB01, b'f', b'i', 0x00, 0x00), /* fi to fi */
|
||||
Transliteration::new(0xFB02, b'f', b'l', 0x00, 0x00), /* fl to fl */
|
||||
Transliteration::new(0xFB05, b's', b't', 0x00, 0x00), /* ſt to st */
|
||||
Transliteration::new(0xFB06, b's', b't', 0x00, 0x00), /* st to st */
|
||||
];
|
||||
|
||||
/// Return the value of the first UTF-8 character in the string
|
||||
fn utf8_read(z: &[u8]) -> (u32, usize) {
|
||||
if z.is_empty() {
|
||||
return (0, 0);
|
||||
}
|
||||
|
||||
let first_byte = z[0];
|
||||
if first_byte < 0x80 {
|
||||
(first_byte as u32, 1)
|
||||
} else {
|
||||
let lookup_index = (first_byte - 0xc0) as usize;
|
||||
if lookup_index >= TRANSLIT_UTF8_LOOKUP.len() {
|
||||
return (first_byte as u32, 1);
|
||||
}
|
||||
|
||||
let mut c = TRANSLIT_UTF8_LOOKUP[lookup_index] as u32;
|
||||
let mut i = 1;
|
||||
|
||||
while i < z.len() && (z[i] & 0xc0) == 0x80 {
|
||||
c = (c << 6) + ((z[i] & 0x3f) as u32);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
(c, i)
|
||||
}
|
||||
}
|
||||
|
||||
/// Find transliteration entry for a given Unicode character using binary search
|
||||
fn find_translit(c: u32) -> Option<&'static Transliteration> {
|
||||
let c = c as u16; // Cast to u16 since our table uses u16
|
||||
TRANSLIT
|
||||
.binary_search_by_key(&c, |t| t.c_from)
|
||||
.ok()
|
||||
.map(|idx| &TRANSLIT[idx])
|
||||
}
|
||||
|
||||
/// Convert the input string from UTF-8 into pure ASCII by converting
|
||||
/// all non-ASCII characters to some combination of characters in the ASCII subset.
|
||||
pub fn transliterate(input: &[u8]) -> Vec<u8> {
|
||||
let mut output = Vec::with_capacity(input.len() * 4);
|
||||
let mut pos = 0;
|
||||
|
||||
while pos < input.len() {
|
||||
let (c, size) = utf8_read(&input[pos..]);
|
||||
pos += size;
|
||||
|
||||
if c <= 127 {
|
||||
output.push(c as u8);
|
||||
} else if let Some(translit) = find_translit(c) {
|
||||
output.push(translit.c_to0);
|
||||
if translit.c_to1 != 0 {
|
||||
output.push(translit.c_to1);
|
||||
if translit.c_to2 != 0 {
|
||||
output.push(translit.c_to2);
|
||||
if translit.c_to3 != 0 {
|
||||
output.push(translit.c_to3);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
output.push(b'?');
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
pub fn transliterate_str(input: &str) -> String {
|
||||
let result = transliterate(input.as_bytes());
|
||||
String::from_utf8(result).unwrap_or_else(|_| "?".to_string())
|
||||
}
|
||||
|
||||
pub fn script_code(input: &[u8]) -> i32 {
|
||||
let mut pos = 0;
|
||||
let mut script_mask = 0;
|
||||
let mut seen_digit = false;
|
||||
|
||||
while pos < input.len() {
|
||||
let (c, size) = utf8_read(&input[pos..]);
|
||||
pos += size;
|
||||
|
||||
if c < 0x02af {
|
||||
if c >= 0x80 {
|
||||
script_mask |= SCRIPT_LATIN;
|
||||
} else if (c as u8).is_ascii_digit() {
|
||||
seen_digit = true;
|
||||
} else {
|
||||
script_mask |= SCRIPT_LATIN;
|
||||
}
|
||||
} else if (0x0400..=0x04ff).contains(&c) {
|
||||
script_mask |= SCRIPT_CYRILLIC;
|
||||
} else if (0x0386..=0x03ce).contains(&c) {
|
||||
script_mask |= SCRIPT_GREEK;
|
||||
} else if (0x0590..=0x05ff).contains(&c) {
|
||||
script_mask |= SCRIPT_HEBREW;
|
||||
} else if (0x0600..=0x06ff).contains(&c) {
|
||||
script_mask |= SCRIPT_ARABIC;
|
||||
}
|
||||
}
|
||||
|
||||
if script_mask == 0 && seen_digit {
|
||||
script_mask = SCRIPT_LATIN;
|
||||
}
|
||||
|
||||
match script_mask {
|
||||
0 => 999,
|
||||
SCRIPT_LATIN => 215,
|
||||
SCRIPT_CYRILLIC => 220,
|
||||
SCRIPT_GREEK => 200,
|
||||
SCRIPT_HEBREW => 125,
|
||||
SCRIPT_ARABIC => 160,
|
||||
_ => 998,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_utf8_read() {
|
||||
let input = "Café".as_bytes();
|
||||
let (c, size) = utf8_read(&input[0..]);
|
||||
assert_eq!(c, b'C' as u32);
|
||||
assert_eq!(size, 1);
|
||||
let (c, size) = utf8_read(&input[3..]);
|
||||
assert_eq!(c, 0x00E9); // é
|
||||
assert_eq!(size, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_transliterate_basic() {
|
||||
let result = transliterate_str("Café");
|
||||
assert_eq!(result, "Cafe");
|
||||
let result = transliterate_str("Naïve");
|
||||
assert_eq!(result, "Naive");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_transliterate_german() {
|
||||
let result = transliterate_str("Müller");
|
||||
assert_eq!(result, "Mueller");
|
||||
let result = transliterate_str("Größe");
|
||||
assert_eq!(result, "Groesse");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_script_code() {
|
||||
assert_eq!(script_code("Hello".as_bytes()), 215);
|
||||
assert_eq!(script_code("123".as_bytes()), 215);
|
||||
assert_eq!(script_code("привет".as_bytes()), 220);
|
||||
assert_eq!(script_code("γειά".as_bytes()), 200);
|
||||
assert_eq!(script_code("helloпривет".as_bytes()), 998);
|
||||
}
|
||||
}
|
||||
@@ -560,6 +560,122 @@ def test_ipaddr():
|
||||
)
|
||||
limbo.quit()
|
||||
|
||||
def validate_fuzzy_leven(a):
|
||||
return a == "3"
|
||||
|
||||
def validate_fuzzy_damlev1(a):
|
||||
return a == "2"
|
||||
|
||||
def validate_fuzzy_damlev2(a):
|
||||
return a == "1"
|
||||
|
||||
def validate_fuzzy_editdist1(a):
|
||||
return a == "225"
|
||||
|
||||
def validate_fuzzy_editdist2(a):
|
||||
return a == "110"
|
||||
|
||||
def validate_fuzzy_jarowin(a):
|
||||
return a == "0.907142857142857"
|
||||
|
||||
def validate_fuzzy_osadist(a):
|
||||
return a == "3"
|
||||
|
||||
def validate_fuzzy_soundex(a):
|
||||
return a == "A250"
|
||||
|
||||
def validate_fuzzy_phonetic(a):
|
||||
return a == "ABACAMA"
|
||||
|
||||
def validate_fuzzy_caver(a):
|
||||
return a == "AWSM111111"
|
||||
|
||||
def validate_fuzzy_rsoundex(a):
|
||||
return a == "A03080"
|
||||
|
||||
def validate_fuzzy_translit1(a):
|
||||
return a == "oh my ?"
|
||||
|
||||
def validate_fuzzy_translit2(a):
|
||||
return a == "privet"
|
||||
|
||||
def validate_fuzzy_script(a):
|
||||
return a == "160"
|
||||
|
||||
def test_fuzzy():
|
||||
limbo = TestTursoShell()
|
||||
ext_path = "./target/debug/liblimbo_fuzzy"
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_leven('awesome', 'aewsme');",
|
||||
lambda res: "error: no such function: " in res,
|
||||
"fuzzy levenshtein function returns null when ext not loaded",
|
||||
)
|
||||
limbo.execute_dot(f".load {ext_path}")
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_leven('awesome', 'aewsme');",
|
||||
validate_fuzzy_leven,
|
||||
"fuzzy levenshtein function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_damlev('awesome', 'aewsme');",
|
||||
validate_fuzzy_damlev1,
|
||||
"fuzzy damerau levenshtein1 function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_damlev('Something', 'Smoething');",
|
||||
validate_fuzzy_damlev2,
|
||||
"fuzzy damerau levenshtein2 function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_editdist('abc', 'ca');",
|
||||
validate_fuzzy_editdist1,
|
||||
"fuzzy editdist1 function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_editdist('abc', 'acb');",
|
||||
validate_fuzzy_editdist2,
|
||||
"fuzzy editdist2 function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_jarowin('awesome', 'aewsme');",
|
||||
validate_fuzzy_jarowin,
|
||||
"fuzzy jarowin function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_osadist('awesome', 'aewsme');",
|
||||
validate_fuzzy_osadist,
|
||||
"fuzzy osadist function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_phonetic('awesome');",
|
||||
validate_fuzzy_phonetic,
|
||||
"fuzzy phonetic function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_caver('awesome');",
|
||||
validate_fuzzy_caver,
|
||||
"fuzzy caver function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_rsoundex('awesome');",
|
||||
validate_fuzzy_rsoundex,
|
||||
"fuzzy rsoundex function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_translit('oh my 😅');",
|
||||
validate_fuzzy_translit1,
|
||||
"fuzzy translit1 function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_translit('привет');",
|
||||
validate_fuzzy_translit2,
|
||||
"fuzzy translit2 function works",
|
||||
)
|
||||
limbo.run_test_fn(
|
||||
"SELECT fuzzy_script('داناوانب');",
|
||||
validate_fuzzy_script,
|
||||
"fuzzy script function works",
|
||||
)
|
||||
|
||||
def test_vfs():
|
||||
limbo = TestTursoShell()
|
||||
@@ -822,6 +938,7 @@ def main():
|
||||
test_kv()
|
||||
test_csv()
|
||||
test_tablestats()
|
||||
test_fuzzy()
|
||||
except Exception as e:
|
||||
console.error(f"Test FAILED: {e}")
|
||||
cleanup()
|
||||
|
||||
Reference in New Issue
Block a user