Merge 'Sqlean fuzzy string ' from Danawan Bimantoro

Add implementations of string distance and phonetics functions:
fuzzy_damlev
fuzzy_hamming
fuzzy_jarowin
fuzzy_leven
fuzzy_osadist
fuzzy_editdist
fuzzy_soundex
fuzzy_rsoundex
fuzzy_phonetic
fuzzy_caver
fuzzy_translit
This implementation follows sqlean-fuzzy

Reviewed-by: Preston Thorpe <preston@turso.tech>

Closes #3262
This commit is contained in:
Pekka Enberg
2025-09-27 16:51:49 +03:00
committed by GitHub
13 changed files with 2140 additions and 0 deletions

8
Cargo.lock generated
View File

@@ -2293,6 +2293,14 @@ dependencies = [
"turso_ext",
]
[[package]]
name = "limbo_fuzzy"
version = "0.2.0-pre.8"
dependencies = [
"mimalloc",
"turso_ext",
]
[[package]]
name = "limbo_ipaddr"
version = "0.2.0-pre.9"

View File

@@ -19,6 +19,7 @@ members = [
"extensions/percentile",
"extensions/regexp",
"extensions/tests",
"extensions/fuzzy",
"macros",
"simulator",
"sqlite3",
@@ -62,6 +63,7 @@ limbo_regexp = { path = "extensions/regexp", version = "0.2.0-pre.9" }
turso_sqlite3_parser = { path = "vendored/sqlite3-parser", version = "0.2.0-pre.9" }
limbo_uuid = { path = "extensions/uuid", version = "0.2.0-pre.9" }
turso_parser = { path = "parser", version = "0.2.0-pre.9" }
limbo_fuzzy = { path = "extensions/fuzzy", version = "0.2.0-pre.9" }
sql_generation = { path = "sql_generation" }
strum = { version = "0.26", features = ["derive"] }
strum_macros = "0.26"

View File

@@ -0,0 +1,20 @@
[package]
name = "limbo_fuzzy"
version.workspace = true
authors.workspace = true
edition.workspace = true
license.workspace = true
repository.workspace = true
description = "Limbo fuzzy string extension"
[lib]
crate-type = ["cdylib", "lib"]
[features]
static = ["turso_ext/static"]
[dependencies]
turso_ext = { workspace = true, features = ["static"] }
[target.'cfg(not(target_family = "wasm"))'.dependencies]
mimalloc = { version = "0.1", default-features = false }

View File

@@ -0,0 +1,5 @@
fn main() {
if cfg!(target_os = "windows") {
println!("cargo:rustc-link-lib=advapi32");
}
}

View File

@@ -0,0 +1,191 @@
// remove_non_letters deletes everything from the source string,
// except lowercased letters a-z
fn remove_non_letters(src: &str) -> String {
src.chars()
.filter(|x: &char| x.is_ascii_lowercase())
.collect()
}
// replace_start replaces the `old` substring with the `new` one
// if it matches at the beginning of the `src` string
fn replace_start(src: &str, old: &str, new: &str) -> String {
if let Some(suffix) = src.strip_prefix(old) {
let mut result = String::with_capacity(src.len() - old.len() + new.len());
result.push_str(new);
result.push_str(suffix);
result
} else {
src.to_string()
}
}
// replace_end replaces the `old` substring with the `new` one
// if it matches at the end of the `src` string
fn replace_end(src: &str, old: &str, new: &str) -> String {
if let Some(prefix) = src.strip_suffix(old) {
let mut result = String::with_capacity(src.len() - old.len() + new.len());
result.push_str(prefix);
result.push_str(new);
result
} else {
src.to_string()
}
}
// replace replaces all `old` substrings with `new` ones
// in the the `src` string
fn replace(src: &str, old: &str, new: &str) -> String {
if old.is_empty() || src.is_empty() {
return src.to_string();
}
let mut result = String::with_capacity(src.len());
let mut idx = 0;
while idx < src.len() {
if idx + old.len() <= src.len() && &src[idx..idx + old.len()] == old {
result.push_str(new);
idx += old.len();
} else {
let ch = src[idx..].chars().next().unwrap();
result.push(ch);
idx += ch.len_utf8();
}
}
result
}
// replace_seq replaces all sequences of the `old` character
// with the `new` substring in the the `src` string
fn replace_seq(src: &str, old: char, new: &str) -> String {
let mut result = String::with_capacity(src.len());
let mut match_len = 0;
for ch in src.chars() {
if ch == old {
match_len += 1;
} else {
if match_len > 0 {
result.push_str(new);
match_len = 0;
}
result.push(ch);
}
}
if match_len > 0 {
result.push_str(new);
}
result
}
// pad pads `src` string with trailing 1s
// up to the length of 10 characters
fn pad(src: &str) -> String {
let max_len = 10;
let mut result = String::with_capacity(max_len);
for ch in src.chars().take(max_len) {
result.push(ch);
}
while result.chars().count() < max_len {
result.push('1');
}
result
}
// caverphone implements the Caverphone phonetic hashing algorithm
// https://en.wikipedia.org/wiki/Caverphone
fn caverphone(src: &str) -> String {
if src.is_empty() {
return String::new();
}
let mut res = remove_non_letters(src);
res = replace_end(&res, "e", "");
res = replace_start(&res, "cough", "cou2f");
res = replace_start(&res, "rough", "rou2f");
res = replace_start(&res, "tough", "tou2f");
res = replace_start(&res, "enough", "enou2f");
res = replace_start(&res, "trough", "trou2f");
res = replace_start(&res, "gn", "2n");
res = replace_end(&res, "mb", "m2");
res = replace(&res, "cq", "2q");
res = replace(&res, "ci", "si");
res = replace(&res, "ce", "se");
res = replace(&res, "cy", "sy");
res = replace(&res, "tch", "2ch");
res = replace(&res, "c", "k");
res = replace(&res, "q", "k");
res = replace(&res, "x", "k");
res = replace(&res, "v", "f");
res = replace(&res, "dg", "2g");
res = replace(&res, "tio", "sio");
res = replace(&res, "tia", "sia");
res = replace(&res, "d", "t");
res = replace(&res, "ph", "fh");
res = replace(&res, "b", "p");
res = replace(&res, "sh", "s2");
res = replace(&res, "z", "s");
res = replace_start(&res, "a", "A");
res = replace_start(&res, "e", "A");
res = replace_start(&res, "i", "A");
res = replace_start(&res, "o", "A");
res = replace_start(&res, "u", "A");
res = replace(&res, "a", "3");
res = replace(&res, "e", "3");
res = replace(&res, "i", "3");
res = replace(&res, "o", "3");
res = replace(&res, "u", "3");
res = replace(&res, "j", "y");
res = replace_start(&res, "y3", "Y3");
res = replace_start(&res, "y", "A");
res = replace(&res, "y", "3");
res = replace(&res, "3gh3", "3kh3");
res = replace(&res, "gh", "22");
res = replace(&res, "g", "k");
res = replace_seq(&res, 's', "S");
res = replace_seq(&res, 't', "T");
res = replace_seq(&res, 'p', "P");
res = replace_seq(&res, 'k', "K");
res = replace_seq(&res, 'f', "F");
res = replace_seq(&res, 'm', "M");
res = replace_seq(&res, 'n', "N");
res = replace(&res, "w3", "W3");
res = replace(&res, "wh3", "Wh3");
res = replace_end(&res, "w", "3");
res = replace(&res, "w", "2");
res = replace_start(&res, "h", "A");
res = replace(&res, "h", "2");
res = replace(&res, "r3", "R3");
res = replace_end(&res, "r", "3");
res = replace(&res, "r", "2");
res = replace(&res, "l3", "L3");
res = replace_end(&res, "l", "3");
res = replace(&res, "l", "2");
res = replace(&res, "2", "");
res = replace_end(&res, "3", "A");
res = replace(&res, "3", "");
res = pad(&res);
res
}
pub fn caver_str(input: Option<&str>) -> Option<String> {
input.map(caverphone)
}

View File

@@ -0,0 +1,54 @@
pub const CCLASS_SILENT: u8 = 0;
pub const CCLASS_VOWEL: u8 = 1;
pub const CCLASS_B: u8 = 2;
pub const CCLASS_Y: u8 = 9;
pub const CCLASS_L: u8 = 6;
pub const CCLASS_R: u8 = 7;
//pub const CCLASS_M: u8 = 8;
pub const CCLASS_DIGIT: u8 = 10;
pub const CCLASS_SPACE: u8 = 11;
pub const CCLASS_OTHER: u8 = 12;
pub const MID_CLASS: [u8; 128] = [
12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, //
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, //
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
2, 3, 7, 3, 4, 1, 2, 2, 3, 1, 3, 12, 12, 12, 12, 12, //
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
2, 3, 7, 3, 4, 1, 2, 2, 3, 1, 3, 12, 12, 12, 12, 12, //
];
pub const INIT_CLASS: [u8; 128] = [
12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 12, 12, //
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, //
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, //
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, //
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 8, 8, 1, //
2, 3, 7, 3, 4, 1, 2, 2, 3, 9, 3, 12, 12, 12, 12, 12, //
];
// Based on: const unsigned char className[] = ".ABCDHLRMY9 ?";
pub const CLASS_NAME: [u8; 13] = [
b'.', // CCLASS_SILENT (0) -> .
b'A', // CCLASS_VOWEL (1) -> A
b'B', // CCLASS_B (2) -> B
b'C', // CCLASS_C (3) -> C
b'D', // CCLASS_D (4) -> D
b'H', // CCLASS_H (5) -> H
b'L', // CCLASS_L (6) -> L
b'R', // CCLASS_R (7) -> R
b'M', // CCLASS_M (8) -> M
b'Y', // CCLASS_Y (9) -> Y
b'9', // CCLASS_DIGIT (10) -> 9
b' ', // CCLASS_SPACE (11) -> space
b'?', // CCLASS_OTHER (12) -> ?
];
pub const SCRIPT_LATIN: u32 = 0x0001;
pub const SCRIPT_CYRILLIC: u32 = 0x0002;
pub const SCRIPT_GREEK: u32 = 0x0004;
pub const SCRIPT_HEBREW: u32 = 0x0008;
pub const SCRIPT_ARABIC: u32 = 0x0010;

View File

@@ -0,0 +1,276 @@
// Adapted from SQLite spellfix.c extension and sqlean fuzzy/editdist.c
use crate::common::*;
#[derive(Debug, PartialEq)]
pub enum EditDistanceError {
NonAsciiInput,
}
pub type EditDistanceResult = Result<i32, EditDistanceError>;
fn character_class(c_prev: u8, c: u8) -> u8 {
if c_prev == 0 {
INIT_CLASS[(c & 0x7f) as usize]
} else {
MID_CLASS[(c & 0x7f) as usize]
}
}
/// Return the cost of inserting or deleting character c immediately
/// following character c_prev. If c_prev == 0, that means c is the first
/// character of the word.
fn insert_or_delete_cost(c_prev: u8, c: u8, c_next: u8) -> i32 {
let class_c = character_class(c_prev, c);
if class_c == CCLASS_SILENT {
return 1;
}
if c_prev == c {
return 10;
}
if class_c == CCLASS_VOWEL && (c_prev == b'r' || c_next == b'r') {
return 20; // Insert a vowel before or after 'r'
}
let class_c_prev = character_class(c_prev, c_prev);
if class_c == class_c_prev {
if class_c == CCLASS_VOWEL {
15
} else {
50
}
} else {
// Any other character insertion or deletion
100
}
}
const FINAL_INS_COST_DIV: i32 = 4;
/// Return the cost of substituting c_to in place of c_from assuming
/// the previous character is c_prev. If c_prev == 0 then c_to is the first
/// character of the word.
fn substitute_cost(c_prev: u8, c_from: u8, c_to: u8) -> i32 {
if c_from == c_to {
return 0;
}
if c_from == (c_to ^ 0x20) && c_to.is_ascii_alphabetic() {
return 0;
}
let class_from = character_class(c_prev, c_from);
let class_to = character_class(c_prev, c_to);
if class_from == class_to {
40
} else if (CCLASS_B..=CCLASS_Y).contains(&class_from)
&& (CCLASS_B..=CCLASS_Y).contains(&class_to)
{
75
} else {
100
}
}
/// Given two strings z_a and z_b which are pure ASCII, return the cost
/// of transforming z_a into z_b. If z_a ends with '*' assume that it is
/// a prefix of z_b and give only minimal penalty for extra characters
/// on the end of z_b.
///
/// Returns cost where smaller numbers mean a closer match
///
/// Returns Err for Non-ASCII characters on input
pub fn edit_distance(z_a: &str, z_b: &str) -> EditDistanceResult {
if z_a.is_empty() && z_b.is_empty() {
return Ok(0);
}
let za_bytes = z_a.as_bytes();
let zb_bytes = z_b.as_bytes();
if !z_a.is_ascii() || !z_b.is_ascii() {
return Err(EditDistanceError::NonAsciiInput);
}
if z_a.is_empty() {
let mut res = 0;
let mut c_b_prev = 0u8;
let zb_bytes = z_b.as_bytes();
for (i, &c_b) in zb_bytes.iter().enumerate() {
let c_b_next = if i + 1 < zb_bytes.len() {
zb_bytes[i + 1]
} else {
0
};
res += insert_or_delete_cost(c_b_prev, c_b, c_b_next) / FINAL_INS_COST_DIV;
c_b_prev = c_b;
}
return Ok(res);
}
if z_b.is_empty() {
let mut res = 0;
let mut c_a_prev = 0u8;
let za_bytes = z_a.as_bytes();
for (i, &c_a) in za_bytes.iter().enumerate() {
let c_a_next = if i + 1 < za_bytes.len() {
za_bytes[i + 1]
} else {
0
};
res += insert_or_delete_cost(c_a_prev, c_a, c_a_next);
c_a_prev = c_a;
}
return Ok(res);
}
let mut za_start = 0;
let mut zb_start = 0;
// Skip any common prefix
while za_start < za_bytes.len()
&& zb_start < zb_bytes.len()
&& za_bytes[za_start] == zb_bytes[zb_start]
{
za_start += 1;
zb_start += 1;
}
// If both strings are exhausted after common prefix
if za_start >= za_bytes.len() && zb_start >= zb_bytes.len() {
return Ok(0);
}
let za_remaining = &za_bytes[za_start..];
let zb_remaining = &zb_bytes[zb_start..];
let n_a = za_remaining.len();
let n_b = zb_remaining.len();
// Special processing if either remaining string is empty after prefix matching
if n_a == 0 {
let mut res = 0;
let mut c_b_prev = if za_start > 0 {
za_bytes[za_start - 1]
} else {
0
};
for (i, &c_b) in zb_remaining.iter().enumerate() {
let c_b_next = if i + 1 < n_b { zb_remaining[i + 1] } else { 0 };
res += insert_or_delete_cost(c_b_prev, c_b, c_b_next) / FINAL_INS_COST_DIV;
c_b_prev = c_b;
}
return Ok(res);
}
if n_b == 0 {
let mut res = 0;
let mut c_a_prev = if za_start > 0 {
za_bytes[za_start - 1]
} else {
0
};
for (i, &c_a) in za_remaining.iter().enumerate() {
let c_a_next = if i + 1 < n_a { za_remaining[i + 1] } else { 0 };
res += insert_or_delete_cost(c_a_prev, c_a, c_a_next);
c_a_prev = c_a;
}
return Ok(res);
}
// Check if a is a prefix pattern
if za_remaining.len() == 1 && za_remaining[0] == b'*' {
return Ok(0);
}
let mut m = vec![0i32; n_b + 1];
let mut cx = vec![0u8; n_b + 1];
let dc = if za_start > 0 {
za_bytes[za_start - 1]
} else {
0
};
m[0] = 0;
cx[0] = dc;
let mut c_b_prev = dc;
for x_b in 1..=n_b {
let c_b = zb_remaining[x_b - 1];
let c_b_next = if x_b < n_b { zb_remaining[x_b] } else { 0 };
cx[x_b] = c_b;
m[x_b] = m[x_b - 1] + insert_or_delete_cost(c_b_prev, c_b, c_b_next);
c_b_prev = c_b;
}
let mut c_a_prev = dc;
for x_a in 1..=n_a {
let last_a = x_a == n_a;
let c_a = za_remaining[x_a - 1];
let c_a_next = if x_a < n_a { za_remaining[x_a] } else { 0 };
if c_a == b'*' && last_a {
break;
}
let mut d = m[0];
m[0] = d + insert_or_delete_cost(c_a_prev, c_a, c_a_next);
for x_b in 1..=n_b {
let c_b = zb_remaining[x_b - 1];
let c_b_next = if x_b < n_b { zb_remaining[x_b] } else { 0 };
// Cost to insert c_b
let mut ins_cost = insert_or_delete_cost(cx[x_b - 1], c_b, c_b_next);
if last_a {
ins_cost /= FINAL_INS_COST_DIV;
}
// Cost to delete c_a
let del_cost = insert_or_delete_cost(cx[x_b], c_a, c_b_next);
// Cost to substitute c_a -> c_b
let sub_cost = substitute_cost(cx[x_b - 1], c_a, c_b);
// Find best cost
let mut total_cost = ins_cost + m[x_b - 1];
let mut ncx = c_b;
if del_cost + m[x_b] < total_cost {
total_cost = del_cost + m[x_b];
ncx = c_a;
}
if sub_cost + d < total_cost {
total_cost = sub_cost + d;
}
d = m[x_b];
m[x_b] = total_cost;
cx[x_b] = ncx;
}
c_a_prev = c_a;
}
let res = if za_remaining.last() == Some(&b'*') {
let mut min_cost = m[1];
for &val in m.iter().skip(1).take(n_b) {
if val < min_cost {
min_cost = val;
}
}
min_cost
} else {
m[n_b]
};
Ok(res)
}

666
extensions/fuzzy/src/lib.rs Normal file
View File

@@ -0,0 +1,666 @@
// Adapted from sqlean fuzzy
use std::cmp;
use turso_ext::{register_extension, scalar, ResultCode, Value};
mod caver;
mod common;
mod editdist;
mod phonetic;
mod rsoundex;
mod soundex;
mod translit;
register_extension! {
scalars: {levenshtein, damerau_levenshtein, edit_distance, hamming, jaronwin, osadist, fuzzy_soundex, fuzzy_phonetic, fuzzy_caver, fuzzy_rsoundex, fuzzy_translit, fuzzy_script}
}
/// Calculates and returns the Levenshtein distance of two non NULL strings.
#[scalar(name = "fuzzy_leven")]
fn levenshtein(args: &[Value]) -> Value {
let Some(arg1) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let Some(arg2) = args[1].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let dist = leven(arg1, arg2);
return Value::from_integer(dist);
}
fn leven(s1: &str, s2: &str) -> i64 {
let mut str1: &[u8] = s1.as_bytes();
let mut str2: &[u8] = s2.as_bytes();
let mut str1_len = str1.len();
let mut str2_len = str2.len();
if str1_len == 0 {
return str2_len as i64;
}
if str2_len == 0 {
return str1_len as i64;
}
while str1_len > 0 && str2_len > 0 && str1[0] == str2[0] {
str1 = &str1[1..];
str2 = &str2[1..];
str1_len -= 1;
str2_len -= 1;
}
let mut vector: Vec<usize> = (0..=str1_len).collect();
let mut last_diag: usize;
let mut cur: usize;
for row in 1..=str2_len {
last_diag = row - 1;
vector[0] = row;
for col in 1..=str1_len {
cur = vector[col];
let cost = if str1[col - 1] == str2[row - 1] { 0 } else { 1 };
vector[col] = std::cmp::min(
std::cmp::min(vector[col] + 1, vector[col - 1] + 1),
last_diag + cost,
);
last_diag = cur;
}
}
vector[str1_len] as i64
}
/// Calculates and returns the Damerau-Levenshtein distance of two non NULL
#[scalar(name = "fuzzy_damlev")]
fn damerau_levenshtein(args: &[Value]) -> Value {
let Some(arg1) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let Some(arg2) = args[1].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let dist = damlev(arg1, arg2);
return Value::from_integer(dist);
}
#[allow(clippy::needless_range_loop)]
fn damlev(s1: &str, s2: &str) -> i64 {
let str1: &[u8] = s1.as_bytes();
let str2: &[u8] = s2.as_bytes();
let str1_len = str1.len();
let str2_len = str2.len();
if str1_len == 0 {
return str2_len as i64;
}
if str2_len == 0 {
return str1_len as i64;
}
let mut start = 0;
while start < str1_len && start < str2_len && str1[start] == str2[start] {
start += 1;
}
let str1 = &str1[start..];
let str2 = &str2[start..];
let len1 = str1.len();
let len2 = str2.len();
const ALPHA_SIZE: usize = 255;
let infi = len1 + len2;
let mut dict = vec![0usize; ALPHA_SIZE];
let rows = len1 + 2;
let cols = len2 + 2;
let mut matrix = vec![vec![0usize; cols]; rows];
matrix[0][0] = infi;
for i in 1..rows {
matrix[i][0] = infi;
matrix[i][1] = i - 1;
}
for j in 1..cols {
matrix[0][j] = infi;
matrix[1][j] = j - 1;
}
for (row, &c1) in str1.iter().enumerate() {
let mut db = 0;
for (col, &c2) in str2.iter().enumerate() {
let i = dict[c2 as usize];
let k = db;
let cost = if c1 == c2 { 0 } else { 1 };
if cost == 0 {
db = col + 1;
}
matrix[row + 2][col + 2] = std::cmp::min(
std::cmp::min(
matrix[row + 1][col + 1] + cost,
matrix[row + 2][col + 1] + 1,
),
std::cmp::min(
matrix[row + 1][col + 2] + 1,
matrix[i][k] + (row + 1 - i - 1) + (col + 1 - k - 1) + 1,
),
);
}
dict[c1 as usize] = row + 1;
}
matrix[rows - 1][cols - 1] as i64
}
//
// fuzzy_editdist(A,B)
//
// Return the cost of transforming string A into string B. Both strings
// must be pure ASCII text. If A ends with '*' then it is assumed to be
// a prefix of B and extra characters on the end of B have minimal additional
// cost.
//
#[scalar(name = "fuzzy_editdist")]
fn edit_distance(args: &[Value]) {
let Some(arg1) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let Some(arg2) = args[1].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
if let Ok(res) = editdist::edit_distance(arg1, arg2) {
return Value::from_integer(res as i64);
} else {
return Value::error(ResultCode::InvalidArgs);
}
}
// returns the hamming distance between two strings
#[scalar(name = "fuzzy_hamming")]
fn hamming(args: &[Value]) {
let Some(arg1) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let Some(arg2) = args[1].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let dist = hamming_dist(arg1, arg2);
return Value::from_integer(dist);
}
fn hamming_dist(s1: &str, s2: &str) -> i64 {
let str1_b = s1.as_bytes();
let str2_b = s2.as_bytes();
if str1_b.len() != str2_b.len() {
return -1_i64;
}
let res = str1_b
.iter()
.zip(str2_b.iter())
.filter(|(a, b)| a != b)
.count();
res as i64
}
#[scalar(name = "fuzzy_jarowin")]
fn jaronwin(args: &[Value]) {
let Some(arg1) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let Some(arg2) = args[1].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let res = jaro_winkler(arg1, arg2);
return Value::from_float(res);
}
/// Calculates and returns the Jaro-Winkler distance of two non NULL strings.
fn jaro_winkler(s1: &str, s2: &str) -> f64 {
let dist = jaro(s1, s2);
let mut prefix_len = 0;
for (c1, c2) in s1.chars().zip(s2.chars()) {
if c1 == c2 {
prefix_len += 1;
} else {
break;
}
if prefix_len == 3 {
break;
}
}
dist + (prefix_len as f64) * 0.1 * (1.0 - dist)
}
/// Calculates and returns the Jaro distance of two non NULL strings.
fn jaro(s1: &str, s2: &str) -> f64 {
if s1 == s2 {
return 1.0;
}
let s1: Vec<char> = s1.chars().collect();
let s2: Vec<char> = s2.chars().collect();
let len1 = s1.len();
let len2 = s2.len();
if len1 == 0 || len2 == 0 {
return 0.0;
}
let max_dist = (cmp::max(len1, len2) / 2).saturating_sub(1);
let mut match_count = 0;
let mut hash_s1 = vec![false; len1];
let mut hash_s2 = vec![false; len2];
for i in 0..len1 {
let start = i.saturating_sub(max_dist);
let end = cmp::min(i + max_dist + 1, len2);
for j in start..end {
if s1[i] == s2[j] && !hash_s2[j] {
hash_s1[i] = true;
hash_s2[j] = true;
match_count += 1;
break;
}
}
}
if match_count == 0 {
return 0.0;
}
let mut t = 0;
let mut point = 0;
for i in 0..len1 {
if hash_s1[i] {
while point < len2 && !hash_s2[point] {
point += 1;
}
if point < len2 && s1[i] != s2[point] {
t += 1;
}
point += 1;
}
}
let t = t as f64 / 2.0;
let match_count = match_count as f64;
(match_count / len1 as f64 + match_count / len2 as f64 + (match_count - t) / match_count) / 3.0
}
/// Computes and returns the Optimal String Alignment distance for two non NULL
#[scalar(name = "fuzzy_osadist")]
fn osadist(args: &[Value]) {
let Some(arg1) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let Some(arg2) = args[1].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let dist = optimal_string_alignment(arg1, arg2);
return Value::from_integer(dist as i64);
}
fn optimal_string_alignment(s1: &str, s2: &str) -> usize {
let mut s1_chars: Vec<char> = s1.chars().collect();
let mut s2_chars: Vec<char> = s2.chars().collect();
let mut len1 = s1_chars.len();
let mut len2 = s2_chars.len();
while len1 > 0 && len2 > 0 && s1_chars[0] == s2_chars[0] {
s1_chars.remove(0);
s2_chars.remove(0);
len1 -= 1;
len2 -= 1;
}
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
let mut matrix = vec![vec![0usize; len2 + 1]; len1 + 1];
// clippy from this
//for i in 0..=len1 {
// matrix[i][0] = i;
//}
//for j in 0..=len2 {
// matrix[0][j] = j;
//}
// to
for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
row[0] = i;
}
for (j, item) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
*item = j;
}
for i in 1..=len1 {
for j in 1..=len2 {
let cost = if s1_chars[i - 1] == s2_chars[j - 1] {
0
} else {
1
};
let deletion = matrix[i - 1][j] + 1;
let insertion = matrix[i][j - 1] + 1;
let substitution = matrix[i - 1][j - 1] + cost;
matrix[i][j] = deletion.min(insertion).min(substitution);
if i > 1
&& j > 1
&& s1_chars[i % len1] == s2_chars[j - 2]
&& s1_chars[i - 2] == s2_chars[j % len2]
{
matrix[i][j] = matrix[i][j].min(matrix[i - 2][j - 2] + cost);
}
}
}
matrix[len1][len2]
}
#[scalar(name = "fuzzy_soundex")]
fn fuzzy_soundex(args: &[Value]) {
let arg1 = args[0].to_text();
if let Some(txt) = soundex::soundex(arg1) {
Value::from_text(txt)
} else {
Value::null()
}
}
#[scalar(name = "fuzzy_phonetic")]
fn fuzzy_phonetic(args: &[Value]) {
let arg1 = args[0].to_text();
if let Some(txt) = phonetic::phonetic_hash_str(arg1) {
Value::from_text(txt)
} else {
Value::null()
}
}
#[scalar(name = "fuzzy_caver")]
fn fuzzy_caver(args: &[Value]) {
let arg1 = args[0].to_text();
if let Some(txt) = caver::caver_str(arg1) {
Value::from_text(txt)
} else {
Value::null()
}
}
#[scalar(name = "fuzzy_rsoundex")]
pub fn fuzzy_rsoundex(args: &[Value]) {
let arg1 = args[0].to_text();
if let Some(txt) = rsoundex::rsoundex(arg1) {
Value::from_text(txt)
} else {
Value::null()
}
}
//Convert a string that contains non-ASCII Roman characters into
//pure ASCII.
#[scalar(name = "fuzzy_translit")]
fn fuzzy_translit(args: &[Value]) {
let Some(arg) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let dist = translit::transliterate_str(arg);
return Value::from_text(dist);
}
// Try to determine the dominant script used by the word X and return
// its ISO 15924 numeric code.
//
// The current implementation only understands the following scripts:
//
// 125 (Hebrew)
// 160 (Arabic)
// 200 (Greek)
// 215 (Latin)
// 220 (Cyrillic)
//
// This routine will return 998 if the input X contains characters from
// two or more of the above scripts or 999 if X contains no characters
// from any of the above scripts.
#[scalar(name = "fuzzy_script")]
pub fn fuzzy_script(args: &[Value]) {
let Some(arg) = args[0].to_text() else {
return Value::error(ResultCode::InvalidArgs);
};
let dist = translit::script_code(arg.as_bytes());
return Value::from_integer(dist as i64);
}
//tests adapted from sqlean fuzzy
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_damlev() {
let cases = vec![
("abc", "abc", 0),
("abc", "", 3),
("", "abc", 3),
("abc", "ab", 1),
("abc", "abcd", 1),
("abc", "acb", 1),
("abc", "ca", 2),
];
for (s1, s2, expected) in cases {
let got = damlev(s1, s2);
assert_eq!(got, expected, "damlev({s1}, {s2}) failed");
}
}
#[test]
fn test_hamming() {
let cases = vec![
("abc", "abc", 0),
("abc", "", -1),
("", "abc", -1),
("hello", "hellp", 1),
("hello", "heloh", 2),
];
for (s1, s2, expected) in cases {
let got = hamming_dist(s1, s2);
assert_eq!(got, expected, "hamming({s1}, {s2}) failed");
}
}
#[test]
fn test_jaro_win() {
let cases: Vec<(&str, &str, f64)> = vec![
("abc", "abc", 1.0),
("abc", "", 0.0),
("", "abc", 0.0),
("my string", "my tsring", 0.974),
("my string", "my ntrisg", 0.896),
];
for (s1, s2, expected) in cases {
let got = jaro_winkler(s1, s2);
if (expected - 0.974).abs() < 1e-6 || (expected - 0.896).abs() < 1e-6 {
let got_rounded = (got * 1000.0).round() / 1000.0;
assert!(
(got_rounded - expected).abs() < 1e-6,
"jaro_winkler({s1}, {s2}) failed: got {got_rounded}, expected {expected}"
);
} else {
assert!(
(got - expected).abs() < 1e-6,
"jaro_winkler({s1}, {s2}) failed: got {got}, expected {expected}"
);
}
}
}
#[test]
fn test_leven() {
let cases = vec![
("abc", "abc", 0),
("abc", "", 3),
("", "abc", 3),
("abc", "ab", 1),
("abc", "abcd", 1),
("abc", "acb", 2),
("abc", "ca", 3),
];
for (s1, s2, expected) in cases {
let got = leven(s1, s2);
assert_eq!(got, expected, "leven({s1}, {s2}) failed");
}
}
#[test]
fn test_edit_distance() {
let test_cases = vec![
("abc", "abc", 0),
("abc", "", 300),
("", "abc", 75),
("abc", "ab", 100),
("abc", "abcd", 25),
("abc", "acb", 110),
("abc", "ca", 225),
//more cases
("awesome", "aewsme", 215),
("kitten", "sitting", 105),
("flaw", "lawn", 110),
("rust", "trust", 100),
("gumbo", "gambol", 65),
];
for (s1, s2, expected) in test_cases {
let res = editdist::edit_distance(s1, s2).unwrap();
assert_eq!(res, expected, "edit_distance({s1}, {s2}) failed");
}
}
#[test]
fn test_osadist() {
let cases = vec![
("abc", "abc", 0),
("abc", "", 3),
("", "abc", 3),
("abc", "ab", 1),
("abc", "abcd", 1),
("abc", "acb", 2),
("abc", "ca", 3),
];
for (s1, s2, expected) in cases {
let got = optimal_string_alignment(s1, s2);
assert_eq!(got, expected, "osadist({s1}, {s2}) failed");
}
}
#[test]
fn test_soundex() {
let cases = vec![
(None, None),
(Some(""), Some("".to_string())),
(Some("phonetics"), Some("P532".to_string())),
(Some("is"), Some("I200".to_string())),
(Some("awesome"), Some("A250".to_string())),
];
for (input, expected) in cases {
let result = soundex::soundex(input);
assert_eq!(
result, expected,
"fuzzy_soundex({input:?}) failed: expected {expected:?}, got {result:?}"
);
}
}
#[test]
fn test_phonetic() {
let cases = vec![
(None, None),
(Some(""), Some("".to_string())),
(Some("phonetics"), Some("BAMADAC".to_string())),
(Some("is"), Some("AC".to_string())),
(Some("awesome"), Some("ABACAMA".to_string())),
];
for (input, expected) in cases {
let result = phonetic::phonetic_hash_str(input);
assert_eq!(
result, expected,
"fuzzy_phonetic({input:?}) failed: expected {expected:?}, got {result:?}"
);
}
}
#[test]
fn test_caver() {
let cases = vec![
(None, None),
(Some(""), Some("".to_string())),
(Some("phonetics"), Some("FNTKS11111".to_string())),
(Some("is"), Some("AS11111111".to_string())),
(Some("awesome"), Some("AWSM111111".to_string())),
];
for (input, expected) in cases {
let result = caver::caver_str(input);
assert_eq!(
result, expected,
"fuzzy_caver({input:?}) failed: expected {expected:?}, got {result:?}"
);
}
}
#[test]
fn test_rsoundex() {
let cases = vec![
(None, None),
(Some(""), Some("".to_string())),
(Some("phonetics"), Some("P1080603".to_string())),
(Some("is"), Some("I03".to_string())),
(Some("awesome"), Some("A03080".to_string())),
];
for (input, expected) in cases {
let result = rsoundex::rsoundex(input);
assert_eq!(
result, expected,
"fuzzy_rsoundex({input:?}) failed: expected {expected:?}, got {result:?}"
);
}
}
}

View File

@@ -0,0 +1,110 @@
use crate::common::*;
/// Generate a "phonetic hash" from a string of ASCII characters.
///
/// The algorithm:
/// Maps characters by character class as defined above
/// Omits double-letters
/// Omits vowels beside R and L
/// Omits T when followed by CH
/// Omits W when followed by R
/// Omits D when followed by J or G
/// Omits K in KN or G in GN at the beginning of a word
///
/// Returns a Vec<u8> containing the phonetic hash, or None if input is invalid.
pub fn phonetic_hash(z_in: &[u8]) -> Option<Vec<u8>> {
if z_in.is_empty() {
return Some(Vec::new());
}
let mut z_out = Vec::with_capacity(z_in.len() + 1);
let mut c_prev = 0x77u8;
let mut c_prev_x = 0x77u8;
let mut a_class = &INIT_CLASS;
let mut input = z_in;
if z_in.len() > 2 {
match z_in[0] {
b'g' | b'k' => {
if z_in[1] == b'n' {
input = &z_in[1..];
}
}
_ => {}
}
}
let mut i = 0;
while i < input.len() {
let mut c = input[i];
if i + 1 < input.len() {
if c == b'w' && input[i + 1] == b'r' {
i += 1;
continue;
}
if c == b'd' && (input[i + 1] == b'j' || input[i + 1] == b'g') {
i += 1;
continue;
}
if i + 2 < input.len() && c == b't' && input[i + 1] == b'c' && input[i + 2] == b'h' {
i += 1;
continue;
}
}
c = a_class[(c & 0x7f) as usize];
if c == CCLASS_SPACE {
i += 1;
continue;
}
if c == CCLASS_OTHER && c_prev != CCLASS_DIGIT {
i += 1;
continue;
}
a_class = &MID_CLASS;
if c == CCLASS_VOWEL && (c_prev_x == CCLASS_R || c_prev_x == CCLASS_L) {
i += 1;
continue;
}
if (c == CCLASS_R || c == CCLASS_L) && c_prev_x == CCLASS_VOWEL && !z_out.is_empty() {
z_out.pop();
}
c_prev = c;
if c == CCLASS_SILENT {
i += 1;
continue;
}
c_prev_x = c;
if (c as usize) < CLASS_NAME.len() {
c = CLASS_NAME[c as usize];
} else {
c = b'?';
}
if z_out.is_empty() || c != *z_out.last().unwrap() {
z_out.push(c);
}
i += 1;
}
Some(z_out)
}
pub fn phonetic_hash_str(input: Option<&str>) -> Option<String> {
match input {
None => None,
Some(s) => {
phonetic_hash(s.as_bytes()).map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
}
}
}

View File

@@ -0,0 +1,49 @@
/// Computes and returns the soundex representation of a given non NULL string.
/// More information about the algorithm can be found here:
/// http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html
pub fn rsoundex(input: Option<&str>) -> Option<String> {
if let Some(s) = input {
if s.is_empty() {
return Some("".to_string());
}
let str_bytes = s.as_bytes();
let str_len = str_bytes.len();
let mut code = String::with_capacity(str_len + 1);
code.push(str_bytes[0].to_ascii_uppercase() as char);
let mut buf: Vec<char> = Vec::with_capacity(str_len);
for &b in str_bytes {
buf.push(refined_soundex_encode(b as char));
}
let mut prev: Option<char> = None;
for c in buf {
if Some(c) != prev {
code.push(c);
prev = Some(c);
}
}
Some(code)
} else {
None
}
}
//helper
fn refined_soundex_encode(c: char) -> char {
match c.to_ascii_lowercase() {
'b' | 'p' => '1',
'f' | 'v' => '2',
'c' | 'k' | 's' => '3',
'g' | 'j' => '4',
'q' | 'x' | 'z' => '5',
'd' | 't' => '6',
'l' => '7',
'm' | 'n' => '8',
'r' => '9',
_ => '0',
}
}

View File

@@ -0,0 +1,65 @@
/// Computes and returns the soundex representation of a given string.
/// https://en.wikipedia.org/wiki/Soundex
pub fn soundex(input: Option<&str>) -> Option<String> {
if let Some(input_str) = input {
if input_str.is_empty() {
return Some("".to_string());
}
let str_bytes = input_str.as_bytes();
let str_len = str_bytes.len();
let mut code = String::with_capacity(4);
code.push(str_bytes[0].to_ascii_uppercase() as char);
let mut buf: Vec<char> = Vec::with_capacity(str_len);
for &byte in str_bytes {
buf.push(soundex_encode(byte as char));
}
let mut d = 1; // digit counter
let mut i = 1; // index counter
while i < str_len && d < 4 {
let current = buf[i];
let previous = buf[i - 1];
if current != previous && current != '0' {
if i > 1 {
let two_back = buf[i - 2];
let separator = str_bytes[i - 1].to_ascii_lowercase() as char;
if current == two_back && (separator == 'h' || separator == 'w') {
i += 1;
continue;
}
}
code.push(current);
d += 1;
}
i += 1;
}
while d < 4 {
code.push('0');
d += 1;
}
Some(code)
} else {
None
}
}
/// Helper function
fn soundex_encode(c: char) -> char {
match c.to_ascii_lowercase() {
'b' | 'f' | 'p' | 'v' => '1',
'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2',
'd' | 't' => '3',
'l' => '4',
'm' | 'n' => '5',
'r' => '6',
_ => '0',
}
}

View File

@@ -0,0 +1,577 @@
use crate::common::*;
static TRANSLIT_UTF8_LOOKUP: [u8; 64] = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
];
#[derive(Copy, Clone, Debug)]
struct Transliteration {
c_from: u16,
c_to0: u8,
c_to1: u8,
c_to2: u8,
c_to3: u8,
}
impl Transliteration {
const fn new(c_from: u16, c_to0: u8, c_to1: u8, c_to2: u8, c_to3: u8) -> Self {
Self {
c_from,
c_to0,
c_to1,
c_to2,
c_to3,
}
}
}
static TRANSLIT: [Transliteration; 389] = [
Transliteration::new(0x00A0, b' ', 0x00, 0x00, 0x00), /* to */
Transliteration::new(0x00B5, b'u', 0x00, 0x00, 0x00), /* µ to u */
Transliteration::new(0x00C0, b'A', 0x00, 0x00, 0x00), /* À to A */
Transliteration::new(0x00C1, b'A', 0x00, 0x00, 0x00), /* Á to A */
Transliteration::new(0x00C2, b'A', 0x00, 0x00, 0x00), /* Â to A */
Transliteration::new(0x00C3, b'A', 0x00, 0x00, 0x00), /* Ã to A */
Transliteration::new(0x00C4, b'A', b'e', 0x00, 0x00), /* Ä to Ae */
Transliteration::new(0x00C5, b'A', b'a', 0x00, 0x00), /* Å to Aa */
Transliteration::new(0x00C6, b'A', b'E', 0x00, 0x00), /* Æ to AE */
Transliteration::new(0x00C7, b'C', 0x00, 0x00, 0x00), /* Ç to C */
Transliteration::new(0x00C8, b'E', 0x00, 0x00, 0x00), /* È to E */
Transliteration::new(0x00C9, b'E', 0x00, 0x00, 0x00), /* É to E */
Transliteration::new(0x00CA, b'E', 0x00, 0x00, 0x00), /* Ê to E */
Transliteration::new(0x00CB, b'E', 0x00, 0x00, 0x00), /* Ë to E */
Transliteration::new(0x00CC, b'I', 0x00, 0x00, 0x00), /* Ì to I */
Transliteration::new(0x00CD, b'I', 0x00, 0x00, 0x00), /* Í to I */
Transliteration::new(0x00CE, b'I', 0x00, 0x00, 0x00), /* Î to I */
Transliteration::new(0x00CF, b'I', 0x00, 0x00, 0x00), /* Ï to I */
Transliteration::new(0x00D0, b'D', 0x00, 0x00, 0x00), /* Ð to D */
Transliteration::new(0x00D1, b'N', 0x00, 0x00, 0x00), /* Ñ to N */
Transliteration::new(0x00D2, b'O', 0x00, 0x00, 0x00), /* Ò to O */
Transliteration::new(0x00D3, b'O', 0x00, 0x00, 0x00), /* Ó to O */
Transliteration::new(0x00D4, b'O', 0x00, 0x00, 0x00), /* Ô to O */
Transliteration::new(0x00D5, b'O', 0x00, 0x00, 0x00), /* Õ to O */
Transliteration::new(0x00D6, b'O', b'e', 0x00, 0x00), /* Ö to Oe */
Transliteration::new(0x00D7, b'x', 0x00, 0x00, 0x00), /* × to x */
Transliteration::new(0x00D8, b'O', 0x00, 0x00, 0x00), /* Ø to O */
Transliteration::new(0x00D9, b'U', 0x00, 0x00, 0x00), /* Ù to U */
Transliteration::new(0x00DA, b'U', 0x00, 0x00, 0x00), /* Ú to U */
Transliteration::new(0x00DB, b'U', 0x00, 0x00, 0x00), /* Û to U */
Transliteration::new(0x00DC, b'U', b'e', 0x00, 0x00), /* Ü to Ue */
Transliteration::new(0x00DD, b'Y', 0x00, 0x00, 0x00), /* Ý to Y */
Transliteration::new(0x00DE, b'T', b'h', 0x00, 0x00), /* Þ to Th */
Transliteration::new(0x00DF, b's', b's', 0x00, 0x00), /* ß to ss */
Transliteration::new(0x00E0, b'a', 0x00, 0x00, 0x00), /* à to a */
Transliteration::new(0x00E1, b'a', 0x00, 0x00, 0x00), /* á to a */
Transliteration::new(0x00E2, b'a', 0x00, 0x00, 0x00), /* â to a */
Transliteration::new(0x00E3, b'a', 0x00, 0x00, 0x00), /* ã to a */
Transliteration::new(0x00E4, b'a', b'e', 0x00, 0x00), /* ä to ae */
Transliteration::new(0x00E5, b'a', b'a', 0x00, 0x00), /* å to aa */
Transliteration::new(0x00E6, b'a', b'e', 0x00, 0x00), /* æ to ae */
Transliteration::new(0x00E7, b'c', 0x00, 0x00, 0x00), /* ç to c */
Transliteration::new(0x00E8, b'e', 0x00, 0x00, 0x00), /* è to e */
Transliteration::new(0x00E9, b'e', 0x00, 0x00, 0x00), /* é to e */
Transliteration::new(0x00EA, b'e', 0x00, 0x00, 0x00), /* ê to e */
Transliteration::new(0x00EB, b'e', 0x00, 0x00, 0x00), /* ë to e */
Transliteration::new(0x00EC, b'i', 0x00, 0x00, 0x00), /* ì to i */
Transliteration::new(0x00ED, b'i', 0x00, 0x00, 0x00), /* í to i */
Transliteration::new(0x00EE, b'i', 0x00, 0x00, 0x00), /* î to i */
Transliteration::new(0x00EF, b'i', 0x00, 0x00, 0x00), /* ï to i */
Transliteration::new(0x00F0, b'd', 0x00, 0x00, 0x00), /* ð to d */
Transliteration::new(0x00F1, b'n', 0x00, 0x00, 0x00), /* ñ to n */
Transliteration::new(0x00F2, b'o', 0x00, 0x00, 0x00), /* ò to o */
Transliteration::new(0x00F3, b'o', 0x00, 0x00, 0x00), /* ó to o */
Transliteration::new(0x00F4, b'o', 0x00, 0x00, 0x00), /* ô to o */
Transliteration::new(0x00F5, b'o', 0x00, 0x00, 0x00), /* õ to o */
Transliteration::new(0x00F6, b'o', b'e', 0x00, 0x00), /* ö to oe */
Transliteration::new(0x00F7, b':', 0x00, 0x00, 0x00), /* ÷ to : */
Transliteration::new(0x00F8, b'o', 0x00, 0x00, 0x00), /* ø to o */
Transliteration::new(0x00F9, b'u', 0x00, 0x00, 0x00), /* ù to u */
Transliteration::new(0x00FA, b'u', 0x00, 0x00, 0x00), /* ú to u */
Transliteration::new(0x00FB, b'u', 0x00, 0x00, 0x00), /* û to u */
Transliteration::new(0x00FC, b'u', b'e', 0x00, 0x00), /* ü to ue */
Transliteration::new(0x00FD, b'y', 0x00, 0x00, 0x00), /* ý to y */
Transliteration::new(0x00FE, b't', b'h', 0x00, 0x00), /* þ to th */
Transliteration::new(0x00FF, b'y', 0x00, 0x00, 0x00), /* ÿ to y */
Transliteration::new(0x0100, b'A', 0x00, 0x00, 0x00), /* Ā to A */
Transliteration::new(0x0101, b'a', 0x00, 0x00, 0x00), /* ā to a */
Transliteration::new(0x0102, b'A', 0x00, 0x00, 0x00), /* Ă to A */
Transliteration::new(0x0103, b'a', 0x00, 0x00, 0x00), /* ă to a */
Transliteration::new(0x0104, b'A', 0x00, 0x00, 0x00), /* Ą to A */
Transliteration::new(0x0105, b'a', 0x00, 0x00, 0x00), /* ą to a */
Transliteration::new(0x0106, b'C', 0x00, 0x00, 0x00), /* Ć to C */
Transliteration::new(0x0107, b'c', 0x00, 0x00, 0x00), /* ć to c */
Transliteration::new(0x0108, b'C', b'h', 0x00, 0x00), /* Ĉ to Ch */
Transliteration::new(0x0109, b'c', b'h', 0x00, 0x00), /* ĉ to ch */
Transliteration::new(0x010A, b'C', 0x00, 0x00, 0x00), /* Ċ to C */
Transliteration::new(0x010B, b'c', 0x00, 0x00, 0x00), /* ċ to c */
Transliteration::new(0x010C, b'C', 0x00, 0x00, 0x00), /* Č to C */
Transliteration::new(0x010D, b'c', 0x00, 0x00, 0x00), /* č to c */
Transliteration::new(0x010E, b'D', 0x00, 0x00, 0x00), /* Ď to D */
Transliteration::new(0x010F, b'd', 0x00, 0x00, 0x00), /* ď to d */
Transliteration::new(0x0110, b'D', 0x00, 0x00, 0x00), /* Đ to D */
Transliteration::new(0x0111, b'd', 0x00, 0x00, 0x00), /* đ to d */
Transliteration::new(0x0112, b'E', 0x00, 0x00, 0x00), /* Ē to E */
Transliteration::new(0x0113, b'e', 0x00, 0x00, 0x00), /* ē to e */
Transliteration::new(0x0114, b'E', 0x00, 0x00, 0x00), /* Ĕ to E */
Transliteration::new(0x0115, b'e', 0x00, 0x00, 0x00), /* ĕ to e */
Transliteration::new(0x0116, b'E', 0x00, 0x00, 0x00), /* Ė to E */
Transliteration::new(0x0117, b'e', 0x00, 0x00, 0x00), /* ė to e */
Transliteration::new(0x0118, b'E', 0x00, 0x00, 0x00), /* Ę to E */
Transliteration::new(0x0119, b'e', 0x00, 0x00, 0x00), /* ę to e */
Transliteration::new(0x011A, b'E', 0x00, 0x00, 0x00), /* Ě to E */
Transliteration::new(0x011B, b'e', 0x00, 0x00, 0x00), /* ě to e */
Transliteration::new(0x011C, b'G', b'h', 0x00, 0x00), /* Ĝ to Gh */
Transliteration::new(0x011D, b'g', b'h', 0x00, 0x00), /* ĝ to gh */
Transliteration::new(0x011E, b'G', 0x00, 0x00, 0x00), /* Ğ to G */
Transliteration::new(0x011F, b'g', 0x00, 0x00, 0x00), /* ğ to g */
Transliteration::new(0x0120, b'G', 0x00, 0x00, 0x00), /* Ġ to G */
Transliteration::new(0x0121, b'g', 0x00, 0x00, 0x00), /* ġ to g */
Transliteration::new(0x0122, b'G', 0x00, 0x00, 0x00), /* Ģ to G */
Transliteration::new(0x0123, b'g', 0x00, 0x00, 0x00), /* ģ to g */
Transliteration::new(0x0124, b'H', b'h', 0x00, 0x00), /* Ĥ to Hh */
Transliteration::new(0x0125, b'h', b'h', 0x00, 0x00), /* ĥ to hh */
Transliteration::new(0x0126, b'H', 0x00, 0x00, 0x00), /* Ħ to H */
Transliteration::new(0x0127, b'h', 0x00, 0x00, 0x00), /* ħ to h */
Transliteration::new(0x0128, b'I', 0x00, 0x00, 0x00), /* Ĩ to I */
Transliteration::new(0x0129, b'i', 0x00, 0x00, 0x00), /* ĩ to i */
Transliteration::new(0x012A, b'I', 0x00, 0x00, 0x00), /* Ī to I */
Transliteration::new(0x012B, b'i', 0x00, 0x00, 0x00), /* ī to i */
Transliteration::new(0x012C, b'I', 0x00, 0x00, 0x00), /* Ĭ to I */
Transliteration::new(0x012D, b'i', 0x00, 0x00, 0x00), /* ĭ to i */
Transliteration::new(0x012E, b'I', 0x00, 0x00, 0x00), /* Į to I */
Transliteration::new(0x012F, b'i', 0x00, 0x00, 0x00), /* į to i */
Transliteration::new(0x0130, b'I', 0x00, 0x00, 0x00), /* İ to I */
Transliteration::new(0x0131, b'i', 0x00, 0x00, 0x00), /* ı to i */
Transliteration::new(0x0132, b'I', b'J', 0x00, 0x00), /* IJ to IJ */
Transliteration::new(0x0133, b'i', b'j', 0x00, 0x00), /* ij to ij */
Transliteration::new(0x0134, b'J', b'h', 0x00, 0x00), /* Ĵ to Jh */
Transliteration::new(0x0135, b'j', b'h', 0x00, 0x00), /* ĵ to jh */
Transliteration::new(0x0136, b'K', 0x00, 0x00, 0x00), /* Ķ to K */
Transliteration::new(0x0137, b'k', 0x00, 0x00, 0x00), /* ķ to k */
Transliteration::new(0x0138, b'k', 0x00, 0x00, 0x00), /* ĸ to k */
Transliteration::new(0x0139, b'L', 0x00, 0x00, 0x00), /* Ĺ to L */
Transliteration::new(0x013A, b'l', 0x00, 0x00, 0x00), /* ĺ to l */
Transliteration::new(0x013B, b'L', 0x00, 0x00, 0x00), /* Ļ to L */
Transliteration::new(0x013C, b'l', 0x00, 0x00, 0x00), /* ļ to l */
Transliteration::new(0x013D, b'L', 0x00, 0x00, 0x00), /* Ľ to L */
Transliteration::new(0x013E, b'l', 0x00, 0x00, 0x00), /* ľ to l */
Transliteration::new(0x013F, b'L', b'.', 0x00, 0x00), /* Ŀ to L. */
Transliteration::new(0x0140, b'l', b'.', 0x00, 0x00), /* ŀ to l. */
Transliteration::new(0x0141, b'L', 0x00, 0x00, 0x00), /* Ł to L */
Transliteration::new(0x0142, b'l', 0x00, 0x00, 0x00), /* ł to l */
Transliteration::new(0x0143, b'N', 0x00, 0x00, 0x00), /* Ń to N */
Transliteration::new(0x0144, b'n', 0x00, 0x00, 0x00), /* ń to n */
Transliteration::new(0x0145, b'N', 0x00, 0x00, 0x00), /* Ņ to N */
Transliteration::new(0x0146, b'n', 0x00, 0x00, 0x00), /* ņ to n */
Transliteration::new(0x0147, b'N', 0x00, 0x00, 0x00), /* Ň to N */
Transliteration::new(0x0148, b'n', 0x00, 0x00, 0x00), /* ň to n */
Transliteration::new(0x0149, b'\'', b'n', 0x00, 0x00), /* ʼn to 'n */
Transliteration::new(0x014A, b'N', b'G', 0x00, 0x00), /* Ŋ to NG */
Transliteration::new(0x014B, b'n', b'g', 0x00, 0x00), /* ŋ to ng */
Transliteration::new(0x014C, b'O', 0x00, 0x00, 0x00), /* Ō to O */
Transliteration::new(0x014D, b'o', 0x00, 0x00, 0x00), /* ō to o */
Transliteration::new(0x014E, b'O', 0x00, 0x00, 0x00), /* Ŏ to O */
Transliteration::new(0x014F, b'o', 0x00, 0x00, 0x00), /* ŏ to o */
Transliteration::new(0x0150, b'O', 0x00, 0x00, 0x00), /* Ő to O */
Transliteration::new(0x0151, b'o', 0x00, 0x00, 0x00), /* ő to o */
Transliteration::new(0x0152, b'O', b'E', 0x00, 0x00), /* Œ to OE */
Transliteration::new(0x0153, b'o', b'e', 0x00, 0x00), /* œ to oe */
Transliteration::new(0x0154, b'R', 0x00, 0x00, 0x00), /* Ŕ to R */
Transliteration::new(0x0155, b'r', 0x00, 0x00, 0x00), /* ŕ to r */
Transliteration::new(0x0156, b'R', 0x00, 0x00, 0x00), /* Ŗ to R */
Transliteration::new(0x0157, b'r', 0x00, 0x00, 0x00), /* ŗ to r */
Transliteration::new(0x0158, b'R', 0x00, 0x00, 0x00), /* Ř to R */
Transliteration::new(0x0159, b'r', 0x00, 0x00, 0x00), /* ř to r */
Transliteration::new(0x015A, b'S', 0x00, 0x00, 0x00), /* Ś to S */
Transliteration::new(0x015B, b's', 0x00, 0x00, 0x00), /* ś to s */
Transliteration::new(0x015C, b'S', b'h', 0x00, 0x00), /* Ŝ to Sh */
Transliteration::new(0x015D, b's', b'h', 0x00, 0x00), /* ŝ to sh */
Transliteration::new(0x015E, b'S', 0x00, 0x00, 0x00), /* Ş to S */
Transliteration::new(0x015F, b's', 0x00, 0x00, 0x00), /* ş to s */
Transliteration::new(0x0160, b'S', 0x00, 0x00, 0x00), /* Š to S */
Transliteration::new(0x0161, b's', 0x00, 0x00, 0x00), /* š to s */
Transliteration::new(0x0162, b'T', 0x00, 0x00, 0x00), /* Ţ to T */
Transliteration::new(0x0163, b't', 0x00, 0x00, 0x00), /* ţ to t */
Transliteration::new(0x0164, b'T', 0x00, 0x00, 0x00), /* Ť to T */
Transliteration::new(0x0165, b't', 0x00, 0x00, 0x00), /* ť to t */
Transliteration::new(0x0166, b'T', 0x00, 0x00, 0x00), /* Ŧ to T */
Transliteration::new(0x0167, b't', 0x00, 0x00, 0x00), /* ŧ to t */
Transliteration::new(0x0168, b'U', 0x00, 0x00, 0x00), /* Ũ to U */
Transliteration::new(0x0169, b'u', 0x00, 0x00, 0x00), /* ũ to u */
Transliteration::new(0x016A, b'U', 0x00, 0x00, 0x00), /* Ū to U */
Transliteration::new(0x016B, b'u', 0x00, 0x00, 0x00), /* ū to u */
Transliteration::new(0x016C, b'U', 0x00, 0x00, 0x00), /* Ŭ to U */
Transliteration::new(0x016D, b'u', 0x00, 0x00, 0x00), /* ŭ to u */
Transliteration::new(0x016E, b'U', 0x00, 0x00, 0x00), /* Ů to U */
Transliteration::new(0x016F, b'u', 0x00, 0x00, 0x00), /* ů to u */
Transliteration::new(0x0170, b'U', 0x00, 0x00, 0x00), /* Ű to U */
Transliteration::new(0x0171, b'u', 0x00, 0x00, 0x00), /* ű to u */
Transliteration::new(0x0172, b'U', 0x00, 0x00, 0x00), /* Ų to U */
Transliteration::new(0x0173, b'u', 0x00, 0x00, 0x00), /* ų to u */
Transliteration::new(0x0174, b'W', 0x00, 0x00, 0x00), /* Ŵ to W */
Transliteration::new(0x0175, b'w', 0x00, 0x00, 0x00), /* ŵ to w */
Transliteration::new(0x0176, b'Y', 0x00, 0x00, 0x00), /* Ŷ to Y */
Transliteration::new(0x0177, b'y', 0x00, 0x00, 0x00), /* ŷ to y */
Transliteration::new(0x0178, b'Y', 0x00, 0x00, 0x00), /* Ÿ to Y */
Transliteration::new(0x0179, b'Z', 0x00, 0x00, 0x00), /* Ź to Z */
Transliteration::new(0x017A, b'z', 0x00, 0x00, 0x00), /* ź to z */
Transliteration::new(0x017B, b'Z', 0x00, 0x00, 0x00), /* Ż to Z */
Transliteration::new(0x017C, b'z', 0x00, 0x00, 0x00), /* ż to z */
Transliteration::new(0x017D, b'Z', 0x00, 0x00, 0x00), /* Ž to Z */
Transliteration::new(0x017E, b'z', 0x00, 0x00, 0x00), /* ž to z */
Transliteration::new(0x017F, b's', 0x00, 0x00, 0x00), /* ſ to s */
Transliteration::new(0x0192, b'f', 0x00, 0x00, 0x00), /* ƒ to f */
Transliteration::new(0x0218, b'S', 0x00, 0x00, 0x00), /* Ș to S */
Transliteration::new(0x0219, b's', 0x00, 0x00, 0x00), /* ș to s */
Transliteration::new(0x021A, b'T', 0x00, 0x00, 0x00), /* Ț to T */
Transliteration::new(0x021B, b't', 0x00, 0x00, 0x00), /* ț to t */
Transliteration::new(0x0386, b'A', 0x00, 0x00, 0x00), /* Ά to A */
Transliteration::new(0x0388, b'E', 0x00, 0x00, 0x00), /* Έ to E */
Transliteration::new(0x0389, b'I', 0x00, 0x00, 0x00), /* Ή to I */
Transliteration::new(0x038A, b'I', 0x00, 0x00, 0x00), /* Ί to I */
Transliteration::new(0x038C, b'O', 0x00, 0x00, 0x00), /* Ό to O */
Transliteration::new(0x038E, b'Y', 0x00, 0x00, 0x00), /* Ύ to Y */
Transliteration::new(0x038F, b'O', 0x00, 0x00, 0x00), /* Ώ to O */
Transliteration::new(0x0390, b'i', 0x00, 0x00, 0x00), /* ΐ to i */
Transliteration::new(0x0391, b'A', 0x00, 0x00, 0x00), /* Α to A */
Transliteration::new(0x0392, b'B', 0x00, 0x00, 0x00), /* Β to B */
Transliteration::new(0x0393, b'G', 0x00, 0x00, 0x00), /* Γ to G */
Transliteration::new(0x0394, b'D', 0x00, 0x00, 0x00), /* Δ to D */
Transliteration::new(0x0395, b'E', 0x00, 0x00, 0x00), /* Ε to E */
Transliteration::new(0x0396, b'Z', 0x00, 0x00, 0x00), /* Ζ to Z */
Transliteration::new(0x0397, b'I', 0x00, 0x00, 0x00), /* Η to I */
Transliteration::new(0x0398, b'T', b'h', 0x00, 0x00), /* Θ to Th */
Transliteration::new(0x0399, b'I', 0x00, 0x00, 0x00), /* Ι to I */
Transliteration::new(0x039A, b'K', 0x00, 0x00, 0x00), /* Κ to K */
Transliteration::new(0x039B, b'L', 0x00, 0x00, 0x00), /* Λ to L */
Transliteration::new(0x039C, b'M', 0x00, 0x00, 0x00), /* Μ to M */
Transliteration::new(0x039D, b'N', 0x00, 0x00, 0x00), /* Ν to N */
Transliteration::new(0x039E, b'X', 0x00, 0x00, 0x00), /* Ξ to X */
Transliteration::new(0x039F, b'O', 0x00, 0x00, 0x00), /* Ο to O */
Transliteration::new(0x03A0, b'P', 0x00, 0x00, 0x00), /* Π to P */
Transliteration::new(0x03A1, b'R', 0x00, 0x00, 0x00), /* Ρ to R */
Transliteration::new(0x03A3, b'S', 0x00, 0x00, 0x00), /* Σ to S */
Transliteration::new(0x03A4, b'T', 0x00, 0x00, 0x00), /* Τ to T */
Transliteration::new(0x03A5, b'Y', 0x00, 0x00, 0x00), /* Υ to Y */
Transliteration::new(0x03A6, b'F', 0x00, 0x00, 0x00), /* Φ to F */
Transliteration::new(0x03A7, b'C', b'h', 0x00, 0x00), /* Χ to Ch */
Transliteration::new(0x03A8, b'P', b's', 0x00, 0x00), /* Ψ to Ps */
Transliteration::new(0x03A9, b'O', 0x00, 0x00, 0x00), /* Ω to O */
Transliteration::new(0x03AA, b'I', 0x00, 0x00, 0x00), /* Ϊ to I */
Transliteration::new(0x03AB, b'Y', 0x00, 0x00, 0x00), /* Ϋ to Y */
Transliteration::new(0x03AC, b'a', 0x00, 0x00, 0x00), /* ά to a */
Transliteration::new(0x03AD, b'e', 0x00, 0x00, 0x00), /* έ to e */
Transliteration::new(0x03AE, b'i', 0x00, 0x00, 0x00), /* ή to i */
Transliteration::new(0x03AF, b'i', 0x00, 0x00, 0x00), /* ί to i */
Transliteration::new(0x03B1, b'a', 0x00, 0x00, 0x00), /* α to a */
Transliteration::new(0x03B2, b'b', 0x00, 0x00, 0x00), /* β to b */
Transliteration::new(0x03B3, b'g', 0x00, 0x00, 0x00), /* γ to g */
Transliteration::new(0x03B4, b'd', 0x00, 0x00, 0x00), /* δ to d */
Transliteration::new(0x03B5, b'e', 0x00, 0x00, 0x00), /* ε to e */
Transliteration::new(0x03B6, b'z', 0x00, 0x00, 0x00), /* ζ to z */
Transliteration::new(0x03B7, b'i', 0x00, 0x00, 0x00), /* η to i */
Transliteration::new(0x03B8, b't', b'h', 0x00, 0x00), /* θ to th */
Transliteration::new(0x03B9, b'i', 0x00, 0x00, 0x00), /* ι to i */
Transliteration::new(0x03BA, b'k', 0x00, 0x00, 0x00), /* κ to k */
Transliteration::new(0x03BB, b'l', 0x00, 0x00, 0x00), /* λ to l */
Transliteration::new(0x03BC, b'm', 0x00, 0x00, 0x00), /* μ to m */
Transliteration::new(0x03BD, b'n', 0x00, 0x00, 0x00), /* ν to n */
Transliteration::new(0x03BE, b'x', 0x00, 0x00, 0x00), /* ξ to x */
Transliteration::new(0x03BF, b'o', 0x00, 0x00, 0x00), /* ο to o */
Transliteration::new(0x03C0, b'p', 0x00, 0x00, 0x00), /* π to p */
Transliteration::new(0x03C1, b'r', 0x00, 0x00, 0x00), /* ρ to r */
Transliteration::new(0x03C3, b's', 0x00, 0x00, 0x00), /* σ to s */
Transliteration::new(0x03C4, b't', 0x00, 0x00, 0x00), /* τ to t */
Transliteration::new(0x03C5, b'y', 0x00, 0x00, 0x00), /* υ to y */
Transliteration::new(0x03C6, b'f', 0x00, 0x00, 0x00), /* φ to f */
Transliteration::new(0x03C7, b'c', b'h', 0x00, 0x00), /* χ to ch */
Transliteration::new(0x03C8, b'p', b's', 0x00, 0x00), /* ψ to ps */
Transliteration::new(0x03C9, b'o', 0x00, 0x00, 0x00), /* ω to o */
Transliteration::new(0x03CA, b'i', 0x00, 0x00, 0x00), /* ϊ to i */
Transliteration::new(0x03CB, b'y', 0x00, 0x00, 0x00), /* ϋ to y */
Transliteration::new(0x03CC, b'o', 0x00, 0x00, 0x00), /* ό to o */
Transliteration::new(0x03CD, b'y', 0x00, 0x00, 0x00), /* ύ to y */
Transliteration::new(0x03CE, b'i', 0x00, 0x00, 0x00), /* ώ to i */
Transliteration::new(0x0400, b'E', 0x00, 0x00, 0x00), /* Ѐ to E */
Transliteration::new(0x0401, b'E', 0x00, 0x00, 0x00), /* Ё to E */
Transliteration::new(0x0402, b'D', 0x00, 0x00, 0x00), /* Ђ to D */
Transliteration::new(0x0403, b'G', 0x00, 0x00, 0x00), /* Ѓ to G */
Transliteration::new(0x0404, b'E', 0x00, 0x00, 0x00), /* Є to E */
Transliteration::new(0x0405, b'Z', 0x00, 0x00, 0x00), /* Ѕ to Z */
Transliteration::new(0x0406, b'I', 0x00, 0x00, 0x00), /* І to I */
Transliteration::new(0x0407, b'I', 0x00, 0x00, 0x00), /* Ї to I */
Transliteration::new(0x0408, b'J', 0x00, 0x00, 0x00), /* Ј to J */
Transliteration::new(0x0409, b'I', 0x00, 0x00, 0x00), /* Љ to I */
Transliteration::new(0x040A, b'N', 0x00, 0x00, 0x00), /* Њ to N */
Transliteration::new(0x040B, b'D', 0x00, 0x00, 0x00), /* Ћ to D */
Transliteration::new(0x040C, b'K', 0x00, 0x00, 0x00), /* Ќ to K */
Transliteration::new(0x040D, b'I', 0x00, 0x00, 0x00), /* Ѝ to I */
Transliteration::new(0x040E, b'U', 0x00, 0x00, 0x00), /* Ў to U */
Transliteration::new(0x040F, b'D', 0x00, 0x00, 0x00), /* Џ to D */
Transliteration::new(0x0410, b'A', 0x00, 0x00, 0x00), /* А to A */
Transliteration::new(0x0411, b'B', 0x00, 0x00, 0x00), /* Б to B */
Transliteration::new(0x0412, b'V', 0x00, 0x00, 0x00), /* В to V */
Transliteration::new(0x0413, b'G', 0x00, 0x00, 0x00), /* Г to G */
Transliteration::new(0x0414, b'D', 0x00, 0x00, 0x00), /* Д to D */
Transliteration::new(0x0415, b'E', 0x00, 0x00, 0x00), /* Е to E */
Transliteration::new(0x0416, b'Z', b'h', 0x00, 0x00), /* Ж to Zh */
Transliteration::new(0x0417, b'Z', 0x00, 0x00, 0x00), /* З to Z */
Transliteration::new(0x0418, b'I', 0x00, 0x00, 0x00), /* И to I */
Transliteration::new(0x0419, b'I', 0x00, 0x00, 0x00), /* Й to I */
Transliteration::new(0x041A, b'K', 0x00, 0x00, 0x00), /* К to K */
Transliteration::new(0x041B, b'L', 0x00, 0x00, 0x00), /* Л to L */
Transliteration::new(0x041C, b'M', 0x00, 0x00, 0x00), /* М to M */
Transliteration::new(0x041D, b'N', 0x00, 0x00, 0x00), /* Н to N */
Transliteration::new(0x041E, b'O', 0x00, 0x00, 0x00), /* О to O */
Transliteration::new(0x041F, b'P', 0x00, 0x00, 0x00), /* П to P */
Transliteration::new(0x0420, b'R', 0x00, 0x00, 0x00), /* Р to R */
Transliteration::new(0x0421, b'S', 0x00, 0x00, 0x00), /* С to S */
Transliteration::new(0x0422, b'T', 0x00, 0x00, 0x00), /* Т to T */
Transliteration::new(0x0423, b'U', 0x00, 0x00, 0x00), /* У to U */
Transliteration::new(0x0424, b'F', 0x00, 0x00, 0x00), /* Ф to F */
Transliteration::new(0x0425, b'K', b'h', 0x00, 0x00), /* Х to Kh */
Transliteration::new(0x0426, b'T', b'c', 0x00, 0x00), /* Ц to Tc */
Transliteration::new(0x0427, b'C', b'h', 0x00, 0x00), /* Ч to Ch */
Transliteration::new(0x0428, b'S', b'h', 0x00, 0x00), /* Ш to Sh */
Transliteration::new(0x0429, b'S', b'h', b'c', b'h'), /* Щ to Shch */
Transliteration::new(0x042A, b'a', 0x00, 0x00, 0x00), /* to A */
Transliteration::new(0x042B, b'Y', 0x00, 0x00, 0x00), /* Ы to Y */
Transliteration::new(0x042C, b'Y', 0x00, 0x00, 0x00), /* to Y */
Transliteration::new(0x042D, b'E', 0x00, 0x00, 0x00), /* Э to E */
Transliteration::new(0x042E, b'I', b'u', 0x00, 0x00), /* Ю to Iu */
Transliteration::new(0x042F, b'I', b'a', 0x00, 0x00), /* Я to Ia */
Transliteration::new(0x0430, b'a', 0x00, 0x00, 0x00), /* а to a */
Transliteration::new(0x0431, b'b', 0x00, 0x00, 0x00), /* б to b */
Transliteration::new(0x0432, b'v', 0x00, 0x00, 0x00), /* в to v */
Transliteration::new(0x0433, b'g', 0x00, 0x00, 0x00), /* г to g */
Transliteration::new(0x0434, b'd', 0x00, 0x00, 0x00), /* д to d */
Transliteration::new(0x0435, b'e', 0x00, 0x00, 0x00), /* е to e */
Transliteration::new(0x0436, b'z', b'h', 0x00, 0x00), /* ж to zh */
Transliteration::new(0x0437, b'z', 0x00, 0x00, 0x00), /* з to z */
Transliteration::new(0x0438, b'i', 0x00, 0x00, 0x00), /* и to i */
Transliteration::new(0x0439, b'i', 0x00, 0x00, 0x00), /* й to i */
Transliteration::new(0x043A, b'k', 0x00, 0x00, 0x00), /* к to k */
Transliteration::new(0x043B, b'l', 0x00, 0x00, 0x00), /* л to l */
Transliteration::new(0x043C, b'm', 0x00, 0x00, 0x00), /* м to m */
Transliteration::new(0x043D, b'n', 0x00, 0x00, 0x00), /* н to n */
Transliteration::new(0x043E, b'o', 0x00, 0x00, 0x00), /* о to o */
Transliteration::new(0x043F, b'p', 0x00, 0x00, 0x00), /* п to p */
Transliteration::new(0x0440, b'r', 0x00, 0x00, 0x00), /* р to r */
Transliteration::new(0x0441, b's', 0x00, 0x00, 0x00), /* с to s */
Transliteration::new(0x0442, b't', 0x00, 0x00, 0x00), /* т to t */
Transliteration::new(0x0443, b'u', 0x00, 0x00, 0x00), /* у to u */
Transliteration::new(0x0444, b'f', 0x00, 0x00, 0x00), /* ф to f */
Transliteration::new(0x0445, b'k', b'h', 0x00, 0x00), /* х to kh */
Transliteration::new(0x0446, b't', b'c', 0x00, 0x00), /* ц to tc */
Transliteration::new(0x0447, b'c', b'h', 0x00, 0x00), /* ч to ch */
Transliteration::new(0x0448, b's', b'h', 0x00, 0x00), /* ш to sh */
Transliteration::new(0x0449, b's', b'h', b'c', b'h'), /* щ to shch */
Transliteration::new(0x044A, b'a', 0x00, 0x00, 0x00), /* to a */
Transliteration::new(0x044B, b'y', 0x00, 0x00, 0x00), /* ы to y */
Transliteration::new(0x044C, b'y', 0x00, 0x00, 0x00), /* to y */
Transliteration::new(0x044D, b'e', 0x00, 0x00, 0x00), /* э to e */
Transliteration::new(0x044E, b'i', b'u', 0x00, 0x00), /* ю to iu */
Transliteration::new(0x044F, b'i', b'a', 0x00, 0x00), /* я to ia */
Transliteration::new(0x0450, b'e', 0x00, 0x00, 0x00), /* ѐ to e */
Transliteration::new(0x0451, b'e', 0x00, 0x00, 0x00), /* ё to e */
Transliteration::new(0x0452, b'd', 0x00, 0x00, 0x00), /* ђ to d */
Transliteration::new(0x0453, b'g', 0x00, 0x00, 0x00), /* ѓ to g */
Transliteration::new(0x0454, b'e', 0x00, 0x00, 0x00), /* є to e */
Transliteration::new(0x0455, b'z', 0x00, 0x00, 0x00), /* ѕ to z */
Transliteration::new(0x0456, b'i', 0x00, 0x00, 0x00), /* і to i */
Transliteration::new(0x0457, b'i', 0x00, 0x00, 0x00), /* ї to i */
Transliteration::new(0x0458, b'j', 0x00, 0x00, 0x00), /* ј to j */
Transliteration::new(0x0459, b'i', 0x00, 0x00, 0x00), /* љ to i */
Transliteration::new(0x045A, b'n', 0x00, 0x00, 0x00), /* њ to n */
Transliteration::new(0x045B, b'd', 0x00, 0x00, 0x00), /* ћ to d */
Transliteration::new(0x045C, b'k', 0x00, 0x00, 0x00), /* ќ to k */
Transliteration::new(0x045D, b'i', 0x00, 0x00, 0x00), /* ѝ to i */
Transliteration::new(0x045E, b'u', 0x00, 0x00, 0x00), /* ў to u */
Transliteration::new(0x045F, b'd', 0x00, 0x00, 0x00), /* џ to d */
Transliteration::new(0x1E02, b'B', 0x00, 0x00, 0x00), /* Ḃ to B */
Transliteration::new(0x1E03, b'b', 0x00, 0x00, 0x00), /* ḃ to b */
Transliteration::new(0x1E0A, b'D', 0x00, 0x00, 0x00), /* Ḋ to D */
Transliteration::new(0x1E0B, b'd', 0x00, 0x00, 0x00), /* ḋ to d */
Transliteration::new(0x1E1E, b'F', 0x00, 0x00, 0x00), /* Ḟ to F */
Transliteration::new(0x1E1F, b'f', 0x00, 0x00, 0x00), /* ḟ to f */
Transliteration::new(0x1E40, b'M', 0x00, 0x00, 0x00), /* Ṁ to M */
Transliteration::new(0x1E41, b'm', 0x00, 0x00, 0x00), /* ṁ to m */
Transliteration::new(0x1E56, b'P', 0x00, 0x00, 0x00), /* Ṗ to P */
Transliteration::new(0x1E57, b'p', 0x00, 0x00, 0x00), /* ṗ to p */
Transliteration::new(0x1E60, b'S', 0x00, 0x00, 0x00), /* Ṡ to S */
Transliteration::new(0x1E61, b's', 0x00, 0x00, 0x00), /* ṡ to s */
Transliteration::new(0x1E6A, b'T', 0x00, 0x00, 0x00), /* Ṫ to T */
Transliteration::new(0x1E6B, b't', 0x00, 0x00, 0x00), /* ṫ to t */
Transliteration::new(0x1E80, b'W', 0x00, 0x00, 0x00), /* Ẁ to W */
Transliteration::new(0x1E81, b'w', 0x00, 0x00, 0x00), /* ẁ to w */
Transliteration::new(0x1E82, b'W', 0x00, 0x00, 0x00), /* Ẃ to W */
Transliteration::new(0x1E83, b'w', 0x00, 0x00, 0x00), /* ẃ to w */
Transliteration::new(0x1E84, b'W', 0x00, 0x00, 0x00), /* Ẅ to W */
Transliteration::new(0x1E85, b'w', 0x00, 0x00, 0x00), /* ẅ to w */
Transliteration::new(0x1EF2, b'Y', 0x00, 0x00, 0x00), /* Ỳ to Y */
Transliteration::new(0x1EF3, b'y', 0x00, 0x00, 0x00), /* ỳ to y */
Transliteration::new(0xFB00, b'f', b'f', 0x00, 0x00), /* ff to ff */
Transliteration::new(0xFB01, b'f', b'i', 0x00, 0x00), /* fi to fi */
Transliteration::new(0xFB02, b'f', b'l', 0x00, 0x00), /* fl to fl */
Transliteration::new(0xFB05, b's', b't', 0x00, 0x00), /* ſt to st */
Transliteration::new(0xFB06, b's', b't', 0x00, 0x00), /* st to st */
];
/// Return the value of the first UTF-8 character in the string
fn utf8_read(z: &[u8]) -> (u32, usize) {
if z.is_empty() {
return (0, 0);
}
let first_byte = z[0];
if first_byte < 0x80 {
(first_byte as u32, 1)
} else {
let lookup_index = (first_byte - 0xc0) as usize;
if lookup_index >= TRANSLIT_UTF8_LOOKUP.len() {
return (first_byte as u32, 1);
}
let mut c = TRANSLIT_UTF8_LOOKUP[lookup_index] as u32;
let mut i = 1;
while i < z.len() && (z[i] & 0xc0) == 0x80 {
c = (c << 6) + ((z[i] & 0x3f) as u32);
i += 1;
}
(c, i)
}
}
/// Find transliteration entry for a given Unicode character using binary search
fn find_translit(c: u32) -> Option<&'static Transliteration> {
let c = c as u16; // Cast to u16 since our table uses u16
TRANSLIT
.binary_search_by_key(&c, |t| t.c_from)
.ok()
.map(|idx| &TRANSLIT[idx])
}
/// Convert the input string from UTF-8 into pure ASCII by converting
/// all non-ASCII characters to some combination of characters in the ASCII subset.
pub fn transliterate(input: &[u8]) -> Vec<u8> {
let mut output = Vec::with_capacity(input.len() * 4);
let mut pos = 0;
while pos < input.len() {
let (c, size) = utf8_read(&input[pos..]);
pos += size;
if c <= 127 {
output.push(c as u8);
} else if let Some(translit) = find_translit(c) {
output.push(translit.c_to0);
if translit.c_to1 != 0 {
output.push(translit.c_to1);
if translit.c_to2 != 0 {
output.push(translit.c_to2);
if translit.c_to3 != 0 {
output.push(translit.c_to3);
}
}
}
} else {
output.push(b'?');
}
}
output
}
pub fn transliterate_str(input: &str) -> String {
let result = transliterate(input.as_bytes());
String::from_utf8(result).unwrap_or_else(|_| "?".to_string())
}
pub fn script_code(input: &[u8]) -> i32 {
let mut pos = 0;
let mut script_mask = 0;
let mut seen_digit = false;
while pos < input.len() {
let (c, size) = utf8_read(&input[pos..]);
pos += size;
if c < 0x02af {
if c >= 0x80 {
script_mask |= SCRIPT_LATIN;
} else if (c as u8).is_ascii_digit() {
seen_digit = true;
} else {
script_mask |= SCRIPT_LATIN;
}
} else if (0x0400..=0x04ff).contains(&c) {
script_mask |= SCRIPT_CYRILLIC;
} else if (0x0386..=0x03ce).contains(&c) {
script_mask |= SCRIPT_GREEK;
} else if (0x0590..=0x05ff).contains(&c) {
script_mask |= SCRIPT_HEBREW;
} else if (0x0600..=0x06ff).contains(&c) {
script_mask |= SCRIPT_ARABIC;
}
}
if script_mask == 0 && seen_digit {
script_mask = SCRIPT_LATIN;
}
match script_mask {
0 => 999,
SCRIPT_LATIN => 215,
SCRIPT_CYRILLIC => 220,
SCRIPT_GREEK => 200,
SCRIPT_HEBREW => 125,
SCRIPT_ARABIC => 160,
_ => 998,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_utf8_read() {
let input = "Café".as_bytes();
let (c, size) = utf8_read(&input[0..]);
assert_eq!(c, b'C' as u32);
assert_eq!(size, 1);
let (c, size) = utf8_read(&input[3..]);
assert_eq!(c, 0x00E9); // é
assert_eq!(size, 2);
}
#[test]
fn test_transliterate_basic() {
let result = transliterate_str("Café");
assert_eq!(result, "Cafe");
let result = transliterate_str("Naïve");
assert_eq!(result, "Naive");
}
#[test]
fn test_transliterate_german() {
let result = transliterate_str("Müller");
assert_eq!(result, "Mueller");
let result = transliterate_str("Größe");
assert_eq!(result, "Groesse");
}
#[test]
fn test_script_code() {
assert_eq!(script_code("Hello".as_bytes()), 215);
assert_eq!(script_code("123".as_bytes()), 215);
assert_eq!(script_code("привет".as_bytes()), 220);
assert_eq!(script_code("γειά".as_bytes()), 200);
assert_eq!(script_code("helloпривет".as_bytes()), 998);
}
}

View File

@@ -560,6 +560,122 @@ def test_ipaddr():
)
limbo.quit()
def validate_fuzzy_leven(a):
return a == "3"
def validate_fuzzy_damlev1(a):
return a == "2"
def validate_fuzzy_damlev2(a):
return a == "1"
def validate_fuzzy_editdist1(a):
return a == "225"
def validate_fuzzy_editdist2(a):
return a == "110"
def validate_fuzzy_jarowin(a):
return a == "0.907142857142857"
def validate_fuzzy_osadist(a):
return a == "3"
def validate_fuzzy_soundex(a):
return a == "A250"
def validate_fuzzy_phonetic(a):
return a == "ABACAMA"
def validate_fuzzy_caver(a):
return a == "AWSM111111"
def validate_fuzzy_rsoundex(a):
return a == "A03080"
def validate_fuzzy_translit1(a):
return a == "oh my ?"
def validate_fuzzy_translit2(a):
return a == "privet"
def validate_fuzzy_script(a):
return a == "160"
def test_fuzzy():
limbo = TestTursoShell()
ext_path = "./target/debug/liblimbo_fuzzy"
limbo.run_test_fn(
"SELECT fuzzy_leven('awesome', 'aewsme');",
lambda res: "error: no such function: " in res,
"fuzzy levenshtein function returns null when ext not loaded",
)
limbo.execute_dot(f".load {ext_path}")
limbo.run_test_fn(
"SELECT fuzzy_leven('awesome', 'aewsme');",
validate_fuzzy_leven,
"fuzzy levenshtein function works",
)
limbo.run_test_fn(
"SELECT fuzzy_damlev('awesome', 'aewsme');",
validate_fuzzy_damlev1,
"fuzzy damerau levenshtein1 function works",
)
limbo.run_test_fn(
"SELECT fuzzy_damlev('Something', 'Smoething');",
validate_fuzzy_damlev2,
"fuzzy damerau levenshtein2 function works",
)
limbo.run_test_fn(
"SELECT fuzzy_editdist('abc', 'ca');",
validate_fuzzy_editdist1,
"fuzzy editdist1 function works",
)
limbo.run_test_fn(
"SELECT fuzzy_editdist('abc', 'acb');",
validate_fuzzy_editdist2,
"fuzzy editdist2 function works",
)
limbo.run_test_fn(
"SELECT fuzzy_jarowin('awesome', 'aewsme');",
validate_fuzzy_jarowin,
"fuzzy jarowin function works",
)
limbo.run_test_fn(
"SELECT fuzzy_osadist('awesome', 'aewsme');",
validate_fuzzy_osadist,
"fuzzy osadist function works",
)
limbo.run_test_fn(
"SELECT fuzzy_phonetic('awesome');",
validate_fuzzy_phonetic,
"fuzzy phonetic function works",
)
limbo.run_test_fn(
"SELECT fuzzy_caver('awesome');",
validate_fuzzy_caver,
"fuzzy caver function works",
)
limbo.run_test_fn(
"SELECT fuzzy_rsoundex('awesome');",
validate_fuzzy_rsoundex,
"fuzzy rsoundex function works",
)
limbo.run_test_fn(
"SELECT fuzzy_translit('oh my 😅');",
validate_fuzzy_translit1,
"fuzzy translit1 function works",
)
limbo.run_test_fn(
"SELECT fuzzy_translit('привет');",
validate_fuzzy_translit2,
"fuzzy translit2 function works",
)
limbo.run_test_fn(
"SELECT fuzzy_script('داناوانب');",
validate_fuzzy_script,
"fuzzy script function works",
)
def test_vfs():
limbo = TestTursoShell()
@@ -822,6 +938,7 @@ def main():
test_kv()
test_csv()
test_tablestats()
test_fuzzy()
except Exception as e:
console.error(f"Test FAILED: {e}")
cleanup()