diff --git a/Cargo.lock b/Cargo.lock index ccd556376..6a5e3d900 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -523,10 +523,11 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.17" +version = "1.2.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fcb57c740ae1daf453ae85f16e37396f672b039e00d9d866e07ddb24e328e3a" +checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" dependencies = [ + "find-msvc-tools", "jobserver", "libc", "shlex", @@ -1504,6 +1505,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "find-msvc-tools" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + [[package]] name = "findshlibs" version = "0.10.2" @@ -4143,6 +4150,15 @@ dependencies = [ "similar", ] +[[package]] +name = "simsimd" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e3f209c5a8155b8458b1a0d3a6fc9fa09d201e6086fdaae18e9e283b9274f8f" +dependencies = [ + "cc", +] + [[package]] name = "slab" version = "0.4.9" @@ -4925,6 +4941,7 @@ dependencies = [ "rustix 1.0.7", "ryu", "serde", + "simsimd", "sorted-vec", "strum", "strum_macros", diff --git a/bindings/javascript/packages/wasm/promise.test.ts b/bindings/javascript/packages/wasm/promise.test.ts index d80dee8b7..7cdd8bc1b 100644 --- a/bindings/javascript/packages/wasm/promise.test.ts +++ b/bindings/javascript/packages/wasm/promise.test.ts @@ -1,6 +1,19 @@ import { expect, test } from 'vitest' import { connect, Database } from './promise-default.js' +test('vector-test', async () => { + const db = await connect(":memory:"); + const v1 = new Array(1024).fill(0).map((_, i) => i); + const v2 = new Array(1024).fill(0).map((_, i) => 1024 - i); + const result = await db.prepare(`SELECT + vector_distance_cos(vector32('${JSON.stringify(v1)}'), vector32('${JSON.stringify(v2)}')) as cosf32, + vector_distance_cos(vector64('${JSON.stringify(v1)}'), vector64('${JSON.stringify(v2)}')) as cosf64, + vector_distance_l2(vector32('${JSON.stringify(v1)}'), vector32('${JSON.stringify(v2)}')) as l2f32, + vector_distance_l2(vector64('${JSON.stringify(v1)}'), vector64('${JSON.stringify(v2)}')) as l2f64 + `).all(); + console.info(result); +}) + test('explain', async () => { const db = await connect(":memory:"); const stmt = db.prepare("EXPLAIN SELECT 1"); diff --git a/core/Cargo.toml b/core/Cargo.toml index d5651a2e4..cae081cb9 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -84,6 +84,7 @@ aegis = "0.9.0" twox-hash = "2.1.1" intrusive-collections = "0.9.7" roaring = "0.11.2" +simsimd = "6.5.3" [build-dependencies] chrono = { workspace = true, default-features = false } diff --git a/core/vector/operations/distance_cos.rs b/core/vector/operations/distance_cos.rs index aaa2c86f6..437d069c6 100644 --- a/core/vector/operations/distance_cos.rs +++ b/core/vector/operations/distance_cos.rs @@ -2,6 +2,7 @@ use crate::{ vector::vector_types::{Vector, VectorSparse, VectorType}, LimboError, Result, }; +use simsimd::SpatialSimilarity; pub fn vector_distance_cos(v1: &Vector, v2: &Vector) -> Result { if v1.dims != v2.dims { @@ -15,11 +16,23 @@ pub fn vector_distance_cos(v1: &Vector, v2: &Vector) -> Result { )); } match v1.vector_type { - VectorType::Float32Dense => Ok(vector_f32_distance_cos( + #[cfg(not(target_family = "wasm"))] + VectorType::Float32Dense => Ok(vector_f32_distance_cos_simsimd( v1.as_f32_slice(), v2.as_f32_slice(), )), - VectorType::Float64Dense => Ok(vector_f64_distance_cos( + #[cfg(target_family = "wasm")] + VectorType::Float32Dense => Ok(vector_f32_distance_cos_rust( + v1.as_f32_slice(), + v2.as_f32_slice(), + )), + #[cfg(not(target_family = "wasm"))] + VectorType::Float64Dense => Ok(vector_f64_distance_cos_simsimd( + v1.as_f64_slice(), + v2.as_f64_slice(), + )), + #[cfg(target_family = "wasm")] + VectorType::Float64Dense => Ok(vector_f64_distance_cos_rust( v1.as_f64_slice(), v2.as_f64_slice(), )), @@ -30,44 +43,44 @@ pub fn vector_distance_cos(v1: &Vector, v2: &Vector) -> Result { } } -fn vector_f32_distance_cos(v1: &[f32], v2: &[f32]) -> f64 { - let (mut dot, mut norm1, mut norm2) = (0.0, 0.0, 0.0); - - let dims = v1.len(); - for i in 0..dims { - let e1 = v1[i]; - let e2 = v2[i]; - dot += e1 * e2; - norm1 += e1 * e1; - norm2 += e2 * e2; - } - - // Check for zero norms to avoid division by zero - if norm1 == 0.0 || norm2 == 0.0 { - return f64::NAN; - } - - 1.0 - (dot / (norm1 * norm2).sqrt()) as f64 +#[allow(dead_code)] +fn vector_f32_distance_cos_simsimd(v1: &[f32], v2: &[f32]) -> f64 { + f32::cosine(v1, v2).unwrap_or(f64::NAN) } -fn vector_f64_distance_cos(v1: &[f64], v2: &[f64]) -> f64 { +// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189 +#[allow(dead_code)] +fn vector_f32_distance_cos_rust(v1: &[f32], v2: &[f32]) -> f64 { let (mut dot, mut norm1, mut norm2) = (0.0, 0.0, 0.0); - - let dims = v1.len(); - for i in 0..dims { - let e1 = v1[i]; - let e2 = v2[i]; - dot += e1 * e2; - norm1 += e1 * e1; - norm2 += e2 * e2; + for (a, b) in v1.iter().zip(v2.iter()) { + dot += a * b; + norm1 += a * a; + norm2 += b * b; } - - // Check for zero norms if norm1 == 0.0 || norm2 == 0.0 { - return f64::NAN; + return 0.0; } + (1.0 - dot / (norm1 * norm2).sqrt()) as f64 +} - 1.0 - (dot / (norm1 * norm2).sqrt()) +#[allow(dead_code)] +fn vector_f64_distance_cos_simsimd(v1: &[f64], v2: &[f64]) -> f64 { + f64::cosine(v1, v2).unwrap_or(f64::NAN) +} + +// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189 +#[allow(dead_code)] +fn vector_f64_distance_cos_rust(v1: &[f64], v2: &[f64]) -> f64 { + let (mut dot, mut norm1, mut norm2) = (0.0, 0.0, 0.0); + for (a, b) in v1.iter().zip(v2.iter()) { + dot += a * b; + norm1 += a * a; + norm2 += b * b; + } + if norm1 == 0.0 || norm2 == 0.0 { + return 0.0; + } + 1.0 - dot / (norm1 * norm2).sqrt() } fn vector_f32_sparse_distance_cos(v1: VectorSparse, v2: VectorSparse) -> f64 { @@ -120,20 +133,26 @@ mod tests { #[test] fn test_vector_distance_cos_f32() { - assert!(vector_f32_distance_cos(&[], &[]).is_nan()); - assert!(vector_f32_distance_cos(&[1.0, 2.0], &[0.0, 0.0]).is_nan()); - assert_eq!(vector_f32_distance_cos(&[1.0, 2.0], &[1.0, 2.0]), 0.0); - assert_eq!(vector_f32_distance_cos(&[1.0, 2.0], &[-1.0, -2.0]), 2.0); - assert_eq!(vector_f32_distance_cos(&[1.0, 2.0], &[-2.0, 1.0]), 1.0); + assert_eq!(vector_f32_distance_cos_simsimd(&[], &[]), 0.0); + assert_eq!( + vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[0.0, 0.0]), + 1.0 + ); + assert!(vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[1.0, 2.0]).abs() < 1e-9); + assert!((vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[-1.0, -2.0]) - 2.0).abs() < 1e-9); + assert!((vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[-2.0, 1.0]) - 1.0).abs() < 1e-9); } #[test] fn test_vector_distance_cos_f64() { - assert!(vector_f64_distance_cos(&[], &[]).is_nan()); - assert!(vector_f64_distance_cos(&[1.0, 2.0], &[0.0, 0.0]).is_nan()); - assert_eq!(vector_f64_distance_cos(&[1.0, 2.0], &[1.0, 2.0]), 0.0); - assert_eq!(vector_f64_distance_cos(&[1.0, 2.0], &[-1.0, -2.0]), 2.0); - assert_eq!(vector_f64_distance_cos(&[1.0, 2.0], &[-2.0, 1.0]), 1.0); + assert_eq!(vector_f64_distance_cos_simsimd(&[], &[]), 0.0); + assert_eq!( + vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[0.0, 0.0]), + 1.0 + ); + assert!(vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[1.0, 2.0]).abs() < 1e-9); + assert!((vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[-1.0, -2.0]) - 2.0).abs() < 1e-9); + assert!((vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[-2.0, 1.0]) - 1.0).abs() < 1e-9); } #[test] @@ -148,7 +167,7 @@ mod tests { idx: &[1, 2], values: &[1.0, 3.0] }, - ) - vector_f32_distance_cos(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0])) + ) - vector_f32_distance_cos_simsimd(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0])) .abs() < 1e-7 ); @@ -169,4 +188,30 @@ mod tests { (d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6 } + + #[quickcheck] + fn prop_vector_distance_cos_rust_vs_simsimd_f32( + v1: ArbitraryVector<100>, + v2: ArbitraryVector<100>, + ) -> bool { + let v1 = vector_convert(v1.into(), VectorType::Float32Dense).unwrap(); + let v2 = vector_convert(v2.into(), VectorType::Float32Dense).unwrap(); + let d1 = vector_f32_distance_cos_rust(v1.as_f32_slice(), v2.as_f32_slice()); + let d2 = vector_f32_distance_cos_simsimd(v1.as_f32_slice(), v2.as_f32_slice()); + println!("d1 vs d2: {} vs {}", d1, d2); + (d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-4 + } + + #[quickcheck] + fn prop_vector_distance_cos_rust_vs_simsimd_f64( + v1: ArbitraryVector<100>, + v2: ArbitraryVector<100>, + ) -> bool { + let v1 = vector_convert(v1.into(), VectorType::Float64Dense).unwrap(); + let v2 = vector_convert(v2.into(), VectorType::Float64Dense).unwrap(); + let d1 = vector_f64_distance_cos_rust(v1.as_f64_slice(), v2.as_f64_slice()); + let d2 = vector_f64_distance_cos_simsimd(v1.as_f64_slice(), v2.as_f64_slice()); + println!("d1 vs d2: {} vs {}", d1, d2); + (d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6 + } } diff --git a/core/vector/operations/distance_l2.rs b/core/vector/operations/distance_l2.rs index 68d01857a..84f21db14 100644 --- a/core/vector/operations/distance_l2.rs +++ b/core/vector/operations/distance_l2.rs @@ -2,6 +2,7 @@ use crate::{ vector::vector_types::{Vector, VectorSparse, VectorType}, LimboError, Result, }; +use simsimd::SpatialSimilarity; pub fn vector_distance_l2(v1: &Vector, v2: &Vector) -> Result { if v1.dims != v2.dims { @@ -15,12 +16,26 @@ pub fn vector_distance_l2(v1: &Vector, v2: &Vector) -> Result { )); } match v1.vector_type { - VectorType::Float32Dense => { - Ok(vector_f32_distance_l2(v1.as_f32_slice(), v2.as_f32_slice())) - } - VectorType::Float64Dense => { - Ok(vector_f64_distance_l2(v1.as_f64_slice(), v2.as_f64_slice())) - } + #[cfg(not(target_family = "wasm"))] + VectorType::Float32Dense => Ok(vector_f32_distance_l2_simsimd( + v1.as_f32_slice(), + v2.as_f32_slice(), + )), + #[cfg(target_family = "wasm")] + VectorType::Float32Dense => Ok(vector_f32_distance_l2_rust( + v1.as_f32_slice(), + v2.as_f32_slice(), + )), + #[cfg(not(target_family = "wasm"))] + VectorType::Float64Dense => Ok(vector_f64_distance_l2_simsimd( + v1.as_f64_slice(), + v2.as_f64_slice(), + )), + #[cfg(target_family = "wasm")] + VectorType::Float64Dense => Ok(vector_f64_distance_l2_rust( + v1.as_f64_slice(), + v2.as_f64_slice(), + )), VectorType::Float32Sparse => Ok(vector_f32_sparse_distance_l2( v1.as_f32_sparse(), v2.as_f32_sparse(), @@ -28,7 +43,14 @@ pub fn vector_distance_l2(v1: &Vector, v2: &Vector) -> Result { } } -fn vector_f32_distance_l2(v1: &[f32], v2: &[f32]) -> f64 { +#[allow(dead_code)] +fn vector_f32_distance_l2_simsimd(v1: &[f32], v2: &[f32]) -> f64 { + f32::euclidean(v1, v2).unwrap_or(f64::NAN) +} + +// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189 +#[allow(dead_code)] +fn vector_f32_distance_l2_rust(v1: &[f32], v2: &[f32]) -> f64 { let sum = v1 .iter() .zip(v2.iter()) @@ -37,7 +59,14 @@ fn vector_f32_distance_l2(v1: &[f32], v2: &[f32]) -> f64 { sum.sqrt() } -fn vector_f64_distance_l2(v1: &[f64], v2: &[f64]) -> f64 { +#[allow(dead_code)] +fn vector_f64_distance_l2_simsimd(v1: &[f64], v2: &[f64]) -> f64 { + f64::euclidean(v1, v2).unwrap_or(f64::NAN) +} + +// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189 +#[allow(dead_code)] +fn vector_f64_distance_l2_rust(v1: &[f64], v2: &[f64]) -> f64 { let sum = v1 .iter() .zip(v2.iter()) @@ -102,7 +131,7 @@ mod tests { ]; let results = vectors .iter() - .map(|v| vector_f32_distance_l2(&query, v)) + .map(|v| vector_f32_distance_l2_rust(&query, v)) .collect::>(); assert_eq!(results, expected); } @@ -111,41 +140,41 @@ mod tests { fn test_vector_distance_l2_odd_len() { let v = (0..5).map(|x| x as f32).collect::>(); let query = (2..7).map(|x| x as f32).collect::>(); - assert_eq!(vector_f32_distance_l2(&v, &query), 20.0_f64.sqrt()); + assert_eq!(vector_f32_distance_l2_rust(&v, &query), 20.0_f64.sqrt()); } #[test] fn test_vector_distance_l2_f32() { - assert_eq!(vector_f32_distance_l2(&[], &[]), 0.0); + assert_eq!(vector_f32_distance_l2_rust(&[], &[]), 0.0); assert_eq!( - vector_f32_distance_l2(&[1.0, 2.0], &[0.0, 0.0]), + vector_f32_distance_l2_rust(&[1.0, 2.0], &[0.0, 0.0]), (1f64 + 2f64 * 2f64).sqrt() ); - assert_eq!(vector_f32_distance_l2(&[1.0, 2.0], &[1.0, 2.0]), 0.0); + assert_eq!(vector_f32_distance_l2_rust(&[1.0, 2.0], &[1.0, 2.0]), 0.0); assert_eq!( - vector_f32_distance_l2(&[1.0, 2.0], &[-1.0, -2.0]), + vector_f32_distance_l2_rust(&[1.0, 2.0], &[-1.0, -2.0]), (2f64 * 2f64 + 4f64 * 4f64).sqrt() ); assert_eq!( - vector_f32_distance_l2(&[1.0, 2.0], &[-2.0, 1.0]), + vector_f32_distance_l2_rust(&[1.0, 2.0], &[-2.0, 1.0]), (3f64 * 3f64 + 1f64 * 1f64).sqrt() ); } #[test] fn test_vector_distance_l2_f64() { - assert_eq!(vector_f64_distance_l2(&[], &[]), 0.0); + assert_eq!(vector_f64_distance_l2_rust(&[], &[]), 0.0); assert_eq!( - vector_f64_distance_l2(&[1.0, 2.0], &[0.0, 0.0]), + vector_f64_distance_l2_rust(&[1.0, 2.0], &[0.0, 0.0]), (1f64 + 2f64 * 2f64).sqrt() ); - assert_eq!(vector_f64_distance_l2(&[1.0, 2.0], &[1.0, 2.0]), 0.0); + assert_eq!(vector_f64_distance_l2_rust(&[1.0, 2.0], &[1.0, 2.0]), 0.0); assert_eq!( - vector_f64_distance_l2(&[1.0, 2.0], &[-1.0, -2.0]), + vector_f64_distance_l2_rust(&[1.0, 2.0], &[-1.0, -2.0]), (2f64 * 2f64 + 4f64 * 4f64).sqrt() ); assert_eq!( - vector_f64_distance_l2(&[1.0, 2.0], &[-2.0, 1.0]), + vector_f64_distance_l2_rust(&[1.0, 2.0], &[-2.0, 1.0]), (3f64 * 3f64 + 1f64 * 1f64).sqrt() ); } @@ -162,7 +191,7 @@ mod tests { idx: &[1, 2], values: &[1.0, 3.0] }, - ) - vector_f32_distance_l2(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0])) + ) - vector_f32_distance_l2_rust(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0])) .abs() < 1e-7 ); @@ -183,4 +212,28 @@ mod tests { (d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6 } + + #[quickcheck] + fn prop_vector_distance_l2_rust_vs_simsimd_f32( + v1: ArbitraryVector<100>, + v2: ArbitraryVector<100>, + ) -> bool { + let v1 = vector_convert(v1.into(), VectorType::Float32Dense).unwrap(); + let v2 = vector_convert(v2.into(), VectorType::Float32Dense).unwrap(); + let d1 = vector_f32_distance_l2_rust(v1.as_f32_slice(), v2.as_f32_slice()); + let d2 = vector_f32_distance_l2_simsimd(v1.as_f32_slice(), v2.as_f32_slice()); + (d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-4 + } + + #[quickcheck] + fn prop_vector_distance_l2_rust_vs_simsimd_f64( + v1: ArbitraryVector<100>, + v2: ArbitraryVector<100>, + ) -> bool { + let v1 = vector_convert(v1.into(), VectorType::Float64Dense).unwrap(); + let v2 = vector_convert(v2.into(), VectorType::Float64Dense).unwrap(); + let d1 = vector_f64_distance_l2_rust(v1.as_f64_slice(), v2.as_f64_slice()); + let d2 = vector_f64_distance_l2_simsimd(v1.as_f64_slice(), v2.as_f64_slice()); + (d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6 + } }