rewrite grammar generator and add fuzz test for arithmetic expressions

This commit is contained in:
Nikita Sivukhin
2025-02-02 18:39:24 +04:00
parent f716919b10
commit 91fcb67b06
4 changed files with 552 additions and 32 deletions

127
Cargo.lock generated
View File

@@ -24,10 +24,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"getrandom",
"getrandom 0.2.15",
"once_cell",
"version_check",
"zerocopy",
"zerocopy 0.7.35",
]
[[package]]
@@ -60,7 +60,7 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18a1e15a87b13ae79e04e07b3714fc41d5f6993dff11662fdbe0b207c6ad0fe0"
dependencies = [
"rand",
"rand 0.8.5",
]
[[package]]
@@ -472,6 +472,8 @@ dependencies = [
"env_logger 0.10.2",
"limbo_core",
"log",
"rand 0.9.0",
"rand_chacha 0.9.0",
"rexpect",
"rusqlite",
"rustyline",
@@ -977,10 +979,22 @@ dependencies = [
"cfg-if",
"js-sys",
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"wasm-bindgen",
]
[[package]]
name = "getrandom"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
dependencies = [
"cfg-if",
"libc",
"wasi 0.13.3+wasi-0.2.2",
"windows-targets 0.52.6",
]
[[package]]
name = "gimli"
version = "0.31.1"
@@ -1400,7 +1414,7 @@ dependencies = [
"itoa",
"nom",
"ordered-float",
"rand",
"rand 0.8.5",
"ryu",
"serde_json",
]
@@ -1557,7 +1571,7 @@ dependencies = [
"chrono",
"criterion",
"fallible-iterator 0.3.0",
"getrandom",
"getrandom 0.2.15",
"hex",
"indexmap",
"io-uring",
@@ -1579,7 +1593,7 @@ dependencies = [
"pest_derive",
"polling",
"pprof",
"rand",
"rand 0.8.5",
"regex",
"regex-syntax",
"rstest",
@@ -1647,8 +1661,8 @@ dependencies = [
"limbo_core",
"log",
"notify",
"rand",
"rand_chacha",
"rand 0.8.5",
"rand_chacha 0.3.1",
"serde",
"serde_json",
"tempfile",
@@ -1680,7 +1694,7 @@ dependencies = [
"limbo_ext",
"quickcheck",
"quickcheck_macros",
"rand",
"rand 0.8.5",
]
[[package]]
@@ -1798,7 +1812,7 @@ checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
dependencies = [
"libc",
"log",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys 0.52.0",
]
@@ -2067,7 +2081,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"rand",
"rand 0.8.5",
]
[[package]]
@@ -2176,7 +2190,7 @@ version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
"zerocopy",
"zerocopy 0.7.35",
]
[[package]]
@@ -2307,7 +2321,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
dependencies = [
"env_logger 0.8.4",
"log",
"rand",
"rand 0.8.5",
]
[[package]]
@@ -2347,8 +2361,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
]
[[package]]
name = "rand"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.0",
"zerocopy 0.8.14",
]
[[package]]
@@ -2358,7 +2383,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core 0.9.0",
]
[[package]]
@@ -2367,7 +2402,17 @@ version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
"getrandom 0.2.15",
]
[[package]]
name = "rand_core"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff"
dependencies = [
"getrandom 0.3.1",
"zerocopy 0.8.14",
]
[[package]]
@@ -2405,7 +2450,7 @@ version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
dependencies = [
"getrandom",
"getrandom 0.2.15",
"libredox",
"thiserror 1.0.69",
]
@@ -2847,7 +2892,7 @@ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
dependencies = [
"cfg-if",
"fastrand",
"getrandom",
"getrandom 0.2.15",
"once_cell",
"rustix",
"windows-sys 0.59.0",
@@ -3079,7 +3124,7 @@ version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4"
dependencies = [
"getrandom",
"getrandom 0.2.15",
]
[[package]]
@@ -3119,6 +3164,15 @@ version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasi"
version = "0.13.3+wasi-0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
dependencies = [
"wit-bindgen-rt",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.100"
@@ -3454,6 +3508,15 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "wit-bindgen-rt"
version = "0.33.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "write16"
version = "1.0.0"
@@ -3497,7 +3560,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"byteorder",
"zerocopy-derive",
"zerocopy-derive 0.7.35",
]
[[package]]
name = "zerocopy"
version = "0.8.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a367f292d93d4eab890745e75a778da40909cab4d6ff8173693812f79c4a2468"
dependencies = [
"zerocopy-derive 0.8.14",
]
[[package]]
@@ -3511,6 +3583,17 @@ dependencies = [
"syn 2.0.96",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3931cb58c62c13adec22e38686b559c86a30565e16ad6e8510a337cedc611e1"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]
[[package]]
name = "zerofrom"
version = "0.1.5"

View File

@@ -29,7 +29,9 @@ rusqlite = { version = "0.29", features = ["bundled"] }
tempfile = "3.0.7"
log = "0.4.22"
assert_cmd = "^2"
rand_chacha = "0.9.0"
rand = "0.9.0"
# rexpect does not support windows.
[target.'cfg(not(windows))'.dependencies]
rexpect = "0.6.0"
rexpect = "0.6.0"

View File

@@ -0,0 +1,385 @@
/// Grammar generator is a helper to build a probabilistic grammar and generate random string from it
/// Grammar consists of terminal (characters) and symbols (non-terminal with some expansion rule)
///
/// Current, supported expansion rules are:
/// 1. Symbol -> [Str]: generate terminals which form fixed length string with constant prefix and random suffix
/// 2. Symbol -> [Int]: generate terminals which form integer from specified range
/// 3. Symbol -> (Inner)?: generate expansion for Inner symbol with some probability
/// 4. Symbol -> (Inner){n..m}: generate k expansions for Inner symbol where k \in [n..m) with uniform distribution
/// (note, that every repetition will be expanded independently)
/// 5. Symbol -> Inner1 Inner2 .. Inner[n]: concatenate expansions from inner symbols and insert separator string between them
/// 6. Symbol -> Choice1 | Choice2 | .. | Choice[n]: pick random choice according to their weights randomly and generate expansion for it
///
/// (this is more or less [context-free grammar](https://en.wikipedia.org/wiki/Context-free_grammar) with very minor differences)
///
/// The idea behind this code is to provide a way to "build" grammar generator with all these rules and their dependencies and after that
/// we can randomly sample strings from this generator easily.
use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
use rand::Rng;
use rand_chacha::ChaCha8Rng;
#[derive(Clone, Debug)]
pub enum SymbolType {
Str {
fixed_prefix: String,
random_length: usize,
},
Int {
range: Range<i32>,
},
Optional {
value: SymbolHandle,
prob: f64,
},
Repeat {
value: SymbolHandle,
range: Range<usize>,
separator: String,
},
Concat {
values: Vec<SymbolHandle>,
separator: String,
},
Choice {
values: Vec<(SymbolHandle, f64)>,
},
}
pub fn const_str(s: &str) -> SymbolType {
SymbolType::Str {
fixed_prefix: s.to_string(),
random_length: 0,
}
}
pub fn rand_str(fixed_prefix: &str, random_length: usize) -> SymbolType {
SymbolType::Str {
fixed_prefix: fixed_prefix.to_string(),
random_length,
}
}
pub fn rand_int(range: Range<i32>) -> SymbolType {
SymbolType::Int { range }
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct SymbolHandle(i32);
pub struct SymbolDefinitionBuilder {
generator: GrammarGenerator,
handle: SymbolHandle,
symbol: Option<SymbolType>,
}
#[derive(Debug)]
enum GrammarFrontierNode {
Handle(SymbolHandle),
String(String),
}
#[derive(Clone)]
pub struct GrammarGenerator(Rc<RefCell<GrammarGeneratorInner>>);
struct GrammarGeneratorInner {
last_symbol_id: i32,
symbols: HashMap<SymbolHandle, SymbolType>,
}
impl GrammarGenerator {
pub fn new() -> Self {
GrammarGenerator(Rc::new(RefCell::new(GrammarGeneratorInner {
last_symbol_id: 0,
symbols: HashMap::new(),
})))
}
pub fn create_handle(&self) -> (SymbolHandle, SymbolDefinitionBuilder) {
let handle = SymbolHandle(self.0.borrow().last_symbol_id);
self.0.borrow_mut().last_symbol_id += 1;
let builder = SymbolDefinitionBuilder {
generator: self.clone(),
handle,
symbol: None,
};
(handle, builder)
}
pub fn create(&self) -> SymbolDefinitionBuilder {
let (_, builder) = self.create_handle();
builder
}
pub fn register(&self, handle: SymbolHandle, value: SymbolType) {
let result = self.0.borrow_mut().symbols.insert(handle, value);
assert!(result.is_none(), "handle can be registered only once");
}
// this helper runs DFS for directed graph and set is_recursive[v] = true for all reachable from root vertices
// if path of infinite lengths exists for v
fn is_recursive_from_root(
&self,
root: SymbolHandle,
is_recursive: &mut HashMap<SymbolHandle, bool>,
) -> bool {
if let Some(_) = is_recursive.get(&root) {
is_recursive.insert(root, true);
return true;
}
is_recursive.insert(root, false);
let symbols = &self.0.borrow().symbols;
let recursive = match symbols.get(&root).expect("symbol must be registered") {
SymbolType::Str { .. } | SymbolType::Int { .. } => false,
SymbolType::Optional { value, .. } | SymbolType::Repeat { value, .. } => {
self.is_recursive_from_root(*value, is_recursive)
}
SymbolType::Concat { values, .. } => {
let mut recursive = false;
for value in values.iter() {
recursive |= self.is_recursive_from_root(*value, is_recursive);
}
recursive
}
SymbolType::Choice { values, .. } => {
let mut recursive = false;
for (value, _) in values.iter() {
recursive |= self.is_recursive_from_root(*value, is_recursive);
}
recursive
}
};
is_recursive.insert(root, recursive);
recursive
}
// we generate random sample from grammar in BFS fashion instead of DFS because in such a way we can force abort generation of string in more fair fashion
// the problem with probabilistic grammar, is that it's recursive rules can have infinite (or very large) average length of expanded terminals
// in order to fight with this problem, we provide length_limit_hint which will change logic of generation and start using only non-recursive rules (if this is possible) in case
// when "frontier" of the generation already have >= length_limit_hint nodes
pub fn generate(
&self,
rng: &mut ChaCha8Rng,
root: SymbolHandle,
length_limit_hint: usize,
) -> String {
let mut frontier = vec![GrammarFrontierNode::Handle(root)];
let mut is_recursive = HashMap::new();
self.is_recursive_from_root(root, &mut is_recursive);
let symbols = &self.0.borrow().symbols;
let terminals = loop {
let mut next = Vec::new();
let mut expanded = false;
let limit_exceeded = frontier.len() >= length_limit_hint;
for node in frontier.into_iter() {
let GrammarFrontierNode::Handle(handle) = node else {
next.push(node);
continue;
};
expanded = true;
match symbols.get(&handle).expect("symbol must be registered") {
SymbolType::Str {
fixed_prefix,
random_length,
} => {
let mut s = fixed_prefix.clone();
for _ in 0..*random_length {
s.push(rng.random_range('A'..'Z'));
}
next.push(GrammarFrontierNode::String(s));
}
SymbolType::Int { range } => {
next.push(GrammarFrontierNode::String(
rng.random_range(range.clone()).to_string(),
));
}
SymbolType::Optional { value, prob } => {
if !limit_exceeded && rng.random_bool(*prob) {
next.push(GrammarFrontierNode::Handle(*value));
}
}
SymbolType::Repeat {
value,
range,
separator,
} => {
let repetitions = if !limit_exceeded {
rng.random_range(range.clone())
} else {
range.start
};
for i in 0..repetitions {
if i > 0 {
next.push(GrammarFrontierNode::String(separator.to_string()));
}
next.push(GrammarFrontierNode::Handle(*value));
}
}
SymbolType::Concat { values, separator } => {
for (i, value) in values.iter().enumerate() {
if i > 0 {
next.push(GrammarFrontierNode::String(separator.to_string()));
}
next.push(GrammarFrontierNode::Handle(*value));
}
}
SymbolType::Choice { values } => {
let mut handles = if !limit_exceeded {
values.clone()
} else {
values
.iter()
.filter(|x| is_recursive.get(&x.0) != Some(&true))
.map(|x| *x)
.collect::<Vec<_>>()
};
if handles.len() == 0 {
handles = values.clone();
}
let sum: f64 = handles.iter().map(|x| x.1).sum();
let mut sample = rng.random_range(0.0..sum);
for (i, (handle, weight)) in handles.iter().enumerate() {
sample -= weight;
if sample > 0.0 && i < handles.len() - 1 {
continue;
}
next.push(GrammarFrontierNode::Handle(*handle));
break;
}
}
}
}
if !expanded {
break next;
}
frontier = next;
};
let mut result = String::new();
for node in terminals {
let GrammarFrontierNode::String(string) = node else {
panic!("frontier in the end must contain only string nodes");
};
result.push_str(&string);
}
result
}
}
impl SymbolDefinitionBuilder {
pub fn use_symbol(self, symbol: SymbolType) -> Self {
assert!(self.symbol.is_none(), "symbol must be unset");
Self {
symbol: Some(symbol),
..self
}
}
pub fn concat(self, separator: &str) -> Self {
assert!(self.symbol.is_none(), "symbol must be unset");
Self {
symbol: Some(SymbolType::Concat {
values: vec![],
separator: separator.to_string(),
}),
..self
}
}
pub fn push(mut self, handle: SymbolHandle) -> Self {
let Some(SymbolType::Concat {
mut values,
separator,
}) = self.symbol.take()
else {
panic!("symbol must be set to Concat type");
};
values.push(handle);
Self {
symbol: Some(SymbolType::Concat { values, separator }),
..self
}
}
pub fn push_symbol(self, symbol: SymbolType) -> Self {
let (handle, builder) = self.generator.create_handle();
builder.use_symbol(symbol).build();
self.push(handle)
}
pub fn push_str(self, s: &str) -> Self {
self.push_symbol(const_str(s))
}
pub fn choice(self) -> Self {
assert!(self.symbol.is_none(), "symbol must be unset");
Self {
symbol: Some(SymbolType::Choice { values: vec![] }),
..self
}
}
pub fn option_w(mut self, handle: SymbolHandle, weight: f64) -> Self {
let Some(SymbolType::Choice { mut values }) = self.symbol.take() else {
panic!("symbol must be set to Choice type");
};
values.push((handle, weight));
Self {
symbol: Some(SymbolType::Choice { values }),
..self
}
}
pub fn option(self, handle: SymbolHandle) -> Self {
self.option_w(handle, 1.0)
}
pub fn option_symbol_w(self, symbol: SymbolType, weight: f64) -> Self {
let (handle, builder) = self.generator.create_handle();
builder.use_symbol(symbol).build();
self.option_w(handle, weight)
}
pub fn option_symbol(self, symbol: SymbolType) -> Self {
self.option_symbol_w(symbol, 1.0)
}
pub fn option_str(self, s: &str) -> Self {
self.option_symbol(const_str(s))
}
pub fn options_symbol<const N: usize>(mut self, symbols: [SymbolType; N]) -> Self {
for symbol in symbols {
self = self.option_symbol(symbol)
}
self
}
pub fn options_str<const N: usize>(mut self, strs: [&str; N]) -> Self {
for s in strs {
self = self.option_str(s)
}
self
}
pub fn repeat(self, range: Range<usize>, separator: &str) -> Self {
let symbol = self.symbol.expect("symbol must be set");
let (handle, builder) = self.generator.create_handle();
builder.use_symbol(symbol).build();
Self {
symbol: Some(SymbolType::Repeat {
value: handle,
range,
separator: separator.to_string(),
}),
..self
}
}
pub fn optional(self, prob: f64) -> Self {
let symbol = self.symbol.expect("symbol must be set");
let (handle, builder) = self.generator.create_handle();
builder.use_symbol(symbol).build();
Self {
symbol: Some(SymbolType::Optional {
value: handle,
prob,
}),
..self
}
}
pub fn build(self) -> SymbolHandle {
let symbol = self.symbol.expect("symbol must be set");
self.generator.register(self.handle, symbol);
self.handle
}
}

View File

@@ -1,10 +1,16 @@
pub mod grammar_generator;
#[cfg(test)]
mod tests {
use std::{rc::Rc, sync::Arc};
use limbo_core::Database;
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
use rusqlite::params;
use crate::grammar_generator::{rand_int, GrammarGenerator};
fn sqlite_exec_row(conn: &rusqlite::Connection, query: &str) -> Vec<rusqlite::types::Value> {
let mut stmt = conn.prepare(&query).unwrap();
let mut rows = stmt.query(params![]).unwrap();
@@ -49,19 +55,63 @@ mod tests {
}
#[test]
pub fn kek() {
pub fn arithmetic_expression_fuzz() {
let g = GrammarGenerator::new();
let (expr, expr_builder) = g.create_handle();
let (bin_op, bin_op_builder) = g.create_handle();
let (unary_op, unary_op_builder) = g.create_handle();
let (paren, paren_builder) = g.create_handle();
paren_builder
.concat("")
.push_str("(")
.push(expr)
.push_str(")")
.build();
unary_op_builder
.concat(" ")
.push(g.create().choice().options_str(["~", "+", "-"]).build())
.push(expr)
.build();
bin_op_builder
.concat(" ")
.push(expr)
.push(
g.create()
.choice()
.options_str(["+", "-", "*", "/", "%", "&", "|", "<<", ">>"])
.build(),
)
.push(expr)
.build();
expr_builder
.choice()
.option_w(unary_op, 1.0)
.option_w(bin_op, 1.0)
.option_w(paren, 1.0)
.option_symbol_w(rand_int(-10..10), 1.0)
.build();
let sql = g.create().concat(" ").push_str("SELECT").push(expr).build();
let io = Arc::new(limbo_core::PlatformIO::new().unwrap());
let limbo_db = Database::open_file(io, ":memory:").unwrap();
let limbo_conn = limbo_db.connect();
let sqlite_conn = rusqlite::Connection::open_in_memory().unwrap();
println!(
"column: {:?}",
sqlite_exec_row(&sqlite_conn, "SELECT 1 = 1.0")
);
println!(
"column: {:?}",
limbo_exec_row(&limbo_conn, "SELECT 1 = 1.0")
);
let mut rng = ChaCha8Rng::seed_from_u64(0);
for _ in 0..16 * 1024 {
let query = g.generate(&mut rng, sql, 50);
let limbo = limbo_exec_row(&limbo_conn, &query);
let sqlite = sqlite_exec_row(&sqlite_conn, &query);
assert_eq!(
limbo, sqlite,
"query: {}, limbo: {:?}, sqlite: {:?}",
query, limbo, sqlite
);
}
}
}