parser: finish lexer and draft AST

This commit is contained in:
TcMits
2025-08-01 15:07:20 +07:00
parent 81c86d42b4
commit 1e926d0093
9 changed files with 3435 additions and 1 deletions

View File

@@ -111,3 +111,7 @@ harness = false
[[bench]]
name = "tpc_h_benchmark"
harness = false
[[bench]]
name = "parser_benchmark"
harness = false

View File

@@ -0,0 +1,33 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use pprof::criterion::{Output, PProfProfiler};
use turso_core::parser::lexer::Lexer;
fn bench_lexer(criterion: &mut Criterion) {
let queries = [
"SELECT 1",
"SELECT * FROM users LIMIT 1",
"SELECT first_name, count(1) FROM users GROUP BY first_name HAVING count(1) > 1 ORDER BY count(1) LIMIT 1",
];
for query in queries.iter() {
let mut group = criterion.benchmark_group(format!("Lexer `{query}`"));
let qb = query.as_bytes();
group.bench_function(BenchmarkId::new("limbo_lexer_query", ""), |b| {
b.iter(|| {
for token in Lexer::new(black_box(qb)) {
token.unwrap();
}
});
});
group.finish();
}
}
criterion_group! {
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = bench_lexer
}
criterion_main!(benches);

View File

@@ -1,6 +1,134 @@
use std::fs;
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{BufWriter, Write};
use std::path::PathBuf;
/// generates a trie-like function with nested match expressions for parsing SQL keywords
/// example: input: [["ABORT", "TK_ABORT"], ["ACTION", "TK_ACTION"], ["ADD", "TK_ADD"],]
/// A
/// ├─ B
/// │ ├─ O
/// │ │ ├─ R
/// │ │ │ ├─ T -> TK_ABORT
/// ├─ C
/// │ ├─ T
/// │ │ ├─ I
/// │ │ │ ├─ O
/// │ │ │ │ ├─ N -> TK_ACTION
/// ├─ D
/// │ ├─ D -> TK_ADD
fn build_keyword_map(
writer: &mut impl Write,
func_name: &str,
keywords: &[[&'static str; 2]],
) -> Result<(), std::io::Error> {
assert!(!keywords.is_empty());
let mut min_len = keywords[0][0].len();
let mut max_len = keywords[0][0].len();
struct PathEntry {
result: Option<&'static str>,
sub_entries: HashMap<u8, Box<PathEntry>>,
}
let mut paths = Box::new(PathEntry {
result: None,
sub_entries: HashMap::new(),
});
for keyword in keywords {
let keyword_b = keyword[0].as_bytes();
if keyword_b.len() < min_len {
min_len = keyword_b.len();
}
if keyword_b.len() > max_len {
max_len = keyword_b.len();
}
let mut current = &mut paths;
for &b in keyword_b {
let upper_b = b.to_ascii_uppercase();
match current.sub_entries.get(&upper_b) {
Some(_) => {
current = current.sub_entries.get_mut(&upper_b).unwrap();
}
None => {
let new_entry = Box::new(PathEntry {
result: None,
sub_entries: HashMap::new(),
});
current.sub_entries.insert(upper_b, new_entry);
current = current.sub_entries.get_mut(&upper_b).unwrap();
}
}
}
assert!(current.result.is_none());
current.result = Some(keyword[1]);
}
fn write_entry(writer: &mut impl Write, entry: &PathEntry) -> Result<(), std::io::Error> {
if let Some(result) = entry.result {
writeln!(writer, "if idx == buf.len() {{")?;
writeln!(writer, "return Some(TokenType::{result});")?;
writeln!(writer, "}}")?;
}
if entry.sub_entries.is_empty() {
writeln!(writer, "None")?;
return Ok(());
}
writeln!(writer, "if idx >= buf.len() {{")?;
writeln!(writer, "return None;")?;
writeln!(writer, "}}")?;
writeln!(writer, "match buf[idx] {{")?;
for (&b, sub_entry) in &entry.sub_entries {
if b.is_ascii_alphabetic() {
writeln!(writer, "{} | {} => {{", b, b.to_ascii_lowercase())?;
} else {
writeln!(writer, "{b} => {{")?;
}
writeln!(writer, "idx += 1;")?;
write_entry(writer, sub_entry)?;
writeln!(writer, "}}")?;
}
writeln!(writer, "_ => None")?;
writeln!(writer, "}}")?;
Ok(())
}
writeln!(
writer,
"pub(crate) const MAX_KEYWORD_LEN: usize = {max_len};"
)?;
writeln!(
writer,
"pub(crate) const MIN_KEYWORD_LEN: usize = {min_len};"
)?;
writeln!(writer, "/// Check if `word` is a keyword")?;
writeln!(
writer,
"pub fn {func_name}(buf: &[u8]) -> Option<TokenType> {{"
)?;
writeln!(
writer,
"if buf.len() < MIN_KEYWORD_LEN || buf.len() > MAX_KEYWORD_LEN {{"
)?;
writeln!(writer, "return None;")?;
writeln!(writer, "}}")?;
writeln!(writer, "let mut idx = 0;")?;
write_entry(writer, &paths)?;
writeln!(writer, "}}")?;
Ok(())
}
fn main() {
let out_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
let built_file = out_dir.join("built.rs");
@@ -18,4 +146,161 @@ fn main() {
),
)
.expect("Failed to append to built file");
let keywords = out_dir.join("keywords.rs");
let mut keywords = BufWriter::new(File::create(keywords).unwrap());
build_keyword_map(
&mut keywords,
"keyword_token",
&[
["ABORT", "TK_ABORT"],
["ACTION", "TK_ACTION"],
["ADD", "TK_ADD"],
["AFTER", "TK_AFTER"],
["ALL", "TK_ALL"],
["ALTER", "TK_ALTER"],
["ALWAYS", "TK_ALWAYS"],
["ANALYZE", "TK_ANALYZE"],
["AND", "TK_AND"],
["AS", "TK_AS"],
["ASC", "TK_ASC"],
["ATTACH", "TK_ATTACH"],
["AUTOINCREMENT", "TK_AUTOINCR"],
["BEFORE", "TK_BEFORE"],
["BEGIN", "TK_BEGIN"],
["BETWEEN", "TK_BETWEEN"],
["BY", "TK_BY"],
["CASCADE", "TK_CASCADE"],
["CASE", "TK_CASE"],
["CAST", "TK_CAST"],
["CHECK", "TK_CHECK"],
["COLLATE", "TK_COLLATE"],
["COLUMN", "TK_COLUMNKW"],
["COMMIT", "TK_COMMIT"],
["CONFLICT", "TK_CONFLICT"],
["CONSTRAINT", "TK_CONSTRAINT"],
["CREATE", "TK_CREATE"],
["CROSS", "TK_JOIN_KW"],
["CURRENT", "TK_CURRENT"],
["CURRENT_DATE", "TK_CTIME_KW"],
["CURRENT_TIME", "TK_CTIME_KW"],
["CURRENT_TIMESTAMP", "TK_CTIME_KW"],
["DATABASE", "TK_DATABASE"],
["DEFAULT", "TK_DEFAULT"],
["DEFERRABLE", "TK_DEFERRABLE"],
["DEFERRED", "TK_DEFERRED"],
["DELETE", "TK_DELETE"],
["DESC", "TK_DESC"],
["DETACH", "TK_DETACH"],
["DISTINCT", "TK_DISTINCT"],
["DO", "TK_DO"],
["DROP", "TK_DROP"],
["EACH", "TK_EACH"],
["ELSE", "TK_ELSE"],
["END", "TK_END"],
["ESCAPE", "TK_ESCAPE"],
["EXCEPT", "TK_EXCEPT"],
["EXCLUDE", "TK_EXCLUDE"],
["EXCLUSIVE", "TK_EXCLUSIVE"],
["EXISTS", "TK_EXISTS"],
["EXPLAIN", "TK_EXPLAIN"],
["FAIL", "TK_FAIL"],
["FILTER", "TK_FILTER"],
["FIRST", "TK_FIRST"],
["FOLLOWING", "TK_FOLLOWING"],
["FOR", "TK_FOR"],
["FOREIGN", "TK_FOREIGN"],
["FROM", "TK_FROM"],
["FULL", "TK_JOIN_KW"],
["GENERATED", "TK_GENERATED"],
["GLOB", "TK_LIKE_KW"],
["GROUP", "TK_GROUP"],
["GROUPS", "TK_GROUPS"],
["HAVING", "TK_HAVING"],
["IF", "TK_IF"],
["IGNORE", "TK_IGNORE"],
["IMMEDIATE", "TK_IMMEDIATE"],
["IN", "TK_IN"],
["INDEX", "TK_INDEX"],
["INDEXED", "TK_INDEXED"],
["INITIALLY", "TK_INITIALLY"],
["INNER", "TK_JOIN_KW"],
["INSERT", "TK_INSERT"],
["INSTEAD", "TK_INSTEAD"],
["INTERSECT", "TK_INTERSECT"],
["INTO", "TK_INTO"],
["IS", "TK_IS"],
["ISNULL", "TK_ISNULL"],
["JOIN", "TK_JOIN"],
["KEY", "TK_KEY"],
["LAST", "TK_LAST"],
["LEFT", "TK_JOIN_KW"],
["LIKE", "TK_LIKE_KW"],
["LIMIT", "TK_LIMIT"],
["MATCH", "TK_MATCH"],
["MATERIALIZED", "TK_MATERIALIZED"],
["NATURAL", "TK_JOIN_KW"],
["NO", "TK_NO"],
["NOT", "TK_NOT"],
["NOTHING", "TK_NOTHING"],
["NOTNULL", "TK_NOTNULL"],
["NULL", "TK_NULL"],
["NULLS", "TK_NULLS"],
["OF", "TK_OF"],
["OFFSET", "TK_OFFSET"],
["ON", "TK_ON"],
["OR", "TK_OR"],
["ORDER", "TK_ORDER"],
["OTHERS", "TK_OTHERS"],
["OUTER", "TK_JOIN_KW"],
["OVER", "TK_OVER"],
["PARTITION", "TK_PARTITION"],
["PLAN", "TK_PLAN"],
["PRAGMA", "TK_PRAGMA"],
["PRECEDING", "TK_PRECEDING"],
["PRIMARY", "TK_PRIMARY"],
["QUERY", "TK_QUERY"],
["RAISE", "TK_RAISE"],
["RANGE", "TK_RANGE"],
["RECURSIVE", "TK_RECURSIVE"],
["REFERENCES", "TK_REFERENCES"],
["REGEXP", "TK_LIKE_KW"],
["REINDEX", "TK_REINDEX"],
["RELEASE", "TK_RELEASE"],
["RENAME", "TK_RENAME"],
["REPLACE", "TK_REPLACE"],
["RETURNING", "TK_RETURNING"],
["RESTRICT", "TK_RESTRICT"],
["RIGHT", "TK_JOIN_KW"],
["ROLLBACK", "TK_ROLLBACK"],
["ROW", "TK_ROW"],
["ROWS", "TK_ROWS"],
["SAVEPOINT", "TK_SAVEPOINT"],
["SELECT", "TK_SELECT"],
["SET", "TK_SET"],
["TABLE", "TK_TABLE"],
["TEMP", "TK_TEMP"],
["TEMPORARY", "TK_TEMP"],
["THEN", "TK_THEN"],
["TIES", "TK_TIES"],
["TO", "TK_TO"],
["TRANSACTION", "TK_TRANSACTION"],
["TRIGGER", "TK_TRIGGER"],
["UNBOUNDED", "TK_UNBOUNDED"],
["UNION", "TK_UNION"],
["UNIQUE", "TK_UNIQUE"],
["UPDATE", "TK_UPDATE"],
["USING", "TK_USING"],
["VACUUM", "TK_VACUUM"],
["VALUES", "TK_VALUES"],
["VIEW", "TK_VIEW"],
["VIRTUAL", "TK_VIRTUAL"],
["WHEN", "TK_WHEN"],
["WHERE", "TK_WHERE"],
["WINDOW", "TK_WINDOW"],
["WITH", "TK_WITH"],
["WITHOUT", "TK_WITHOUT"],
],
)
.unwrap();
}

View File

@@ -30,6 +30,7 @@ mod uuid;
mod vdbe;
mod vector;
mod vtab;
pub mod parser;
#[cfg(feature = "fuzz")]
pub mod numeric;

1455
core/parser/ast.rs Normal file

File diff suppressed because it is too large Load Diff

94
core/parser/error.rs Normal file
View File

@@ -0,0 +1,94 @@
use std::error;
use std::fmt;
use std::io;
/// SQL lexer and parser errors
#[non_exhaustive]
#[derive(Debug, miette::Diagnostic)]
#[diagnostic()]
pub enum Error {
/// I/O Error
Io(io::Error),
/// Lexer error
UnrecognizedToken(usize, #[label("here")] Option<miette::SourceSpan>),
/// Missing quote or double-quote or backtick
UnterminatedLiteral(usize, #[label("here")] Option<miette::SourceSpan>),
/// Missing `]`
UnterminatedBracket(usize, #[label("here")] Option<miette::SourceSpan>),
/// Missing `*/`
UnterminatedBlockComment(usize, #[label("here")] Option<miette::SourceSpan>),
/// Invalid parameter name
BadVariableName(usize, #[label("here")] Option<miette::SourceSpan>),
/// Invalid number format
#[diagnostic(help("Invalid digit at `{0}`"))]
BadNumber(
usize,
#[label("here")] Option<miette::SourceSpan>,
String, // Holds the offending number as a string
),
#[diagnostic(help("Invalid digit at `{0}`"))]
BadFractionalPart(
usize,
#[label("here")] Option<miette::SourceSpan>,
String, // Holds the offending number as a string
),
#[diagnostic(help("Invalid digit at `{0}`"))]
BadExponentPart(
usize,
#[label("here")] Option<miette::SourceSpan>,
String, // Holds the offending number as a string
),
/// Invalid or missing sign after `!`
ExpectedEqualsSign(usize, #[label("here")] Option<miette::SourceSpan>),
/// BLOB literals are string literals containing hexadecimal data and preceded by a single "x" or "X" character.
MalformedBlobLiteral(usize, #[label("here")] Option<miette::SourceSpan>),
/// Hexadecimal integer literals follow the C-language notation of "0x" or "0X" followed by hexadecimal digits.
MalformedHexInteger(
usize,
#[label("here")] Option<miette::SourceSpan>,
#[help] Option<&'static str>,
),
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
Self::Io(ref err) => err.fmt(f),
Self::UnrecognizedToken(pos, _) => {
write!(f, "unrecognized token at {:?}", pos)
}
Self::UnterminatedLiteral(pos, _) => {
write!(f, "non-terminated literal at {:?}", pos)
}
Self::UnterminatedBracket(pos, _) => {
write!(f, "non-terminated bracket at {:?}", pos)
}
Self::UnterminatedBlockComment(pos, _) => {
write!(f, "non-terminated block comment at {:?}", pos)
}
Self::BadVariableName(pos, _) => write!(f, "bad variable name at {:?}", pos),
Self::BadNumber(pos, _, _) => write!(f, "bad number at {:?}", pos),
Self::BadFractionalPart(pos, _, _) => {
write!(f, "bad fractional part at {:?}", pos)
}
Self::BadExponentPart(pos, _, _) => {
write!(f, "bad exponent part at {:?}", pos)
}
Self::ExpectedEqualsSign(pos, _) => write!(f, "expected = sign at {:?}", pos),
Self::MalformedBlobLiteral(pos, _) => {
write!(f, "malformed blob literal at {:?}", pos)
}
Self::MalformedHexInteger(pos, _, _) => {
write!(f, "malformed hex integer at {:?}", pos)
}
}
}
}
impl error::Error for Error {}
impl From<io::Error> for Error {
fn from(err: io::Error) -> Self {
Self::Io(err)
}
}

1379
core/parser/lexer.rs Normal file

File diff suppressed because it is too large Load Diff

4
core/parser/mod.rs Normal file
View File

@@ -0,0 +1,4 @@
pub mod error;
pub mod token;
pub mod ast;
pub mod lexer;

179
core/parser/token.rs Normal file
View File

@@ -0,0 +1,179 @@
/// Token classes
// Generated by lemon (parse.h).
// Renamed manually.
// To be keep in sync.
#[non_exhaustive]
#[allow(non_camel_case_types, missing_docs)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd)]
#[repr(u16)]
pub enum TokenType {
TK_EOF = 0,
TK_SEMI = 1,
TK_EXPLAIN = 2,
TK_QUERY = 3,
TK_PLAN = 4,
TK_BEGIN = 5,
TK_TRANSACTION = 6,
TK_DEFERRED = 7,
TK_IMMEDIATE = 8,
TK_EXCLUSIVE = 9,
TK_COMMIT = 10,
TK_END = 11,
TK_ROLLBACK = 12,
TK_SAVEPOINT = 13,
TK_RELEASE = 14,
TK_TO = 15,
TK_TABLE = 16,
TK_CREATE = 17,
TK_IF = 18,
TK_NOT = 19,
TK_EXISTS = 20,
TK_TEMP = 21,
TK_LP = 22,
TK_RP = 23,
TK_AS = 24,
TK_COMMA = 25,
TK_WITHOUT = 26,
TK_ABORT = 27,
TK_ACTION = 28,
TK_AFTER = 29,
TK_ANALYZE = 30,
TK_ASC = 31,
TK_ATTACH = 32,
TK_BEFORE = 33,
TK_BY = 34,
TK_CASCADE = 35,
TK_CAST = 36,
TK_CONFLICT = 37,
TK_DATABASE = 38,
TK_DESC = 39,
TK_DETACH = 40,
TK_EACH = 41,
TK_FAIL = 42,
TK_OR = 43,
TK_AND = 44,
TK_IS = 45,
TK_ISNOT = 46,
TK_MATCH = 47,
TK_LIKE_KW = 48,
TK_BETWEEN = 49,
TK_IN = 50,
TK_ISNULL = 51,
TK_NOTNULL = 52,
TK_NE = 53,
TK_EQ = 54,
TK_GT = 55,
TK_LE = 56,
TK_LT = 57,
TK_GE = 58,
TK_ESCAPE = 59,
TK_ID = 60,
TK_COLUMNKW = 61,
TK_DO = 62,
TK_FOR = 63,
TK_IGNORE = 64,
TK_INITIALLY = 65,
TK_INSTEAD = 66,
TK_NO = 67,
TK_KEY = 68,
TK_OF = 69,
TK_OFFSET = 70,
TK_PRAGMA = 71,
TK_RAISE = 72,
TK_RECURSIVE = 73,
TK_REPLACE = 74,
TK_RESTRICT = 75,
TK_ROW = 76,
TK_ROWS = 77,
TK_TRIGGER = 78,
TK_VACUUM = 79,
TK_VIEW = 80,
TK_VIRTUAL = 81,
TK_WITH = 82,
TK_NULLS = 83,
TK_FIRST = 84,
TK_LAST = 85,
TK_CURRENT = 86,
TK_FOLLOWING = 87,
TK_PARTITION = 88,
TK_PRECEDING = 89,
TK_RANGE = 90,
TK_UNBOUNDED = 91,
TK_EXCLUDE = 92,
TK_GROUPS = 93,
TK_OTHERS = 94,
TK_TIES = 95,
TK_GENERATED = 96,
TK_ALWAYS = 97,
TK_MATERIALIZED = 98,
TK_REINDEX = 99,
TK_RENAME = 100,
TK_CTIME_KW = 101,
TK_ANY = 102,
TK_BITAND = 103,
TK_BITOR = 104,
TK_LSHIFT = 105,
TK_RSHIFT = 106,
TK_PLUS = 107,
TK_MINUS = 108,
TK_STAR = 109,
TK_SLASH = 110,
TK_REM = 111,
TK_CONCAT = 112,
TK_PTR = 113,
TK_COLLATE = 114,
TK_BITNOT = 115,
TK_ON = 116,
TK_INDEXED = 117,
TK_STRING = 118,
TK_JOIN_KW = 119,
TK_CONSTRAINT = 120,
TK_DEFAULT = 121,
TK_NULL = 122,
TK_PRIMARY = 123,
TK_UNIQUE = 124,
TK_CHECK = 125,
TK_REFERENCES = 126,
TK_AUTOINCR = 127,
TK_INSERT = 128,
TK_DELETE = 129,
TK_UPDATE = 130,
TK_SET = 131,
TK_DEFERRABLE = 132,
TK_FOREIGN = 133,
TK_DROP = 134,
TK_UNION = 135,
TK_ALL = 136,
TK_EXCEPT = 137,
TK_INTERSECT = 138,
TK_SELECT = 139,
TK_VALUES = 140,
TK_DISTINCT = 141,
TK_DOT = 142,
TK_FROM = 143,
TK_JOIN = 144,
TK_USING = 145,
TK_ORDER = 146,
TK_GROUP = 147,
TK_HAVING = 148,
TK_LIMIT = 149,
TK_WHERE = 150,
TK_RETURNING = 151,
TK_INTO = 152,
TK_NOTHING = 153,
TK_BLOB = 154,
TK_FLOAT = 155,
TK_INTEGER = 156,
TK_VARIABLE = 157,
TK_CASE = 158,
TK_WHEN = 159,
TK_THEN = 160,
TK_ELSE = 161,
TK_INDEX = 162,
TK_ALTER = 163,
TK_ADD = 164,
TK_WINDOW = 165,
TK_OVER = 166,
TK_FILTER = 167,
TK_ILLEGAL = 185,
}