Merge 'parser: replace KEYWORDS with matching' from Lâm Hoàng Phúc

before:
```sh
sqlparser-rs parsing benchmark/sqlparser::select
                        time:   [693.20 ns 693.96 ns 694.73 ns]
                        change: [+7.4382% +7.6384% +7.8250%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) low severe
  1 (1.00%) low mild
  1 (1.00%) high mild
sqlparser-rs parsing benchmark/sqlparser::with_select
                        time:   [2.5734 µs 2.5763 µs 2.5796 µs]
                        change: [+16.583% +16.809% +17.024%] (p = 0.00 < 0.05)
                        Performance has regressed.
sqlparser-rs parsing benchmark/keyword_token
                        time:   [3.1919 µs 3.1983 µs 3.2047 µs]
                        change: [+944.74% +948.97% +952.91%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) low mild
```
after:
```sh
sqlparser-rs parsing benchmark/sqlparser::select
                        time:   [637.09 ns 638.50 ns 640.15 ns]
                        change: [-1.8412% -1.5494% -1.2424%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  3 (3.00%) high mild
  1 (1.00%) high severe
sqlparser-rs parsing benchmark/sqlparser::with_select
                        time:   [2.1896 µs 2.1919 µs 2.1942 µs]
                        change: [-0.6894% -0.3923% -0.1517%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) low severe
sqlparser-rs parsing benchmark/keyword_token
                        time:   [298.99 ns 299.82 ns 300.72 ns]
                        change: [-1.4726% -1.0148% -0.5702%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 7 outliers among 100 measurements (7.00%)
  1 (1.00%) low mild
  6 (6.00%) high mild
```

Reviewed-by: Jussi Saurio <jussi.saurio@gmail.com>

Closes #1939
This commit is contained in:
Pekka Enberg
2025-07-08 10:21:03 +03:00
6 changed files with 603 additions and 230 deletions

View File

@@ -25,12 +25,10 @@ default = ["YYNOERRORRECOVERY", "NDEBUG"]
serde = ["dep:serde", "indexmap/serde", "bitflags/serde"]
[dependencies]
phf = { version = "0.11", features = ["uncased"] }
log = "0.4.22"
memchr = "2.0"
fallible-iterator = "0.3"
bitflags = "2.0"
uncased = "0.9.10"
indexmap = "2.0"
miette = "7.4.0"
strum = { workspace = true }
@@ -42,9 +40,6 @@ env_logger = { version = "0.11", default-features = false }
[build-dependencies]
cc = "1.0"
phf_shared = { version = "0.11", features = ["uncased"] }
phf_codegen = "0.11"
uncased = "0.9.10"
[lints.rust]
dead_code = "allow"

View File

@@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::{BufWriter, Result, Write};
@@ -5,7 +6,135 @@ use std::path::Path;
use std::process::Command;
use cc::Build;
use uncased::UncasedStr;
/// generates a trie-like function with nested match expressions for parsing SQL keywords
/// example: input: [["ABORT", "TK_ABORT"], ["ACTION", "TK_ACTION"], ["ADD", "TK_ADD"],]
/// A
/// ├─ B
/// │ ├─ O
/// │ │ ├─ R
/// │ │ │ ├─ T -> TK_ABORT
/// ├─ C
/// │ ├─ T
/// │ │ ├─ I
/// │ │ │ ├─ O
/// │ │ │ │ ├─ N -> TK_ACTION
/// ├─ D
/// │ ├─ D -> TK_ADD
fn build_keyword_map(
writer: &mut impl Write,
func_name: &str,
keywords: &[[&'static str; 2]],
) -> Result<()> {
assert!(!keywords.is_empty());
let mut min_len = keywords[0][0].len();
let mut max_len = keywords[0][0].len();
struct PathEntry {
result: Option<&'static str>,
sub_entries: HashMap<u8, Box<PathEntry>>,
}
let mut paths = Box::new(PathEntry {
result: None,
sub_entries: HashMap::new(),
});
for keyword in keywords {
let keyword_b = keyword[0].as_bytes();
if keyword_b.len() < min_len {
min_len = keyword_b.len();
}
if keyword_b.len() > max_len {
max_len = keyword_b.len();
}
let mut current = &mut paths;
for &b in keyword_b {
let upper_b = b.to_ascii_uppercase();
match current.sub_entries.get(&upper_b) {
Some(_) => {
current = current.sub_entries.get_mut(&upper_b).unwrap();
}
None => {
let new_entry = Box::new(PathEntry {
result: None,
sub_entries: HashMap::new(),
});
current.sub_entries.insert(upper_b, new_entry);
current = current.sub_entries.get_mut(&upper_b).unwrap();
}
}
}
assert!(current.result.is_none());
current.result = Some(keyword[1]);
}
fn write_entry(writer: &mut impl Write, entry: &PathEntry) -> Result<()> {
if let Some(result) = entry.result {
writeln!(writer, "if idx == buf.len() {{")?;
writeln!(writer, "return Some(TokenType::{});", result)?;
writeln!(writer, "}}")?;
}
if entry.sub_entries.is_empty() {
writeln!(writer, "None")?;
return Ok(());
}
writeln!(writer, "if idx >= buf.len() {{")?;
writeln!(writer, "return None;")?;
writeln!(writer, "}}")?;
writeln!(writer, "match buf[idx] {{")?;
for (&b, sub_entry) in &entry.sub_entries {
if b.is_ascii_alphabetic() {
writeln!(writer, "{} | {} => {{", b, b.to_ascii_lowercase())?;
} else {
writeln!(writer, "{} => {{", b)?;
}
writeln!(writer, "idx += 1;")?;
write_entry(writer, sub_entry)?;
writeln!(writer, "}}")?;
}
writeln!(writer, "_ => None")?;
writeln!(writer, "}}")?;
Ok(())
}
writeln!(
writer,
"pub(crate) const MAX_KEYWORD_LEN: usize = {};",
max_len
)?;
writeln!(
writer,
"pub(crate) const MIN_KEYWORD_LEN: usize = {};",
min_len
)?;
writeln!(writer, "/// Check if `word` is a keyword")?;
writeln!(
writer,
"pub fn {}(buf: &[u8]) -> Option<TokenType> {{",
func_name
)?;
writeln!(
writer,
"if buf.len() < MIN_KEYWORD_LEN || buf.len() > MAX_KEYWORD_LEN {{"
)?;
writeln!(writer, "return None;")?;
writeln!(writer, "}}")?;
writeln!(writer, "let mut idx = 0;")?;
write_entry(writer, &paths)?;
writeln!(writer, "}}")?;
Ok(())
}
fn main() -> Result<()> {
let out_dir = env::var("OUT_DIR").unwrap();
@@ -43,164 +172,158 @@ fn main() -> Result<()> {
let keywords = out_path.join("keywords.rs");
let mut keywords = BufWriter::new(File::create(keywords)?);
write!(
build_keyword_map(
&mut keywords,
"static KEYWORDS: ::phf::Map<&'static UncasedStr, TokenType> = \n{};",
phf_codegen::Map::new()
.entry(UncasedStr::new("ABORT"), "TokenType::TK_ABORT")
.entry(UncasedStr::new("ACTION"), "TokenType::TK_ACTION")
.entry(UncasedStr::new("ADD"), "TokenType::TK_ADD")
.entry(UncasedStr::new("AFTER"), "TokenType::TK_AFTER")
.entry(UncasedStr::new("ALL"), "TokenType::TK_ALL")
.entry(UncasedStr::new("ALTER"), "TokenType::TK_ALTER")
.entry(UncasedStr::new("ALWAYS"), "TokenType::TK_ALWAYS")
.entry(UncasedStr::new("ANALYZE"), "TokenType::TK_ANALYZE")
.entry(UncasedStr::new("AND"), "TokenType::TK_AND")
.entry(UncasedStr::new("AS"), "TokenType::TK_AS")
.entry(UncasedStr::new("ASC"), "TokenType::TK_ASC")
.entry(UncasedStr::new("ATTACH"), "TokenType::TK_ATTACH")
.entry(UncasedStr::new("AUTOINCREMENT"), "TokenType::TK_AUTOINCR")
.entry(UncasedStr::new("BEFORE"), "TokenType::TK_BEFORE")
.entry(UncasedStr::new("BEGIN"), "TokenType::TK_BEGIN")
.entry(UncasedStr::new("BETWEEN"), "TokenType::TK_BETWEEN")
.entry(UncasedStr::new("BY"), "TokenType::TK_BY")
.entry(UncasedStr::new("CASCADE"), "TokenType::TK_CASCADE")
.entry(UncasedStr::new("CASE"), "TokenType::TK_CASE")
.entry(UncasedStr::new("CAST"), "TokenType::TK_CAST")
.entry(UncasedStr::new("CHECK"), "TokenType::TK_CHECK")
.entry(UncasedStr::new("COLLATE"), "TokenType::TK_COLLATE")
.entry(UncasedStr::new("COLUMN"), "TokenType::TK_COLUMNKW")
.entry(UncasedStr::new("COMMIT"), "TokenType::TK_COMMIT")
.entry(UncasedStr::new("CONFLICT"), "TokenType::TK_CONFLICT")
.entry(UncasedStr::new("CONSTRAINT"), "TokenType::TK_CONSTRAINT")
.entry(UncasedStr::new("CREATE"), "TokenType::TK_CREATE")
.entry(UncasedStr::new("CROSS"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("CURRENT"), "TokenType::TK_CURRENT")
.entry(UncasedStr::new("CURRENT_DATE"), "TokenType::TK_CTIME_KW")
.entry(UncasedStr::new("CURRENT_TIME"), "TokenType::TK_CTIME_KW")
.entry(
UncasedStr::new("CURRENT_TIMESTAMP"),
"TokenType::TK_CTIME_KW"
)
.entry(UncasedStr::new("DATABASE"), "TokenType::TK_DATABASE")
.entry(UncasedStr::new("DEFAULT"), "TokenType::TK_DEFAULT")
.entry(UncasedStr::new("DEFERRABLE"), "TokenType::TK_DEFERRABLE")
.entry(UncasedStr::new("DEFERRED"), "TokenType::TK_DEFERRED")
.entry(UncasedStr::new("DELETE"), "TokenType::TK_DELETE")
.entry(UncasedStr::new("DESC"), "TokenType::TK_DESC")
.entry(UncasedStr::new("DETACH"), "TokenType::TK_DETACH")
.entry(UncasedStr::new("DISTINCT"), "TokenType::TK_DISTINCT")
.entry(UncasedStr::new("DO"), "TokenType::TK_DO")
.entry(UncasedStr::new("DROP"), "TokenType::TK_DROP")
.entry(UncasedStr::new("EACH"), "TokenType::TK_EACH")
.entry(UncasedStr::new("ELSE"), "TokenType::TK_ELSE")
.entry(UncasedStr::new("END"), "TokenType::TK_END")
.entry(UncasedStr::new("ESCAPE"), "TokenType::TK_ESCAPE")
.entry(UncasedStr::new("EXCEPT"), "TokenType::TK_EXCEPT")
.entry(UncasedStr::new("EXCLUDE"), "TokenType::TK_EXCLUDE")
.entry(UncasedStr::new("EXCLUSIVE"), "TokenType::TK_EXCLUSIVE")
.entry(UncasedStr::new("EXISTS"), "TokenType::TK_EXISTS")
.entry(UncasedStr::new("EXPLAIN"), "TokenType::TK_EXPLAIN")
.entry(UncasedStr::new("FAIL"), "TokenType::TK_FAIL")
.entry(UncasedStr::new("FILTER"), "TokenType::TK_FILTER")
.entry(UncasedStr::new("FIRST"), "TokenType::TK_FIRST")
.entry(UncasedStr::new("FOLLOWING"), "TokenType::TK_FOLLOWING")
.entry(UncasedStr::new("FOR"), "TokenType::TK_FOR")
.entry(UncasedStr::new("FOREIGN"), "TokenType::TK_FOREIGN")
.entry(UncasedStr::new("FROM"), "TokenType::TK_FROM")
.entry(UncasedStr::new("FULL"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("GENERATED"), "TokenType::TK_GENERATED")
.entry(UncasedStr::new("GLOB"), "TokenType::TK_LIKE_KW")
.entry(UncasedStr::new("GROUP"), "TokenType::TK_GROUP")
.entry(UncasedStr::new("GROUPS"), "TokenType::TK_GROUPS")
.entry(UncasedStr::new("HAVING"), "TokenType::TK_HAVING")
.entry(UncasedStr::new("IF"), "TokenType::TK_IF")
.entry(UncasedStr::new("IGNORE"), "TokenType::TK_IGNORE")
.entry(UncasedStr::new("IMMEDIATE"), "TokenType::TK_IMMEDIATE")
.entry(UncasedStr::new("IN"), "TokenType::TK_IN")
.entry(UncasedStr::new("INDEX"), "TokenType::TK_INDEX")
.entry(UncasedStr::new("INDEXED"), "TokenType::TK_INDEXED")
.entry(UncasedStr::new("INITIALLY"), "TokenType::TK_INITIALLY")
.entry(UncasedStr::new("INNER"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("INSERT"), "TokenType::TK_INSERT")
.entry(UncasedStr::new("INSTEAD"), "TokenType::TK_INSTEAD")
.entry(UncasedStr::new("INTERSECT"), "TokenType::TK_INTERSECT")
.entry(UncasedStr::new("INTO"), "TokenType::TK_INTO")
.entry(UncasedStr::new("IS"), "TokenType::TK_IS")
.entry(UncasedStr::new("ISNULL"), "TokenType::TK_ISNULL")
.entry(UncasedStr::new("JOIN"), "TokenType::TK_JOIN")
.entry(UncasedStr::new("KEY"), "TokenType::TK_KEY")
.entry(UncasedStr::new("LAST"), "TokenType::TK_LAST")
.entry(UncasedStr::new("LEFT"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("LIKE"), "TokenType::TK_LIKE_KW")
.entry(UncasedStr::new("LIMIT"), "TokenType::TK_LIMIT")
.entry(UncasedStr::new("MATCH"), "TokenType::TK_MATCH")
.entry(
UncasedStr::new("MATERIALIZED"),
"TokenType::TK_MATERIALIZED"
)
.entry(UncasedStr::new("NATURAL"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("NO"), "TokenType::TK_NO")
.entry(UncasedStr::new("NOT"), "TokenType::TK_NOT")
.entry(UncasedStr::new("NOTHING"), "TokenType::TK_NOTHING")
.entry(UncasedStr::new("NOTNULL"), "TokenType::TK_NOTNULL")
.entry(UncasedStr::new("NULL"), "TokenType::TK_NULL")
.entry(UncasedStr::new("NULLS"), "TokenType::TK_NULLS")
.entry(UncasedStr::new("OF"), "TokenType::TK_OF")
.entry(UncasedStr::new("OFFSET"), "TokenType::TK_OFFSET")
.entry(UncasedStr::new("ON"), "TokenType::TK_ON")
.entry(UncasedStr::new("OR"), "TokenType::TK_OR")
.entry(UncasedStr::new("ORDER"), "TokenType::TK_ORDER")
.entry(UncasedStr::new("OTHERS"), "TokenType::TK_OTHERS")
.entry(UncasedStr::new("OUTER"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("OVER"), "TokenType::TK_OVER")
.entry(UncasedStr::new("PARTITION"), "TokenType::TK_PARTITION")
.entry(UncasedStr::new("PLAN"), "TokenType::TK_PLAN")
.entry(UncasedStr::new("PRAGMA"), "TokenType::TK_PRAGMA")
.entry(UncasedStr::new("PRECEDING"), "TokenType::TK_PRECEDING")
.entry(UncasedStr::new("PRIMARY"), "TokenType::TK_PRIMARY")
.entry(UncasedStr::new("QUERY"), "TokenType::TK_QUERY")
.entry(UncasedStr::new("RAISE"), "TokenType::TK_RAISE")
.entry(UncasedStr::new("RANGE"), "TokenType::TK_RANGE")
.entry(UncasedStr::new("RECURSIVE"), "TokenType::TK_RECURSIVE")
.entry(UncasedStr::new("REFERENCES"), "TokenType::TK_REFERENCES")
.entry(UncasedStr::new("REGEXP"), "TokenType::TK_LIKE_KW")
.entry(UncasedStr::new("REINDEX"), "TokenType::TK_REINDEX")
.entry(UncasedStr::new("RELEASE"), "TokenType::TK_RELEASE")
.entry(UncasedStr::new("RENAME"), "TokenType::TK_RENAME")
.entry(UncasedStr::new("REPLACE"), "TokenType::TK_REPLACE")
.entry(UncasedStr::new("RETURNING"), "TokenType::TK_RETURNING")
.entry(UncasedStr::new("RESTRICT"), "TokenType::TK_RESTRICT")
.entry(UncasedStr::new("RIGHT"), "TokenType::TK_JOIN_KW")
.entry(UncasedStr::new("ROLLBACK"), "TokenType::TK_ROLLBACK")
.entry(UncasedStr::new("ROW"), "TokenType::TK_ROW")
.entry(UncasedStr::new("ROWS"), "TokenType::TK_ROWS")
.entry(UncasedStr::new("SAVEPOINT"), "TokenType::TK_SAVEPOINT")
.entry(UncasedStr::new("SELECT"), "TokenType::TK_SELECT")
.entry(UncasedStr::new("SET"), "TokenType::TK_SET")
.entry(UncasedStr::new("TABLE"), "TokenType::TK_TABLE")
.entry(UncasedStr::new("TEMP"), "TokenType::TK_TEMP")
.entry(UncasedStr::new("TEMPORARY"), "TokenType::TK_TEMP")
.entry(UncasedStr::new("THEN"), "TokenType::TK_THEN")
.entry(UncasedStr::new("TIES"), "TokenType::TK_TIES")
.entry(UncasedStr::new("TO"), "TokenType::TK_TO")
.entry(UncasedStr::new("TRANSACTION"), "TokenType::TK_TRANSACTION")
.entry(UncasedStr::new("TRIGGER"), "TokenType::TK_TRIGGER")
.entry(UncasedStr::new("UNBOUNDED"), "TokenType::TK_UNBOUNDED")
.entry(UncasedStr::new("UNION"), "TokenType::TK_UNION")
.entry(UncasedStr::new("UNIQUE"), "TokenType::TK_UNIQUE")
.entry(UncasedStr::new("UPDATE"), "TokenType::TK_UPDATE")
.entry(UncasedStr::new("USING"), "TokenType::TK_USING")
.entry(UncasedStr::new("VACUUM"), "TokenType::TK_VACUUM")
.entry(UncasedStr::new("VALUES"), "TokenType::TK_VALUES")
.entry(UncasedStr::new("VIEW"), "TokenType::TK_VIEW")
.entry(UncasedStr::new("VIRTUAL"), "TokenType::TK_VIRTUAL")
.entry(UncasedStr::new("WHEN"), "TokenType::TK_WHEN")
.entry(UncasedStr::new("WHERE"), "TokenType::TK_WHERE")
.entry(UncasedStr::new("WINDOW"), "TokenType::TK_WINDOW")
.entry(UncasedStr::new("WITH"), "TokenType::TK_WITH")
.entry(UncasedStr::new("WITHOUT"), "TokenType::TK_WITHOUT")
.build()
"keyword_token",
&[
["ABORT", "TK_ABORT"],
["ACTION", "TK_ACTION"],
["ADD", "TK_ADD"],
["AFTER", "TK_AFTER"],
["ALL", "TK_ALL"],
["ALTER", "TK_ALTER"],
["ALWAYS", "TK_ALWAYS"],
["ANALYZE", "TK_ANALYZE"],
["AND", "TK_AND"],
["AS", "TK_AS"],
["ASC", "TK_ASC"],
["ATTACH", "TK_ATTACH"],
["AUTOINCREMENT", "TK_AUTOINCR"],
["BEFORE", "TK_BEFORE"],
["BEGIN", "TK_BEGIN"],
["BETWEEN", "TK_BETWEEN"],
["BY", "TK_BY"],
["CASCADE", "TK_CASCADE"],
["CASE", "TK_CASE"],
["CAST", "TK_CAST"],
["CHECK", "TK_CHECK"],
["COLLATE", "TK_COLLATE"],
["COLUMN", "TK_COLUMNKW"],
["COMMIT", "TK_COMMIT"],
["CONFLICT", "TK_CONFLICT"],
["CONSTRAINT", "TK_CONSTRAINT"],
["CREATE", "TK_CREATE"],
["CROSS", "TK_JOIN_KW"],
["CURRENT", "TK_CURRENT"],
["CURRENT_DATE", "TK_CTIME_KW"],
["CURRENT_TIME", "TK_CTIME_KW"],
["CURRENT_TIMESTAMP", "TK_CTIME_KW"],
["DATABASE", "TK_DATABASE"],
["DEFAULT", "TK_DEFAULT"],
["DEFERRABLE", "TK_DEFERRABLE"],
["DEFERRED", "TK_DEFERRED"],
["DELETE", "TK_DELETE"],
["DESC", "TK_DESC"],
["DETACH", "TK_DETACH"],
["DISTINCT", "TK_DISTINCT"],
["DO", "TK_DO"],
["DROP", "TK_DROP"],
["EACH", "TK_EACH"],
["ELSE", "TK_ELSE"],
["END", "TK_END"],
["ESCAPE", "TK_ESCAPE"],
["EXCEPT", "TK_EXCEPT"],
["EXCLUDE", "TK_EXCLUDE"],
["EXCLUSIVE", "TK_EXCLUSIVE"],
["EXISTS", "TK_EXISTS"],
["EXPLAIN", "TK_EXPLAIN"],
["FAIL", "TK_FAIL"],
["FILTER", "TK_FILTER"],
["FIRST", "TK_FIRST"],
["FOLLOWING", "TK_FOLLOWING"],
["FOR", "TK_FOR"],
["FOREIGN", "TK_FOREIGN"],
["FROM", "TK_FROM"],
["FULL", "TK_JOIN_KW"],
["GENERATED", "TK_GENERATED"],
["GLOB", "TK_LIKE_KW"],
["GROUP", "TK_GROUP"],
["GROUPS", "TK_GROUPS"],
["HAVING", "TK_HAVING"],
["IF", "TK_IF"],
["IGNORE", "TK_IGNORE"],
["IMMEDIATE", "TK_IMMEDIATE"],
["IN", "TK_IN"],
["INDEX", "TK_INDEX"],
["INDEXED", "TK_INDEXED"],
["INITIALLY", "TK_INITIALLY"],
["INNER", "TK_JOIN_KW"],
["INSERT", "TK_INSERT"],
["INSTEAD", "TK_INSTEAD"],
["INTERSECT", "TK_INTERSECT"],
["INTO", "TK_INTO"],
["IS", "TK_IS"],
["ISNULL", "TK_ISNULL"],
["JOIN", "TK_JOIN"],
["KEY", "TK_KEY"],
["LAST", "TK_LAST"],
["LEFT", "TK_JOIN_KW"],
["LIKE", "TK_LIKE_KW"],
["LIMIT", "TK_LIMIT"],
["MATCH", "TK_MATCH"],
["MATERIALIZED", "TK_MATERIALIZED"],
["NATURAL", "TK_JOIN_KW"],
["NO", "TK_NO"],
["NOT", "TK_NOT"],
["NOTHING", "TK_NOTHING"],
["NOTNULL", "TK_NOTNULL"],
["NULL", "TK_NULL"],
["NULLS", "TK_NULLS"],
["OF", "TK_OF"],
["OFFSET", "TK_OFFSET"],
["ON", "TK_ON"],
["OR", "TK_OR"],
["ORDER", "TK_ORDER"],
["OTHERS", "TK_OTHERS"],
["OUTER", "TK_JOIN_KW"],
["OVER", "TK_OVER"],
["PARTITION", "TK_PARTITION"],
["PLAN", "TK_PLAN"],
["PRAGMA", "TK_PRAGMA"],
["PRECEDING", "TK_PRECEDING"],
["PRIMARY", "TK_PRIMARY"],
["QUERY", "TK_QUERY"],
["RAISE", "TK_RAISE"],
["RANGE", "TK_RANGE"],
["RECURSIVE", "TK_RECURSIVE"],
["REFERENCES", "TK_REFERENCES"],
["REGEXP", "TK_LIKE_KW"],
["REINDEX", "TK_REINDEX"],
["RELEASE", "TK_RELEASE"],
["RENAME", "TK_RENAME"],
["REPLACE", "TK_REPLACE"],
["RETURNING", "TK_RETURNING"],
["RESTRICT", "TK_RESTRICT"],
["RIGHT", "TK_JOIN_KW"],
["ROLLBACK", "TK_ROLLBACK"],
["ROW", "TK_ROW"],
["ROWS", "TK_ROWS"],
["SAVEPOINT", "TK_SAVEPOINT"],
["SELECT", "TK_SELECT"],
["SET", "TK_SET"],
["TABLE", "TK_TABLE"],
["TEMP", "TK_TEMP"],
["TEMPORARY", "TK_TEMP"],
["THEN", "TK_THEN"],
["TIES", "TK_TIES"],
["TO", "TK_TO"],
["TRANSACTION", "TK_TRANSACTION"],
["TRIGGER", "TK_TRIGGER"],
["UNBOUNDED", "TK_UNBOUNDED"],
["UNION", "TK_UNION"],
["UNIQUE", "TK_UNIQUE"],
["UPDATE", "TK_UPDATE"],
["USING", "TK_USING"],
["VACUUM", "TK_VACUUM"],
["VALUES", "TK_VALUES"],
["VIEW", "TK_VIEW"],
["VIRTUAL", "TK_VIRTUAL"],
["WHEN", "TK_WHEN"],
["WHERE", "TK_WHERE"],
["WINDOW", "TK_WINDOW"],
["WITH", "TK_WITH"],
["WITHOUT", "TK_WITHOUT"],
],
)?;
println!("cargo:rerun-if-changed=third_party/lemon/lemon.c");

View File

@@ -12,7 +12,7 @@
use criterion::{criterion_group, criterion_main, Criterion};
use fallible_iterator::FallibleIterator;
use turso_sqlite3_parser::lexer::sql::Parser;
use turso_sqlite3_parser::{dialect::keyword_token, lexer::sql::Parser};
fn basic_queries(c: &mut Criterion) {
let mut group = c.benchmark_group("sqlparser-rs parsing benchmark");
@@ -42,6 +42,152 @@ fn basic_queries(c: &mut Criterion) {
assert!(parser.next().unwrap().unwrap().readonly())
});
});
static VALUES: [&[u8]; 136] = [
b"ABORT",
b"ACTION",
b"ADD",
b"AFTER",
b"ALL",
b"ALTER",
b"ANALYZE",
b"AND",
b"AS",
b"ASC",
b"ATTACH",
b"AUTOINCREMENT",
b"BEFORE",
b"BEGIN",
b"BETWEEN",
b"BY",
b"CASCADE",
b"CASE",
b"CAST",
b"CHECK",
b"COLLATE",
b"COLUMN",
b"COMMIT",
b"CONFLICT",
b"CONSTRAINT",
b"CREATE",
b"CROSS",
b"CURRENT",
b"CURRENT_DATE",
b"CURRENT_TIME",
b"CURRENT_TIMESTAMP",
b"DATABASE",
b"DEFAULT",
b"DEFERRABLE",
b"DEFERRED",
b"DELETE",
b"DESC",
b"DETACH",
b"DISTINCT",
b"DO",
b"DROP",
b"EACH",
b"ELSE",
b"END",
b"ESCAPE",
b"EXCEPT",
b"EXCLUSIVE",
b"EXISTS",
b"EXPLAIN",
b"FAIL",
b"FILTER",
b"FOLLOWING",
b"FOR",
b"FOREIGN",
b"FROM",
b"FULL",
b"GLOB",
b"GROUP",
b"HAVING",
b"IF",
b"IGNORE",
b"IMMEDIATE",
b"IN",
b"INDEX",
b"INDEXED",
b"INITIALLY",
b"INNER",
b"INSERT",
b"INSTEAD",
b"INTERSECT",
b"INTO",
b"IS",
b"ISNULL",
b"JOIN",
b"KEY",
b"LEFT",
b"LIKE",
b"LIMIT",
b"MATCH",
b"NATURAL",
b"NO",
b"NOT",
b"NOTHING",
b"NOTNULL",
b"NULL",
b"OF",
b"OFFSET",
b"ON",
b"OR",
b"ORDER",
b"OUTER",
b"OVER",
b"PARTITION",
b"PLAN",
b"PRAGMA",
b"PRECEDING",
b"PRIMARY",
b"QUERY",
b"RAISE",
b"RANGE",
b"RECURSIVE",
b"REFERENCES",
b"REGEXP",
b"REINDEX",
b"RELEASE",
b"RENAME",
b"REPLACE",
b"RESTRICT",
b"RIGHT",
b"ROLLBACK",
b"ROW",
b"ROWS",
b"SAVEPOINT",
b"SELECT",
b"SET",
b"TABLE",
b"TEMP",
b"TEMPORARY",
b"THEN",
b"TO",
b"TRANSACTION",
b"TRIGGER",
b"UNBOUNDED",
b"UNION",
b"UNIQUE",
b"UPDATE",
b"USING",
b"VACUUM",
b"VALUES",
b"VIEW",
b"VIRTUAL",
b"WHEN",
b"WHERE",
b"WINDOW",
b"WITH",
b"WITHOUT",
];
group.bench_with_input("keyword_token", &VALUES, |b, &s| {
b.iter(|| {
for value in &s {
assert!(keyword_token(value).is_some())
}
});
});
}
criterion_group!(benches, basic_queries);

View File

@@ -2,7 +2,6 @@
use std::fmt::Formatter;
use std::str;
use uncased::UncasedStr;
mod token;
pub use token::TokenType;
@@ -42,13 +41,6 @@ pub(crate) fn from_bytes(bytes: &[u8]) -> String {
}
include!(concat!(env!("OUT_DIR"), "/keywords.rs"));
pub(crate) const MAX_KEYWORD_LEN: usize = 17;
/// Check if `word` is a keyword
pub fn keyword_token(word: &[u8]) -> Option<TokenType> {
let s = std::str::from_utf8(word).ok()?;
KEYWORDS.get(UncasedStr::new(s)).cloned()
}
pub(crate) fn is_identifier(name: &str) -> bool {
if name.is_empty() {
@@ -242,3 +234,176 @@ impl TokenType {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn test_keyword_token() {
let values = HashMap::from([
("ABORT", TokenType::TK_ABORT),
("ACTION", TokenType::TK_ACTION),
("ADD", TokenType::TK_ADD),
("AFTER", TokenType::TK_AFTER),
("ALL", TokenType::TK_ALL),
("ALTER", TokenType::TK_ALTER),
("ALWAYS", TokenType::TK_ALWAYS),
("ANALYZE", TokenType::TK_ANALYZE),
("AND", TokenType::TK_AND),
("AS", TokenType::TK_AS),
("ASC", TokenType::TK_ASC),
("ATTACH", TokenType::TK_ATTACH),
("AUTOINCREMENT", TokenType::TK_AUTOINCR),
("BEFORE", TokenType::TK_BEFORE),
("BEGIN", TokenType::TK_BEGIN),
("BETWEEN", TokenType::TK_BETWEEN),
("BY", TokenType::TK_BY),
("CASCADE", TokenType::TK_CASCADE),
("CASE", TokenType::TK_CASE),
("CAST", TokenType::TK_CAST),
("CHECK", TokenType::TK_CHECK),
("COLLATE", TokenType::TK_COLLATE),
("COLUMN", TokenType::TK_COLUMNKW),
("COMMIT", TokenType::TK_COMMIT),
("CONFLICT", TokenType::TK_CONFLICT),
("CONSTRAINT", TokenType::TK_CONSTRAINT),
("CREATE", TokenType::TK_CREATE),
("CROSS", TokenType::TK_JOIN_KW),
("CURRENT", TokenType::TK_CURRENT),
("CURRENT_DATE", TokenType::TK_CTIME_KW),
("CURRENT_TIME", TokenType::TK_CTIME_KW),
("CURRENT_TIMESTAMP", TokenType::TK_CTIME_KW),
("DATABASE", TokenType::TK_DATABASE),
("DEFAULT", TokenType::TK_DEFAULT),
("DEFERRABLE", TokenType::TK_DEFERRABLE),
("DEFERRED", TokenType::TK_DEFERRED),
("DELETE", TokenType::TK_DELETE),
("DESC", TokenType::TK_DESC),
("DETACH", TokenType::TK_DETACH),
("DISTINCT", TokenType::TK_DISTINCT),
("DO", TokenType::TK_DO),
("DROP", TokenType::TK_DROP),
("EACH", TokenType::TK_EACH),
("ELSE", TokenType::TK_ELSE),
("END", TokenType::TK_END),
("ESCAPE", TokenType::TK_ESCAPE),
("EXCEPT", TokenType::TK_EXCEPT),
("EXCLUDE", TokenType::TK_EXCLUDE),
("EXCLUSIVE", TokenType::TK_EXCLUSIVE),
("EXISTS", TokenType::TK_EXISTS),
("EXPLAIN", TokenType::TK_EXPLAIN),
("FAIL", TokenType::TK_FAIL),
("FILTER", TokenType::TK_FILTER),
("FIRST", TokenType::TK_FIRST),
("FOLLOWING", TokenType::TK_FOLLOWING),
("FOR", TokenType::TK_FOR),
("FOREIGN", TokenType::TK_FOREIGN),
("FROM", TokenType::TK_FROM),
("FULL", TokenType::TK_JOIN_KW),
("GENERATED", TokenType::TK_GENERATED),
("GLOB", TokenType::TK_LIKE_KW),
("GROUP", TokenType::TK_GROUP),
("GROUPS", TokenType::TK_GROUPS),
("HAVING", TokenType::TK_HAVING),
("IF", TokenType::TK_IF),
("IGNORE", TokenType::TK_IGNORE),
("IMMEDIATE", TokenType::TK_IMMEDIATE),
("IN", TokenType::TK_IN),
("INDEX", TokenType::TK_INDEX),
("INDEXED", TokenType::TK_INDEXED),
("INITIALLY", TokenType::TK_INITIALLY),
("INNER", TokenType::TK_JOIN_KW),
("INSERT", TokenType::TK_INSERT),
("INSTEAD", TokenType::TK_INSTEAD),
("INTERSECT", TokenType::TK_INTERSECT),
("INTO", TokenType::TK_INTO),
("IS", TokenType::TK_IS),
("ISNULL", TokenType::TK_ISNULL),
("JOIN", TokenType::TK_JOIN),
("KEY", TokenType::TK_KEY),
("LAST", TokenType::TK_LAST),
("LEFT", TokenType::TK_JOIN_KW),
("LIKE", TokenType::TK_LIKE_KW),
("LIMIT", TokenType::TK_LIMIT),
("MATCH", TokenType::TK_MATCH),
("MATERIALIZED", TokenType::TK_MATERIALIZED),
("NATURAL", TokenType::TK_JOIN_KW),
("NO", TokenType::TK_NO),
("NOT", TokenType::TK_NOT),
("NOTHING", TokenType::TK_NOTHING),
("NOTNULL", TokenType::TK_NOTNULL),
("NULL", TokenType::TK_NULL),
("NULLS", TokenType::TK_NULLS),
("OF", TokenType::TK_OF),
("OFFSET", TokenType::TK_OFFSET),
("ON", TokenType::TK_ON),
("OR", TokenType::TK_OR),
("ORDER", TokenType::TK_ORDER),
("OTHERS", TokenType::TK_OTHERS),
("OUTER", TokenType::TK_JOIN_KW),
("OVER", TokenType::TK_OVER),
("PARTITION", TokenType::TK_PARTITION),
("PLAN", TokenType::TK_PLAN),
("PRAGMA", TokenType::TK_PRAGMA),
("PRECEDING", TokenType::TK_PRECEDING),
("PRIMARY", TokenType::TK_PRIMARY),
("QUERY", TokenType::TK_QUERY),
("RAISE", TokenType::TK_RAISE),
("RANGE", TokenType::TK_RANGE),
("RECURSIVE", TokenType::TK_RECURSIVE),
("REFERENCES", TokenType::TK_REFERENCES),
("REGEXP", TokenType::TK_LIKE_KW),
("REINDEX", TokenType::TK_REINDEX),
("RELEASE", TokenType::TK_RELEASE),
("RENAME", TokenType::TK_RENAME),
("REPLACE", TokenType::TK_REPLACE),
("RETURNING", TokenType::TK_RETURNING),
("RESTRICT", TokenType::TK_RESTRICT),
("RIGHT", TokenType::TK_JOIN_KW),
("ROLLBACK", TokenType::TK_ROLLBACK),
("ROW", TokenType::TK_ROW),
("ROWS", TokenType::TK_ROWS),
("SAVEPOINT", TokenType::TK_SAVEPOINT),
("SELECT", TokenType::TK_SELECT),
("SET", TokenType::TK_SET),
("TABLE", TokenType::TK_TABLE),
("TEMP", TokenType::TK_TEMP),
("TEMPORARY", TokenType::TK_TEMP),
("THEN", TokenType::TK_THEN),
("TIES", TokenType::TK_TIES),
("TO", TokenType::TK_TO),
("TRANSACTION", TokenType::TK_TRANSACTION),
("TRIGGER", TokenType::TK_TRIGGER),
("UNBOUNDED", TokenType::TK_UNBOUNDED),
("UNION", TokenType::TK_UNION),
("UNIQUE", TokenType::TK_UNIQUE),
("UPDATE", TokenType::TK_UPDATE),
("USING", TokenType::TK_USING),
("VACUUM", TokenType::TK_VACUUM),
("VALUES", TokenType::TK_VALUES),
("VIEW", TokenType::TK_VIEW),
("VIRTUAL", TokenType::TK_VIRTUAL),
("WHEN", TokenType::TK_WHEN),
("WHERE", TokenType::TK_WHERE),
("WINDOW", TokenType::TK_WINDOW),
("WITH", TokenType::TK_WITH),
("WITHOUT", TokenType::TK_WITHOUT),
]);
for (key, value) in &values {
assert!(keyword_token(key.as_bytes()).unwrap() == *value);
assert!(
keyword_token(key.as_bytes().to_ascii_lowercase().as_slice()).unwrap() == *value
);
}
assert!(keyword_token(b"").is_none());
assert!(keyword_token(b"wrong").is_none());
assert!(keyword_token(b"super wrong").is_none());
assert!(keyword_token(b"super_wrong").is_none());
assert!(keyword_token(b"aae26e78-3ba7-4627-8f8f-02623302495a").is_none());
assert!(keyword_token("Crème Brulée".as_bytes()).is_none());
assert!(keyword_token("fróm".as_bytes()).is_none());
}
}

View File

@@ -4,9 +4,7 @@ use memchr::memchr;
pub use crate::dialect::TokenType;
use crate::dialect::TokenType::*;
use crate::dialect::{
is_identifier_continue, is_identifier_start, keyword_token, sentinel, MAX_KEYWORD_LEN,
};
use crate::dialect::{is_identifier_continue, is_identifier_start, keyword_token, sentinel};
use crate::parser::ast::Cmd;
use crate::parser::parse::{yyParser, YYCODETYPE};
use crate::parser::Context;
@@ -719,12 +717,7 @@ impl Tokenizer {
_ => data.len(),
};
let word = &data[..i];
let tt = if word.len() >= 2 && word.len() <= MAX_KEYWORD_LEN && word.is_ascii() {
keyword_token(word).unwrap_or(TK_ID)
} else {
TK_ID
};
(Some((word, tt)), i)
(Some((word, keyword_token(word).unwrap_or(TK_ID))), i)
}
}