mirror of
https://github.com/aljazceru/turso.git
synced 2025-12-25 12:04:21 +01:00
Merge 'parser: replace KEYWORDS with matching' from Lâm Hoàng Phúc
before:
```sh
sqlparser-rs parsing benchmark/sqlparser::select
time: [693.20 ns 693.96 ns 694.73 ns]
change: [+7.4382% +7.6384% +7.8250%] (p = 0.00 < 0.05)
Performance has regressed.
Found 3 outliers among 100 measurements (3.00%)
1 (1.00%) low severe
1 (1.00%) low mild
1 (1.00%) high mild
sqlparser-rs parsing benchmark/sqlparser::with_select
time: [2.5734 µs 2.5763 µs 2.5796 µs]
change: [+16.583% +16.809% +17.024%] (p = 0.00 < 0.05)
Performance has regressed.
sqlparser-rs parsing benchmark/keyword_token
time: [3.1919 µs 3.1983 µs 3.2047 µs]
change: [+944.74% +948.97% +952.91%] (p = 0.00 < 0.05)
Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) low mild
```
after:
```sh
sqlparser-rs parsing benchmark/sqlparser::select
time: [637.09 ns 638.50 ns 640.15 ns]
change: [-1.8412% -1.5494% -1.2424%] (p = 0.00 < 0.05)
Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
1 (1.00%) low severe
3 (3.00%) low mild
3 (3.00%) high mild
1 (1.00%) high severe
sqlparser-rs parsing benchmark/sqlparser::with_select
time: [2.1896 µs 2.1919 µs 2.1942 µs]
change: [-0.6894% -0.3923% -0.1517%] (p = 0.00 < 0.05)
Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
4 (4.00%) low severe
sqlparser-rs parsing benchmark/keyword_token
time: [298.99 ns 299.82 ns 300.72 ns]
change: [-1.4726% -1.0148% -0.5702%] (p = 0.00 < 0.05)
Change within noise threshold.
Found 7 outliers among 100 measurements (7.00%)
1 (1.00%) low mild
6 (6.00%) high mild
```
Reviewed-by: Jussi Saurio <jussi.saurio@gmail.com>
Closes #1939
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
|
||||
use std::fmt::Formatter;
|
||||
use std::str;
|
||||
use uncased::UncasedStr;
|
||||
|
||||
mod token;
|
||||
pub use token::TokenType;
|
||||
@@ -42,13 +41,6 @@ pub(crate) fn from_bytes(bytes: &[u8]) -> String {
|
||||
}
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/keywords.rs"));
|
||||
pub(crate) const MAX_KEYWORD_LEN: usize = 17;
|
||||
|
||||
/// Check if `word` is a keyword
|
||||
pub fn keyword_token(word: &[u8]) -> Option<TokenType> {
|
||||
let s = std::str::from_utf8(word).ok()?;
|
||||
KEYWORDS.get(UncasedStr::new(s)).cloned()
|
||||
}
|
||||
|
||||
pub(crate) fn is_identifier(name: &str) -> bool {
|
||||
if name.is_empty() {
|
||||
@@ -242,3 +234,176 @@ impl TokenType {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_keyword_token() {
|
||||
let values = HashMap::from([
|
||||
("ABORT", TokenType::TK_ABORT),
|
||||
("ACTION", TokenType::TK_ACTION),
|
||||
("ADD", TokenType::TK_ADD),
|
||||
("AFTER", TokenType::TK_AFTER),
|
||||
("ALL", TokenType::TK_ALL),
|
||||
("ALTER", TokenType::TK_ALTER),
|
||||
("ALWAYS", TokenType::TK_ALWAYS),
|
||||
("ANALYZE", TokenType::TK_ANALYZE),
|
||||
("AND", TokenType::TK_AND),
|
||||
("AS", TokenType::TK_AS),
|
||||
("ASC", TokenType::TK_ASC),
|
||||
("ATTACH", TokenType::TK_ATTACH),
|
||||
("AUTOINCREMENT", TokenType::TK_AUTOINCR),
|
||||
("BEFORE", TokenType::TK_BEFORE),
|
||||
("BEGIN", TokenType::TK_BEGIN),
|
||||
("BETWEEN", TokenType::TK_BETWEEN),
|
||||
("BY", TokenType::TK_BY),
|
||||
("CASCADE", TokenType::TK_CASCADE),
|
||||
("CASE", TokenType::TK_CASE),
|
||||
("CAST", TokenType::TK_CAST),
|
||||
("CHECK", TokenType::TK_CHECK),
|
||||
("COLLATE", TokenType::TK_COLLATE),
|
||||
("COLUMN", TokenType::TK_COLUMNKW),
|
||||
("COMMIT", TokenType::TK_COMMIT),
|
||||
("CONFLICT", TokenType::TK_CONFLICT),
|
||||
("CONSTRAINT", TokenType::TK_CONSTRAINT),
|
||||
("CREATE", TokenType::TK_CREATE),
|
||||
("CROSS", TokenType::TK_JOIN_KW),
|
||||
("CURRENT", TokenType::TK_CURRENT),
|
||||
("CURRENT_DATE", TokenType::TK_CTIME_KW),
|
||||
("CURRENT_TIME", TokenType::TK_CTIME_KW),
|
||||
("CURRENT_TIMESTAMP", TokenType::TK_CTIME_KW),
|
||||
("DATABASE", TokenType::TK_DATABASE),
|
||||
("DEFAULT", TokenType::TK_DEFAULT),
|
||||
("DEFERRABLE", TokenType::TK_DEFERRABLE),
|
||||
("DEFERRED", TokenType::TK_DEFERRED),
|
||||
("DELETE", TokenType::TK_DELETE),
|
||||
("DESC", TokenType::TK_DESC),
|
||||
("DETACH", TokenType::TK_DETACH),
|
||||
("DISTINCT", TokenType::TK_DISTINCT),
|
||||
("DO", TokenType::TK_DO),
|
||||
("DROP", TokenType::TK_DROP),
|
||||
("EACH", TokenType::TK_EACH),
|
||||
("ELSE", TokenType::TK_ELSE),
|
||||
("END", TokenType::TK_END),
|
||||
("ESCAPE", TokenType::TK_ESCAPE),
|
||||
("EXCEPT", TokenType::TK_EXCEPT),
|
||||
("EXCLUDE", TokenType::TK_EXCLUDE),
|
||||
("EXCLUSIVE", TokenType::TK_EXCLUSIVE),
|
||||
("EXISTS", TokenType::TK_EXISTS),
|
||||
("EXPLAIN", TokenType::TK_EXPLAIN),
|
||||
("FAIL", TokenType::TK_FAIL),
|
||||
("FILTER", TokenType::TK_FILTER),
|
||||
("FIRST", TokenType::TK_FIRST),
|
||||
("FOLLOWING", TokenType::TK_FOLLOWING),
|
||||
("FOR", TokenType::TK_FOR),
|
||||
("FOREIGN", TokenType::TK_FOREIGN),
|
||||
("FROM", TokenType::TK_FROM),
|
||||
("FULL", TokenType::TK_JOIN_KW),
|
||||
("GENERATED", TokenType::TK_GENERATED),
|
||||
("GLOB", TokenType::TK_LIKE_KW),
|
||||
("GROUP", TokenType::TK_GROUP),
|
||||
("GROUPS", TokenType::TK_GROUPS),
|
||||
("HAVING", TokenType::TK_HAVING),
|
||||
("IF", TokenType::TK_IF),
|
||||
("IGNORE", TokenType::TK_IGNORE),
|
||||
("IMMEDIATE", TokenType::TK_IMMEDIATE),
|
||||
("IN", TokenType::TK_IN),
|
||||
("INDEX", TokenType::TK_INDEX),
|
||||
("INDEXED", TokenType::TK_INDEXED),
|
||||
("INITIALLY", TokenType::TK_INITIALLY),
|
||||
("INNER", TokenType::TK_JOIN_KW),
|
||||
("INSERT", TokenType::TK_INSERT),
|
||||
("INSTEAD", TokenType::TK_INSTEAD),
|
||||
("INTERSECT", TokenType::TK_INTERSECT),
|
||||
("INTO", TokenType::TK_INTO),
|
||||
("IS", TokenType::TK_IS),
|
||||
("ISNULL", TokenType::TK_ISNULL),
|
||||
("JOIN", TokenType::TK_JOIN),
|
||||
("KEY", TokenType::TK_KEY),
|
||||
("LAST", TokenType::TK_LAST),
|
||||
("LEFT", TokenType::TK_JOIN_KW),
|
||||
("LIKE", TokenType::TK_LIKE_KW),
|
||||
("LIMIT", TokenType::TK_LIMIT),
|
||||
("MATCH", TokenType::TK_MATCH),
|
||||
("MATERIALIZED", TokenType::TK_MATERIALIZED),
|
||||
("NATURAL", TokenType::TK_JOIN_KW),
|
||||
("NO", TokenType::TK_NO),
|
||||
("NOT", TokenType::TK_NOT),
|
||||
("NOTHING", TokenType::TK_NOTHING),
|
||||
("NOTNULL", TokenType::TK_NOTNULL),
|
||||
("NULL", TokenType::TK_NULL),
|
||||
("NULLS", TokenType::TK_NULLS),
|
||||
("OF", TokenType::TK_OF),
|
||||
("OFFSET", TokenType::TK_OFFSET),
|
||||
("ON", TokenType::TK_ON),
|
||||
("OR", TokenType::TK_OR),
|
||||
("ORDER", TokenType::TK_ORDER),
|
||||
("OTHERS", TokenType::TK_OTHERS),
|
||||
("OUTER", TokenType::TK_JOIN_KW),
|
||||
("OVER", TokenType::TK_OVER),
|
||||
("PARTITION", TokenType::TK_PARTITION),
|
||||
("PLAN", TokenType::TK_PLAN),
|
||||
("PRAGMA", TokenType::TK_PRAGMA),
|
||||
("PRECEDING", TokenType::TK_PRECEDING),
|
||||
("PRIMARY", TokenType::TK_PRIMARY),
|
||||
("QUERY", TokenType::TK_QUERY),
|
||||
("RAISE", TokenType::TK_RAISE),
|
||||
("RANGE", TokenType::TK_RANGE),
|
||||
("RECURSIVE", TokenType::TK_RECURSIVE),
|
||||
("REFERENCES", TokenType::TK_REFERENCES),
|
||||
("REGEXP", TokenType::TK_LIKE_KW),
|
||||
("REINDEX", TokenType::TK_REINDEX),
|
||||
("RELEASE", TokenType::TK_RELEASE),
|
||||
("RENAME", TokenType::TK_RENAME),
|
||||
("REPLACE", TokenType::TK_REPLACE),
|
||||
("RETURNING", TokenType::TK_RETURNING),
|
||||
("RESTRICT", TokenType::TK_RESTRICT),
|
||||
("RIGHT", TokenType::TK_JOIN_KW),
|
||||
("ROLLBACK", TokenType::TK_ROLLBACK),
|
||||
("ROW", TokenType::TK_ROW),
|
||||
("ROWS", TokenType::TK_ROWS),
|
||||
("SAVEPOINT", TokenType::TK_SAVEPOINT),
|
||||
("SELECT", TokenType::TK_SELECT),
|
||||
("SET", TokenType::TK_SET),
|
||||
("TABLE", TokenType::TK_TABLE),
|
||||
("TEMP", TokenType::TK_TEMP),
|
||||
("TEMPORARY", TokenType::TK_TEMP),
|
||||
("THEN", TokenType::TK_THEN),
|
||||
("TIES", TokenType::TK_TIES),
|
||||
("TO", TokenType::TK_TO),
|
||||
("TRANSACTION", TokenType::TK_TRANSACTION),
|
||||
("TRIGGER", TokenType::TK_TRIGGER),
|
||||
("UNBOUNDED", TokenType::TK_UNBOUNDED),
|
||||
("UNION", TokenType::TK_UNION),
|
||||
("UNIQUE", TokenType::TK_UNIQUE),
|
||||
("UPDATE", TokenType::TK_UPDATE),
|
||||
("USING", TokenType::TK_USING),
|
||||
("VACUUM", TokenType::TK_VACUUM),
|
||||
("VALUES", TokenType::TK_VALUES),
|
||||
("VIEW", TokenType::TK_VIEW),
|
||||
("VIRTUAL", TokenType::TK_VIRTUAL),
|
||||
("WHEN", TokenType::TK_WHEN),
|
||||
("WHERE", TokenType::TK_WHERE),
|
||||
("WINDOW", TokenType::TK_WINDOW),
|
||||
("WITH", TokenType::TK_WITH),
|
||||
("WITHOUT", TokenType::TK_WITHOUT),
|
||||
]);
|
||||
|
||||
for (key, value) in &values {
|
||||
assert!(keyword_token(key.as_bytes()).unwrap() == *value);
|
||||
assert!(
|
||||
keyword_token(key.as_bytes().to_ascii_lowercase().as_slice()).unwrap() == *value
|
||||
);
|
||||
}
|
||||
|
||||
assert!(keyword_token(b"").is_none());
|
||||
assert!(keyword_token(b"wrong").is_none());
|
||||
assert!(keyword_token(b"super wrong").is_none());
|
||||
assert!(keyword_token(b"super_wrong").is_none());
|
||||
assert!(keyword_token(b"aae26e78-3ba7-4627-8f8f-02623302495a").is_none());
|
||||
assert!(keyword_token("Crème Brulée".as_bytes()).is_none());
|
||||
assert!(keyword_token("fróm".as_bytes()).is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,7 @@ use memchr::memchr;
|
||||
|
||||
pub use crate::dialect::TokenType;
|
||||
use crate::dialect::TokenType::*;
|
||||
use crate::dialect::{
|
||||
is_identifier_continue, is_identifier_start, keyword_token, sentinel, MAX_KEYWORD_LEN,
|
||||
};
|
||||
use crate::dialect::{is_identifier_continue, is_identifier_start, keyword_token, sentinel};
|
||||
use crate::parser::ast::Cmd;
|
||||
use crate::parser::parse::{yyParser, YYCODETYPE};
|
||||
use crate::parser::Context;
|
||||
@@ -719,12 +717,7 @@ impl Tokenizer {
|
||||
_ => data.len(),
|
||||
};
|
||||
let word = &data[..i];
|
||||
let tt = if word.len() >= 2 && word.len() <= MAX_KEYWORD_LEN && word.is_ascii() {
|
||||
keyword_token(word).unwrap_or(TK_ID)
|
||||
} else {
|
||||
TK_ID
|
||||
};
|
||||
(Some((word, tt)), i)
|
||||
(Some((word, keyword_token(word).unwrap_or(TK_ID))), i)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user