Files
turso/parser/src/lexer.rs
Pekka Enberg 433b60555f Add BEGIN CONCURRENT support for MVCC mode
Currently, when MVCC is enabled, every transaction mode supports
concurrent reads and writes, which makes it hard to adopt for existing
applications that use `BEGIN DEFERRED` or `BEGIN IMMEDIATE`.

Therefore, add support for `BEGIN CONCURRENT` transactions when MVCC is
enabled. The transaction mode allows multiple concurrent read/write
transactions that don't block each other, with conflicts resolved at
commit time. Furthermore, implement the correct semantics for `BEGIN
DEFERRED` and `BEGIN IMMEDIATE` by taking advantage of the pager level
write lock when transaction upgrades to write. This means that now
concurrent MVCC transactions are serialized against the legacy ones when
needed.

The implementation includes:

- Parser support for CONCURRENT keyword in BEGIN statements

- New Concurrent variant in TransactionMode to distinguish from regular
  read/write transactions

- MVCC store tracking of exclusive transactions to support IMMEDIATE and
  EXCLUSIVE modes alongside CONCURRENT

- Proper transaction state management for all transaction types in MVCC

This enables better concurrency for applications that can handle
optimistic concurrency control, while still supporting traditional
SQLite transaction semantics via IMMEDIATE and EXCLUSIVE modes.
2025-09-11 16:05:52 +03:00

1568 lines
55 KiB
Rust

use crate::{error::Error, token::TokenType, Result};
use turso_macros::match_ignore_ascii_case;
fn keyword_or_id_token(input: &[u8]) -> TokenType {
match_ignore_ascii_case!(match input {
b"ABORT" => TokenType::TK_ABORT,
b"ACTION" => TokenType::TK_ACTION,
b"ADD" => TokenType::TK_ADD,
b"AFTER" => TokenType::TK_AFTER,
b"ALL" => TokenType::TK_ALL,
b"ALTER" => TokenType::TK_ALTER,
b"ALWAYS" => TokenType::TK_ALWAYS,
b"ANALYZE" => TokenType::TK_ANALYZE,
b"AND" => TokenType::TK_AND,
b"AS" => TokenType::TK_AS,
b"ASC" => TokenType::TK_ASC,
b"ATTACH" => TokenType::TK_ATTACH,
b"AUTOINCREMENT" => TokenType::TK_AUTOINCR,
b"BEFORE" => TokenType::TK_BEFORE,
b"BEGIN" => TokenType::TK_BEGIN,
b"BETWEEN" => TokenType::TK_BETWEEN,
b"BY" => TokenType::TK_BY,
b"CASCADE" => TokenType::TK_CASCADE,
b"CASE" => TokenType::TK_CASE,
b"CAST" => TokenType::TK_CAST,
b"CHECK" => TokenType::TK_CHECK,
b"COLLATE" => TokenType::TK_COLLATE,
b"COLUMN" => TokenType::TK_COLUMNKW,
b"COMMIT" => TokenType::TK_COMMIT,
b"CONCURRENT" => TokenType::TK_CONCURRENT,
b"CONFLICT" => TokenType::TK_CONFLICT,
b"CONSTRAINT" => TokenType::TK_CONSTRAINT,
b"CREATE" => TokenType::TK_CREATE,
b"CROSS" => TokenType::TK_JOIN_KW,
b"CURRENT" => TokenType::TK_CURRENT,
b"CURRENT_DATE" => TokenType::TK_CTIME_KW,
b"CURRENT_TIME" => TokenType::TK_CTIME_KW,
b"CURRENT_TIMESTAMP" => TokenType::TK_CTIME_KW,
b"DATABASE" => TokenType::TK_DATABASE,
b"DEFAULT" => TokenType::TK_DEFAULT,
b"DEFERRABLE" => TokenType::TK_DEFERRABLE,
b"DEFERRED" => TokenType::TK_DEFERRED,
b"DELETE" => TokenType::TK_DELETE,
b"DESC" => TokenType::TK_DESC,
b"DETACH" => TokenType::TK_DETACH,
b"DISTINCT" => TokenType::TK_DISTINCT,
b"DO" => TokenType::TK_DO,
b"DROP" => TokenType::TK_DROP,
b"EACH" => TokenType::TK_EACH,
b"ELSE" => TokenType::TK_ELSE,
b"END" => TokenType::TK_END,
b"ESCAPE" => TokenType::TK_ESCAPE,
b"EXCEPT" => TokenType::TK_EXCEPT,
b"EXCLUDE" => TokenType::TK_EXCLUDE,
b"EXCLUSIVE" => TokenType::TK_EXCLUSIVE,
b"EXISTS" => TokenType::TK_EXISTS,
b"EXPLAIN" => TokenType::TK_EXPLAIN,
b"FAIL" => TokenType::TK_FAIL,
b"FILTER" => TokenType::TK_FILTER,
b"FIRST" => TokenType::TK_FIRST,
b"FOLLOWING" => TokenType::TK_FOLLOWING,
b"FOR" => TokenType::TK_FOR,
b"FOREIGN" => TokenType::TK_FOREIGN,
b"FROM" => TokenType::TK_FROM,
b"FULL" => TokenType::TK_JOIN_KW,
b"GENERATED" => TokenType::TK_GENERATED,
b"GLOB" => TokenType::TK_LIKE_KW,
b"GROUP" => TokenType::TK_GROUP,
b"GROUPS" => TokenType::TK_GROUPS,
b"HAVING" => TokenType::TK_HAVING,
b"IF" => TokenType::TK_IF,
b"IGNORE" => TokenType::TK_IGNORE,
b"IMMEDIATE" => TokenType::TK_IMMEDIATE,
b"IN" => TokenType::TK_IN,
b"INDEX" => TokenType::TK_INDEX,
b"INDEXED" => TokenType::TK_INDEXED,
b"INITIALLY" => TokenType::TK_INITIALLY,
b"INNER" => TokenType::TK_JOIN_KW,
b"INSERT" => TokenType::TK_INSERT,
b"INSTEAD" => TokenType::TK_INSTEAD,
b"INTERSECT" => TokenType::TK_INTERSECT,
b"INTO" => TokenType::TK_INTO,
b"IS" => TokenType::TK_IS,
b"ISNULL" => TokenType::TK_ISNULL,
b"JOIN" => TokenType::TK_JOIN,
b"KEY" => TokenType::TK_KEY,
b"LAST" => TokenType::TK_LAST,
b"LEFT" => TokenType::TK_JOIN_KW,
b"LIKE" => TokenType::TK_LIKE_KW,
b"LIMIT" => TokenType::TK_LIMIT,
b"MATCH" => TokenType::TK_MATCH,
b"MATERIALIZED" => TokenType::TK_MATERIALIZED,
b"NATURAL" => TokenType::TK_JOIN_KW,
b"NO" => TokenType::TK_NO,
b"NOT" => TokenType::TK_NOT,
b"NOTHING" => TokenType::TK_NOTHING,
b"NOTNULL" => TokenType::TK_NOTNULL,
b"NULL" => TokenType::TK_NULL,
b"NULLS" => TokenType::TK_NULLS,
b"OF" => TokenType::TK_OF,
b"OFFSET" => TokenType::TK_OFFSET,
b"ON" => TokenType::TK_ON,
b"OR" => TokenType::TK_OR,
b"ORDER" => TokenType::TK_ORDER,
b"OTHERS" => TokenType::TK_OTHERS,
b"OUTER" => TokenType::TK_JOIN_KW,
b"OVER" => TokenType::TK_OVER,
b"PARTITION" => TokenType::TK_PARTITION,
b"PLAN" => TokenType::TK_PLAN,
b"PRAGMA" => TokenType::TK_PRAGMA,
b"PRECEDING" => TokenType::TK_PRECEDING,
b"PRIMARY" => TokenType::TK_PRIMARY,
b"QUERY" => TokenType::TK_QUERY,
b"RAISE" => TokenType::TK_RAISE,
b"RANGE" => TokenType::TK_RANGE,
b"RECURSIVE" => TokenType::TK_RECURSIVE,
b"REFERENCES" => TokenType::TK_REFERENCES,
b"REGEXP" => TokenType::TK_LIKE_KW,
b"REINDEX" => TokenType::TK_REINDEX,
b"RELEASE" => TokenType::TK_RELEASE,
b"RENAME" => TokenType::TK_RENAME,
b"REPLACE" => TokenType::TK_REPLACE,
b"RETURNING" => TokenType::TK_RETURNING,
b"RESTRICT" => TokenType::TK_RESTRICT,
b"RIGHT" => TokenType::TK_JOIN_KW,
b"ROLLBACK" => TokenType::TK_ROLLBACK,
b"ROW" => TokenType::TK_ROW,
b"ROWS" => TokenType::TK_ROWS,
b"SAVEPOINT" => TokenType::TK_SAVEPOINT,
b"SELECT" => TokenType::TK_SELECT,
b"SET" => TokenType::TK_SET,
b"TABLE" => TokenType::TK_TABLE,
b"TEMP" => TokenType::TK_TEMP,
b"TEMPORARY" => TokenType::TK_TEMP,
b"THEN" => TokenType::TK_THEN,
b"TIES" => TokenType::TK_TIES,
b"TO" => TokenType::TK_TO,
b"TRANSACTION" => TokenType::TK_TRANSACTION,
b"TRIGGER" => TokenType::TK_TRIGGER,
b"UNBOUNDED" => TokenType::TK_UNBOUNDED,
b"UNION" => TokenType::TK_UNION,
b"UNIQUE" => TokenType::TK_UNIQUE,
b"UPDATE" => TokenType::TK_UPDATE,
b"USING" => TokenType::TK_USING,
b"VACUUM" => TokenType::TK_VACUUM,
b"VALUES" => TokenType::TK_VALUES,
b"VIEW" => TokenType::TK_VIEW,
b"VIRTUAL" => TokenType::TK_VIRTUAL,
b"WHEN" => TokenType::TK_WHEN,
b"WHERE" => TokenType::TK_WHERE,
b"WINDOW" => TokenType::TK_WINDOW,
b"WITH" => TokenType::TK_WITH,
b"WITHOUT" => TokenType::TK_WITHOUT,
_ => TokenType::TK_ID,
})
}
#[inline(always)]
pub fn is_identifier_start(b: u8) -> bool {
b.is_ascii_uppercase() || b == b'_' || b.is_ascii_lowercase() || b > b'\x7F'
}
#[inline(always)]
pub fn is_identifier_continue(b: u8) -> bool {
b == b'$'
|| b.is_ascii_digit()
|| b.is_ascii_uppercase()
|| b == b'_'
|| b.is_ascii_lowercase()
|| b > b'\x7F'
}
#[derive(Clone, PartialEq, Eq, Debug)] // do not derive Copy for Token, just use .clone() when needed
pub struct Token<'a> {
pub value: &'a [u8],
pub token_type: Option<TokenType>, // None means Token is whitespaces or comments
}
pub struct Lexer<'a> {
pub(crate) offset: usize,
pub(crate) input: &'a [u8],
}
impl<'a> Iterator for Lexer<'a> {
type Item = Result<Token<'a>>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.peek() {
None => None, // End of file
Some(b) if b.is_ascii_whitespace() => Some(Ok(self.eat_white_space())),
// matching logic
Some(b) => match b {
b'-' => Some(Ok(self.eat_minus_or_comment_or_ptr())),
b'(' => Some(Ok(self.eat_one_token(TokenType::TK_LP))),
b')' => Some(Ok(self.eat_one_token(TokenType::TK_RP))),
b';' => Some(Ok(self.eat_one_token(TokenType::TK_SEMI))),
b'+' => Some(Ok(self.eat_one_token(TokenType::TK_PLUS))),
b'*' => Some(Ok(self.eat_one_token(TokenType::TK_STAR))),
b'/' => Some(self.mark(|l| l.eat_slash_or_comment())),
b'%' => Some(Ok(self.eat_one_token(TokenType::TK_REM))),
b'=' => Some(Ok(self.eat_eq())),
b'<' => Some(Ok(self.eat_le_or_ne_or_lshift_or_lt())),
b'>' => Some(Ok(self.eat_ge_or_gt_or_rshift())),
b'!' => Some(self.mark(|l| l.eat_ne())),
b'|' => Some(Ok(self.eat_concat_or_bitor())),
b',' => Some(Ok(self.eat_one_token(TokenType::TK_COMMA))),
b'&' => Some(Ok(self.eat_one_token(TokenType::TK_BITAND))),
b'~' => Some(Ok(self.eat_one_token(TokenType::TK_BITNOT))),
b'\'' | b'"' | b'`' => Some(self.mark(|l| l.eat_lit_or_id())),
b'.' => Some(self.mark(|l| l.eat_dot_or_frac(false))),
b'0'..=b'9' => Some(self.mark(|l| l.eat_number())),
b'[' => Some(self.mark(|l| l.eat_bracket())),
b'?' | b'$' | b'@' | b'#' | b':' => Some(self.mark(|l| l.eat_var())),
b if is_identifier_start(b) => Some(self.mark(|l| l.eat_blob_or_id())),
_ => Some(self.eat_unrecognized()),
},
}
}
}
impl<'a> Lexer<'a> {
#[inline(always)]
pub fn new(input: &'a [u8]) -> Self {
Lexer { input, offset: 0 }
}
#[inline(always)]
pub fn remaining(&self) -> &'a [u8] {
&self.input[self.offset..]
}
#[inline]
pub fn mark<F, R>(&mut self, exc: F) -> Result<R>
where
F: FnOnce(&mut Self) -> Result<R>,
{
let start_offset = self.offset;
let result = exc(self);
if result.is_err() {
self.offset = start_offset; // Reset to the start offset if an error occurs
}
result
}
/// Returns the current offset in the input without consuming.
#[inline(always)]
pub fn peek(&self) -> Option<u8> {
if self.offset < self.input.len() {
Some(self.input[self.offset])
} else {
None // End of file
}
}
/// Returns the current offset in the input and consumes it.
#[inline(always)]
pub fn eat(&mut self) -> Option<u8> {
let result = self.peek();
if result.is_some() {
self.offset += 1;
}
result
}
#[inline(always)]
fn eat_and_assert<F>(&mut self, f: F)
where
F: Fn(u8) -> bool,
{
let _value = self.eat();
debug_assert!(f(_value.unwrap()))
}
#[inline]
fn eat_while<F>(&mut self, f: F)
where
F: Fn(Option<u8>) -> bool,
{
loop {
if !f(self.peek()) {
return;
}
self.eat();
}
}
fn eat_while_number_digit(&mut self) -> Result<()> {
loop {
let start = self.offset;
self.eat_while(|b| b.is_some() && b.unwrap().is_ascii_digit());
match self.peek() {
Some(b'_') => {
self.eat_and_assert(|b| b == b'_');
if start == self.offset {
// before the underscore, there was no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
}
match self.peek() {
Some(b) if b.is_ascii_digit() => continue, // Continue if next is a digit
_ => {
// after the underscore, there is no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
}
}
}
_ => return Ok(()),
}
}
}
fn eat_while_number_hexdigit(&mut self) -> Result<()> {
loop {
let start = self.offset;
self.eat_while(|b| b.is_some() && b.unwrap().is_ascii_hexdigit());
match self.peek() {
Some(b'_') => {
if start == self.offset {
// before the underscore, there was no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
}
self.eat_and_assert(|b| b == b'_');
match self.peek() {
Some(b) if b.is_ascii_hexdigit() => continue, // Continue if next is a digit
_ => {
// after the underscore, there is no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
}
}
}
_ => return Ok(()),
}
}
}
#[inline]
fn eat_one_token(&mut self, typ: TokenType) -> Token<'a> {
debug_assert!(!self.remaining().is_empty());
let tok = Token {
value: &self.remaining()[..1],
token_type: Some(typ),
};
self.offset += 1;
tok
}
#[inline]
fn eat_white_space(&mut self) -> Token<'a> {
let start = self.offset;
self.eat_and_assert(|b| b.is_ascii_whitespace());
self.eat_while(|b| b.is_some() && b.unwrap().is_ascii_whitespace());
Token {
value: &self.input[start..self.offset],
token_type: None, // This is a whitespace
}
}
fn eat_minus_or_comment_or_ptr(&mut self) -> Token<'a> {
let start = self.offset;
self.eat_and_assert(|b| b == b'-');
match self.peek() {
Some(b'-') => {
self.eat_and_assert(|b| b == b'-');
self.eat_while(|b| b.is_some() && b.unwrap() != b'\n');
if self.peek() == Some(b'\n') {
self.eat_and_assert(|b| b == b'\n');
}
Token {
value: &self.input[start..self.offset],
token_type: None, // This is a comment
}
}
Some(b'>') => {
self.eat_and_assert(|b| b == b'>');
if self.peek() == Some(b'>') {
self.eat_and_assert(|b| b == b'>');
}
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_PTR),
}
}
_ => Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_MINUS),
},
}
}
fn eat_slash_or_comment(&mut self) -> Result<Token<'a>> {
let start = self.offset;
self.eat_and_assert(|b| b == b'/');
match self.peek() {
Some(b'*') => {
self.eat_and_assert(|b| b == b'*');
loop {
self.eat_while(|b| b.is_some() && b.unwrap() != b'*');
match self.peek() {
Some(b'*') => {
self.eat_and_assert(|b| b == b'*');
match self.peek() {
Some(b'/') => {
self.eat_and_assert(|b| b == b'/');
break; // End of block comment
}
None => {
return Err(Error::UnterminatedBlockComment(
(start, self.offset - start).into(),
))
}
_ => {}
}
}
None => {
return Err(Error::UnterminatedBlockComment(
(start, self.offset - start).into(),
))
}
_ => unreachable!(), // We should not reach here
}
}
Ok(Token {
value: &self.input[start..self.offset],
token_type: None, // This is a comment
})
}
_ => Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_SLASH),
}),
}
}
fn eat_eq(&mut self) -> Token<'a> {
let start = self.offset;
self.eat_and_assert(|b| b == b'=');
if self.peek() == Some(b'=') {
self.eat_and_assert(|b| b == b'=');
}
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_EQ),
}
}
fn eat_le_or_ne_or_lshift_or_lt(&mut self) -> Token<'a> {
let start = self.offset;
self.eat_and_assert(|b| b == b'<');
match self.peek() {
Some(b'=') => {
self.eat_and_assert(|b| b == b'=');
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_LE),
}
}
Some(b'<') => {
self.eat_and_assert(|b| b == b'<');
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_LSHIFT),
}
}
Some(b'>') => {
self.eat_and_assert(|b| b == b'>');
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_NE),
}
}
_ => Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_LT),
},
}
}
fn eat_ge_or_gt_or_rshift(&mut self) -> Token<'a> {
let start = self.offset;
self.eat_and_assert(|b| b == b'>');
match self.peek() {
Some(b'=') => {
self.eat_and_assert(|b| b == b'=');
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_GE),
}
}
Some(b'>') => {
self.eat_and_assert(|b| b == b'>');
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_RSHIFT),
}
}
_ => Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_GT),
},
}
}
fn eat_ne(&mut self) -> Result<Token<'a>> {
let start = self.offset;
self.eat_and_assert(|b| b == b'!');
match self.peek() {
Some(b'=') => {
self.eat_and_assert(|b| b == b'=');
}
_ => {
return Err(Error::ExpectedEqualsSign(
(start, self.offset - start).into(),
))
}
}
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_NE),
})
}
fn eat_concat_or_bitor(&mut self) -> Token<'a> {
let start = self.offset;
self.eat_and_assert(|b| b == b'|');
if self.peek() == Some(b'|') {
self.eat_and_assert(|b| b == b'|');
return Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_CONCAT),
};
}
Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_BITOR),
}
}
fn eat_lit_or_id(&mut self) -> Result<Token<'a>> {
let start = self.offset;
let quote = self.eat().unwrap();
debug_assert!(quote == b'\'' || quote == b'"' || quote == b'`');
let tt = if quote == b'\'' {
TokenType::TK_STRING
} else {
TokenType::TK_ID
};
loop {
self.eat_while(|b| b.is_some() && b.unwrap() != quote);
match self.peek() {
Some(b) if b == quote => {
self.eat_and_assert(|b| b == quote);
match self.peek() {
Some(b) if b == quote => {
self.eat_and_assert(|b| b == quote);
continue;
}
_ => break,
}
}
None => {
return Err(Error::UnterminatedLiteral(
(start, self.offset - start).into(),
))
}
_ => unreachable!(),
};
}
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(tt),
})
}
fn eat_dot_or_frac(&mut self, has_digit_prefix: bool) -> Result<Token<'a>> {
let start = self.offset;
self.eat_and_assert(|b| b == b'.');
match self.peek() {
Some(b)
if b.is_ascii_digit() || (has_digit_prefix && b.eq_ignore_ascii_case(&b'e')) =>
{
self.eat_while_number_digit()?;
match self.peek() {
Some(b'e') | Some(b'E') => {
_ = self.eat_expo()?;
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_FLOAT),
})
}
Some(b) if is_identifier_start(b) => Err(Error::BadFractionalPart(
(start, self.offset - start).into(),
)),
_ => Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_FLOAT),
}),
}
}
_ => Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_DOT),
}),
}
}
fn eat_expo(&mut self) -> Result<Token<'a>> {
let start = self.offset;
self.eat_and_assert(|b| b == b'e' || b == b'E');
match self.peek() {
Some(b'+') | Some(b'-') => {
self.eat_and_assert(|b| b == b'+' || b == b'-');
}
_ => {}
}
let start_num = self.offset;
self.eat_while_number_digit()?;
if start_num == self.offset {
return Err(Error::BadExponentPart((start, self.offset - start).into()));
}
if self.peek().is_some() && is_identifier_start(self.peek().unwrap()) {
return Err(Error::BadExponentPart((start, self.offset - start).into()));
}
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_FLOAT), // This is a number
})
}
fn eat_number(&mut self) -> Result<Token<'a>> {
let start = self.offset;
let first_digit = self.eat().unwrap();
debug_assert!(first_digit.is_ascii_digit());
// hex int
if first_digit == b'0' {
match self.peek() {
Some(b'x') | Some(b'X') => {
self.eat_and_assert(|b| b == b'x' || b == b'X');
let start_hex = self.offset;
self.eat_while_number_hexdigit()?;
if start_hex == self.offset {
return Err(Error::MalformedHexInteger(
(start, self.offset - start).into(),
));
}
if self.peek().is_some() && is_identifier_start(self.peek().unwrap()) {
return Err(Error::BadNumber((start, self.offset - start).into()));
}
return Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_INTEGER),
});
}
_ => {}
}
}
self.eat_while_number_digit()?;
match self.peek() {
Some(b'.') => {
self.eat_dot_or_frac(true)?;
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_FLOAT),
})
}
Some(b'e') | Some(b'E') => {
self.eat_expo()?;
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_FLOAT),
})
}
Some(b) if is_identifier_start(b) => {
Err(Error::BadNumber((start, self.offset - start).into()))
}
_ => Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_INTEGER),
}),
}
}
fn eat_bracket(&mut self) -> Result<Token<'a>> {
let start = self.offset;
self.eat_and_assert(|b| b == b'[');
self.eat_while(|b| b.is_some() && b.unwrap() != b']');
match self.peek() {
Some(b']') => {
self.eat_and_assert(|b| b == b']');
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_ID),
})
}
None => Err(Error::UnterminatedBracket(
(start, self.offset - start).into(),
)),
_ => unreachable!(), // We should not reach here
}
}
fn eat_var(&mut self) -> Result<Token<'a>> {
let start = self.offset;
let tok = self.eat().unwrap();
debug_assert!(tok == b'?' || tok == b'$' || tok == b'@' || tok == b'#' || tok == b':');
match tok {
b'?' => {
self.eat_while(|b| b.is_some() && b.unwrap().is_ascii_digit());
Ok(Token {
value: &self.input[start + 1..self.offset], // do not include '? in the value
token_type: Some(TokenType::TK_VARIABLE),
})
}
_ => {
let start_id = self.offset;
self.eat_while(|b| b.is_some() && is_identifier_continue(b.unwrap()));
// empty variable name
if start_id == self.offset {
return Err(Error::BadVariableName((start, self.offset - start).into()));
}
Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_VARIABLE),
})
}
}
}
#[inline]
fn eat_blob_or_id(&mut self) -> Result<Token<'a>> {
let start = self.offset;
let start_char = self.eat().unwrap();
debug_assert!(is_identifier_start(start_char));
match start_char {
b'x' | b'X' if self.peek() == Some(b'\'') => {
self.eat_and_assert(|b| b == b'\'');
let start_hex = self.offset;
self.eat_while(|b| b.is_some() && b.unwrap().is_ascii_hexdigit());
match self.peek() {
Some(b'\'') => {
let end_hex = self.offset;
debug_assert!(end_hex >= start_hex);
self.eat_and_assert(|b| b == b'\'');
if (end_hex - start_hex) % 2 != 0 {
return Err(Error::UnrecognizedToken(
(start, self.offset - start).into(),
));
}
Ok(Token {
value: &self.input[start + 2..self.offset - 1], // do not include 'x' or 'X' and the last '
token_type: Some(TokenType::TK_BLOB),
})
}
_ => Err(Error::UnterminatedLiteral(
(start, self.offset - start).into(),
)),
}
}
_ => {
self.eat_while(|b| b.is_some() && is_identifier_continue(b.unwrap()));
let result = &self.input[start..self.offset];
Ok(Token {
value: result,
token_type: Some(keyword_or_id_token(result)),
})
}
}
}
fn eat_unrecognized(&mut self) -> Result<Token<'a>> {
let start = self.offset;
self.eat_while(|b| b.is_some() && !b.unwrap().is_ascii_whitespace());
Err(Error::UnrecognizedToken(
(start, self.offset - start).into(),
))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn test_lexer_one_tok() {
let test_cases = vec![
(
b" ".as_slice(),
Token {
value: b" ".as_slice(),
token_type: None,
},
),
(
b"-- This is a comment\n".as_slice(),
Token {
value: b"-- This is a comment\n".as_slice(),
token_type: None, // This is a comment
},
),
(
b"-".as_slice(),
Token {
value: b"-".as_slice(),
token_type: Some(TokenType::TK_MINUS),
},
),
(
b"->".as_slice(),
Token {
value: b"->".as_slice(),
token_type: Some(TokenType::TK_PTR),
},
),
(
b"->>".as_slice(),
Token {
value: b"->>".as_slice(),
token_type: Some(TokenType::TK_PTR),
},
),
(
b"(".as_slice(),
Token {
value: b"(".as_slice(),
token_type: Some(TokenType::TK_LP),
},
),
(
b")".as_slice(),
Token {
value: b")".as_slice(),
token_type: Some(TokenType::TK_RP),
},
),
(
b";".as_slice(),
Token {
value: b";".as_slice(),
token_type: Some(TokenType::TK_SEMI),
},
),
(
b"+".as_slice(),
Token {
value: b"+".as_slice(),
token_type: Some(TokenType::TK_PLUS),
},
),
(
b"*".as_slice(),
Token {
value: b"*".as_slice(),
token_type: Some(TokenType::TK_STAR),
},
),
(
b"/".as_slice(),
Token {
value: b"/".as_slice(),
token_type: Some(TokenType::TK_SLASH),
},
),
(
b"/* This is a block comment */".as_slice(),
Token {
value: b"/* This is a block comment */".as_slice(),
token_type: None, // This is a comment
},
),
(
b"/* This is a\n\n block comment */".as_slice(),
Token {
value: b"/* This is a\n\n block comment */".as_slice(),
token_type: None, // This is a comment
},
),
(
b"/* This is a** block* comment */".as_slice(),
Token {
value: b"/* This is a** block* comment */".as_slice(),
token_type: None, // This is a comment
},
),
(
b"=".as_slice(),
Token {
value: b"=".as_slice(),
token_type: Some(TokenType::TK_EQ),
},
),
(
b"==".as_slice(),
Token {
value: b"==".as_slice(),
token_type: Some(TokenType::TK_EQ),
},
),
(
b"<".as_slice(),
Token {
value: b"<".as_slice(),
token_type: Some(TokenType::TK_LT),
},
),
(
b"<>".as_slice(),
Token {
value: b"<>".as_slice(),
token_type: Some(TokenType::TK_NE),
},
),
(
b"<=".as_slice(),
Token {
value: b"<=".as_slice(),
token_type: Some(TokenType::TK_LE),
},
),
(
b"<<".as_slice(),
Token {
value: b"<<".as_slice(),
token_type: Some(TokenType::TK_LSHIFT),
},
),
(
b">".as_slice(),
Token {
value: b">".as_slice(),
token_type: Some(TokenType::TK_GT),
},
),
(
b">=".as_slice(),
Token {
value: b">=".as_slice(),
token_type: Some(TokenType::TK_GE),
},
),
(
b">>".as_slice(),
Token {
value: b">>".as_slice(),
token_type: Some(TokenType::TK_RSHIFT),
},
),
(
b"!=".as_slice(),
Token {
value: b"!=".as_slice(),
token_type: Some(TokenType::TK_NE),
},
),
(
b"|".as_slice(),
Token {
value: b"|".as_slice(),
token_type: Some(TokenType::TK_BITOR),
},
),
(
b"||".as_slice(),
Token {
value: b"||".as_slice(),
token_type: Some(TokenType::TK_CONCAT),
},
),
(
b",".as_slice(),
Token {
value: b",".as_slice(),
token_type: Some(TokenType::TK_COMMA),
},
),
(
b"&".as_slice(),
Token {
value: b"&".as_slice(),
token_type: Some(TokenType::TK_BITAND),
},
),
(
b"~".as_slice(),
Token {
value: b"~".as_slice(),
token_type: Some(TokenType::TK_BITNOT),
},
),
(
b"'string'".as_slice(),
Token {
value: b"'string'".as_slice(),
token_type: Some(TokenType::TK_STRING),
},
),
(
b"`identifier`".as_slice(),
Token {
value: b"`identifier`".as_slice(),
token_type: Some(TokenType::TK_ID),
},
),
(
b"\"quoted string\"".as_slice(),
Token {
value: b"\"quoted string\"".as_slice(),
token_type: Some(TokenType::TK_ID),
},
),
(
b"\"\"\"triple \"\"quoted string\"\"\"".as_slice(),
Token {
value: b"\"\"\"triple \"\"quoted string\"\"\"".as_slice(),
token_type: Some(TokenType::TK_ID),
},
),
(
b"```triple ``quoted string```".as_slice(),
Token {
value: b"```triple ``quoted string```".as_slice(),
token_type: Some(TokenType::TK_ID),
},
),
(
b"'''triple ''quoted string'''".as_slice(),
Token {
value: b"'''triple ''quoted string'''".as_slice(),
token_type: Some(TokenType::TK_STRING),
},
),
(
b".".as_slice(),
Token {
value: b".".as_slice(),
token_type: Some(TokenType::TK_DOT),
},
),
(
b".123".as_slice(),
Token {
value: b".123".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b".456".as_slice(),
Token {
value: b".456".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b".456e789".as_slice(),
Token {
value: b".456e789".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b".456E-789".as_slice(),
Token {
value: b".456E-789".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b"123".as_slice(),
Token {
value: b"123".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
),
(
b"9_223_372_036_854_775_807".as_slice(),
Token {
value: b"9_223_372_036_854_775_807".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
),
(
b"123.456".as_slice(),
Token {
value: b"123.456".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b"123e456".as_slice(),
Token {
value: b"123e456".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b"123E-456".as_slice(),
Token {
value: b"123E-456".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
(
b"0x1A3F".as_slice(),
Token {
value: b"0x1A3F".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
),
(
b"0x1A3F_5678".as_slice(),
Token {
value: b"0x1A3F_5678".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
),
(
b"0x1A3F_5678e9".as_slice(),
Token {
value: b"0x1A3F_5678e9".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
),
(
b"[identifier]".as_slice(),
Token {
value: b"[identifier]".as_slice(),
token_type: Some(TokenType::TK_ID),
},
),
(
b"?123".as_slice(),
Token {
value: b"123".as_slice(), // '?' is not included in the value
token_type: Some(TokenType::TK_VARIABLE),
},
),
(
b"$var_name".as_slice(),
Token {
value: b"$var_name".as_slice(),
token_type: Some(TokenType::TK_VARIABLE),
},
),
(
b"@param".as_slice(),
Token {
value: b"@param".as_slice(),
token_type: Some(TokenType::TK_VARIABLE),
},
),
(
b"#comment".as_slice(),
Token {
value: b"#comment".as_slice(),
token_type: Some(TokenType::TK_VARIABLE),
},
),
(
b":named_param".as_slice(),
Token {
value: b":named_param".as_slice(),
token_type: Some(TokenType::TK_VARIABLE),
},
),
(
b"x'1234567890abcdef'".as_slice(),
Token {
value: b"1234567890abcdef".as_slice(), // 'x' is not included in the value
token_type: Some(TokenType::TK_BLOB),
},
),
(
b"X'1234567890abcdef'".as_slice(),
Token {
value: b"1234567890abcdef".as_slice(), // 'X' is not included in the value
token_type: Some(TokenType::TK_BLOB),
},
),
(
b"x''".as_slice(),
Token {
value: b"".as_slice(), // 'x' is not included in the value
token_type: Some(TokenType::TK_BLOB),
},
),
(
b"X''".as_slice(),
Token {
value: b"".as_slice(), // 'X' is not included in the value
token_type: Some(TokenType::TK_BLOB),
},
),
(
b"wHeRe".as_slice(),
Token {
value: b"wHeRe".as_slice(), // 'X' is not included in the value
token_type: Some(TokenType::TK_WHERE),
},
),
(
b"wHeRe123".as_slice(),
Token {
value: b"wHeRe123".as_slice(), // 'X' is not included in the value
token_type: Some(TokenType::TK_ID),
},
),
(
b"wHeRe_123".as_slice(),
Token {
value: b"wHeRe_123".as_slice(), // 'X' is not included in the value
token_type: Some(TokenType::TK_ID),
},
),
// issue 2933
(
b"1.e5".as_slice(),
Token {
value: b"1.e5".as_slice(),
token_type: Some(TokenType::TK_FLOAT),
},
),
];
for (input, expected) in test_cases {
let mut lexer = Lexer::new(input);
let token = lexer.next().unwrap().unwrap();
let expect_value = unsafe { String::from_utf8_unchecked(expected.value.to_vec()) };
let got_value = unsafe { String::from_utf8_unchecked(token.value.to_vec()) };
println!("Input: {input:?}, Expected: {expect_value:?}, Got: {got_value:?}");
assert_eq!(got_value, expect_value);
assert_eq!(token.token_type, expected.token_type);
}
}
#[test]
fn test_keyword_token() {
let values = HashMap::from([
("ABORT", TokenType::TK_ABORT),
("ACTION", TokenType::TK_ACTION),
("ADD", TokenType::TK_ADD),
("AFTER", TokenType::TK_AFTER),
("ALL", TokenType::TK_ALL),
("ALTER", TokenType::TK_ALTER),
("ALWAYS", TokenType::TK_ALWAYS),
("ANALYZE", TokenType::TK_ANALYZE),
("AND", TokenType::TK_AND),
("AS", TokenType::TK_AS),
("ASC", TokenType::TK_ASC),
("ATTACH", TokenType::TK_ATTACH),
("AUTOINCREMENT", TokenType::TK_AUTOINCR),
("BEFORE", TokenType::TK_BEFORE),
("BEGIN", TokenType::TK_BEGIN),
("BETWEEN", TokenType::TK_BETWEEN),
("BY", TokenType::TK_BY),
("CASCADE", TokenType::TK_CASCADE),
("CASE", TokenType::TK_CASE),
("CAST", TokenType::TK_CAST),
("CHECK", TokenType::TK_CHECK),
("COLLATE", TokenType::TK_COLLATE),
("COLUMN", TokenType::TK_COLUMNKW),
("COMMIT", TokenType::TK_COMMIT),
("CONCURRENT", TokenType::TK_CONCURRENT),
("CONFLICT", TokenType::TK_CONFLICT),
("CONSTRAINT", TokenType::TK_CONSTRAINT),
("CREATE", TokenType::TK_CREATE),
("CROSS", TokenType::TK_JOIN_KW),
("CURRENT", TokenType::TK_CURRENT),
("CURRENT_DATE", TokenType::TK_CTIME_KW),
("CURRENT_TIME", TokenType::TK_CTIME_KW),
("CURRENT_TIMESTAMP", TokenType::TK_CTIME_KW),
("DATABASE", TokenType::TK_DATABASE),
("DEFAULT", TokenType::TK_DEFAULT),
("DEFERRABLE", TokenType::TK_DEFERRABLE),
("DEFERRED", TokenType::TK_DEFERRED),
("DELETE", TokenType::TK_DELETE),
("DESC", TokenType::TK_DESC),
("DETACH", TokenType::TK_DETACH),
("DISTINCT", TokenType::TK_DISTINCT),
("DO", TokenType::TK_DO),
("DROP", TokenType::TK_DROP),
("EACH", TokenType::TK_EACH),
("ELSE", TokenType::TK_ELSE),
("END", TokenType::TK_END),
("ESCAPE", TokenType::TK_ESCAPE),
("EXCEPT", TokenType::TK_EXCEPT),
("EXCLUDE", TokenType::TK_EXCLUDE),
("EXCLUSIVE", TokenType::TK_EXCLUSIVE),
("EXISTS", TokenType::TK_EXISTS),
("EXPLAIN", TokenType::TK_EXPLAIN),
("FAIL", TokenType::TK_FAIL),
("FILTER", TokenType::TK_FILTER),
("FIRST", TokenType::TK_FIRST),
("FOLLOWING", TokenType::TK_FOLLOWING),
("FOR", TokenType::TK_FOR),
("FOREIGN", TokenType::TK_FOREIGN),
("FROM", TokenType::TK_FROM),
("FULL", TokenType::TK_JOIN_KW),
("GENERATED", TokenType::TK_GENERATED),
("GLOB", TokenType::TK_LIKE_KW),
("GROUP", TokenType::TK_GROUP),
("GROUPS", TokenType::TK_GROUPS),
("HAVING", TokenType::TK_HAVING),
("IF", TokenType::TK_IF),
("IGNORE", TokenType::TK_IGNORE),
("IMMEDIATE", TokenType::TK_IMMEDIATE),
("IN", TokenType::TK_IN),
("INDEX", TokenType::TK_INDEX),
("INDEXED", TokenType::TK_INDEXED),
("INITIALLY", TokenType::TK_INITIALLY),
("INNER", TokenType::TK_JOIN_KW),
("INSERT", TokenType::TK_INSERT),
("INSTEAD", TokenType::TK_INSTEAD),
("INTERSECT", TokenType::TK_INTERSECT),
("INTO", TokenType::TK_INTO),
("IS", TokenType::TK_IS),
("ISNULL", TokenType::TK_ISNULL),
("JOIN", TokenType::TK_JOIN),
("KEY", TokenType::TK_KEY),
("LAST", TokenType::TK_LAST),
("LEFT", TokenType::TK_JOIN_KW),
("LIKE", TokenType::TK_LIKE_KW),
("LIMIT", TokenType::TK_LIMIT),
("MATCH", TokenType::TK_MATCH),
("MATERIALIZED", TokenType::TK_MATERIALIZED),
("NATURAL", TokenType::TK_JOIN_KW),
("NO", TokenType::TK_NO),
("NOT", TokenType::TK_NOT),
("NOTHING", TokenType::TK_NOTHING),
("NOTNULL", TokenType::TK_NOTNULL),
("NULL", TokenType::TK_NULL),
("NULLS", TokenType::TK_NULLS),
("OF", TokenType::TK_OF),
("OFFSET", TokenType::TK_OFFSET),
("ON", TokenType::TK_ON),
("OR", TokenType::TK_OR),
("ORDER", TokenType::TK_ORDER),
("OTHERS", TokenType::TK_OTHERS),
("OUTER", TokenType::TK_JOIN_KW),
("OVER", TokenType::TK_OVER),
("PARTITION", TokenType::TK_PARTITION),
("PLAN", TokenType::TK_PLAN),
("PRAGMA", TokenType::TK_PRAGMA),
("PRECEDING", TokenType::TK_PRECEDING),
("PRIMARY", TokenType::TK_PRIMARY),
("QUERY", TokenType::TK_QUERY),
("RAISE", TokenType::TK_RAISE),
("RANGE", TokenType::TK_RANGE),
("RECURSIVE", TokenType::TK_RECURSIVE),
("REFERENCES", TokenType::TK_REFERENCES),
("REGEXP", TokenType::TK_LIKE_KW),
("REINDEX", TokenType::TK_REINDEX),
("RELEASE", TokenType::TK_RELEASE),
("RENAME", TokenType::TK_RENAME),
("REPLACE", TokenType::TK_REPLACE),
("RETURNING", TokenType::TK_RETURNING),
("RESTRICT", TokenType::TK_RESTRICT),
("RIGHT", TokenType::TK_JOIN_KW),
("ROLLBACK", TokenType::TK_ROLLBACK),
("ROW", TokenType::TK_ROW),
("ROWS", TokenType::TK_ROWS),
("SAVEPOINT", TokenType::TK_SAVEPOINT),
("SELECT", TokenType::TK_SELECT),
("SET", TokenType::TK_SET),
("TABLE", TokenType::TK_TABLE),
("TEMP", TokenType::TK_TEMP),
("TEMPORARY", TokenType::TK_TEMP),
("THEN", TokenType::TK_THEN),
("TIES", TokenType::TK_TIES),
("TO", TokenType::TK_TO),
("TRANSACTION", TokenType::TK_TRANSACTION),
("TRIGGER", TokenType::TK_TRIGGER),
("UNBOUNDED", TokenType::TK_UNBOUNDED),
("UNION", TokenType::TK_UNION),
("UNIQUE", TokenType::TK_UNIQUE),
("UPDATE", TokenType::TK_UPDATE),
("USING", TokenType::TK_USING),
("VACUUM", TokenType::TK_VACUUM),
("VALUES", TokenType::TK_VALUES),
("VIEW", TokenType::TK_VIEW),
("VIRTUAL", TokenType::TK_VIRTUAL),
("WHEN", TokenType::TK_WHEN),
("WHERE", TokenType::TK_WHERE),
("WINDOW", TokenType::TK_WINDOW),
("WITH", TokenType::TK_WITH),
("WITHOUT", TokenType::TK_WITHOUT),
]);
for (key, value) in &values {
assert!(keyword_or_id_token(key.as_bytes()) == *value);
assert!(keyword_or_id_token(key.as_bytes().to_ascii_lowercase().as_slice()) == *value);
}
assert_eq!(keyword_or_id_token(b""), TokenType::TK_ID);
assert_eq!(keyword_or_id_token(b"wrong"), TokenType::TK_ID);
assert_eq!(keyword_or_id_token(b"super wrong"), TokenType::TK_ID);
assert_eq!(keyword_or_id_token(b"super_wrong"), TokenType::TK_ID);
assert_eq!(
keyword_or_id_token(b"aae26e78-3ba7-4627-8f8f-02623302495a"),
TokenType::TK_ID
);
assert_eq!(
keyword_or_id_token("Crème Brulée".as_bytes()),
TokenType::TK_ID
);
assert_eq!(keyword_or_id_token("fróm".as_bytes()), TokenType::TK_ID);
}
#[test]
fn test_lexer_multi_tok() {
let test_cases = vec![
(
b" SELECT 1".as_slice(),
vec![
Token {
value: b" ".as_slice(),
token_type: None,
},
Token {
value: b"SELECT".as_slice(),
token_type: Some(TokenType::TK_SELECT),
},
Token {
value: b" ".as_slice(),
token_type: None,
},
Token {
value: b"1".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
],
),
(
b"INSERT INTO users VALUES (1,2,3)".as_slice(),
vec![
Token {
value: b"INSERT".as_slice(),
token_type: Some(TokenType::TK_INSERT),
},
Token {
value: b" ".as_slice(),
token_type: None,
},
Token {
value: b"INTO".as_slice(),
token_type: Some(TokenType::TK_INTO),
},
Token {
value: b" ".as_slice(),
token_type: None,
},
Token {
value: b"users".as_slice(),
token_type: Some(TokenType::TK_ID),
},
Token {
value: b" ".as_slice(),
token_type: None,
},
Token {
value: b"VALUES".as_slice(),
token_type: Some(TokenType::TK_VALUES),
},
Token {
value: b" ".as_slice(),
token_type: None,
},
Token {
value: b"(".as_slice(),
token_type: Some(TokenType::TK_LP),
},
Token {
value: b"1".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
Token {
value: b",".as_slice(),
token_type: Some(TokenType::TK_COMMA),
},
Token {
value: b"2".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
Token {
value: b",".as_slice(),
token_type: Some(TokenType::TK_COMMA),
},
Token {
value: b"3".as_slice(),
token_type: Some(TokenType::TK_INTEGER),
},
Token {
value: b")".as_slice(),
token_type: Some(TokenType::TK_RP),
},
],
),
// issue 2933
(
b"u.email".as_slice(),
vec![
Token {
value: b"u".as_slice(),
token_type: Some(TokenType::TK_ID),
},
Token {
value: b".".as_slice(),
token_type: Some(TokenType::TK_DOT),
},
Token {
value: b"email".as_slice(),
token_type: Some(TokenType::TK_ID),
},
],
),
];
for (input, expected_tokens) in test_cases {
let lexer = Lexer::new(input);
let mut tokens = Vec::new();
for token in lexer {
tokens.push(token.unwrap());
}
assert_eq!(tokens.len(), expected_tokens.len());
for (i, token) in tokens.iter().enumerate() {
let expect_value =
unsafe { String::from_utf8_unchecked(expected_tokens[i].value.to_vec()) };
let got_value = unsafe { String::from_utf8_unchecked(token.value.to_vec()) };
assert_eq!(got_value, expect_value);
assert_eq!(token.token_type, expected_tokens[i].token_type);
}
}
}
}