Merge 'Nicer parse errors using miette' from Samyak Sarnayak

I noticed that the parse errors were a bit hard to read - only the
nearest token and the line/col offsets were printed.
I made a first attempt at improving the errors using
[miette](https://github.com/zkat/miette).
- Added derive for `miette::Diagnostic` to both the parser's error type
and LimboError.
- Added miette dependency to both sqlite3_parser and core. The `fancy`
feature is only enabled for the CLI. So the overhead on the libraries
(core, parser) should be minimal.
Some future improvements that can be made further:
- Add spans to AST nodes so that errors can better point to the correct
token. See upstream issue: https://github.com/gwenn/lemon-rs/issues/33
- Construct more errors with offset information. I noticed that most
parser errors are constructed with `None` as the offset.
- The messages are a bit redundant (example "syntax error at (1, 6)").
This can improved.
Comparisons.
Before:
```
❯ cargo run --package limbo --bin limbo database.db --output-mode pretty
...
limbo> selet * from a;
[2025-01-05T11:22:55Z ERROR sqlite3Parser] near "Token([115, 101, 108, 101, 116])": syntax error
Parse error: near "selet": syntax error at (1, 6)
```
<img width="969" alt="image" src="https://github.com/user-
attachments/assets/82651a77-f5ac-4eee-b208-88c6ea7fc9b7" />
After:
```
❯ cargo run --package limbo --bin limbo database.db --output-mode pretty
...
limbo> selet * from a;
[2025-01-05T12:25:52Z ERROR sqlite3Parser] near "Token([115, 101, 108, 101, 116])": syntax error

  × near "selet": syntax error at (1, 6)
   ╭────
 1 │ selet * from a
   ·     ▲
   ·     ╰── syntax error
   ╰────

```
<img width="980" alt="image" src="https://github.com/user-
attachments/assets/747a90e5-5085-41f9-b0fe-25864179ca35" />

Closes #618
This commit is contained in:
Pekka Enberg
2025-01-05 21:09:52 +02:00
10 changed files with 224 additions and 59 deletions

View File

@@ -79,6 +79,12 @@ impl<S: Splitter> Scanner<S> {
pub fn column(&self) -> usize {
self.column
}
/// Current byte offset in the source string
pub fn offset(&self) -> usize {
self.offset
}
/// Associated splitter
pub fn splitter(&self) -> &S {
&self.splitter

View File

@@ -7,57 +7,91 @@ use crate::parser::ParserError;
/// SQL lexer and parser errors
#[non_exhaustive]
#[derive(Debug)]
#[derive(Debug, miette::Diagnostic)]
#[diagnostic()]
pub enum Error {
/// I/O Error
Io(io::Error),
/// Lexer error
UnrecognizedToken(Option<(u64, usize)>),
UnrecognizedToken(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Missing quote or double-quote or backtick
UnterminatedLiteral(Option<(u64, usize)>),
UnterminatedLiteral(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Missing `]`
UnterminatedBracket(Option<(u64, usize)>),
UnterminatedBracket(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Missing `*/`
UnterminatedBlockComment(Option<(u64, usize)>),
UnterminatedBlockComment(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Invalid parameter name
BadVariableName(Option<(u64, usize)>),
BadVariableName(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Invalid number format
BadNumber(Option<(u64, usize)>),
BadNumber(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Invalid or missing sign after `!`
ExpectedEqualsSign(Option<(u64, usize)>),
ExpectedEqualsSign(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// BLOB literals are string literals containing hexadecimal data and preceded by a single "x" or "X" character.
MalformedBlobLiteral(Option<(u64, usize)>),
MalformedBlobLiteral(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Hexadecimal integer literals follow the C-language notation of "0x" or "0X" followed by hexadecimal digits.
MalformedHexInteger(Option<(u64, usize)>),
MalformedHexInteger(
Option<(u64, usize)>,
#[label("here")] Option<miette::SourceSpan>,
),
/// Grammar error
ParserError(ParserError, Option<(u64, usize)>),
ParserError(
ParserError,
Option<(u64, usize)>,
#[label("syntax error")] Option<miette::SourceSpan>,
),
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
Self::Io(ref err) => err.fmt(f),
Self::UnrecognizedToken(pos) => write!(f, "unrecognized token at {:?}", pos.unwrap()),
Self::UnterminatedLiteral(pos) => {
Self::UnrecognizedToken(pos, _) => {
write!(f, "unrecognized token at {:?}", pos.unwrap())
}
Self::UnterminatedLiteral(pos, _) => {
write!(f, "non-terminated literal at {:?}", pos.unwrap())
}
Self::UnterminatedBracket(pos) => {
Self::UnterminatedBracket(pos, _) => {
write!(f, "non-terminated bracket at {:?}", pos.unwrap())
}
Self::UnterminatedBlockComment(pos) => {
Self::UnterminatedBlockComment(pos, _) => {
write!(f, "non-terminated block comment at {:?}", pos.unwrap())
}
Self::BadVariableName(pos) => write!(f, "bad variable name at {:?}", pos.unwrap()),
Self::BadNumber(pos) => write!(f, "bad number at {:?}", pos.unwrap()),
Self::ExpectedEqualsSign(pos) => write!(f, "expected = sign at {:?}", pos.unwrap()),
Self::MalformedBlobLiteral(pos) => {
Self::BadVariableName(pos, _) => write!(f, "bad variable name at {:?}", pos.unwrap()),
Self::BadNumber(pos, _) => write!(f, "bad number at {:?}", pos.unwrap()),
Self::ExpectedEqualsSign(pos, _) => write!(f, "expected = sign at {:?}", pos.unwrap()),
Self::MalformedBlobLiteral(pos, _) => {
write!(f, "malformed blob literal at {:?}", pos.unwrap())
}
Self::MalformedHexInteger(pos) => {
Self::MalformedHexInteger(pos, _) => {
write!(f, "malformed hex integer at {:?}", pos.unwrap())
}
Self::ParserError(ref msg, Some(pos)) => write!(f, "{msg} at {pos:?}"),
Self::ParserError(ref msg, _) => write!(f, "{msg}"),
Self::ParserError(ref msg, Some(pos), _) => write!(f, "{msg} at {pos:?}"),
Self::ParserError(ref msg, _, _) => write!(f, "{msg}"),
}
}
}
@@ -72,7 +106,7 @@ impl From<io::Error> for Error {
impl From<ParserError> for Error {
fn from(err: ParserError) -> Self {
Self::ParserError(err, None)
Self::ParserError(err, None, None)
}
}
@@ -80,16 +114,16 @@ impl ScanError for Error {
fn position(&mut self, line: u64, column: usize) {
match *self {
Self::Io(_) => {}
Self::UnrecognizedToken(ref mut pos) => *pos = Some((line, column)),
Self::UnterminatedLiteral(ref mut pos) => *pos = Some((line, column)),
Self::UnterminatedBracket(ref mut pos) => *pos = Some((line, column)),
Self::UnterminatedBlockComment(ref mut pos) => *pos = Some((line, column)),
Self::BadVariableName(ref mut pos) => *pos = Some((line, column)),
Self::BadNumber(ref mut pos) => *pos = Some((line, column)),
Self::ExpectedEqualsSign(ref mut pos) => *pos = Some((line, column)),
Self::MalformedBlobLiteral(ref mut pos) => *pos = Some((line, column)),
Self::MalformedHexInteger(ref mut pos) => *pos = Some((line, column)),
Self::ParserError(_, ref mut pos) => *pos = Some((line, column)),
Self::UnrecognizedToken(ref mut pos, _) => *pos = Some((line, column)),
Self::UnterminatedLiteral(ref mut pos, _) => *pos = Some((line, column)),
Self::UnterminatedBracket(ref mut pos, _) => *pos = Some((line, column)),
Self::UnterminatedBlockComment(ref mut pos, _) => *pos = Some((line, column)),
Self::BadVariableName(ref mut pos, _) => *pos = Some((line, column)),
Self::BadNumber(ref mut pos, _) => *pos = Some((line, column)),
Self::ExpectedEqualsSign(ref mut pos, _) => *pos = Some((line, column)),
Self::MalformedBlobLiteral(ref mut pos, _) => *pos = Some((line, column)),
Self::MalformedHexInteger(ref mut pos, _) => *pos = Some((line, column)),
Self::ParserError(_, ref mut pos, _) => *pos = Some((line, column)),
}
}
}

View File

@@ -57,6 +57,11 @@ impl<'input> Parser<'input> {
pub fn column(&self) -> usize {
self.scanner.column()
}
/// Current byte offset in input
pub fn offset(&self) -> usize {
self.scanner.offset()
}
}
/*
@@ -230,13 +235,21 @@ impl FallibleIterator for Parser<'_> {
}
self.parser.sqlite3ParserFinalize();
if let Some(e) = self.parser.ctx.error() {
let err = Error::ParserError(e, Some((self.scanner.line(), self.scanner.column())));
let err = Error::ParserError(
e,
Some((self.scanner.line(), self.scanner.column())),
Some((self.offset() - 1).into()),
);
return Err(err);
}
let cmd = self.parser.ctx.cmd();
if let Some(ref cmd) = cmd {
if let Err(e) = cmd.check() {
let err = Error::ParserError(e, Some((self.scanner.line(), self.scanner.column())));
let err = Error::ParserError(
e,
Some((self.scanner.line(), self.scanner.column())),
Some((self.offset() - 1).into()),
);
return Err(err);
}
}
@@ -332,7 +345,7 @@ impl Splitter for Tokenizer {
if let Some(i) = end {
Ok((None, i + 1))
} else {
Err(Error::UnterminatedBlockComment(None))
Err(Error::UnterminatedBlockComment(None, None))
}
} else {
Ok((Some((&data[..1], TK_SLASH)), 1))
@@ -381,10 +394,10 @@ impl Splitter for Tokenizer {
if *b == b'=' {
Ok((Some((&data[..2], TK_NE)), 2))
} else {
Err(Error::ExpectedEqualsSign(None))
Err(Error::ExpectedEqualsSign(None, None))
}
} else {
Err(Error::ExpectedEqualsSign(None))
Err(Error::ExpectedEqualsSign(None, None))
}
}
b'|' => {
@@ -419,7 +432,7 @@ impl Splitter for Tokenizer {
// Keep original quotes / '[' ... ]'
Ok((Some((&data[0..=i], TK_ID)), i + 1))
} else {
Err(Error::UnterminatedBracket(None))
Err(Error::UnterminatedBracket(None, None))
}
}
b'?' => {
@@ -437,14 +450,14 @@ impl Splitter for Tokenizer {
.skip(1)
.position(|&b| !is_identifier_continue(b))
{
Some(0) => Err(Error::BadVariableName(None)),
Some(0) => Err(Error::BadVariableName(None, None)),
Some(i) => {
// '$' is included as part of the name
Ok((Some((&data[..=i], TK_VARIABLE)), i + 1))
}
None => {
if data.len() == 1 {
return Err(Error::BadVariableName(None));
return Err(Error::BadVariableName(None, None));
}
Ok((Some((data, TK_VARIABLE)), data.len()))
}
@@ -461,7 +474,7 @@ impl Splitter for Tokenizer {
Ok(self.identifierish(data))
}
}
_ => Err(Error::UnrecognizedToken(None)),
_ => Err(Error::UnrecognizedToken(None, None)),
}
}
}
@@ -493,7 +506,7 @@ fn literal(data: &[u8], quote: u8) -> Result<(Option<Token<'_>>, usize), Error>
// keep original quotes in the token
Ok((Some((&data[0..i], tt)), i))
} else {
Err(Error::UnterminatedLiteral(None))
Err(Error::UnterminatedLiteral(None, None))
}
}
@@ -507,11 +520,11 @@ fn blob_literal(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
.find(|&(_, &b)| !b.is_ascii_hexdigit())
{
if *b != b'\'' || i % 2 != 0 {
return Err(Error::MalformedBlobLiteral(None));
return Err(Error::MalformedBlobLiteral(None, None));
}
Ok((Some((&data[2..i], TK_BLOB)), i + 1))
} else {
Err(Error::MalformedBlobLiteral(None))
Err(Error::MalformedBlobLiteral(None, None))
}
}
@@ -532,7 +545,7 @@ fn number(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
} else if b == b'e' || b == b'E' {
return exponential_part(data, i);
} else if is_identifier_start(b) {
return Err(Error::BadNumber(None));
return Err(Error::BadNumber(None, None));
}
Ok((Some((&data[..i], TK_INTEGER)), i))
} else {
@@ -546,13 +559,13 @@ fn hex_integer(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
if let Some((i, b)) = find_end_of_number(data, 2, u8::is_ascii_hexdigit)? {
// Must not be empty (Ox is invalid)
if i == 2 || is_identifier_start(b) {
return Err(Error::MalformedHexInteger(None));
return Err(Error::MalformedHexInteger(None, None));
}
Ok((Some((&data[..i], TK_INTEGER)), i))
} else {
// Must not be empty (Ox is invalid)
if data.len() == 2 {
return Err(Error::MalformedHexInteger(None));
return Err(Error::MalformedHexInteger(None, None));
}
Ok((Some((data, TK_INTEGER)), data.len()))
}
@@ -564,7 +577,7 @@ fn fractional_part(data: &[u8], i: usize) -> Result<(Option<Token<'_>>, usize),
if b == b'e' || b == b'E' {
return exponential_part(data, i);
} else if is_identifier_start(b) {
return Err(Error::BadNumber(None));
return Err(Error::BadNumber(None, None));
}
Ok((Some((&data[..i], TK_FLOAT)), i))
} else {
@@ -579,17 +592,17 @@ fn exponential_part(data: &[u8], i: usize) -> Result<(Option<Token<'_>>, usize),
let i = if *b == b'+' || *b == b'-' { i + 1 } else { i };
if let Some((j, b)) = find_end_of_number(data, i + 1, u8::is_ascii_digit)? {
if j == i + 1 || is_identifier_start(b) {
return Err(Error::BadNumber(None));
return Err(Error::BadNumber(None, None));
}
Ok((Some((&data[..j], TK_FLOAT)), j))
} else {
if data.len() == i + 1 {
return Err(Error::BadNumber(None));
return Err(Error::BadNumber(None, None));
}
Ok((Some((data, TK_FLOAT)), data.len()))
}
} else {
Err(Error::BadNumber(None))
Err(Error::BadNumber(None, None))
}
}
@@ -606,7 +619,7 @@ fn find_end_of_number(
{
continue;
}
return Err(Error::BadNumber(None));
return Err(Error::BadNumber(None, None));
} else {
return Ok(Some((j, b)));
}
@@ -660,7 +673,7 @@ mod tests {
let mut s = Scanner::new(tokenizer);
expect_token(&mut s, input, b"SELECT", TokenType::TK_SELECT)?;
let err = s.scan(input).unwrap_err();
assert!(matches!(err, Error::BadNumber(_)));
assert!(matches!(err, Error::BadNumber(_, _)));
Ok(())
}

View File

@@ -361,7 +361,7 @@ fn expect_parser_err_msg(input: &[u8], error_msg: &str) {
}
fn expect_parser_err(input: &[u8], err: ParserError) {
let r = parse(input);
if let Error::ParserError(e, _) = r.unwrap_err() {
if let Error::ParserError(e, _, _) = r.unwrap_err() {
assert_eq!(e, err);
} else {
panic!("unexpected error type")