Merge 'Return better syntax error messages' from Diego Reis

Current error messages are too "low level", e.g returning tokens in
messages. This PR improves this a bit.
Before:
```text
 turso> with t as (select * from pragma_schema_version); select c.schema_version from t as c;

  × unexpected token at SourceSpan { offset: SourceOffset(47), length: 1 }
   ╭────
 1 │ with t as (select * from pragma_schema_version); select c.schema_version from t as c;
   ·                                                ┬
   ·                                                ╰── here
   ╰────
  help: expected [TK_SELECT, TK_VALUES, TK_UPDATE, TK_DELETE, TK_INSERT, TK_REPLACE] but found TK_SEMI
```
Now:
```text
 turso> with t as (select * from pragma_schema_version); select c.schema_version from t as c;

  × unexpected token ';' at offset 47
   ╭────
 1 │ with t as (select * from pragma_schema_version);select c.schema_version from t as c;
   ·                                                ┬
   ·                                                ╰── here
   ╰────
  help: expected SELECT, VALUES, UPDATE, DELETE, INSERT, or REPLACE but found ';'
  ```
@TcMits WDYT?

Closes #3190
This commit is contained in:
Jussi Saurio
2025-10-22 10:57:54 +03:00
committed by GitHub
5 changed files with 267 additions and 56 deletions

View File

@@ -6,45 +6,101 @@ use crate::token::TokenType;
#[diagnostic()]
pub enum Error {
/// Lexer error
#[error("unrecognized token at {0:?}")]
UnrecognizedToken(#[label("here")] miette::SourceSpan),
#[error("unrecognized token '{token_text}' at offset {offset}")]
UnrecognizedToken {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Missing quote or double-quote or backtick
#[error("non-terminated literal at {0:?}")]
UnterminatedLiteral(#[label("here")] miette::SourceSpan),
#[error("non-terminated literal '{token_text}' at offset {offset}")]
UnterminatedLiteral {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Missing `]`
#[error("non-terminated bracket at {0:?}")]
UnterminatedBracket(#[label("here")] miette::SourceSpan),
#[error("non-terminated bracket '{token_text}' at offset {offset}")]
UnterminatedBracket {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Missing `*/`
#[error("non-terminated block comment '{token_text}' at offset {offset}")]
UnterminatedBlockComment {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Invalid parameter name
#[error("bad variable name at {0:?}")]
BadVariableName(#[label("here")] miette::SourceSpan),
#[error("bad variable name '{token_text}' at offset {offset}")]
BadVariableName {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Invalid number format
#[error("bad number at {0:?}")]
BadNumber(#[label("here")] miette::SourceSpan),
#[error("bad number '{token_text}' at offset {offset}")]
BadNumber {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
// Bad fractional part of a number
#[error("bad fractional part at {0:?}")]
BadFractionalPart(#[label("here")] miette::SourceSpan),
#[error("bad fractional part '{token_text}' at offset {offset}")]
BadFractionalPart {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
// Bad exponent part of a number
#[error("bad exponent part at {0:?}")]
BadExponentPart(#[label("here")] miette::SourceSpan),
#[error("bad exponent part '{token_text}' at offset {offset}")]
BadExponentPart {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Invalid or missing sign after `!`
#[error("expected = sign at {0:?}")]
ExpectedEqualsSign(#[label("here")] miette::SourceSpan),
#[error("expected = sign '{token_text}' at offset {offset}")]
ExpectedEqualsSign {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
/// Hexadecimal integer literals follow the C-language notation of "0x" or "0X" followed by hexadecimal digits.
#[error("malformed hex integer at {0:?}")]
MalformedHexInteger(#[label("here")] miette::SourceSpan),
#[error("malformed hex integer '{token_text}' at offset {offset}")]
MalformedHexInteger {
#[label("here")]
span: miette::SourceSpan,
token_text: String,
offset: usize,
},
// parse errors
// Unexpected end of file
#[error("unexpected end of file")]
ParseUnexpectedEOF,
// Unexpected token
#[error("unexpected token at {parsed_offset:?}")]
#[diagnostic(help("expected {expected:?} but found {got:?}"))]
#[error("unexpected token '{token_text}' at offset {offset}")]
#[diagnostic(help("expected {expected_display} but found '{token_text}'"))]
ParseUnexpectedToken {
#[label("here")]
parsed_offset: miette::SourceSpan,
got: TokenType,
expected: &'static [TokenType],
token_text: String,
offset: usize,
expected_display: String,
},
// Custom error message
#[error("{0}")]

View File

@@ -297,14 +297,27 @@ impl<'a> Lexer<'a> {
if start == self.offset {
// before the underscore, there was no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::BadNumber {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
match self.peek() {
Some(b) if b.is_ascii_digit() => continue, // Continue if next is a digit
_ => {
// after the underscore, there is no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset])
.to_string();
return Err(Error::BadNumber {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
}
}
@@ -321,7 +334,13 @@ impl<'a> Lexer<'a> {
Some(b'_') => {
if start == self.offset {
// before the underscore, there was no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::BadNumber {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
self.eat_and_assert(|b| b == b'_');
@@ -329,7 +348,14 @@ impl<'a> Lexer<'a> {
Some(b) if b.is_ascii_hexdigit() => continue, // Continue if next is a digit
_ => {
// after the underscore, there is no digit
return Err(Error::BadNumber((start, self.offset - start).into()));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset])
.to_string();
return Err(Error::BadNumber {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
}
}
@@ -514,9 +540,13 @@ impl<'a> Lexer<'a> {
self.eat_and_assert(|b| b == b'=');
}
_ => {
return Err(Error::ExpectedEqualsSign(
(start, self.offset - start).into(),
))
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::ExpectedEqualsSign {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
}
@@ -567,9 +597,13 @@ impl<'a> Lexer<'a> {
}
}
None => {
return Err(Error::UnterminatedLiteral(
(start, self.offset - start).into(),
))
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::UnterminatedLiteral {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
_ => unreachable!(),
};
@@ -598,9 +632,15 @@ impl<'a> Lexer<'a> {
token_type: Some(TokenType::TK_FLOAT),
})
}
Some(b) if is_identifier_start(b) => Err(Error::BadFractionalPart(
(start, self.offset - start).into(),
)),
Some(b) if is_identifier_start(b) => {
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
Err(Error::BadFractionalPart {
span: (start, self.offset - start).into(),
token_text,
offset: start,
})
}
_ => Ok(Token {
value: &self.input[start..self.offset],
token_type: Some(TokenType::TK_FLOAT),
@@ -627,11 +667,21 @@ impl<'a> Lexer<'a> {
let start_num = self.offset;
self.eat_while_number_digit()?;
if start_num == self.offset {
return Err(Error::BadExponentPart((start, self.offset - start).into()));
let token_text = String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::BadExponentPart {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
if self.peek().is_some() && is_identifier_start(self.peek().unwrap()) {
return Err(Error::BadExponentPart((start, self.offset - start).into()));
let token_text = String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::BadExponentPart {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
Ok(Token {
@@ -654,13 +704,23 @@ impl<'a> Lexer<'a> {
self.eat_while_number_hexdigit()?;
if start_hex == self.offset {
return Err(Error::MalformedHexInteger(
(start, self.offset - start).into(),
));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::MalformedHexInteger {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
if self.peek().is_some() && is_identifier_start(self.peek().unwrap()) {
return Err(Error::BadNumber((start, self.offset - start).into()));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::BadNumber {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
return Ok(Token {
@@ -689,7 +749,13 @@ impl<'a> Lexer<'a> {
})
}
Some(b) if is_identifier_start(b) => {
Err(Error::BadNumber((start, self.offset - start).into()))
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
Err(Error::BadNumber {
span: (start, self.offset - start).into(),
token_text,
offset: start,
})
}
_ => Ok(Token {
value: &self.input[start..self.offset],
@@ -710,9 +776,15 @@ impl<'a> Lexer<'a> {
token_type: Some(TokenType::TK_ID),
})
}
None => Err(Error::UnterminatedBracket(
(start, self.offset - start).into(),
)),
None => {
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
Err(Error::UnterminatedBracket {
span: (start, self.offset - start).into(),
token_text,
offset: start,
})
}
_ => unreachable!(), // We should not reach here
}
}
@@ -737,7 +809,13 @@ impl<'a> Lexer<'a> {
// empty variable name
if start_id == self.offset {
return Err(Error::BadVariableName((start, self.offset - start).into()));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
return Err(Error::BadVariableName {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
Ok(Token {
@@ -767,9 +845,14 @@ impl<'a> Lexer<'a> {
self.eat_and_assert(|b| b == b'\'');
if (end_hex - start_hex) % 2 != 0 {
return Err(Error::UnrecognizedToken(
(start, self.offset - start).into(),
));
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset])
.to_string();
return Err(Error::UnrecognizedToken {
span: (start, self.offset - start).into(),
token_text,
offset: start,
});
}
Ok(Token {
@@ -777,9 +860,15 @@ impl<'a> Lexer<'a> {
token_type: Some(TokenType::TK_BLOB),
})
}
_ => Err(Error::UnterminatedLiteral(
(start, self.offset - start).into(),
)),
_ => {
let token_text =
String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
Err(Error::UnterminatedLiteral {
span: (start, self.offset - start).into(),
token_text,
offset: start,
})
}
}
}
_ => {
@@ -796,9 +885,12 @@ impl<'a> Lexer<'a> {
fn eat_unrecognized(&mut self) -> Result<Token<'a>> {
let start = self.offset;
self.eat_while(|b| b.is_some() && !b.unwrap().is_ascii_whitespace());
Err(Error::UnrecognizedToken(
(start, self.offset - start).into(),
))
let token_text = String::from_utf8_lossy(&self.input[start..self.offset]).to_string();
Err(Error::UnrecognizedToken {
span: (start, self.offset - start).into(),
token_text,
offset: start,
})
}
}

View File

@@ -28,12 +28,17 @@ macro_rules! peek_expect {
match (TK_ID, tt.fallback_id_if_ok()) {
$(($x, TK_ID) => token,)*
_ => {
let token_text = String::from_utf8_lossy(token.value).to_string();
let offset = $parser.offset();
return Err(Error::ParseUnexpectedToken {
parsed_offset: ($parser.offset(), token_len).into(),
expected: &[
$($x,)*
],
got: tt,
token_text: token_text.clone(),
offset,
expected_display: crate::token::TokenType::format_expected_tokens(&[$($x,)*]),
})
}
}
@@ -242,10 +247,17 @@ impl<'a> Parser<'a> {
Some(token) => {
if !found_semi {
let tt = token.token_type.unwrap();
let token_text = String::from_utf8_lossy(token.value).to_string();
let offset = self.offset();
return Err(Error::ParseUnexpectedToken {
parsed_offset: (self.offset(), 1).into(),
parsed_offset: (offset, 1).into(),
expected: &[TK_SEMI],
got: tt,
token_text: token_text.clone(),
offset,
expected_display: crate::token::TokenType::format_expected_tokens(&[
TK_SEMI,
]),
});
}
@@ -1495,10 +1507,18 @@ impl<'a> Parser<'a> {
Some(self.parse_nm()?)
} else if tok.token_type == Some(TK_LP) {
if can_be_lit_str {
let token = self.peek_no_eof()?;
let token_text = String::from_utf8_lossy(token.value).to_string();
let offset = self.offset();
return Err(Error::ParseUnexpectedToken {
parsed_offset: (self.offset() - name.len(), name.len()).into(),
got: TK_STRING,
expected: &[TK_ID, TK_INDEXED, TK_JOIN_KW],
token_text: token_text.clone(),
offset,
expected_display: crate::token::TokenType::format_expected_tokens(
&[TK_ID, TK_INDEXED, TK_JOIN_KW],
),
});
} // can not be literal string in function name

View File

@@ -548,4 +548,47 @@ impl TokenType {
_ => self,
}
}
/// Get user-friendly display name for error messages
pub fn user_friendly_name(&self) -> &'static str {
match self.as_str() {
Some(s) => s,
None => match self {
TokenType::TK_ID => "identifier",
TokenType::TK_STRING => "string",
TokenType::TK_INTEGER => "integer",
TokenType::TK_FLOAT => "float",
TokenType::TK_BLOB => "blob",
TokenType::TK_VARIABLE => "variable",
TokenType::TK_ILLEGAL => "illegal token",
TokenType::TK_EOF => "end of file",
TokenType::TK_LIKE_KW => "LIKE",
TokenType::TK_JOIN_KW => "JOIN",
TokenType::TK_CTIME_KW => "datetime function",
TokenType::TK_ISNOT => "IS NOT",
TokenType::TK_ISNULL => "ISNULL",
TokenType::TK_NOTNULL => "NOTNULL",
TokenType::TK_PTR => "->",
_ => "unknown token",
},
}
}
/// Format multiple tokens for error messages
pub fn format_expected_tokens(tokens: &[TokenType]) -> String {
if tokens.is_empty() {
return "nothing".to_string();
}
if tokens.len() == 1 {
return tokens[0].user_friendly_name().to_string();
}
let names: Vec<&str> = tokens.iter().map(|t| t.user_friendly_name()).collect();
if names.len() == 2 {
format!("{} or {}", names[0], names[1])
} else {
let (last, rest) = names.split_last().unwrap();
format!("{}, or {}", rest.join(", "), last)
}
}
}