diff --git a/parser/src/error.rs b/parser/src/error.rs index 63d4aef78..26353ffb6 100644 --- a/parser/src/error.rs +++ b/parser/src/error.rs @@ -6,45 +6,101 @@ use crate::token::TokenType; #[diagnostic()] pub enum Error { /// Lexer error - #[error("unrecognized token at {0:?}")] - UnrecognizedToken(#[label("here")] miette::SourceSpan), + #[error("unrecognized token '{token_text}' at offset {offset}")] + UnrecognizedToken { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, /// Missing quote or double-quote or backtick - #[error("non-terminated literal at {0:?}")] - UnterminatedLiteral(#[label("here")] miette::SourceSpan), + #[error("non-terminated literal '{token_text}' at offset {offset}")] + UnterminatedLiteral { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, /// Missing `]` - #[error("non-terminated bracket at {0:?}")] - UnterminatedBracket(#[label("here")] miette::SourceSpan), + #[error("non-terminated bracket '{token_text}' at offset {offset}")] + UnterminatedBracket { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, + /// Missing `*/` + #[error("non-terminated block comment '{token_text}' at offset {offset}")] + UnterminatedBlockComment { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, /// Invalid parameter name - #[error("bad variable name at {0:?}")] - BadVariableName(#[label("here")] miette::SourceSpan), + #[error("bad variable name '{token_text}' at offset {offset}")] + BadVariableName { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, /// Invalid number format - #[error("bad number at {0:?}")] - BadNumber(#[label("here")] miette::SourceSpan), + #[error("bad number '{token_text}' at offset {offset}")] + BadNumber { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, // Bad fractional part of a number - #[error("bad fractional part at {0:?}")] - BadFractionalPart(#[label("here")] miette::SourceSpan), + #[error("bad fractional part '{token_text}' at offset {offset}")] + BadFractionalPart { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, // Bad exponent part of a number - #[error("bad exponent part at {0:?}")] - BadExponentPart(#[label("here")] miette::SourceSpan), + #[error("bad exponent part '{token_text}' at offset {offset}")] + BadExponentPart { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, /// Invalid or missing sign after `!` - #[error("expected = sign at {0:?}")] - ExpectedEqualsSign(#[label("here")] miette::SourceSpan), + #[error("expected = sign '{token_text}' at offset {offset}")] + ExpectedEqualsSign { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, /// Hexadecimal integer literals follow the C-language notation of "0x" or "0X" followed by hexadecimal digits. - #[error("malformed hex integer at {0:?}")] - MalformedHexInteger(#[label("here")] miette::SourceSpan), + #[error("malformed hex integer '{token_text}' at offset {offset}")] + MalformedHexInteger { + #[label("here")] + span: miette::SourceSpan, + token_text: String, + offset: usize, + }, // parse errors // Unexpected end of file #[error("unexpected end of file")] ParseUnexpectedEOF, // Unexpected token - #[error("unexpected token at {parsed_offset:?}")] - #[diagnostic(help("expected {expected:?} but found {got:?}"))] + #[error("unexpected token '{token_text}' at offset {offset}")] + #[diagnostic(help("expected {expected_display} but found '{token_text}'"))] ParseUnexpectedToken { #[label("here")] parsed_offset: miette::SourceSpan, got: TokenType, expected: &'static [TokenType], + token_text: String, + offset: usize, + expected_display: String, }, // Custom error message #[error("{0}")] diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index 0876e4103..33429917a 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -297,14 +297,27 @@ impl<'a> Lexer<'a> { if start == self.offset { // before the underscore, there was no digit - return Err(Error::BadNumber((start, self.offset - start).into())); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::BadNumber { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } match self.peek() { Some(b) if b.is_ascii_digit() => continue, // Continue if next is a digit _ => { // after the underscore, there is no digit - return Err(Error::BadNumber((start, self.offset - start).into())); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]) + .to_string(); + return Err(Error::BadNumber { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } } } @@ -321,7 +334,13 @@ impl<'a> Lexer<'a> { Some(b'_') => { if start == self.offset { // before the underscore, there was no digit - return Err(Error::BadNumber((start, self.offset - start).into())); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::BadNumber { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } self.eat_and_assert(|b| b == b'_'); @@ -329,7 +348,14 @@ impl<'a> Lexer<'a> { Some(b) if b.is_ascii_hexdigit() => continue, // Continue if next is a digit _ => { // after the underscore, there is no digit - return Err(Error::BadNumber((start, self.offset - start).into())); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]) + .to_string(); + return Err(Error::BadNumber { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } } } @@ -514,9 +540,13 @@ impl<'a> Lexer<'a> { self.eat_and_assert(|b| b == b'='); } _ => { - return Err(Error::ExpectedEqualsSign( - (start, self.offset - start).into(), - )) + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::ExpectedEqualsSign { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } } @@ -567,9 +597,13 @@ impl<'a> Lexer<'a> { } } None => { - return Err(Error::UnterminatedLiteral( - (start, self.offset - start).into(), - )) + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::UnterminatedLiteral { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } _ => unreachable!(), }; @@ -598,9 +632,15 @@ impl<'a> Lexer<'a> { token_type: Some(TokenType::TK_FLOAT), }) } - Some(b) if is_identifier_start(b) => Err(Error::BadFractionalPart( - (start, self.offset - start).into(), - )), + Some(b) if is_identifier_start(b) => { + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + Err(Error::BadFractionalPart { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }) + } _ => Ok(Token { value: &self.input[start..self.offset], token_type: Some(TokenType::TK_FLOAT), @@ -627,11 +667,21 @@ impl<'a> Lexer<'a> { let start_num = self.offset; self.eat_while_number_digit()?; if start_num == self.offset { - return Err(Error::BadExponentPart((start, self.offset - start).into())); + let token_text = String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::BadExponentPart { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } if self.peek().is_some() && is_identifier_start(self.peek().unwrap()) { - return Err(Error::BadExponentPart((start, self.offset - start).into())); + let token_text = String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::BadExponentPart { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } Ok(Token { @@ -654,13 +704,23 @@ impl<'a> Lexer<'a> { self.eat_while_number_hexdigit()?; if start_hex == self.offset { - return Err(Error::MalformedHexInteger( - (start, self.offset - start).into(), - )); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::MalformedHexInteger { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } if self.peek().is_some() && is_identifier_start(self.peek().unwrap()) { - return Err(Error::BadNumber((start, self.offset - start).into())); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::BadNumber { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } return Ok(Token { @@ -689,7 +749,13 @@ impl<'a> Lexer<'a> { }) } Some(b) if is_identifier_start(b) => { - Err(Error::BadNumber((start, self.offset - start).into())) + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + Err(Error::BadNumber { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }) } _ => Ok(Token { value: &self.input[start..self.offset], @@ -710,9 +776,15 @@ impl<'a> Lexer<'a> { token_type: Some(TokenType::TK_ID), }) } - None => Err(Error::UnterminatedBracket( - (start, self.offset - start).into(), - )), + None => { + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + Err(Error::UnterminatedBracket { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }) + } _ => unreachable!(), // We should not reach here } } @@ -737,7 +809,13 @@ impl<'a> Lexer<'a> { // empty variable name if start_id == self.offset { - return Err(Error::BadVariableName((start, self.offset - start).into())); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + return Err(Error::BadVariableName { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } Ok(Token { @@ -767,9 +845,14 @@ impl<'a> Lexer<'a> { self.eat_and_assert(|b| b == b'\''); if (end_hex - start_hex) % 2 != 0 { - return Err(Error::UnrecognizedToken( - (start, self.offset - start).into(), - )); + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]) + .to_string(); + return Err(Error::UnrecognizedToken { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }); } Ok(Token { @@ -777,9 +860,15 @@ impl<'a> Lexer<'a> { token_type: Some(TokenType::TK_BLOB), }) } - _ => Err(Error::UnterminatedLiteral( - (start, self.offset - start).into(), - )), + _ => { + let token_text = + String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + Err(Error::UnterminatedLiteral { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }) + } } } _ => { @@ -796,9 +885,12 @@ impl<'a> Lexer<'a> { fn eat_unrecognized(&mut self) -> Result> { let start = self.offset; self.eat_while(|b| b.is_some() && !b.unwrap().is_ascii_whitespace()); - Err(Error::UnrecognizedToken( - (start, self.offset - start).into(), - )) + let token_text = String::from_utf8_lossy(&self.input[start..self.offset]).to_string(); + Err(Error::UnrecognizedToken { + span: (start, self.offset - start).into(), + token_text, + offset: start, + }) } } diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 1563f3e8b..02a57fadc 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -28,12 +28,17 @@ macro_rules! peek_expect { match (TK_ID, tt.fallback_id_if_ok()) { $(($x, TK_ID) => token,)* _ => { + let token_text = String::from_utf8_lossy(token.value).to_string(); + let offset = $parser.offset(); return Err(Error::ParseUnexpectedToken { parsed_offset: ($parser.offset(), token_len).into(), expected: &[ $($x,)* ], got: tt, + token_text: token_text.clone(), + offset, + expected_display: crate::token::TokenType::format_expected_tokens(&[$($x,)*]), }) } } @@ -242,10 +247,17 @@ impl<'a> Parser<'a> { Some(token) => { if !found_semi { let tt = token.token_type.unwrap(); + let token_text = String::from_utf8_lossy(token.value).to_string(); + let offset = self.offset(); return Err(Error::ParseUnexpectedToken { - parsed_offset: (self.offset(), 1).into(), + parsed_offset: (offset, 1).into(), expected: &[TK_SEMI], got: tt, + token_text: token_text.clone(), + offset, + expected_display: crate::token::TokenType::format_expected_tokens(&[ + TK_SEMI, + ]), }); } @@ -1495,10 +1507,18 @@ impl<'a> Parser<'a> { Some(self.parse_nm()?) } else if tok.token_type == Some(TK_LP) { if can_be_lit_str { + let token = self.peek_no_eof()?; + let token_text = String::from_utf8_lossy(token.value).to_string(); + let offset = self.offset(); return Err(Error::ParseUnexpectedToken { parsed_offset: (self.offset() - name.len(), name.len()).into(), got: TK_STRING, expected: &[TK_ID, TK_INDEXED, TK_JOIN_KW], + token_text: token_text.clone(), + offset, + expected_display: crate::token::TokenType::format_expected_tokens( + &[TK_ID, TK_INDEXED, TK_JOIN_KW], + ), }); } // can not be literal string in function name diff --git a/parser/src/token.rs b/parser/src/token.rs index ed8f416c5..0f0719741 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -548,4 +548,47 @@ impl TokenType { _ => self, } } + + /// Get user-friendly display name for error messages + pub fn user_friendly_name(&self) -> &'static str { + match self.as_str() { + Some(s) => s, + None => match self { + TokenType::TK_ID => "identifier", + TokenType::TK_STRING => "string", + TokenType::TK_INTEGER => "integer", + TokenType::TK_FLOAT => "float", + TokenType::TK_BLOB => "blob", + TokenType::TK_VARIABLE => "variable", + TokenType::TK_ILLEGAL => "illegal token", + TokenType::TK_EOF => "end of file", + TokenType::TK_LIKE_KW => "LIKE", + TokenType::TK_JOIN_KW => "JOIN", + TokenType::TK_CTIME_KW => "datetime function", + TokenType::TK_ISNOT => "IS NOT", + TokenType::TK_ISNULL => "ISNULL", + TokenType::TK_NOTNULL => "NOTNULL", + TokenType::TK_PTR => "->", + _ => "unknown token", + }, + } + } + + /// Format multiple tokens for error messages + pub fn format_expected_tokens(tokens: &[TokenType]) -> String { + if tokens.is_empty() { + return "nothing".to_string(); + } + if tokens.len() == 1 { + return tokens[0].user_friendly_name().to_string(); + } + + let names: Vec<&str> = tokens.iter().map(|t| t.user_friendly_name()).collect(); + if names.len() == 2 { + format!("{} or {}", names[0], names[1]) + } else { + let (last, rest) = names.split_last().unwrap(); + format!("{}, or {}", rest.join(", "), last) + } + } } diff --git a/testing/cli_tests/extensions.py b/testing/cli_tests/extensions.py index 5f90014a9..ad6e99687 100755 --- a/testing/cli_tests/extensions.py +++ b/testing/cli_tests/extensions.py @@ -792,12 +792,12 @@ def test_csv(): ) turso.run_test_fn( "create virtual table t1 using csv(data='1'\\'2');", - lambda res: "unrecognized token at" in res, + lambda res: "unrecognized token " in res, "Create CSV table with malformed escape sequence", ) turso.run_test_fn( "create virtual table t1 using csv(data=\"12');", - lambda res: "non-terminated literal at" in res, + lambda res: "non-terminated literal " in res, "Create CSV table with unterminated quoted string", )