diff --git a/core/json/json_path.pest b/core/json/json_path.pest deleted file mode 100644 index 590a3df23..000000000 --- a/core/json/json_path.pest +++ /dev/null @@ -1,8 +0,0 @@ -negative_index_indicator = ${ "#-" } -array_offset = ${ ASCII_DIGIT+ } -array_locator = ${ "[" ~ negative_index_indicator? ~ array_offset ~ "]" } -relaxed_array_locator = ${ negative_index_indicator? ~ array_offset } - -root = ${ "$" } -json_path_key = ${ identifier | string | ASCII_DIGIT+ } -path = ${ SOI ~ root ~ (array_locator | "." ~ json_path_key)* ~ EOI } diff --git a/core/json/json_path.rs b/core/json/json_path.rs index ed8981018..33631e469 100644 --- a/core/json/json_path.rs +++ b/core/json/json_path.rs @@ -1,80 +1,344 @@ -use pest::Parser as P; -use pest_derive::Parser; +use crate::bail_parse_error; +use std::borrow::Cow; -#[derive(Parser)] -#[grammar = "json/json.pest"] -#[grammar = "json/json_path.pest"] -struct Parser; +#[derive(Clone, Debug, PartialEq)] +enum PPState { + Start, + AfterRoot, + InKey, + InArrayIndex, + ExpectDotOrBracket, +} + +#[derive(Clone, Debug, PartialEq)] +enum ArrayIndexState { + Start, + AfterHash, + CollectingNumbers, + IsMax, +} /// Describes a JSON path, which is a sequence of keys and/or array locators. #[derive(Clone, Debug)] -pub struct JsonPath { - pub elements: Vec, +pub struct JsonPath<'a> { + pub elements: Vec>, } +type RawString = bool; + /// PathElement describes a single element of a JSON path. #[derive(Clone, Debug, PartialEq)] -pub enum PathElement { +pub enum PathElement<'a> { /// Root element: '$' Root(), /// JSON key - Key(String), + Key(Cow<'a, str>, RawString), /// Array locator, eg. [2], [#-5] - ArrayLocator(i32), + ArrayLocator(Option), +} + +type IsMaxNumber = bool; + +fn collect_num(current: i128, adding: i128, negative: bool) -> (i128, IsMaxNumber) { + let ten = 10i128; + + let result = if negative { + current.saturating_mul(ten).saturating_sub(adding) + } else { + current.saturating_mul(ten).saturating_add(adding) + }; + + let is_max = result == i128::MAX || result == i128::MIN; + (result, is_max) +} + +fn estimate_path_capacity(input: &str) -> usize { + // After $ we need either . or [ for each component + // So divide remaining length by 2 (minimum chars per component) + // Add 1 for the root component + 1 + (input.len() - 1) / 2 } /// Parses path into a Vec of Strings, where each string is a key or an array locator. -pub fn json_path(path: &str) -> crate::Result { - let parsed = Parser::parse(Rule::path, path); +pub fn json_path(path: &str) -> crate::Result> { + if path.is_empty() { + bail_parse_error!("Bad json path: {}", path) + } + let mut parser_state = PPState::Start; + let mut index_state = ArrayIndexState::Start; + let mut key_start = 0; + let mut index_buffer: i128 = 0; + let mut path_components = Vec::with_capacity(estimate_path_capacity(path)); + let mut path_iter = path.char_indices(); - if let Ok(mut parsed) = parsed { - let mut result = vec![]; - let parsed = parsed.next().unwrap(); - for pair in parsed.into_inner() { - match pair.as_rule() { - Rule::EOI => (), - Rule::root => result.push(PathElement::Root()), - Rule::json_path_key => result.push(PathElement::Key(pair.as_str().to_string())), - Rule::array_locator => { - let mut array_locator = pair.into_inner(); - let index_or_negative_indicator = array_locator.next().unwrap(); - - match index_or_negative_indicator.as_rule() { - Rule::negative_index_indicator => { - let negative_offset = array_locator.next().unwrap(); - // TODO: sqlite is able to parse arbitrarily big numbers, but they - // always get overflown and cast to i32. Handle this. - let parsed = negative_offset - .as_str() - .parse::() - .unwrap_or(i128::MAX); - - result.push(PathElement::ArrayLocator(-parsed as i32)); - } - Rule::array_offset => { - let array_offset = index_or_negative_indicator.as_str(); - // TODO: sqlite is able to parse arbitrarily big numbers, but they - // always get overflown and cast to i32. Handle this. - let parsed = array_offset.parse::().unwrap_or(i128::MAX); - - result.push(PathElement::ArrayLocator(parsed as i32)); - } - _ => unreachable!( - "Unexpected rule: {:?}", - index_or_negative_indicator.as_rule() - ), - } - } - _ => { - unreachable!("Unexpected rule: {:?}", pair.as_rule()); - } + while let Some(ch) = path_iter.next() { + match parser_state { + PPState::Start => { + handle_start(ch, &mut parser_state, &mut path_components, path)?; + } + PPState::AfterRoot => { + handle_after_root( + ch, + &mut parser_state, + &mut index_state, + &mut key_start, + &mut index_buffer, + path, + )?; + } + PPState::InKey => { + handle_in_key( + ch, + &mut parser_state, + &mut index_state, + &mut key_start, + &mut index_buffer, + &mut path_components, + &mut path_iter, + path, + )?; + } + PPState::InArrayIndex => { + handle_array_index( + ch, + &mut parser_state, + &mut index_state, + &mut index_buffer, + &mut path_components, + &mut path_iter, + path, + )?; + } + PPState::ExpectDotOrBracket => { + handle_expect_dot_or_bracket( + ch, + &mut parser_state, + &mut index_state, + &mut key_start, + &mut index_buffer, + path, + )?; } } - - Ok(JsonPath { elements: result }) - } else { - crate::bail_constraint_error!("JSON path error near: {:?}", path.to_string()); } + + finalize_path(parser_state, key_start, path, &mut path_components)?; + Ok(JsonPath { + elements: path_components, + }) +} + +fn handle_start( + ch: (usize, char), + parser_state: &mut PPState, + path_components: &mut Vec, + path: &str, +) -> crate::Result<()> { + match ch { + (_, '$') => { + path_components.push(PathElement::Root()); + *parser_state = PPState::AfterRoot; + Ok(()) + } + (_, _) => bail_parse_error!("Bad json path: {}", path), + } +} + +fn handle_after_root( + ch: (usize, char), + parser_state: &mut PPState, + index_state: &mut ArrayIndexState, + key_start: &mut usize, + index_buffer: &mut i128, + path: &str, +) -> crate::Result<()> { + match ch { + (idx, '.') => { + *parser_state = PPState::InKey; + *key_start = idx + ch.1.len_utf8(); + Ok(()) + } + (_, '[') => { + *index_state = ArrayIndexState::Start; + *parser_state = PPState::InArrayIndex; + *index_buffer = 0; + Ok(()) + } + (_, _) => bail_parse_error!("Bad json path: {}", path), + } +} + +fn handle_in_key<'a>( + ch: (usize, char), + parser_state: &mut PPState, + index_state: &mut ArrayIndexState, + key_start: &mut usize, + index_buffer: &mut i128, + path_components: &mut Vec>, + path_iter: &mut std::str::CharIndices, + path: &'a str, +) -> crate::Result<()> { + match ch { + (idx, '.' | '[') => { + let key_end = idx; + if key_end > *key_start { + let key = &path[*key_start..key_end]; + if ch.1 == '[' { + *index_state = ArrayIndexState::Start; + *parser_state = PPState::InArrayIndex; + *index_buffer = 0; + } else { + *key_start = idx + ch.1.len_utf8(); + } + path_components.push(PathElement::Key(Cow::Borrowed(key), false)); + } else { + bail_parse_error!("Bad json path: {}", path) + } + } + (_, '"') => { + handle_quoted_key(parser_state, key_start, path_components, path_iter, path)?; + } + (_, _) => (), + } + Ok(()) +} + +fn handle_quoted_key<'a>( + parser_state: &mut PPState, + key_start: &mut usize, + path_components: &mut Vec>, + path_iter: &mut std::str::CharIndices, + path: &'a str, +) -> crate::Result<()> { + while let Some((idx, ch)) = path_iter.next() { + match ch { + '\\' => { + path_iter.next(); + } + '"' => { + if *key_start < idx { + let key = &path[*key_start + 1..idx]; + path_components.push(PathElement::Key(Cow::Borrowed(key), true)); + *parser_state = PPState::ExpectDotOrBracket; + return Ok(()); + } + } + _ => continue, + } + } + Ok(()) +} + +fn handle_array_index( + ch: (usize, char), + parser_state: &mut PPState, + index_state: &mut ArrayIndexState, + index_buffer: &mut i128, + path_components: &mut Vec>, + path_iter: &mut std::str::CharIndices, + path: &str, +) -> crate::Result<()> { + match (&index_state, ch.1) { + (ArrayIndexState::Start, '#') => { + *index_state = ArrayIndexState::AfterHash; + } + (ArrayIndexState::Start, '0'..='9') => { + *index_buffer = ch.1.to_digit(10).unwrap() as i128; + *index_state = ArrayIndexState::CollectingNumbers; + } + (ArrayIndexState::AfterHash, '-') => { + handle_negative_index(index_state, index_buffer, path_iter, path)?; + } + (ArrayIndexState::AfterHash, ']') => { + *parser_state = PPState::ExpectDotOrBracket; + path_components.push(PathElement::ArrayLocator(None)); + } + (ArrayIndexState::CollectingNumbers, '0'..='9') => { + let (new_num, is_max) = collect_num( + *index_buffer, + ch.1.to_digit(10).unwrap() as i128, + *index_buffer < 0, + ); + if is_max { + *index_state = ArrayIndexState::IsMax; + } + *index_buffer = new_num; + } + (ArrayIndexState::IsMax, '0'..='9') => (), + (ArrayIndexState::CollectingNumbers | ArrayIndexState::IsMax, ']') => { + *parser_state = PPState::ExpectDotOrBracket; + path_components.push(PathElement::ArrayLocator(Some(*index_buffer as i32))); + } + (_, _) => bail_parse_error!("Bad json path: {}", path), + } + Ok(()) +} + +fn handle_negative_index( + index_state: &mut ArrayIndexState, + index_buffer: &mut i128, + path_iter: &mut std::str::CharIndices, + path: &str, +) -> crate::Result<()> { + if let Some((_, next_c)) = path_iter.next() { + if next_c.is_ascii_digit() { + *index_buffer = -(next_c.to_digit(10).unwrap() as i128); + *index_state = ArrayIndexState::CollectingNumbers; + Ok(()) + } else { + bail_parse_error!("Bad json path: {}", path) + } + } else { + bail_parse_error!("Bad json path: {}", path) + } +} + +fn handle_expect_dot_or_bracket( + ch: (usize, char), + parser_state: &mut PPState, + index_state: &mut ArrayIndexState, + key_start: &mut usize, + index_buffer: &mut i128, + path: &str, +) -> crate::Result<()> { + match ch { + (idx, '.') => { + *key_start = idx + ch.1.len_utf8(); + *parser_state = PPState::InKey; + Ok(()) + } + (_, '[') => { + *index_state = ArrayIndexState::Start; + *parser_state = PPState::InArrayIndex; + *index_buffer = 0; + Ok(()) + } + (_, _) => bail_parse_error!("Bad json path: {}", path), + } +} + +fn finalize_path<'a>( + parser_state: PPState, + key_start: usize, + path: &'a str, + path_components: &mut Vec>, +) -> crate::Result<()> { + match parser_state { + PPState::InArrayIndex => bail_parse_error!("Bad json path: {}", path), + PPState::InKey => { + if key_start < path.len() { + let key = &path[key_start..]; + if key.starts_with('"') & !key.ends_with('"') { + bail_parse_error!("Bad json path: {}", path) + } + path_components.push(PathElement::Key(Cow::Borrowed(key), false)); + } else { + bail_parse_error!("Bad json path: {}", path) + } + } + _ => (), + } + Ok(()) } #[cfg(test)] @@ -93,7 +357,10 @@ mod tests { let path = json_path("$.x").unwrap(); assert_eq!(path.elements.len(), 2); assert_eq!(path.elements[0], PathElement::Root()); - assert_eq!(path.elements[1], PathElement::Key("x".to_string())); + assert_eq!( + path.elements[1], + PathElement::Key(Cow::Borrowed("x"), false) + ); } #[test] @@ -101,7 +368,7 @@ mod tests { let path = json_path("$[0]").unwrap(); assert_eq!(path.elements.len(), 2); assert_eq!(path.elements[0], PathElement::Root()); - assert_eq!(path.elements[1], PathElement::ArrayLocator(0)); + assert_eq!(path.elements[1], PathElement::ArrayLocator(Some(0))); } #[test] @@ -109,7 +376,7 @@ mod tests { let path = json_path("$[#-2]").unwrap(); assert_eq!(path.elements.len(), 2); assert_eq!(path.elements[0], PathElement::Root()); - assert_eq!(path.elements[1], PathElement::ArrayLocator(-2)); + assert_eq!(path.elements[1], PathElement::ArrayLocator(Some(-2))); } #[test] @@ -122,7 +389,7 @@ mod tests { let path = json_path(value); match path { - Err(crate::error::LimboError::Constraint(_)) => { + Err(crate::error::LimboError::ParseError(_)) => { // happy path } _ => panic!("Expected error for: {:?}, got: {:?}", value, path), @@ -135,9 +402,94 @@ mod tests { let path = json_path("$.store.book[0].title").unwrap(); assert_eq!(path.elements.len(), 5); assert_eq!(path.elements[0], PathElement::Root()); - assert_eq!(path.elements[1], PathElement::Key("store".to_string())); - assert_eq!(path.elements[2], PathElement::Key("book".to_string())); - assert_eq!(path.elements[3], PathElement::ArrayLocator(0)); - assert_eq!(path.elements[4], PathElement::Key("title".to_string())); + assert_eq!( + path.elements[1], + PathElement::Key(Cow::Borrowed("store"), false) + ); + assert_eq!( + path.elements[2], + PathElement::Key(Cow::Borrowed("book"), false) + ); + assert_eq!(path.elements[3], PathElement::ArrayLocator(Some(0))); + assert_eq!( + path.elements[4], + PathElement::Key(Cow::Borrowed("title"), false) + ); + } + + #[test] + fn test_large_index_wrapping() { + let path = json_path("$[4294967296]").unwrap(); + assert_eq!(path.elements[1], PathElement::ArrayLocator(Some(0))); + + let path = json_path("$[4294967297]").unwrap(); + assert_eq!(path.elements[1], PathElement::ArrayLocator(Some(1))); + } + + #[test] + fn test_deeply_nested_path() { + let path = json_path("$[0][1][2].key[3].other").unwrap(); + assert_eq!(path.elements.len(), 7); + assert_eq!(path.elements[0], PathElement::Root()); + assert_eq!(path.elements[1], PathElement::ArrayLocator(Some(0))); + assert_eq!(path.elements[2], PathElement::ArrayLocator(Some(1))); + assert_eq!(path.elements[3], PathElement::ArrayLocator(Some(2))); + assert_eq!( + path.elements[4], + PathElement::Key(Cow::Borrowed("key"), false) + ); + assert_eq!(path.elements[5], PathElement::ArrayLocator(Some(3))); + } + + #[test] + fn test_edge_cases() { + // Empty key + assert!(json_path("$.").is_err()); + + // Multiple dots + assert!(json_path("$..key").is_err()); + + // Unclosed brackets + assert!(json_path("$[0").is_err()); + assert!(json_path("$[").is_err()); + + // Invalid negative index format + assert!(json_path("$[-1]").is_err()); // should be $[#-1] + } + + #[test] + fn test_path_capacity() { + // Test that our capacity estimation is reasonable + let short_path = "$[0]"; + assert!(estimate_path_capacity(short_path) >= 2); + + let long_path = "$.a.b.c.d.e.f.g[0][1][2]"; + assert!(estimate_path_capacity(long_path) >= 11); + } + + #[test] + fn test_quoted_keys() { + let path = json_path(r#"$."key""#).unwrap(); + assert_eq!( + path.elements[1], + PathElement::Key(Cow::Borrowed("key"), true) + ); + + let path = json_path(r#"$."key.with.dots""#).unwrap(); + assert_eq!( + path.elements[1], + PathElement::Key(Cow::Borrowed("key.with.dots"), true) + ); + + let path = json_path(r#"$."key[0]""#).unwrap(); + assert_eq!( + path.elements[1], + PathElement::Key(Cow::Borrowed("key[0]"), true) + ); + } + + #[test] + fn test_empty_quoted_key() { + assert!(json_path(r#"$."""#).is_ok()); } } diff --git a/core/json/mod.rs b/core/json/mod.rs index c1a195b49..f7a2e0205 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -15,6 +15,7 @@ use indexmap::IndexMap; use jsonb::Error as JsonbError; use ser::to_string_pretty; use serde::{Deserialize, Serialize}; +use std::borrow::Cow; #[derive(Serialize, Deserialize, Debug, PartialEq, Clone)] #[serde(untagged)] @@ -224,7 +225,7 @@ pub fn json_arrow_shift_extract( } let json = get_json_value(value)?; - let extracted = json_extract_single(&json, path, false)?.unwrap_or_else(|| &Val::Null); + let extracted = json_extract_single(&json, path, false)?.unwrap_or(&Val::Null); convert_json_to_db_type(extracted, true) } @@ -241,7 +242,7 @@ pub fn json_extract(value: &OwnedValue, paths: &[OwnedValue]) -> crate::Result crate::Result { - let extracted = - json_extract_single(&json, path, true)?.unwrap_or_else(|| &Val::Null); + let extracted = json_extract_single(&json, path, true)?.unwrap_or(&Val::Null); if paths.len() == 1 && extracted == &Val::Null { return Ok(OwnedValue::Null); @@ -392,33 +392,29 @@ fn json_extract_single<'a>( PathElement::Root() => { current_element = json; } - PathElement::Key(key) => { - let key = key.as_str(); + PathElement::Key(key, _) => match current_element { + Val::Object(map) => { + if let Some((_, value)) = map.iter().find(|(k, _)| k == key) { + current_element = value; + } else { + return Ok(None); + } + } + _ => return Ok(None), + }, + PathElement::ArrayLocator(idx) => match current_element { + Val::Array(array) => { + if let Some(mut idx) = *idx { + if idx < 0 { + idx += array.len() as i32; + } - match current_element { - Val::Object(map) => { - if let Some((_, value)) = map.iter().find(|(k, _)| k == key) { - current_element = value; + if idx < array.len() as i32 { + current_element = &array[idx as usize]; } else { return Ok(None); } } - _ => return Ok(None), - } - } - PathElement::ArrayLocator(idx) => match current_element { - Val::Array(array) => { - let mut idx = *idx; - - if idx < 0 { - idx += array.len() as i32; - } - - if idx < array.len() as i32 { - current_element = &array[idx as usize]; - } else { - return Ok(None); - } } _ => return Ok(None), }, @@ -444,17 +440,23 @@ fn json_path_from_owned_value(path: &OwnedValue, strict: bool) -> crate::Result< JsonPath { elements: vec![ PathElement::Root(), - PathElement::Key(t.as_str().to_string()), + PathElement::Key(Cow::Borrowed(t.as_str()), false), ], } } } OwnedValue::Null => return Ok(None), OwnedValue::Integer(i) => JsonPath { - elements: vec![PathElement::Root(), PathElement::ArrayLocator(*i as i32)], + elements: vec![ + PathElement::Root(), + PathElement::ArrayLocator(Some(*i as i32)), + ], }, OwnedValue::Float(f) => JsonPath { - elements: vec![PathElement::Root(), PathElement::Key(f.to_string())], + elements: vec![ + PathElement::Root(), + PathElement::Key(Cow::Owned(f.to_string()), false), + ], }, _ => crate::bail_constraint_error!("JSON path error near: {:?}", path.to_string()), } @@ -484,8 +486,9 @@ fn find_target<'a>(json: &'a mut Val, path: &JsonPath) -> Option> { PathElement::ArrayLocator(index) => match current { Val::Array(arr) => { if let Some(index) = match index { - i if *i < 0 => arr.len().checked_sub(i.unsigned_abs() as usize), - i => ((*i as usize) < arr.len()).then_some(*i as usize), + Some(i) if *i < 0 => arr.len().checked_sub(i.unsigned_abs() as usize), + Some(i) => ((*i as usize) < arr.len()).then_some(*i as usize), + None => Some(arr.len()), } { if is_last { return Some(Target::Array(arr, index)); @@ -500,7 +503,7 @@ fn find_target<'a>(json: &'a mut Val, path: &JsonPath) -> Option> { return None; } }, - PathElement::Key(key) => match current { + PathElement::Key(key, _) => match current { Val::Object(obj) => { if let Some(pos) = &obj .iter() @@ -537,8 +540,9 @@ fn find_or_create_target<'a>(json: &'a mut Val, path: &JsonPath) -> Option match current { Val::Array(arr) => { if let Some(index) = match index { - i if *i < 0 => arr.len().checked_sub(i.unsigned_abs() as usize), - i => Some(*i as usize), + Some(i) if *i < 0 => arr.len().checked_sub(i.unsigned_abs() as usize), + Some(i) => Some(*i as usize), + None => Some(arr.len()), } { if is_last { if index == arr.len() { @@ -576,7 +580,7 @@ fn find_or_create_target<'a>(json: &'a mut Val, path: &JsonPath) -> Option match current { + PathElement::Key(key, _) => match current { Val::Object(obj) => { if let Some(pos) = &obj .iter() @@ -593,7 +597,7 @@ fn find_or_create_target<'a>(json: &'a mut Val, path: &JsonPath) -> Option = mutate_json_by_path(&mut val, path, |_| { @@ -1363,7 +1367,7 @@ mod tests { let result = result.unwrap(); match &result.elements[..] { - [PathElement::Root(), PathElement::Key(field)] if *field == "field" => {} + [PathElement::Root(), PathElement::Key(field, false)] if *field == "field" => {} _ => panic!("Expected root and field"), } } @@ -1386,7 +1390,7 @@ mod tests { let result = result.unwrap(); match &result.elements[..] { - [PathElement::Root(), PathElement::ArrayLocator(index)] if *index == 3 => {} + [PathElement::Root(), PathElement::ArrayLocator(index)] if *index == Some(3) => {} _ => panic!("Expected root and array locator"), } } @@ -1432,7 +1436,7 @@ mod tests { let result = result.unwrap(); match &result.elements[..] { - [PathElement::Root(), PathElement::Key(field)] if *field == "1.23" => {} + [PathElement::Root(), PathElement::Key(field, false)] if *field == "1.23" => {} _ => panic!("Expected root and field"), } }