From 103c9bcb66397006fc70a70acec2484ac065ab47 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Sat, 8 Mar 2025 15:07:11 +0200 Subject: [PATCH 01/10] inital impl of json parsing --- core/json/jsonb.rs | 657 +++++++++++++++++++++++++++++++++++++++++++++ core/json/mod.rs | 37 +-- 2 files changed, 670 insertions(+), 24 deletions(-) create mode 100644 core/json/jsonb.rs diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs new file mode 100644 index 000000000..fd62905a1 --- /dev/null +++ b/core/json/jsonb.rs @@ -0,0 +1,657 @@ +use crate::{bail_parse_error, LimboError, Result}; +use std::{ + iter::Peekable, + str::{from_utf8, Chars}, +}; + +const PAYLOAD_SIZE8: u8 = 12; +const PAYLOAD_SIZE16: u8 = 13; +const PAYLOAD_SIZE32: u8 = 14; +const MAX_JSON_DEPTH: usize = 1000; +const INFINITY_CHAR_COUNT: u8 = 5; + +#[derive(Debug, Clone)] +pub struct Jsonb { + data: Vec, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ElementType { + NULL = 0, + TRUE = 1, + FALSE = 2, + INT = 3, + INT5 = 4, + FLOAT = 5, + FLOAT5 = 6, + TEXT = 7, + TEXTJ = 8, + TEXT5 = 9, + TEXTRAW = 10, + ARRAY = 11, + OBJECT = 12, + RESERVED1 = 13, + RESERVED2 = 14, + RESERVED3 = 15, +} + +impl TryFrom for ElementType { + type Error = LimboError; + + fn try_from(value: u8) -> std::result::Result { + match value { + 0 => Ok(Self::NULL), + 1 => Ok(Self::TRUE), + 2 => Ok(Self::FALSE), + 3 => Ok(Self::INT), + 4 => Ok(Self::INT5), + 5 => Ok(Self::FLOAT), + 6 => Ok(Self::FLOAT5), + 7 => Ok(Self::TEXT), + 8 => Ok(Self::TEXTJ), + 9 => Ok(Self::TEXT5), + 10 => Ok(Self::TEXTRAW), + 11 => Ok(Self::ARRAY), + 12 => Ok(Self::OBJECT), + 13 => Ok(Self::RESERVED1), + 14 => Ok(Self::RESERVED2), + 15 => Ok(Self::RESERVED3), + _ => bail_parse_error!("Failed to recognize jsonvalue type"), + } + } +} + +type PayloadSize = usize; + +#[derive(Debug, Clone)] +pub struct JsonbHeader(ElementType, PayloadSize); + +impl JsonbHeader { + fn new(element_type: ElementType, payload_size: PayloadSize) -> Self { + Self(element_type, payload_size) + } + + fn from_slice(cursor: usize, slice: &[u8]) -> Result<(Self, usize)> { + match slice.get(cursor) { + Some(header_byte) => { + // Extract first 4 bits (values 0-15) + let element_type = header_byte & 15; + // Get the last 4 bits for header_size + let header_size = header_byte >> 4; + let mut offset = 0; + let total_size = match header_size { + size if size <= 11 => { + offset = 1; + size as usize + } + + 12 => match slice.get(cursor + 1) { + Some(value) => { + offset = 2; + *value as usize + } + None => bail_parse_error!("Failed to read 1-byte size"), + }, + + 13 => match Self::get_size_bytes(slice, cursor + 1, 2) { + Ok(bytes) => { + offset = 3; + u16::from_be_bytes([bytes[0], bytes[1]]) as usize + } + Err(e) => return Err(e), + }, + + 14 => match Self::get_size_bytes(slice, cursor + 1, 4) { + Ok(bytes) => { + offset = 5; + u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize + } + Err(e) => return Err(e), + }, + + _ => unreachable!(), + }; + + Ok((Self(element_type.try_into()?, total_size), offset)) + } + None => bail_parse_error!("Failed to read header byte"), + } + } + + fn into_bytes(&self) -> [u8; 5] { + let mut bytes = [0; 5]; + let element_type = self.0; + let payload_size = self.1; + if payload_size <= 11 { + bytes[0] = (element_type as u8) | ((payload_size as u8) << 4); + } else if payload_size <= 0xFF { + bytes[0] = (element_type as u8) | (PAYLOAD_SIZE8 << 4); + bytes[1] = payload_size as u8; + } else if payload_size <= 0xFFFF { + bytes[0] = (element_type as u8) | (PAYLOAD_SIZE16 << 4); + + let size_bytes = (payload_size as u16).to_be_bytes(); + bytes[1] = size_bytes[0]; + bytes[2] = size_bytes[1]; + } else if payload_size <= 0xFFFFFFFF { + bytes[0] = (element_type as u8) | (PAYLOAD_SIZE32 << 4); + + let size_bytes = (payload_size as u32).to_be_bytes(); + + bytes[1] = size_bytes[0]; + bytes[2] = size_bytes[1]; + bytes[3] = size_bytes[2]; + bytes[4] = size_bytes[3]; + } else { + panic!("Payload size too large for encoding"); + } + + bytes + } + + fn get_size_bytes(slice: &[u8], start: usize, count: usize) -> Result<&[u8]> { + match slice.get(start..start + count) { + Some(bytes) => Ok(bytes), + None => bail_parse_error!("Failed to read header size"), + } + } +} + +impl Jsonb { + pub fn new(capacity: usize) -> Self { + Self { + data: Vec::with_capacity(capacity), + } + } + + pub fn len(&self) -> usize { + self.data.len() + } + + fn read_header(&self, cursor: usize) -> Result<(JsonbHeader, usize)> { + let (header, offset) = JsonbHeader::from_slice(cursor, &self.data)?; + + Ok((header, offset)) + } + + pub fn debug_read(&self) { + let mut cursor = 0usize; + while cursor < self.len() { + let (header, offset) = self.read_header(cursor).unwrap(); + cursor = cursor + offset; + println!("{:?}: HEADER", header); + if header.0 == ElementType::OBJECT || header.0 == ElementType::ARRAY { + cursor = cursor; + } else { + let value = from_utf8(&self.data[cursor..cursor + header.1]).unwrap(); + println!("{:?}: VALUE", value); + cursor = cursor + header.1 + } + } + } + + pub fn to_string(&self) -> String { + from_utf8(&self.data).unwrap().to_owned() + } + + fn deserialize_value( + &mut self, + input: &mut Peekable>, + depth: usize, + ) -> Result { + if depth > MAX_JSON_DEPTH { + bail_parse_error!("Too deep") + }; + let current_depth = depth + 1; + skip_whitespace(input); + match input.peek() { + Some('{') => { + input.next(); // consume '{' + self.deserialize_obj(input, current_depth) + } + Some('[') => { + input.next(); // consume '[' + self.deserialize_array(input, current_depth) + } + Some('t') => self.deserialize_true(input), + Some('f') => self.deserialize_false(input), + Some('n') => self.deserialize_null(input), + Some('"') => self.deserialize_string(input), + Some(c) + if c.is_ascii_digit() + || *c == '-' + || *c == '+' + || *c == '.' + || c.to_ascii_lowercase() == 'i' => + { + self.deserialize_number(input) + } + Some(ch) => bail_parse_error!("Unexpected character: {}", ch), + None => bail_parse_error!("Unexpected end of input"), + } + } + + pub fn deserialize_obj( + &mut self, + input: &mut Peekable>, + depth: usize, + ) -> Result { + if depth > MAX_JSON_DEPTH { + bail_parse_error!("Too deep!") + } + let header_pos = self.len(); + self.write_element_header(header_pos, ElementType::OBJECT, 0)?; + let obj_start = self.len(); + let mut first = true; + let current_depth = depth + 1; + loop { + skip_whitespace(input); + + match input.peek() { + Some('}') => { + input.next(); // consume '}' + if first { + return Ok(1); // empty header + } else { + let obj_size = self.len() - obj_start; + self.write_element_header(header_pos, ElementType::OBJECT, obj_size)?; + return Ok(obj_size + 2); + } + } + Some(',') if !first => { + input.next(); // consume ',' + skip_whitespace(input); + } + Some(_) => { + // Parse key (must be string) + if input.peek() != Some(&'"') { + bail_parse_error!("Object key must be a string"); + } + self.deserialize_string(input)?; + + skip_whitespace(input); + + // Expect and consume ':' + if input.next() != Some(':') { + bail_parse_error!("Expected ':' after object key"); + } + + skip_whitespace(input); + + // Parse value - can be any JSON value including another object + self.deserialize_value(input, current_depth)?; + + first = false; + } + None => { + bail_parse_error!("Unexpected end of input!") + } + } + } + } + + pub fn deserialize_array( + &mut self, + input: &mut Peekable>, + depth: usize, + ) -> Result { + if depth > MAX_JSON_DEPTH { + bail_parse_error!("Too deep"); + } + let header_pos = self.len(); + self.write_element_header(header_pos, ElementType::ARRAY, 0)?; + let arr_start = self.len(); + let mut first = true; + let current_depth = depth + 1; + loop { + skip_whitespace(input); + + match input.peek() { + Some(']') => { + input.next(); + if first { + return Ok(1); + } else { + let arr_len = self.len() - arr_start; + let header_size = + self.write_element_header(header_pos, ElementType::ARRAY, arr_len)?; + return Ok(arr_len + header_size); + } + } + Some(',') if !first => { + input.next(); // consume ',' + skip_whitespace(input); + } + Some(_) => { + skip_whitespace(input); + self.deserialize_value(input, current_depth)?; + + first = false; + } + None => { + bail_parse_error!("Unexpected end of input!") + } + } + } + } + + pub fn deserialize_string(&mut self, input: &mut Peekable>) -> Result { + let string_start = self.len(); + let quote = input.next().unwrap(); // " + + if input.peek().is_none() { + bail_parse_error!("Unexpected end of input"); + }; + // Determine if this will be TEXT, TEXTJ, or TEXT5 + let mut element_type = ElementType::TEXT; + let mut content = String::new(); + + while let Some(c) = input.next() { + if c == quote { + break; + } else if c == '\\' { + // Handle escapes + if let Some(esc) = input.next() { + match esc { + 'b' => { + content.push('\u{0008}'); + element_type = ElementType::TEXTJ; + } + 'f' => { + content.push('\u{000C}'); + element_type = ElementType::TEXTJ; + } + 'n' => { + content.push('\n'); + element_type = ElementType::TEXTJ; + } + 'r' => { + content.push('\r'); + element_type = ElementType::TEXTJ; + } + 't' => { + content.push('\t'); + element_type = ElementType::TEXTJ; + } + '\\' | '"' | '/' => { + content.push(esc); + element_type = ElementType::TEXTJ; + } + 'u' => { + // Unicode escape + element_type = ElementType::TEXTJ; + let mut code = 0u32; + for _ in 0..4 { + if let Some(h) = input.next() { + let h = h.to_digit(16); + match h { + Some(digit) => { + code = code * 16 + digit; + } + None => bail_parse_error!("Failed to parse u16"), + } + } else { + bail_parse_error!("Incomplete Unicode escape"); + } + } + match char::from_u32(code) { + Some(ch) => content.push(ch), + None => bail_parse_error!("Invalid unicode escape!"), + }; + } + // JSON5 extensions + '\n' => { + element_type = ElementType::TEXT5; + content.push('\n'); + } + '\'' | '0' | 'v' | 'x' => { + element_type = ElementType::TEXT5; + // Appropriate handling for each case + } + _ => bail_parse_error!("Invalid escape sequence: \\{}", esc), + } + } else { + bail_parse_error!("Unexpected end of input in escape sequence"); + } + } else if c <= '\u{001F}' { + // Control characters need escaping in standard JSON + element_type = ElementType::TEXT5; + content.push(c); + } else { + content.push(c); + } + } + + // Write header and payload + self.write_element_header(self.len(), element_type, content.len())?; + for byte in content.bytes() { + self.data.push(byte); + } + + Ok(self.len() - string_start) + } + + pub fn deserialize_number(&mut self, input: &mut Peekable>) -> Result { + let num_start = self.len(); + let mut num_str = String::new(); + let mut is_float = false; + let mut is_json5 = false; + + // Handle sign + if input.peek() == Some(&'-') || input.peek() == Some(&'+') { + if input.peek() == Some(&'+') { + is_json5 = true; // JSON5 extension + } + num_str.push(input.next().unwrap()); + } + + // Handle json5 float number + if input.peek() == Some(&'.') { + is_json5 = true; + }; + + // Check for hex (JSON5) + if input.peek() == Some(&'0') { + num_str.push(input.next().unwrap()); + if input.peek() == Some(&'x') || input.peek() == Some(&'X') { + num_str.push(input.next().unwrap()); + while let Some(&ch) = input.peek() { + if ch.is_digit(16) { + num_str.push(input.next().unwrap()); + } else { + break; + } + } + + // Write INT5 header and payload + self.write_element_header(self.len(), ElementType::INT5, num_str.len())?; + for byte in num_str.bytes() { + self.data.push(byte); + } + return Ok(self.len() - num_start); + } + } + + // Check for Infinity + if input.peek().map(|x| x.to_ascii_lowercase()) == Some('i') { + for expected in &['i', 'n', 'f', 'i', 'n', 'i', 't', 'y'] { + if input.next().map(|x| x.to_ascii_lowercase()) != Some(*expected) { + bail_parse_error!("Failed to parse number"); + } + } + self.write_element_header( + self.len(), + ElementType::INT5, + num_str.len() + INFINITY_CHAR_COUNT as usize, + )?; + for byte in num_str + .bytes() + .chain([b'9', b'e', b'9', b'9', b'9'].into_iter()) + { + self.data.push(byte) + } + + return Ok(self.len() - num_start); + }; + + // Regular number parsing + while let Some(&ch) = input.peek() { + match ch { + '0'..='9' => { + num_str.push(input.next().unwrap()); + } + '.' => { + is_float = true; + num_str.push(input.next().unwrap()); + } + 'e' | 'E' => { + is_float = true; + num_str.push(input.next().unwrap()); + if input.peek() == Some(&'+') || input.peek() == Some(&'-') { + num_str.push(input.next().unwrap()); + } + } + _ => break, + } + } + + // Write appropriate header and payload + let element_type = if is_float { + if is_json5 { + ElementType::FLOAT5 + } else { + ElementType::FLOAT + } + } else { + if is_json5 { + ElementType::INT5 + } else { + ElementType::INT + } + }; + + self.write_element_header(self.len(), element_type, num_str.len())?; + for byte in num_str.bytes() { + self.data.push(byte); + } + + Ok(self.len() - num_start) + } + + pub fn deserialize_null(&mut self, input: &mut Peekable>) -> Result { + let start = self.len(); + // Expect "null" + for expected in &['n', 'u', 'l', 'l'] { + if input.next() != Some(*expected) { + bail_parse_error!("Expected 'null'"); + } + } + self.data.push(ElementType::NULL as u8); + Ok(self.len() - start) + } + + pub fn deserialize_true(&mut self, input: &mut Peekable>) -> Result { + let start = self.len(); + // Expect "true" + for expected in &['t', 'r', 'u', 'e'] { + if input.next() != Some(*expected) { + bail_parse_error!("Expected 'true'"); + } + } + self.data.push(ElementType::TRUE as u8); + Ok(self.len() - start) + } + + fn deserialize_false(&mut self, input: &mut Peekable>) -> Result { + let start = self.len(); + // Expect "false" + for expected in &['f', 'a', 'l', 's', 'e'] { + if input.next() != Some(*expected) { + bail_parse_error!("Expected 'false'"); + } + } + self.data.push(ElementType::FALSE as u8); + Ok(self.len() - start) + } + + fn write_element_header( + &mut self, + cursor: usize, + element_type: ElementType, + payload_size: usize, + ) -> Result { + let header = JsonbHeader::new(element_type, payload_size).into_bytes(); + if cursor == self.len() { + for byte in header { + if byte != 0 { + self.data.push(byte); + } + } + } else { + self.data[cursor] = header[0]; + self.data.splice( + cursor + 1..cursor + 1, + header[1..].iter().filter(|&&x| x != 0).cloned(), + ); + } + Ok(header.iter().filter(|&&x| x != 0).count()) + } + + pub fn from_str(input: &str) -> Result { + let mut result = Self::new(input.len()); + let mut input_iter = input.chars().peekable(); + + result.deserialize_value(&mut input_iter, 0)?; + + Ok(result) + } +} + +impl std::str::FromStr for Jsonb { + type Err = LimboError; + + fn from_str(s: &str) -> std::result::Result { + Self::from_str(s) + } +} + +pub fn skip_whitespace(input: &mut Peekable>) { + while let Some(&ch) = input.peek() { + match ch { + ' ' | '\t' | '\n' | '\r' => { + input.next(); + } + '/' => { + // Handle JSON5 comments + input.next(); + if let Some(next_ch) = input.peek() { + if *next_ch == '/' { + // Line comment - skip until newline + input.next(); + while let Some(c) = input.next() { + if c == '\n' { + break; + } + } + } else if *next_ch == '*' { + // Block comment - skip until "*/" + input.next(); + let mut prev = '\0'; + while let Some(c) = input.next() { + if prev == '*' && c == '/' { + break; + } + prev = c; + } + } else { + // Not a comment, put the '/' back + break; + } + } else { + break; + } + } + _ => break, + } + } +} diff --git a/core/json/mod.rs b/core/json/mod.rs index f7a2e0205..26d41b4ee 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -2,17 +2,18 @@ mod de; mod error; mod json_operations; mod json_path; +mod jsonb; mod ser; pub use crate::json::de::from_str; -use crate::json::de::ordered_object; use crate::json::error::Error as JsonError; pub use crate::json::json_operations::{json_patch, json_remove}; use crate::json::json_path::{json_path, JsonPath, PathElement}; pub use crate::json::ser::to_string; use crate::types::{OwnedValue, Text, TextSubtype}; +use crate::{bail_parse_error, json::de::ordered_object}; use indexmap::IndexMap; -use jsonb::Error as JsonbError; +use jsonb::Jsonb; use ser::to_string_pretty; use serde::{Deserialize, Serialize}; use std::borrow::Cow; @@ -39,7 +40,8 @@ pub fn get_json(json_value: &OwnedValue, indent: Option<&str>) -> crate::Result< if t.subtype == TextSubtype::Json { return Ok(json_value.to_owned()); } - + let jsonbin = Jsonb::from_str(json_value.to_text().unwrap())?; + jsonbin.debug_read(); let json_val = get_json_value(json_value)?; let json = match indent { Some(indent) => to_string_pretty(&json_val, indent)?, @@ -51,11 +53,7 @@ pub fn get_json(json_value: &OwnedValue, indent: Option<&str>) -> crate::Result< OwnedValue::Blob(b) => { // TODO: use get_json_value after we implement a single Struct // to represent both JSON and JSONB - if let Ok(json) = jsonb::from_slice(b) { - Ok(OwnedValue::Text(Text::json(&json.to_string()))) - } else { - crate::bail_parse_error!("malformed JSON"); - } + bail_parse_error!("Unsupported") } OwnedValue::Null => Ok(OwnedValue::Null), _ => { @@ -79,11 +77,7 @@ fn get_json_value(json_value: &OwnedValue) -> crate::Result { } }, OwnedValue::Blob(b) => { - if let Ok(_json) = jsonb::from_slice(b) { - todo!("jsonb to json conversion"); - } else { - crate::bail_parse_error!("malformed JSON"); - } + crate::bail_parse_error!("malformed JSON"); } OwnedValue::Null => Ok(Val::Null), OwnedValue::Float(f) => Ok(Val::Float(*f)), @@ -625,13 +619,9 @@ pub fn json_error_position(json: &OwnedValue) -> crate::Result { } } }, - OwnedValue::Blob(b) => match jsonb::from_slice(b) { - Ok(_) => Ok(OwnedValue::Integer(0)), - Err(JsonbError::Syntax(_, pos)) => Ok(OwnedValue::Integer(pos as i64)), - _ => Err(crate::error::LimboError::InternalError( - "failed to determine json error position".into(), - )), - }, + OwnedValue::Blob(b) => { + bail_parse_error!("Unsupported") + } OwnedValue::Null => Ok(OwnedValue::Null), _ => Ok(OwnedValue::Integer(0)), } @@ -667,10 +657,9 @@ pub fn is_json_valid(json_value: &OwnedValue) -> crate::Result { Ok(_) => Ok(OwnedValue::Integer(1)), Err(_) => Ok(OwnedValue::Integer(0)), }, - OwnedValue::Blob(b) => match jsonb::from_slice(b) { - Ok(_) => Ok(OwnedValue::Integer(1)), - Err(_) => Ok(OwnedValue::Integer(0)), - }, + OwnedValue::Blob(b) => { + bail_parse_error!("Unsuported!") + } OwnedValue::Null => Ok(OwnedValue::Null), _ => Ok(OwnedValue::Integer(1)), } From 1efc35c728d2c6b862cdd28e49dd1d612b124b62 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Mon, 10 Mar 2025 23:39:05 +0200 Subject: [PATCH 02/10] use bytes instead of parsed utf8 --- core/json/jsonb.rs | 440 +++++++++++++++++++++++++++++++-------------- 1 file changed, 309 insertions(+), 131 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index fd62905a1..aa7137e09 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -1,6 +1,7 @@ use crate::{bail_parse_error, LimboError, Result}; use std::{ iter::Peekable, + slice::Iter, str::{from_utf8, Chars}, }; @@ -191,38 +192,152 @@ impl Jsonb { } pub fn to_string(&self) -> String { - from_utf8(&self.data).unwrap().to_owned() + let mut result = String::with_capacity(self.data.len() * 2); + self.write_to_string(&mut result); + + result } - fn deserialize_value( - &mut self, - input: &mut Peekable>, - depth: usize, + fn write_to_string(&self, string: &mut String) -> Result<()> { + let cursor = 0; + let _ = self.serialize_value(string, cursor); + Ok(()) + } + + fn serialize_value(&self, string: &mut String, cursor: usize) -> Result { + let (header, skip_header) = self.read_header(cursor)?; + let cursor = cursor + skip_header; + + let current_cursor = match header { + JsonbHeader(ElementType::OBJECT, len) => self.serialize_object(string, cursor, len)?, + JsonbHeader(ElementType::ARRAY, len) => self.serialize_array(string, cursor, len)?, + JsonbHeader(ElementType::TEXT, len) + | JsonbHeader(ElementType::TEXTRAW, len) + | JsonbHeader(ElementType::TEXTJ, len) + | JsonbHeader(ElementType::TEXT5, len) => { + self.serialize_string(string, cursor, len, &header.0)? + } + JsonbHeader(ElementType::INT, len) + | JsonbHeader(ElementType::INT5, len) + | JsonbHeader(ElementType::FLOAT, len) + | JsonbHeader(ElementType::FLOAT5, len) => { + self.serialize_number(string, cursor, len, &header.0)? + } + + JsonbHeader(ElementType::TRUE, _) | JsonbHeader(ElementType::FALSE, _) => { + self.serialize_boolean(string, cursor)? + } + JsonbHeader(ElementType::NULL, _) => self.serialize_null(string, cursor)?, + JsonbHeader(_, _) => { + unreachable!(); + } + }; + Ok(current_cursor) + } + + fn serialize_object(&self, string: &mut String, cursor: usize, len: usize) -> Result { + let end_cursor = cursor + len; + let mut current_cursor = cursor; + string.push('{'); + while current_cursor < end_cursor { + let (key_header, key_header_offset) = self.read_header(current_cursor)?; + current_cursor += key_header_offset; + let JsonbHeader(element_type, len) = key_header; + string.push('"'); + match element_type { + ElementType::TEXT + | ElementType::TEXTRAW + | ElementType::TEXTJ + | ElementType::TEXT5 => { + current_cursor = + self.serialize_string(string, current_cursor, len, &element_type)?; + } + _ => bail_parse_error!("Malformed json!"), + } + string.push('"'); + string.push(':'); + current_cursor = self.serialize_value(string, current_cursor)?; + if current_cursor < end_cursor { + string.push(','); + } + } + string.push('}'); + Ok(current_cursor) + } + + fn serialize_array(&self, string: &mut String, cursor: usize, len: usize) -> Result { + let end_cursor = cursor + len; + let mut current_cursor = cursor; + + string.push('['); + + while end_cursor > current_cursor { + current_cursor = self.serialize_value(string, cursor)?; + if end_cursor > current_cursor { + string.push(','); + } + } + + string.push(']'); + Ok(current_cursor) + } + + fn serialize_string( + &self, + string: &mut String, + cursor: usize, + len: usize, + kind: &ElementType, ) -> Result { + todo!() + } + + fn serialize_number( + &self, + string: &mut String, + cursor: usize, + len: usize, + kind: &ElementType, + ) -> Result { + todo!() + } + + fn serialize_boolean(&self, string: &mut String, cursor: usize) -> Result { + todo!() + } + + fn serialize_null(&self, string: &mut String, cursor: usize) -> Result { + todo!() + } + + fn deserialize_value<'a, I>(&mut self, input: &mut Peekable, depth: usize) -> Result + where + I: Iterator, + { if depth > MAX_JSON_DEPTH { bail_parse_error!("Too deep") }; let current_depth = depth + 1; skip_whitespace(input); match input.peek() { - Some('{') => { + Some(b'{') => { input.next(); // consume '{' self.deserialize_obj(input, current_depth) } - Some('[') => { + Some(b'[') => { input.next(); // consume '[' self.deserialize_array(input, current_depth) } - Some('t') => self.deserialize_true(input), - Some('f') => self.deserialize_false(input), - Some('n') => self.deserialize_null(input), - Some('"') => self.deserialize_string(input), - Some(c) + Some(b't') => self.deserialize_true(input), + Some(b'f') => self.deserialize_false(input), + Some(b'n') => self.deserialize_null(input), + Some(b'"') => self.deserialize_string(input), + Some(&&c) if c.is_ascii_digit() - || *c == '-' - || *c == '+' - || *c == '.' - || c.to_ascii_lowercase() == 'i' => + || c == b'-' + || c == b'+' + || c == b'.' + || c.to_ascii_lowercase() == b'i' => { self.deserialize_number(input) } @@ -231,11 +346,10 @@ impl Jsonb { } } - pub fn deserialize_obj( - &mut self, - input: &mut Peekable>, - depth: usize, - ) -> Result { + pub fn deserialize_obj<'a, I>(&mut self, input: &mut Peekable, depth: usize) -> Result + where + I: Iterator, + { if depth > MAX_JSON_DEPTH { bail_parse_error!("Too deep!") } @@ -248,7 +362,7 @@ impl Jsonb { skip_whitespace(input); match input.peek() { - Some('}') => { + Some(&&b'}') => { input.next(); // consume '}' if first { return Ok(1); // empty header @@ -258,13 +372,13 @@ impl Jsonb { return Ok(obj_size + 2); } } - Some(',') if !first => { + Some(&&b',') if !first => { input.next(); // consume ',' skip_whitespace(input); } Some(_) => { // Parse key (must be string) - if input.peek() != Some(&'"') { + if input.peek() != Some(&&b'"') { bail_parse_error!("Object key must be a string"); } self.deserialize_string(input)?; @@ -272,7 +386,7 @@ impl Jsonb { skip_whitespace(input); // Expect and consume ':' - if input.next() != Some(':') { + if input.next() != Some(&&b':') { bail_parse_error!("Expected ':' after object key"); } @@ -290,11 +404,14 @@ impl Jsonb { } } - pub fn deserialize_array( + pub fn deserialize_array<'a, I>( &mut self, - input: &mut Peekable>, + input: &mut Peekable, depth: usize, - ) -> Result { + ) -> Result + where + I: Iterator, + { if depth > MAX_JSON_DEPTH { bail_parse_error!("Too deep"); } @@ -307,7 +424,7 @@ impl Jsonb { skip_whitespace(input); match input.peek() { - Some(']') => { + Some(&&b']') => { input.next(); if first { return Ok(1); @@ -318,7 +435,7 @@ impl Jsonb { return Ok(arr_len + header_size); } } - Some(',') if !first => { + Some(&&b',') if !first => { input.next(); // consume ',' skip_whitespace(input); } @@ -335,159 +452,192 @@ impl Jsonb { } } - pub fn deserialize_string(&mut self, input: &mut Peekable>) -> Result { + fn deserialize_string<'a, I>(&mut self, input: &mut Peekable) -> Result + where + I: Iterator, + { let string_start = self.len(); let quote = input.next().unwrap(); // " + let mut len = 0; + self.write_element_header(string_start, ElementType::TEXT, 0)?; + let payload_start = self.len(); if input.peek().is_none() { bail_parse_error!("Unexpected end of input"); }; // Determine if this will be TEXT, TEXTJ, or TEXT5 let mut element_type = ElementType::TEXT; - let mut content = String::new(); while let Some(c) = input.next() { if c == quote { break; - } else if c == '\\' { + } else if c == &b'\\' { // Handle escapes - if let Some(esc) = input.next() { + if let Some(&esc) = input.next() { match esc { - 'b' => { - content.push('\u{0008}'); + b'b' => { + self.data.push('\u{0008}' as u8); + len += 1; element_type = ElementType::TEXTJ; } - 'f' => { - content.push('\u{000C}'); + b'f' => { + self.data.push('\u{000C}' as u8); + len += 1; element_type = ElementType::TEXTJ; } - 'n' => { - content.push('\n'); + b'n' => { + self.data.push('\n' as u8); + len += 1; element_type = ElementType::TEXTJ; } - 'r' => { - content.push('\r'); + b'r' => { + self.data.push('\r' as u8); + len += 1; element_type = ElementType::TEXTJ; } - 't' => { - content.push('\t'); + b't' => { + self.data.push('\t' as u8); + len += 1; element_type = ElementType::TEXTJ; } - '\\' | '"' | '/' => { - content.push(esc); + b'\\' | b'"' | b'/' => { + self.data.push(esc); + len += 1; element_type = ElementType::TEXTJ; } - 'u' => { + b'u' => { // Unicode escape element_type = ElementType::TEXTJ; - let mut code = 0u32; + self.data.push(b'\\'); + self.data.push(b'u'); + len += 2; for _ in 0..4 { - if let Some(h) = input.next() { - let h = h.to_digit(16); - match h { - Some(digit) => { - code = code * 16 + digit; - } - None => bail_parse_error!("Failed to parse u16"), + if let Some(&h) = input.next() { + if is_hex_digit(h) { + self.data.push(h); + len += 1; + } else { + bail_parse_error!("Incomplete Unicode escape"); } } else { bail_parse_error!("Incomplete Unicode escape"); } } - match char::from_u32(code) { - Some(ch) => content.push(ch), - None => bail_parse_error!("Invalid unicode escape!"), - }; } // JSON5 extensions - '\n' => { + b'\n' => { element_type = ElementType::TEXT5; - content.push('\n'); + self.data.push(b'\n'); + len += 1; } - '\'' | '0' | 'v' | 'x' => { + b'\'' => { element_type = ElementType::TEXT5; - // Appropriate handling for each case + self.data.push(b'\\'); + self.data.push(b'\''); + len += 2; + } + b'0' => { + element_type = ElementType::TEXT5; + self.data.push(b'\\'); + self.data.push(b'0'); + len += 2; + } + b'v' => { + element_type = ElementType::TEXT5; + self.data.push(b'\\'); + self.data.push(b'v'); + len += 2; + } + b'x' => { + element_type = ElementType::TEXT5; + self.data.push(b'\\'); + self.data.push(b'x'); + len += 2; + } + _ => { + bail_parse_error!("Invalid escape sequence") } - _ => bail_parse_error!("Invalid escape sequence: \\{}", esc), } } else { bail_parse_error!("Unexpected end of input in escape sequence"); } - } else if c <= '\u{001F}' { + } else if c <= &('\u{001F}' as u8) { // Control characters need escaping in standard JSON element_type = ElementType::TEXT5; - content.push(c); + self.data.push(*c); + len += 1; } else { - content.push(c); + self.data.push(*c); + len += 1; } } // Write header and payload - self.write_element_header(self.len(), element_type, content.len())?; - for byte in content.bytes() { - self.data.push(byte); - } + self.write_element_header(string_start, element_type, len)?; - Ok(self.len() - string_start) + Ok(self.len() - payload_start) } - pub fn deserialize_number(&mut self, input: &mut Peekable>) -> Result { + pub fn deserialize_number<'a, I>(&mut self, input: &mut Peekable) -> Result + where + I: Iterator, + { let num_start = self.len(); - let mut num_str = String::new(); + let mut len = 0; let mut is_float = false; let mut is_json5 = false; + self.write_element_header(num_start, ElementType::INT, 0)?; // Handle sign - if input.peek() == Some(&'-') || input.peek() == Some(&'+') { - if input.peek() == Some(&'+') { + if input.peek() == Some(&&b'-') || input.peek() == Some(&&b'+') { + if input.peek() == Some(&&b'+') { is_json5 = true; // JSON5 extension } - num_str.push(input.next().unwrap()); + self.data.push(*input.next().unwrap()); + len += 1; } // Handle json5 float number - if input.peek() == Some(&'.') { + if input.peek() == Some(&&b'.') { is_json5 = true; }; // Check for hex (JSON5) - if input.peek() == Some(&'0') { - num_str.push(input.next().unwrap()); - if input.peek() == Some(&'x') || input.peek() == Some(&'X') { - num_str.push(input.next().unwrap()); - while let Some(&ch) = input.peek() { - if ch.is_digit(16) { - num_str.push(input.next().unwrap()); + if input.peek() == Some(&&b'0') { + self.data.push(*input.next().unwrap()); + len += 1; + if input.peek() == Some(&&b'x') || input.peek() == Some(&&b'X') { + self.data.push(*input.next().unwrap()); + len += 1; + while let Some(&&byte) = input.peek() { + if is_hex_digit(byte) { + self.data.push(*input.next().unwrap()); + len += 1; } else { break; } } // Write INT5 header and payload - self.write_element_header(self.len(), ElementType::INT5, num_str.len())?; - for byte in num_str.bytes() { - self.data.push(byte); - } + self.write_element_header(num_start, ElementType::INT5, len)?; + return Ok(self.len() - num_start); } } // Check for Infinity - if input.peek().map(|x| x.to_ascii_lowercase()) == Some('i') { - for expected in &['i', 'n', 'f', 'i', 'n', 'i', 't', 'y'] { + if input.peek().map(|x| x.to_ascii_lowercase()) == Some(b'i') { + for expected in &[b'i', b'n', b'f', b'i', b'n', b'i', b't', b'y'] { if input.next().map(|x| x.to_ascii_lowercase()) != Some(*expected) { bail_parse_error!("Failed to parse number"); } } self.write_element_header( - self.len(), + num_start, ElementType::INT5, - num_str.len() + INFINITY_CHAR_COUNT as usize, + len + INFINITY_CHAR_COUNT as usize, )?; - for byte in num_str - .bytes() - .chain([b'9', b'e', b'9', b'9', b'9'].into_iter()) - { + for byte in [b'9', b'e', b'9', b'9', b'9'].into_iter() { self.data.push(byte) } @@ -495,20 +645,24 @@ impl Jsonb { }; // Regular number parsing - while let Some(&ch) = input.peek() { + while let Some(&&ch) = input.peek() { match ch { - '0'..='9' => { - num_str.push(input.next().unwrap()); + b'0'..=b'9' => { + self.data.push(*input.next().unwrap()); + len += 1; } - '.' => { + b'.' => { is_float = true; - num_str.push(input.next().unwrap()); + self.data.push(*input.next().unwrap()); + len += 1; } - 'e' | 'E' => { + b'e' | b'E' => { is_float = true; - num_str.push(input.next().unwrap()); - if input.peek() == Some(&'+') || input.peek() == Some(&'-') { - num_str.push(input.next().unwrap()); + self.data.push(*input.next().unwrap()); + len += 1; + if input.peek() == Some(&&b'+') || input.peek() == Some(&&b'-') { + self.data.push(*input.next().unwrap()); + len += 1; } } _ => break, @@ -530,19 +684,19 @@ impl Jsonb { } }; - self.write_element_header(self.len(), element_type, num_str.len())?; - for byte in num_str.bytes() { - self.data.push(byte); - } + self.write_element_header(num_start, element_type, len)?; Ok(self.len() - num_start) } - pub fn deserialize_null(&mut self, input: &mut Peekable>) -> Result { + pub fn deserialize_null<'a, I>(&mut self, input: &mut Peekable) -> Result + where + I: Iterator, + { let start = self.len(); // Expect "null" - for expected in &['n', 'u', 'l', 'l'] { - if input.next() != Some(*expected) { + for expected in &[b'n', b'u', b'l', b'l'] { + if input.next() != Some(expected) { bail_parse_error!("Expected 'null'"); } } @@ -550,11 +704,14 @@ impl Jsonb { Ok(self.len() - start) } - pub fn deserialize_true(&mut self, input: &mut Peekable>) -> Result { + pub fn deserialize_true<'a, I>(&mut self, input: &mut Peekable) -> Result + where + I: Iterator, + { let start = self.len(); // Expect "true" - for expected in &['t', 'r', 'u', 'e'] { - if input.next() != Some(*expected) { + for expected in &[b't', b'r', b'u', b'e'] { + if input.next() != Some(expected) { bail_parse_error!("Expected 'true'"); } } @@ -562,11 +719,14 @@ impl Jsonb { Ok(self.len() - start) } - fn deserialize_false(&mut self, input: &mut Peekable>) -> Result { + fn deserialize_false<'a, I>(&mut self, input: &mut Peekable) -> Result + where + I: Iterator, + { let start = self.len(); // Expect "false" - for expected in &['f', 'a', 'l', 's', 'e'] { - if input.next() != Some(*expected) { + for expected in &[b'f', b'a', b'l', b's', b'e'] { + if input.next() != Some(expected) { bail_parse_error!("Expected 'false'"); } } @@ -599,12 +759,20 @@ impl Jsonb { pub fn from_str(input: &str) -> Result { let mut result = Self::new(input.len()); - let mut input_iter = input.chars().peekable(); + let mut input_iter = input.as_bytes().iter().peekable(); result.deserialize_value(&mut input_iter, 0)?; Ok(result) } + + pub fn from_bytes(input: &[u8]) -> Result { + let mut result = Self::new(input.len()); + let mut input_iter = input.iter().peekable(); + result.deserialize_value(&mut input_iter, 0)?; + + Ok(result) + } } impl std::str::FromStr for Jsonb { @@ -615,30 +783,33 @@ impl std::str::FromStr for Jsonb { } } -pub fn skip_whitespace(input: &mut Peekable>) { +pub fn skip_whitespace<'a, I>(input: &mut Peekable) +where + I: Iterator, +{ while let Some(&ch) = input.peek() { match ch { - ' ' | '\t' | '\n' | '\r' => { + b' ' | b'\t' | b'\n' | b'\r' => { input.next(); } - '/' => { + b'/' => { // Handle JSON5 comments input.next(); - if let Some(next_ch) = input.peek() { - if *next_ch == '/' { + if let Some(&&next_ch) = input.peek() { + if next_ch == b'/' { // Line comment - skip until newline input.next(); - while let Some(c) = input.next() { - if c == '\n' { + while let Some(&c) = input.next() { + if c == b'\n' { break; } } - } else if *next_ch == '*' { + } else if next_ch == b'*' { // Block comment - skip until "*/" input.next(); - let mut prev = '\0'; - while let Some(c) = input.next() { - if prev == '*' && c == '/' { + let mut prev = b'\0'; + while let Some(&c) = input.next() { + if prev == b'*' && c == b'/' { break; } prev = c; @@ -655,3 +826,10 @@ pub fn skip_whitespace(input: &mut Peekable>) { } } } + +fn is_hex_digit(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' => true, + _ => false, + } +} From 47554fda857ebdc4a34a53171ac65dcb40ce1c3c Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Tue, 11 Mar 2025 22:31:57 +0200 Subject: [PATCH 03/10] add serialization functions --- core/json/jsonb.rs | 515 +++++++++++++++++++++++++++++++++++++++------ core/json/mod.rs | 3 +- 2 files changed, 457 insertions(+), 61 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index aa7137e09..009203a3f 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -1,9 +1,5 @@ use crate::{bail_parse_error, LimboError, Result}; -use std::{ - iter::Peekable, - slice::Iter, - str::{from_utf8, Chars}, -}; +use std::{fmt::Write, iter::Peekable, str::from_utf8}; const PAYLOAD_SIZE8: u8 = 12; const PAYLOAD_SIZE16: u8 = 13; @@ -79,7 +75,7 @@ impl JsonbHeader { let element_type = header_byte & 15; // Get the last 4 bits for header_size let header_size = header_byte >> 4; - let mut offset = 0; + let offset: usize; let total_size = match header_size { size if size <= 11 => { offset = 1; @@ -159,7 +155,12 @@ impl JsonbHeader { } impl Jsonb { - pub fn new(capacity: usize) -> Self { + pub fn new(capacity: usize, data: Option<&[u8]>) -> Self { + if let Some(data) = data { + return Self { + data: data.to_vec(), + }; + } Self { data: Vec::with_capacity(capacity), } @@ -175,6 +176,15 @@ impl Jsonb { Ok((header, offset)) } + pub fn is_valid(&self) -> Result<()> { + match self.read_header(0) { + Ok(_) => Ok(()), + Err(_) => bail_parse_error!("Malformed json"), + } + } + + #[allow(dead_code)] + // Needed for debug. I am open to remove it pub fn debug_read(&self) { let mut cursor = 0usize; while cursor < self.len() { @@ -191,11 +201,11 @@ impl Jsonb { } } - pub fn to_string(&self) -> String { + pub fn to_string(&self) -> Result { let mut result = String::with_capacity(self.data.len() * 2); - self.write_to_string(&mut result); + self.write_to_string(&mut result)?; - result + Ok(result) } fn write_to_string(&self, string: &mut String) -> Result<()> { @@ -224,10 +234,9 @@ impl Jsonb { self.serialize_number(string, cursor, len, &header.0)? } - JsonbHeader(ElementType::TRUE, _) | JsonbHeader(ElementType::FALSE, _) => { - self.serialize_boolean(string, cursor)? - } - JsonbHeader(ElementType::NULL, _) => self.serialize_null(string, cursor)?, + JsonbHeader(ElementType::TRUE, _) => self.serialize_boolean(string, cursor, true), + JsonbHeader(ElementType::FALSE, _) => self.serialize_boolean(string, cursor, false), + JsonbHeader(ElementType::NULL, _) => self.serialize_null(string, cursor), JsonbHeader(_, _) => { unreachable!(); } @@ -243,7 +252,7 @@ impl Jsonb { let (key_header, key_header_offset) = self.read_header(current_cursor)?; current_cursor += key_header_offset; let JsonbHeader(element_type, len) = key_header; - string.push('"'); + match element_type { ElementType::TEXT | ElementType::TEXTRAW @@ -254,7 +263,7 @@ impl Jsonb { } _ => bail_parse_error!("Malformed json!"), } - string.push('"'); + string.push(':'); current_cursor = self.serialize_value(string, current_cursor)?; if current_cursor < end_cursor { @@ -271,9 +280,9 @@ impl Jsonb { string.push('['); - while end_cursor > current_cursor { - current_cursor = self.serialize_value(string, cursor)?; - if end_cursor > current_cursor { + while current_cursor < end_cursor { + current_cursor = self.serialize_value(string, current_cursor)?; + if current_cursor < end_cursor { string.push(','); } } @@ -289,7 +298,169 @@ impl Jsonb { len: usize, kind: &ElementType, ) -> Result { - todo!() + let word_slice = &self.data[cursor..cursor + len]; + string.push('"'); + match kind { + // Can be serialized as is. Do not need escaping + &ElementType::TEXT => { + let word = from_utf8(word_slice).map_err(|_| { + LimboError::ParseError("Failed to serialize string!".to_string()) + })?; + string.push_str(word); + } + + // Contain standard json escapes + &ElementType::TEXTJ => { + let word = from_utf8(word_slice).map_err(|_| { + LimboError::ParseError("Failed to serialize string!".to_string()) + })?; + string.push_str(word); + } + + // We have to escape some JSON5 escape sequences + &ElementType::TEXT5 => { + let mut i = 0; + while i < word_slice.len() { + let ch = word_slice[i]; + + // Handle normal characters that don't need escaping + if self.is_json_ok(ch) || ch == b'\'' { + string.push(ch as char); + i += 1; + continue; + } + + // Handle special cases + match ch { + // Double quotes need escaping + b'"' => { + string.push_str("\\\""); + i += 1; + } + + // Control characters (0x00-0x1F) + ch if ch <= 0x1F => { + match ch { + // \b + 0x08 => string.push_str("\\b"), + b'\t' => string.push_str("\\t"), + b'\n' => string.push_str("\\n"), + // \f + 0x0C => string.push_str("\\f"), + b'\r' => string.push_str("\\r"), + _ => { + // Format as \u00XX + let hex = format!("\\u{:04x}", ch); + string.push_str(&hex); + } + } + i += 1; + } + + // Handle escape sequences + b'\\' if i + 1 < word_slice.len() => { + let next_ch = word_slice[i + 1]; + match next_ch { + // Single quote + b'\'' => { + string.push('\''); + i += 2; + } + + // Vertical tab + b'v' => { + string.push_str("\\u0009"); + i += 2; + } + + // Hex escapes like \x27 + b'x' if i + 3 < word_slice.len() => { + string.push_str("\\u00"); + string.push(word_slice[i + 2] as char); + string.push(word_slice[i + 3] as char); + i += 4; + } + + // Null character + b'0' => { + string.push_str("\\u0000"); + i += 2; + } + + // CR line continuation + b'\r' => { + if i + 2 < word_slice.len() && word_slice[i + 2] == b'\n' { + i += 3; // Skip CRLF + } else { + i += 2; // Skip CR + } + } + + // LF line continuation + b'\n' => { + i += 2; + } + + // Unicode line separators (U+2028 and U+2029) + 0xe2 if i + 3 < word_slice.len() + && word_slice[i + 2] == 0x80 + && (word_slice[i + 3] == 0xa8 || word_slice[i + 3] == 0xa9) => + { + i += 4; + } + + // All other escapes pass through + _ => { + string.push('\\'); + string.push(next_ch as char); + i += 2; + } + } + } + + // Default case - just push the character + _ => { + string.push(ch as char); + i += 1; + } + } + } + } + + &ElementType::TEXTRAW => { + // Handle TEXTRAW if needed + let word = from_utf8(word_slice).map_err(|_| { + LimboError::ParseError("Failed to serialize string!".to_string()) + })?; + + // For TEXTRAW, we need to escape special characters for JSON + for ch in word.chars() { + match ch { + '"' => string.push_str("\\\""), + '\\' => string.push_str("\\\\"), + '\x08' => string.push_str("\\b"), + '\x0C' => string.push_str("\\f"), + '\n' => string.push_str("\\n"), + '\r' => string.push_str("\\r"), + '\t' => string.push_str("\\t"), + c if c <= '\u{001F}' => { + string.push_str(&format!("\\u{:04x}", c as u32)); + } + _ => string.push(ch), + } + } + } + + _ => { + unreachable!() + } + } + string.push('"'); + Ok(cursor + len) + } + + fn is_json_ok(&self, ch: u8) -> bool { + ch >= 0x20 && ch <= 0x7E && ch != b'"' && ch != b'\\' } fn serialize_number( @@ -299,15 +470,110 @@ impl Jsonb { len: usize, kind: &ElementType, ) -> Result { - todo!() + let current_cursor = cursor + len; + let num_slice = from_utf8(&self.data[cursor..current_cursor]) + .map_err(|_| LimboError::ParseError("Failed to parse integer".to_string()))?; + + match kind { + ElementType::INT | ElementType::FLOAT => { + string.push_str(num_slice); + } + ElementType::INT5 => { + self.serialize_int5(string, num_slice)?; + } + ElementType::FLOAT5 => { + self.serialize_float5(string, num_slice)?; + } + _ => unreachable!(), + } + Ok(current_cursor) } - fn serialize_boolean(&self, string: &mut String, cursor: usize) -> Result { - todo!() + fn serialize_int5(&self, string: &mut String, hex_str: &str) -> Result<()> { + // Check if number is hex + if hex_str.len() > 2 + && (hex_str[..2].eq_ignore_ascii_case("0x") + || (hex_str.starts_with("-") || hex_str.starts_with("+")) + && hex_str[1..3].eq_ignore_ascii_case("0x")) + { + let (sign, hex_part) = if hex_str.starts_with("-0x") || hex_str.starts_with("-0X") { + ("-", &hex_str[3..]) + } else if hex_str.starts_with("+0x") || hex_str.starts_with("+0X") { + ("", &hex_str[3..]) + } else { + ("", &hex_str[2..]) + }; + + // Add sign + string.push_str(sign); + + let mut value = 0u64; + + for ch in hex_part.chars() { + if !ch.is_ascii_hexdigit() { + bail_parse_error!("Failed to parse hex digit: {}", hex_part); + } + + if (value >> 60) != 0 { + string.push_str("9.0e999"); + return Ok(()); + } + + value = value * 16 + ch.to_digit(16).unwrap_or(0) as u64; + } + write!(string, "{}", value) + .map_err(|_| LimboError::ParseError("Error writing string to json!".to_string()))?; + } else { + string.push_str(hex_str); + } + + Ok(()) } - fn serialize_null(&self, string: &mut String, cursor: usize) -> Result { - todo!() + fn serialize_float5(&self, string: &mut String, float_str: &str) -> Result<()> { + if float_str.len() < 2 { + bail_parse_error!("Integer is less then 2 chars: {}", float_str); + } + match float_str { + val if val.starts_with("-.") => { + string.push_str("-0."); + string.push_str(&val[2..]); + } + val if val.starts_with("+.") => { + string.push_str("0."); + string.push_str(&val[2..]); + } + val if val.starts_with(".") => { + string.push_str("0."); + string.push_str(&val[1..]); + } + val if val + .chars() + .next() + .map_or(false, |c| c.is_ascii_alphanumeric() || c == '+' || c == '-') => + { + string.push_str(val); + string.push('0'); + } + _ => bail_parse_error!("Unable to serialize float5: {}", float_str), + } + + Ok(()) + } + + fn serialize_boolean(&self, string: &mut String, cursor: usize, val: bool) -> usize { + if val { + string.push_str("true"); + } else { + string.push_str("false"); + } + + cursor + } + + fn serialize_null(&self, string: &mut String, cursor: usize) -> usize { + string.push_str("null"); + cursor } fn deserialize_value<'a, I>(&mut self, input: &mut Peekable, depth: usize) -> Result @@ -330,8 +596,9 @@ impl Jsonb { } Some(b't') => self.deserialize_true(input), Some(b'f') => self.deserialize_false(input), - Some(b'n') => self.deserialize_null(input), + Some(b'n') => self.deserialize_null_or_nan(input), Some(b'"') => self.deserialize_string(input), + Some(b'\'') => self.deserialize_string(input), Some(&&c) if c.is_ascii_digit() || c == b'-' @@ -378,9 +645,6 @@ impl Jsonb { } Some(_) => { // Parse key (must be string) - if input.peek() != Some(&&b'"') { - bail_parse_error!("Object key must be a string"); - } self.deserialize_string(input)?; skip_whitespace(input); @@ -458,6 +722,7 @@ impl Jsonb { { let string_start = self.len(); let quote = input.next().unwrap(); // " + let quoted = quote == &b'"' || quote == &b'\''; let mut len = 0; self.write_element_header(string_start, ElementType::TEXT, 0)?; let payload_start = self.len(); @@ -465,44 +730,63 @@ impl Jsonb { if input.peek().is_none() { bail_parse_error!("Unexpected end of input"); }; - // Determine if this will be TEXT, TEXTJ, or TEXT5 + let mut element_type = ElementType::TEXT; + // This needed to support 1 char unquoted JSON5 keys + if !quoted { + self.data.push(*quote); + len += 1; + if let Some(&&c) = input.peek() { + if c == b':' { + self.write_element_header(string_start, element_type, len)?; + + return Ok(self.len() - payload_start); + } + } + }; while let Some(c) = input.next() { - if c == quote { + if c == quote && quoted { break; } else if c == &b'\\' { // Handle escapes if let Some(&esc) = input.next() { match esc { b'b' => { - self.data.push('\u{0008}' as u8); - len += 1; + self.data.push(b'\\'); + self.data.push(b'b'); + len += 2; element_type = ElementType::TEXTJ; } b'f' => { - self.data.push('\u{000C}' as u8); - len += 1; + self.data.push(b'\\'); + self.data.push(b'f'); + len += 2; element_type = ElementType::TEXTJ; } b'n' => { - self.data.push('\n' as u8); - len += 1; + self.data.push(b'\\'); + self.data.push(b'n'); + len += 2; element_type = ElementType::TEXTJ; } b'r' => { self.data.push('\r' as u8); - len += 1; + self.data.push(b'\\'); + self.data.push(b'r'); + len += 2; element_type = ElementType::TEXTJ; } b't' => { - self.data.push('\t' as u8); - len += 1; + self.data.push(b'\\'); + self.data.push(b't'); + len += 2; element_type = ElementType::TEXTJ; } b'\\' | b'"' | b'/' => { + self.data.push(b'\\'); self.data.push(esc); - len += 1; + len += 2; element_type = ElementType::TEXTJ; } b'u' => { @@ -527,8 +811,9 @@ impl Jsonb { // JSON5 extensions b'\n' => { element_type = ElementType::TEXT5; - self.data.push(b'\n'); - len += 1; + self.data.push(b'\\'); + self.data.push(b'n'); + len += 2; } b'\'' => { element_type = ElementType::TEXT5; @@ -553,6 +838,18 @@ impl Jsonb { self.data.push(b'\\'); self.data.push(b'x'); len += 2; + for _ in 0..2 { + if let Some(&h) = input.next() { + if is_hex_digit(h) { + self.data.push(h); + len += 1; + } else { + bail_parse_error!("Invalid hex escape sequence"); + } + } else { + bail_parse_error!("Incomplete hex escape sequence"); + } + } } _ => { bail_parse_error!("Invalid escape sequence") @@ -562,7 +859,6 @@ impl Jsonb { bail_parse_error!("Unexpected end of input in escape sequence"); } } else if c <= &('\u{001F}' as u8) { - // Control characters need escaping in standard JSON element_type = ElementType::TEXT5; self.data.push(*c); len += 1; @@ -570,6 +866,11 @@ impl Jsonb { self.data.push(*c); len += 1; } + if let Some(&&c) = input.peek() { + if (c == b':' || c.is_ascii_whitespace()) && !quoted { + break; + } + } } // Write header and payload @@ -586,15 +887,19 @@ impl Jsonb { let mut len = 0; let mut is_float = false; let mut is_json5 = false; + + // Dummy header self.write_element_header(num_start, ElementType::INT, 0)?; // Handle sign if input.peek() == Some(&&b'-') || input.peek() == Some(&&b'+') { if input.peek() == Some(&&b'+') { - is_json5 = true; // JSON5 extension + is_json5 = true; + input.next(); + } else { + self.data.push(*input.next().unwrap()); + len += 1; } - self.data.push(*input.next().unwrap()); - len += 1; } // Handle json5 float number @@ -618,7 +923,6 @@ impl Jsonb { } } - // Write INT5 header and payload self.write_element_header(num_start, ElementType::INT5, len)?; return Ok(self.len() - num_start); @@ -654,6 +958,11 @@ impl Jsonb { b'.' => { is_float = true; self.data.push(*input.next().unwrap()); + if let Some(ch) = input.peek() { + if !ch.is_ascii_alphanumeric() { + is_json5 = true; + } + }; len += 1; } b'e' | b'E' => { @@ -689,19 +998,39 @@ impl Jsonb { Ok(self.len() - num_start) } - pub fn deserialize_null<'a, I>(&mut self, input: &mut Peekable) -> Result + pub fn deserialize_null_or_nan<'a, I>(&mut self, input: &mut Peekable) -> Result where I: Iterator, { let start = self.len(); - // Expect "null" - for expected in &[b'n', b'u', b'l', b'l'] { - if input.next() != Some(expected) { - bail_parse_error!("Expected 'null'"); + let nul = &[b'n', b'u', b'l', b'l']; + let nan = &[b'n', b'a', b'n']; + let mut nan_score = 0; + let mut nul_score = 0; + for i in 0..4 { + if nan_score == 3 { + self.data.push(ElementType::NULL as u8); + return Ok(self.len() - start); + }; + let nul_ch = nul.get(i); + let nan_ch = nan.get(i); + let ch = input.next(); + if nan_ch != ch && nul_ch != ch { + bail_parse_error!("expected null or nan"); + } + if nan_ch == ch { + nan_score += 1; + } + if nul_ch == ch { + nul_score += 1; } } - self.data.push(ElementType::NULL as u8); - Ok(self.len() - start) + if nul_score == 4 { + self.data.push(ElementType::NULL as u8); + return Ok(self.len() - start); + } else { + bail_parse_error!("expected null or nan"); + } } pub fn deserialize_true<'a, I>(&mut self, input: &mut Peekable) -> Result @@ -709,7 +1038,6 @@ impl Jsonb { I: Iterator, { let start = self.len(); - // Expect "true" for expected in &[b't', b'r', b'u', b'e'] { if input.next() != Some(expected) { bail_parse_error!("Expected 'true'"); @@ -724,7 +1052,6 @@ impl Jsonb { I: Iterator, { let start = self.len(); - // Expect "false" for expected in &[b'f', b'a', b'l', b's', b'e'] { if input.next() != Some(expected) { bail_parse_error!("Expected 'false'"); @@ -758,7 +1085,7 @@ impl Jsonb { } pub fn from_str(input: &str) -> Result { - let mut result = Self::new(input.len()); + let mut result = Self::new(input.len(), None); let mut input_iter = input.as_bytes().iter().peekable(); result.deserialize_value(&mut input_iter, 0)?; @@ -767,12 +1094,16 @@ impl Jsonb { } pub fn from_bytes(input: &[u8]) -> Result { - let mut result = Self::new(input.len()); + let mut result = Self::new(input.len(), None); let mut input_iter = input.iter().peekable(); result.deserialize_value(&mut input_iter, 0)?; Ok(result) } + + pub fn data(self) -> Vec { + self.data + } } impl std::str::FromStr for Jsonb { @@ -833,3 +1164,69 @@ fn is_hex_digit(b: u8) -> bool { _ => false, } } + +fn unescape_to_char<'a, I>(input: &mut Peekable) -> Result +where + I: Iterator, +{ + let code = parse_hex_code_point(input, 4)?; + + // Check if this is a high surrogate (U+D800 to U+DBFF) + if (0xD800..=0xDBFF).contains(&code) { + // This is a high surrogate, expect a low surrogate next + if !matches!(input.next(), Some(&b'\\')) || !matches!(input.next(), Some(&b'u')) { + bail_parse_error!("Expected low surrogate after high surrogate"); + } + + // Parse the low surrogate + let low_surrogate = parse_hex_code_point(input, 4)?; + + if !(0xDC00..=0xDFFF).contains(&low_surrogate) { + bail_parse_error!("Invalid low surrogate value"); + } + + // Combine the surrogate pair to get the actual code point + // Formula: (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000 + let combined = 0x10000 + ((code - 0xD800) << 10) + (low_surrogate - 0xDC00); + + // Convert to char + if let Some(ch) = char::from_u32(combined) { + Ok(ch) + } else { + bail_parse_error!("Invalid Unicode code point from surrogate pair") + } + } else { + // Regular code point, just convert directly + if let Some(ch) = char::from_u32(code) { + Ok(ch) + } else { + bail_parse_error!("Invalid Unicode code point from surrogate pair") + } + } +} + +// Helper function to parse a hex code point +fn parse_hex_code_point<'a, I>(input: &mut Peekable, digits: usize) -> Result +where + I: Iterator, +{ + let mut code = 0u32; + for _ in 0..digits { + if let Some(&h) = input.next() { + if is_hex_digit(h) { + let digit_value = match h { + b'0'..=b'9' => h - b'0', + b'a'..=b'f' => h - b'a' + 10, + b'A'..=b'F' => h - b'A' + 10, + _ => bail_parse_error!("Not a hex digit"), + }; + code = code * 16 + (digit_value as u32); + } else { + bail_parse_error!("Failed to parse unicode escape") + } + } else { + bail_parse_error!("Incomplete Unicode escape"); + } + } + Ok(code) +} diff --git a/core/json/mod.rs b/core/json/mod.rs index 26d41b4ee..20d834542 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -40,8 +40,7 @@ pub fn get_json(json_value: &OwnedValue, indent: Option<&str>) -> crate::Result< if t.subtype == TextSubtype::Json { return Ok(json_value.to_owned()); } - let jsonbin = Jsonb::from_str(json_value.to_text().unwrap())?; - jsonbin.debug_read(); + let json_val = get_json_value(json_value)?; let json = match indent { Some(indent) => to_string_pretty(&json_val, indent)?, From 04f69220b77293c4130061eca3381a133953fbe5 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Tue, 11 Mar 2025 23:56:07 +0200 Subject: [PATCH 04/10] add jsonb function implementation and json now understands blobs --- core/function.rs | 4 ++++ core/json/mod.rs | 34 ++++++++++++++++++++++++++++++---- core/translate/expr.rs | 2 +- core/vdbe/mod.rs | 10 +++++++++- 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/core/function.rs b/core/function.rs index fa10d9787..333266eea 100644 --- a/core/function.rs +++ b/core/function.rs @@ -71,6 +71,7 @@ impl Display for ExternalFunc { #[derive(Debug, Clone, PartialEq)] pub enum JsonFunc { Json, + Jsonb, JsonArray, JsonArrayLength, JsonArrowExtract, @@ -95,6 +96,7 @@ impl Display for JsonFunc { "{}", match self { Self::Json => "json".to_string(), + Self::Jsonb => "jsonb".to_string(), Self::JsonArray => "json_array".to_string(), Self::JsonExtract => "json_extract".to_string(), Self::JsonArrayLength => "json_array_length".to_string(), @@ -549,6 +551,8 @@ impl Func { #[cfg(feature = "json")] "json" => Ok(Self::Json(JsonFunc::Json)), #[cfg(feature = "json")] + "jsonb" => Ok(Self::Json(JsonFunc::Jsonb)), + #[cfg(feature = "json")] "json_array_length" => Ok(Self::Json(JsonFunc::JsonArrayLength)), #[cfg(feature = "json")] "json_array" => Ok(Self::Json(JsonFunc::JsonArray)), diff --git a/core/json/mod.rs b/core/json/mod.rs index 20d834542..f8e9dcb48 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -17,6 +17,7 @@ use jsonb::Jsonb; use ser::to_string_pretty; use serde::{Deserialize, Serialize}; use std::borrow::Cow; +use std::rc::Rc; #[derive(Serialize, Deserialize, Debug, PartialEq, Clone)] #[serde(untagged)] @@ -50,9 +51,12 @@ pub fn get_json(json_value: &OwnedValue, indent: Option<&str>) -> crate::Result< Ok(OwnedValue::Text(Text::json(&json))) } OwnedValue::Blob(b) => { - // TODO: use get_json_value after we implement a single Struct - // to represent both JSON and JSONB - bail_parse_error!("Unsupported") + let jsonbin = Jsonb::new(b.len(), Some(b)); + jsonbin.is_valid()?; + Ok(OwnedValue::Text(Text { + value: Rc::new(jsonbin.to_string()?.into_bytes()), + subtype: TextSubtype::Json, + })) } OwnedValue::Null => Ok(OwnedValue::Null), _ => { @@ -67,6 +71,28 @@ pub fn get_json(json_value: &OwnedValue, indent: Option<&str>) -> crate::Result< } } +pub fn jsonb(json_value: &OwnedValue) -> crate::Result { + let jsonbin = match json_value { + OwnedValue::Null | OwnedValue::Integer(_) | OwnedValue::Float(_) | OwnedValue::Text(_) => { + Jsonb::from_str(&json_value.to_string()) + } + OwnedValue::Blob(blob) => { + let blob = Jsonb::new(blob.len(), Some(&blob)); + blob.is_valid()?; + Ok(blob) + } + _ => { + unimplemented!() + } + }; + match jsonbin { + Ok(jsonbin) => Ok(OwnedValue::Blob(Rc::new(jsonbin.data()))), + Err(_) => { + bail_parse_error!("Malformed json") + } + } +} + fn get_json_value(json_value: &OwnedValue) -> crate::Result { match json_value { OwnedValue::Text(ref t) => match from_str::(t.as_str()) { @@ -75,7 +101,7 @@ fn get_json_value(json_value: &OwnedValue) -> crate::Result { crate::bail_parse_error!("malformed JSON") } }, - OwnedValue::Blob(b) => { + OwnedValue::Blob(_) => { crate::bail_parse_error!("malformed JSON"); } OwnedValue::Null => Ok(Val::Null), diff --git a/core/translate/expr.rs b/core/translate/expr.rs index 8a075bb54..24e7418b3 100644 --- a/core/translate/expr.rs +++ b/core/translate/expr.rs @@ -882,7 +882,7 @@ pub fn translate_expr( } #[cfg(feature = "json")] Func::Json(j) => match j { - JsonFunc::Json => { + JsonFunc::Json | JsonFunc::Jsonb => { let args = expect_arguments_exact!(args, 1, j); translate_function( diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index b65bd80d5..7b9ac9b69 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -52,7 +52,7 @@ use crate::{ function::JsonFunc, json::get_json, json::is_json_valid, json::json_array, json::json_array_length, json::json_arrow_extract, json::json_arrow_shift_extract, json::json_error_position, json::json_extract, json::json_object, json::json_patch, - json::json_quote, json::json_remove, json::json_set, json::json_type, + json::json_quote, json::json_remove, json::json_set, json::json_type, json::jsonb, }; use crate::{info, CheckpointStatus}; use crate::{ @@ -2131,6 +2131,14 @@ impl Program { Err(e) => return Err(e), } } + JsonFunc::Jsonb => { + let json_value = &state.registers[*start_reg]; + let json_blob = jsonb(json_value); + match json_blob { + Ok(json) => state.registers[*dest] = json, + Err(e) => return Err(e), + } + } JsonFunc::JsonArray | JsonFunc::JsonObject => { let reg_values = &state.registers[*start_reg..*start_reg + arg_count]; From 7bd10dd577687f9bd7e6cd1c1d3160bc80d7d647 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Wed, 12 Mar 2025 00:01:50 +0200 Subject: [PATCH 05/10] remove warnings and dead code --- core/json/jsonb.rs | 76 +--------------------------------------------- core/json/mod.rs | 5 +-- 2 files changed, 4 insertions(+), 77 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index 009203a3f..7c0b456fd 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -1084,7 +1084,7 @@ impl Jsonb { Ok(header.iter().filter(|&&x| x != 0).count()) } - pub fn from_str(input: &str) -> Result { + fn from_str(input: &str) -> Result { let mut result = Self::new(input.len(), None); let mut input_iter = input.as_bytes().iter().peekable(); @@ -1093,14 +1093,6 @@ impl Jsonb { Ok(result) } - pub fn from_bytes(input: &[u8]) -> Result { - let mut result = Self::new(input.len(), None); - let mut input_iter = input.iter().peekable(); - result.deserialize_value(&mut input_iter, 0)?; - - Ok(result) - } - pub fn data(self) -> Vec { self.data } @@ -1164,69 +1156,3 @@ fn is_hex_digit(b: u8) -> bool { _ => false, } } - -fn unescape_to_char<'a, I>(input: &mut Peekable) -> Result -where - I: Iterator, -{ - let code = parse_hex_code_point(input, 4)?; - - // Check if this is a high surrogate (U+D800 to U+DBFF) - if (0xD800..=0xDBFF).contains(&code) { - // This is a high surrogate, expect a low surrogate next - if !matches!(input.next(), Some(&b'\\')) || !matches!(input.next(), Some(&b'u')) { - bail_parse_error!("Expected low surrogate after high surrogate"); - } - - // Parse the low surrogate - let low_surrogate = parse_hex_code_point(input, 4)?; - - if !(0xDC00..=0xDFFF).contains(&low_surrogate) { - bail_parse_error!("Invalid low surrogate value"); - } - - // Combine the surrogate pair to get the actual code point - // Formula: (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000 - let combined = 0x10000 + ((code - 0xD800) << 10) + (low_surrogate - 0xDC00); - - // Convert to char - if let Some(ch) = char::from_u32(combined) { - Ok(ch) - } else { - bail_parse_error!("Invalid Unicode code point from surrogate pair") - } - } else { - // Regular code point, just convert directly - if let Some(ch) = char::from_u32(code) { - Ok(ch) - } else { - bail_parse_error!("Invalid Unicode code point from surrogate pair") - } - } -} - -// Helper function to parse a hex code point -fn parse_hex_code_point<'a, I>(input: &mut Peekable, digits: usize) -> Result -where - I: Iterator, -{ - let mut code = 0u32; - for _ in 0..digits { - if let Some(&h) = input.next() { - if is_hex_digit(h) { - let digit_value = match h { - b'0'..=b'9' => h - b'0', - b'a'..=b'f' => h - b'a' + 10, - b'A'..=b'F' => h - b'A' + 10, - _ => bail_parse_error!("Not a hex digit"), - }; - code = code * 16 + (digit_value as u32); - } else { - bail_parse_error!("Failed to parse unicode escape") - } - } else { - bail_parse_error!("Incomplete Unicode escape"); - } - } - Ok(code) -} diff --git a/core/json/mod.rs b/core/json/mod.rs index f8e9dcb48..d10412c3e 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -18,6 +18,7 @@ use ser::to_string_pretty; use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::rc::Rc; +use std::str::FromStr; #[derive(Serialize, Deserialize, Debug, PartialEq, Clone)] #[serde(untagged)] @@ -644,7 +645,7 @@ pub fn json_error_position(json: &OwnedValue) -> crate::Result { } } }, - OwnedValue::Blob(b) => { + OwnedValue::Blob(_) => { bail_parse_error!("Unsupported") } OwnedValue::Null => Ok(OwnedValue::Null), @@ -682,7 +683,7 @@ pub fn is_json_valid(json_value: &OwnedValue) -> crate::Result { Ok(_) => Ok(OwnedValue::Integer(1)), Err(_) => Ok(OwnedValue::Integer(0)), }, - OwnedValue::Blob(b) => { + OwnedValue::Blob(_) => { bail_parse_error!("Unsuported!") } OwnedValue::Null => Ok(OwnedValue::Null), From 19e4bc8523d903eae52eadee53a1c663168b2506 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Wed, 12 Mar 2025 00:24:00 +0200 Subject: [PATCH 06/10] clippy --- core/json/jsonb.rs | 48 +++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index 7c0b456fd..205151001 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -115,7 +115,7 @@ impl JsonbHeader { } } - fn into_bytes(&self) -> [u8; 5] { + fn into_bytes(self) -> [u8; 5] { let mut bytes = [0; 5]; let element_type = self.0; let payload_size = self.1; @@ -189,14 +189,12 @@ impl Jsonb { let mut cursor = 0usize; while cursor < self.len() { let (header, offset) = self.read_header(cursor).unwrap(); - cursor = cursor + offset; + cursor += offset; println!("{:?}: HEADER", header); - if header.0 == ElementType::OBJECT || header.0 == ElementType::ARRAY { - cursor = cursor; - } else { + if header.0 != ElementType::OBJECT || header.0 != ElementType::ARRAY { let value = from_utf8(&self.data[cursor..cursor + header.1]).unwrap(); println!("{:?}: VALUE", value); - cursor = cursor + header.1 + cursor += header.1 } } } @@ -302,7 +300,7 @@ impl Jsonb { string.push('"'); match kind { // Can be serialized as is. Do not need escaping - &ElementType::TEXT => { + ElementType::TEXT => { let word = from_utf8(word_slice).map_err(|_| { LimboError::ParseError("Failed to serialize string!".to_string()) })?; @@ -310,7 +308,7 @@ impl Jsonb { } // Contain standard json escapes - &ElementType::TEXTJ => { + ElementType::TEXTJ => { let word = from_utf8(word_slice).map_err(|_| { LimboError::ParseError("Failed to serialize string!".to_string()) })?; @@ -318,7 +316,7 @@ impl Jsonb { } // We have to escape some JSON5 escape sequences - &ElementType::TEXT5 => { + ElementType::TEXT5 => { let mut i = 0; while i < word_slice.len() { let ch = word_slice[i]; @@ -427,13 +425,11 @@ impl Jsonb { } } - &ElementType::TEXTRAW => { - // Handle TEXTRAW if needed + ElementType::TEXTRAW => { let word = from_utf8(word_slice).map_err(|_| { LimboError::ParseError("Failed to serialize string!".to_string()) })?; - // For TEXTRAW, we need to escape special characters for JSON for ch in word.chars() { match ch { '"' => string.push_str("\\\""), @@ -460,7 +456,7 @@ impl Jsonb { } fn is_json_ok(&self, ch: u8) -> bool { - ch >= 0x20 && ch <= 0x7E && ch != b'"' && ch != b'\\' + (0x20..=0x7E).contains(&ch) && ch != b'"' && ch != b'\\' } fn serialize_number( @@ -650,7 +646,7 @@ impl Jsonb { skip_whitespace(input); // Expect and consume ':' - if input.next() != Some(&&b':') { + if input.next() != Some(&b':') { bail_parse_error!("Expected ':' after object key"); } @@ -771,7 +767,6 @@ impl Jsonb { element_type = ElementType::TEXTJ; } b'r' => { - self.data.push('\r' as u8); self.data.push(b'\\'); self.data.push(b'r'); len += 2; @@ -858,7 +853,7 @@ impl Jsonb { } else { bail_parse_error!("Unexpected end of input in escape sequence"); } - } else if c <= &('\u{001F}' as u8) { + } else if c <= &0x1F { element_type = ElementType::TEXT5; self.data.push(*c); len += 1; @@ -931,7 +926,7 @@ impl Jsonb { // Check for Infinity if input.peek().map(|x| x.to_ascii_lowercase()) == Some(b'i') { - for expected in &[b'i', b'n', b'f', b'i', b'n', b'i', b't', b'y'] { + for expected in b"infinity" { if input.next().map(|x| x.to_ascii_lowercase()) != Some(*expected) { bail_parse_error!("Failed to parse number"); } @@ -941,8 +936,8 @@ impl Jsonb { ElementType::INT5, len + INFINITY_CHAR_COUNT as usize, )?; - for byte in [b'9', b'e', b'9', b'9', b'9'].into_iter() { - self.data.push(byte) + for byte in b"9e999" { + self.data.push(*byte) } return Ok(self.len() - num_start); @@ -1003,8 +998,8 @@ impl Jsonb { I: Iterator, { let start = self.len(); - let nul = &[b'n', b'u', b'l', b'l']; - let nan = &[b'n', b'a', b'n']; + let nul = b"null"; + let nan = b"nan"; let mut nan_score = 0; let mut nul_score = 0; for i in 0..4 { @@ -1027,7 +1022,7 @@ impl Jsonb { } if nul_score == 4 { self.data.push(ElementType::NULL as u8); - return Ok(self.len() - start); + Ok(self.len() - start) } else { bail_parse_error!("expected null or nan"); } @@ -1038,7 +1033,7 @@ impl Jsonb { I: Iterator, { let start = self.len(); - for expected in &[b't', b'r', b'u', b'e'] { + for expected in b"true" { if input.next() != Some(expected) { bail_parse_error!("Expected 'true'"); } @@ -1052,7 +1047,7 @@ impl Jsonb { I: Iterator, { let start = self.len(); - for expected in &[b'f', b'a', b'l', b's', b'e'] { + for expected in b"false" { if input.next() != Some(expected) { bail_parse_error!("Expected 'false'"); } @@ -1151,8 +1146,5 @@ where } fn is_hex_digit(b: u8) -> bool { - match b { - b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' => true, - _ => false, - } + matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F') } From eb2d2fbd69d30cf2665d353ac0344beecfc0f7b0 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Wed, 12 Mar 2025 14:11:20 +0200 Subject: [PATCH 07/10] add tests --- core/json/jsonb.rs | 598 +++++++++++++++++++++++++++++++++++++++++++-- core/json/mod.rs | 14 +- 2 files changed, 575 insertions(+), 37 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index 205151001..ad61d3f55 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -73,6 +73,9 @@ impl JsonbHeader { Some(header_byte) => { // Extract first 4 bits (values 0-15) let element_type = header_byte & 15; + if element_type > 12 { + bail_parse_error!("Invalid element type: {}", element_type); + } // Get the last 4 bits for header_size let header_size = header_byte >> 4; let offset: usize; @@ -178,8 +181,14 @@ impl Jsonb { pub fn is_valid(&self) -> Result<()> { match self.read_header(0) { - Ok(_) => Ok(()), - Err(_) => bail_parse_error!("Malformed json"), + Ok((header, offset)) => { + if let Some(_) = self.data.get(offset..offset + header.1) { + Ok(()) + } else { + bail_parse_error!("malformed JSON") + } + } + Err(_) => bail_parse_error!("malformed JSON"), } } @@ -189,6 +198,7 @@ impl Jsonb { let mut cursor = 0usize; while cursor < self.len() { let (header, offset) = self.read_header(cursor).unwrap(); + println!("{}, {}", cursor, offset); cursor += offset; println!("{:?}: HEADER", header); if header.0 != ElementType::OBJECT || header.0 != ElementType::ARRAY { @@ -259,7 +269,7 @@ impl Jsonb { current_cursor = self.serialize_string(string, current_cursor, len, &element_type)?; } - _ => bail_parse_error!("Malformed json!"), + _ => bail_parse_error!("malformed JSON"), } string.push(':'); @@ -531,6 +541,9 @@ impl Jsonb { bail_parse_error!("Integer is less then 2 chars: {}", float_str); } match float_str { + "9e999" | "-9e999" => { + string.push_str(float_str); + } val if val.starts_with("-.") => { string.push_str("-0."); string.push_str(&val[2..]); @@ -605,7 +618,7 @@ impl Jsonb { self.deserialize_number(input) } Some(ch) => bail_parse_error!("Unexpected character: {}", ch), - None => bail_parse_error!("Unexpected end of input"), + None => Ok(0), } } @@ -724,7 +737,7 @@ impl Jsonb { let payload_start = self.len(); if input.peek().is_none() { - bail_parse_error!("Unexpected end of input"); + bail_parse_error!("Unexpected end of input in string handling"); }; let mut element_type = ElementType::TEXT; @@ -807,7 +820,7 @@ impl Jsonb { b'\n' => { element_type = ElementType::TEXT5; self.data.push(b'\\'); - self.data.push(b'n'); + self.data.push(b'\n'); len += 2; } b'\'' => { @@ -906,21 +919,26 @@ impl Jsonb { if input.peek() == Some(&&b'0') { self.data.push(*input.next().unwrap()); len += 1; - if input.peek() == Some(&&b'x') || input.peek() == Some(&&b'X') { - self.data.push(*input.next().unwrap()); - len += 1; - while let Some(&&byte) = input.peek() { - if is_hex_digit(byte) { - self.data.push(*input.next().unwrap()); - len += 1; - } else { - break; + let next_ch = input.peek(); + if let Some(&&ch) = next_ch { + if ch == b'x' || ch == b'X' { + self.data.push(*input.next().unwrap()); + len += 1; + while let Some(&&byte) = input.peek() { + if is_hex_digit(byte) { + self.data.push(*input.next().unwrap()); + len += 1; + } else { + break; + } } + + self.write_element_header(num_start, ElementType::INT5, len)?; + + return Ok(self.len() - num_start); + } else if ch.is_ascii_alphanumeric() { + bail_parse_error!("Leading zero is not allowed") } - - self.write_element_header(num_start, ElementType::INT5, len)?; - - return Ok(self.len() - num_start); } } @@ -933,12 +951,11 @@ impl Jsonb { } self.write_element_header( num_start, - ElementType::INT5, + ElementType::FLOAT5, len + INFINITY_CHAR_COUNT as usize, )?; - for byte in b"9e999" { - self.data.push(*byte) - } + + self.data.extend_from_slice(b"9e999"); return Ok(self.len() - num_start); }; @@ -953,8 +970,15 @@ impl Jsonb { b'.' => { is_float = true; self.data.push(*input.next().unwrap()); - if let Some(ch) = input.peek() { - if !ch.is_ascii_alphanumeric() { + let next_ch = input.peek(); + match next_ch { + Some(ch) => { + println!("{}", **ch as char); + if !ch.is_ascii_alphanumeric() { + is_json5 = true; + } + } + None => { is_json5 = true; } }; @@ -1082,8 +1106,9 @@ impl Jsonb { fn from_str(input: &str) -> Result { let mut result = Self::new(input.len(), None); let mut input_iter = input.as_bytes().iter().peekable(); - - result.deserialize_value(&mut input_iter, 0)?; + while input_iter.peek().is_some() { + result.deserialize_value(&mut input_iter, 0)?; + } Ok(result) } @@ -1148,3 +1173,522 @@ where fn is_hex_digit(b: u8) -> bool { matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F') } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_null_serialization() { + // Create JSONB with null value + let mut jsonb = Jsonb::new(10, None); + jsonb.data.push(ElementType::NULL as u8); + + // Test serialization + let json_str = jsonb.to_string().unwrap(); + assert_eq!(json_str, "null"); + + // Test round-trip + let reparsed = Jsonb::from_str("null").unwrap(); + assert_eq!(reparsed.data[0] as u8, ElementType::NULL as u8); + } + + #[test] + fn test_boolean_serialization() { + // True + let mut jsonb_true = Jsonb::new(10, None); + jsonb_true.data.push(ElementType::TRUE as u8); + assert_eq!(jsonb_true.to_string().unwrap(), "true"); + + // False + let mut jsonb_false = Jsonb::new(10, None); + jsonb_false.data.push(ElementType::FALSE as u8); + assert_eq!(jsonb_false.to_string().unwrap(), "false"); + + // Round-trip + let true_parsed = Jsonb::from_str("true").unwrap(); + assert_eq!(true_parsed.data[0] as u8, ElementType::TRUE as u8); + + let false_parsed = Jsonb::from_str("false").unwrap(); + assert_eq!(false_parsed.data[0] as u8, ElementType::FALSE as u8); + } + + #[test] + fn test_integer_serialization() { + // Standard integer + let parsed = Jsonb::from_str("42").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "42"); + + // Negative integer + let parsed = Jsonb::from_str("-123").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "-123"); + + // Zero + let parsed = Jsonb::from_str("0").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "0"); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::INT)); + } + + #[test] + fn test_json5_integer_serialization() { + // Hexadecimal notation + let parsed = Jsonb::from_str("0x1A").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "26"); // Should convert to decimal + + // Positive sign (JSON5) + let parsed = Jsonb::from_str("+42").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "42"); + + // Negative hexadecimal + let parsed = Jsonb::from_str("-0xFF").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "-255"); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::INT5)); + } + + #[test] + fn test_float_serialization() { + // Standard float + let parsed = Jsonb::from_str("3.14159").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "3.14159"); + + // Negative float + let parsed = Jsonb::from_str("-2.718").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "-2.718"); + + // Scientific notation + let parsed = Jsonb::from_str("6.022e23").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "6.022e23"); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::FLOAT)); + } + + #[test] + fn test_json5_float_serialization() { + // Leading decimal point + let parsed = Jsonb::from_str(".123").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "0.123"); + + // Trailing decimal point + let parsed = Jsonb::from_str("42.").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "42.0"); + + // Plus sign in exponent + let parsed = Jsonb::from_str("1.5e+10").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "1.5e+10"); + + // Infinity + let parsed = Jsonb::from_str("Infinity").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "9e999"); + + // Negative Infinity + let parsed = Jsonb::from_str("-Infinity").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "-9e999"); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::FLOAT5)); + } + + #[test] + fn test_string_serialization() { + // Simple string + let parsed = Jsonb::from_str(r#""hello world""#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""hello world""#); + + // String with escaped characters + let parsed = Jsonb::from_str(r#""hello\nworld""#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""hello\nworld""#); + + // Unicode escape + let parsed = Jsonb::from_str(r#""hello\u0020world""#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""hello\u0020world""#); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::TEXTJ)); + } + + #[test] + fn test_json5_string_serialization() { + // Single quotes + let parsed = Jsonb::from_str("'hello world'").unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""hello world""#); + + // Hex escape + let parsed = Jsonb::from_str(r#"'\x41\x42\x43'"#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""\u0041\u0042\u0043""#); + + // Multiline string with line continuation + let parsed = Jsonb::from_str( + r#""hello \ +world""#, + ) + .unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""hello world""#); + + // Escaped single quote + let parsed = Jsonb::from_str(r#"'Don\'t worry'"#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#""Don't worry""#); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::TEXT5)); + } + + #[test] + fn test_array_serialization() { + // Empty array + let parsed = Jsonb::from_str("[]").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[]"); + + // Simple array + let parsed = Jsonb::from_str("[1,2,3]").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[1,2,3]"); + + // Nested array + let parsed = Jsonb::from_str("[[1,2],[3,4]]").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[[1,2],[3,4]]"); + + // Mixed types array + let parsed = Jsonb::from_str(r#"[1,"text",true,null,{"key":"value"}]"#).unwrap(); + assert_eq!( + parsed.to_string().unwrap(), + r#"[1,"text",true,null,{"key":"value"}]"# + ); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::ARRAY)); + } + + #[test] + fn test_json5_array_serialization() { + // Trailing comma + let parsed = Jsonb::from_str("[1,2,3,]").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[1,2,3]"); + + // Comments in array + let parsed = Jsonb::from_str("[1,/* comment */2,3]").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[1,2,3]"); + + // Line comment in array + let parsed = Jsonb::from_str("[1,// line comment\n2,3]").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[1,2,3]"); + } + + #[test] + fn test_object_serialization() { + // Empty object + let parsed = Jsonb::from_str("{}").unwrap(); + assert_eq!(parsed.to_string().unwrap(), "{}"); + + // Simple object + let parsed = Jsonb::from_str(r#"{"key":"value"}"#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"key":"value"}"#); + + // Multiple properties + let parsed = Jsonb::from_str(r#"{"a":1,"b":2,"c":3}"#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"a":1,"b":2,"c":3}"#); + + // Nested object + let parsed = Jsonb::from_str(r#"{"outer":{"inner":"value"}}"#).unwrap(); + assert_eq!( + parsed.to_string().unwrap(), + r#"{"outer":{"inner":"value"}}"# + ); + + // Mixed values + let parsed = + Jsonb::from_str(r#"{"str":"text","num":42,"bool":true,"null":null,"arr":[1,2]}"#) + .unwrap(); + assert_eq!( + parsed.to_string().unwrap(), + r#"{"str":"text","num":42,"bool":true,"null":null,"arr":[1,2]}"# + ); + + // Verify correct type + let header = JsonbHeader::from_slice(0, &parsed.data).unwrap().0; + assert!(matches!(header.0, ElementType::OBJECT)); + } + + #[test] + fn test_json5_object_serialization() { + // Unquoted keys + let parsed = Jsonb::from_str("{key:\"value\"}").unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"key":"value"}"#); + + // Trailing comma + let parsed = Jsonb::from_str(r#"{"a":1,"b":2,}"#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"a":1,"b":2}"#); + + // Comments in object + let parsed = Jsonb::from_str(r#"{"a":1,/*comment*/"b":2}"#).unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"a":1,"b":2}"#); + + // Single quotes for keys and values + let parsed = Jsonb::from_str("{'a':'value'}").unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"a":"value"}"#); + } + + #[test] + fn test_complex_json() { + let complex_json = r#"{ + "string": "Hello, world!", + "number": 42, + "float": 3.14159, + "boolean": true, + "null": null, + "array": [1, 2, 3, "text", {"nested": "object"}], + "object": { + "key1": "value1", + "key2": [4, 5, 6], + "key3": { + "nested": true + } + } + }"#; + + let parsed = Jsonb::from_str(complex_json).unwrap(); + // Round-trip test + let reparsed = Jsonb::from_str(&parsed.to_string().unwrap()).unwrap(); + assert_eq!(parsed.to_string().unwrap(), reparsed.to_string().unwrap()); + } + + #[test] + fn test_error_handling() { + // Invalid JSON syntax + assert!(Jsonb::from_str("{").is_err()); + assert!(Jsonb::from_str("[").is_err()); + assert!(Jsonb::from_str("}").is_err()); + assert!(Jsonb::from_str("]").is_err()); + + // Unclosed string + assert!(Jsonb::from_str(r#"{"key":"value"#).is_err()); + + // Invalid number format + assert!(Jsonb::from_str("01234").is_err()); // Leading zero not allowed in JSON + + // Invalid escape sequence + assert!(Jsonb::from_str(r#""\z""#).is_err()); + + // Missing colon in object + assert!(Jsonb::from_str(r#"{"key" "value"}"#).is_err()); + + // Trailing characters + assert!(Jsonb::from_str(r#"{"key":"value"} extra"#).is_err()); + } + + #[test] + fn test_depth_limit() { + // Create a JSON string that exceeds MAX_JSON_DEPTH + let mut deep_json = String::from("["); + for _ in 0..MAX_JSON_DEPTH + 1 { + deep_json.push_str("["); + } + for _ in 0..MAX_JSON_DEPTH + 1 { + deep_json.push_str("]"); + } + deep_json.push_str("]"); + + // Should fail due to exceeding depth limit + assert!(Jsonb::from_str(&deep_json).is_err()); + } + + #[test] + fn test_header_encoding() { + // Small payload (fits in 4 bits) + let header = JsonbHeader::new(ElementType::TEXT, 5); + let bytes = header.into_bytes(); + assert_eq!(bytes[0], (5 << 4) | (ElementType::TEXT as u8)); + + // Medium payload (8-bit) + let header = JsonbHeader::new(ElementType::TEXT, 200); + let bytes = header.into_bytes(); + assert_eq!(bytes[0], (PAYLOAD_SIZE8 << 4) | (ElementType::TEXT as u8)); + assert_eq!(bytes[1], 200); + + // Large payload (16-bit) + let header = JsonbHeader::new(ElementType::TEXT, 40000); + let bytes = header.into_bytes(); + assert_eq!(bytes[0], (PAYLOAD_SIZE16 << 4) | (ElementType::TEXT as u8)); + assert_eq!(bytes[1], (40000 >> 8) as u8); + assert_eq!(bytes[2], (40000 & 0xFF) as u8); + + // Extra large payload (32-bit) + let header = JsonbHeader::new(ElementType::TEXT, 70000); + let bytes = header.into_bytes(); + assert_eq!(bytes[0], (PAYLOAD_SIZE32 << 4) | (ElementType::TEXT as u8)); + assert_eq!(bytes[1], (70000 >> 24) as u8); + assert_eq!(bytes[2], ((70000 >> 16) & 0xFF) as u8); + assert_eq!(bytes[3], ((70000 >> 8) & 0xFF) as u8); + assert_eq!(bytes[4], (70000 & 0xFF) as u8); + } + + #[test] + fn test_header_decoding() { + // Create sample data with various headers + let mut data = Vec::new(); + + // Small payload + data.push((5 << 4) | (ElementType::TEXT as u8)); + + // Medium payload (8-bit) + data.push((PAYLOAD_SIZE8 << 4) | (ElementType::ARRAY as u8)); + data.push(150); // Payload size + + // Large payload (16-bit) + data.push((PAYLOAD_SIZE16 << 4) | (ElementType::OBJECT as u8)); + data.push(0x98); // High byte of 39000 + data.push(0x68); // Low byte of 39000 + + // Parse and verify each header + let (header1, offset1) = JsonbHeader::from_slice(0, &data).unwrap(); + assert_eq!(offset1, 1); + assert_eq!(header1.0, ElementType::TEXT); + assert_eq!(header1.1, 5); + + let (header2, offset2) = JsonbHeader::from_slice(1, &data).unwrap(); + assert_eq!(offset2, 2); + assert_eq!(header2.0, ElementType::ARRAY); + assert_eq!(header2.1, 150); + + let (header3, offset3) = JsonbHeader::from_slice(3, &data).unwrap(); + assert_eq!(offset3, 3); + assert_eq!(header3.0, ElementType::OBJECT); + assert_eq!(header3.1, 0x9868); // 39000 + } + + #[test] + fn test_unicode_escapes() { + // Basic unicode escape + let parsed = Jsonb::from_str(r#""\u00A9""#).unwrap(); // Copyright symbol + assert_eq!(parsed.to_string().unwrap(), r#""\u00A9""#); + + // Non-BMP character (surrogate pair) + let parsed = Jsonb::from_str(r#""\uD83D\uDE00""#).unwrap(); // Smiley emoji + assert_eq!(parsed.to_string().unwrap(), r#""\uD83D\uDE00""#); + } + + #[test] + fn test_json5_comments() { + // Line comments + let parsed = Jsonb::from_str( + r#"{ + // This is a line comment + "key": "value" + }"#, + ) + .unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"key":"value"}"#); + + // Block comments + let parsed = Jsonb::from_str( + r#"{ + /* This is a + block comment */ + "key": "value" + }"#, + ) + .unwrap(); + assert_eq!(parsed.to_string().unwrap(), r#"{"key":"value"}"#); + + // Comments inside array + let parsed = Jsonb::from_str( + r#"[1, // Comment + 2, /* Another comment */ 3]"#, + ) + .unwrap(); + assert_eq!(parsed.to_string().unwrap(), "[1,2,3]"); + } + + #[test] + fn test_whitespace_handling() { + // Various whitespace patterns + let json_with_whitespace = r#" + { + "key1" : "value1" , + "key2": [ 1, 2, 3 ] , + "key3": { + "nested" : true + } + } + "#; + + let parsed = Jsonb::from_str(json_with_whitespace).unwrap(); + assert_eq!( + parsed.to_string().unwrap(), + r#"{"key1":"value1","key2":[1,2,3],"key3":{"nested":true}}"# + ); + } + + #[test] + fn test_binary_roundtrip() { + // Test that binary data can be round-tripped through the JSONB format + let original = r#"{"test":"value","array":[1,2,3]}"#; + let parsed = Jsonb::from_str(original).unwrap(); + let binary_data = parsed.data.clone(); + + // Create a new Jsonb from the binary data + let from_binary = Jsonb::new(0, Some(&binary_data)); + assert_eq!(from_binary.to_string().unwrap(), original); + } + + #[test] + fn test_large_json() { + // Generate a large JSON with many elements + let mut large_array = String::from("["); + for i in 0..1000 { + large_array.push_str(&format!("{}", i)); + if i < 999 { + large_array.push_str(","); + } + } + large_array.push_str("]"); + + let parsed = Jsonb::from_str(&large_array).unwrap(); + assert!(parsed.to_string().unwrap().starts_with("[0,1,2,")); + assert!(parsed.to_string().unwrap().ends_with("998,999]")); + } + + #[test] + fn test_jsonb_is_valid() { + // Valid JSONB + let jsonb = Jsonb::from_str(r#"{"test":"value"}"#).unwrap(); + assert!(jsonb.is_valid().is_ok()); + + // Invalid JSONB (manually corrupted) + let mut invalid = jsonb.data.clone(); + if !invalid.is_empty() { + invalid[0] = 0xFF; // Invalid element type + let jsonb = Jsonb::new(0, Some(&invalid)); + assert!(jsonb.is_valid().is_err()); + } + } + + #[test] + fn test_special_characters_in_strings() { + // Test handling of various special characters + let json = r#"{ + "escaped_quotes": "He said \"Hello\"", + "backslashes": "C:\\Windows\\System32", + "control_chars": "\b\f\n\r\t", + "unicode": "\u00A9 2023" + }"#; + + let parsed = Jsonb::from_str(json).unwrap(); + let result = parsed.to_string().unwrap(); + + assert!(result.contains(r#""escaped_quotes":"He said \"Hello\"""#)); + assert!(result.contains(r#""backslashes":"C:\\Windows\\System32""#)); + assert!(result.contains(r#""control_chars":"\b\f\n\r\t""#)); + assert!(result.contains(r#""unicode":"\u00A9 2023""#)); + } +} diff --git a/core/json/mod.rs b/core/json/mod.rs index d10412c3e..6f8b571f8 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -89,7 +89,7 @@ pub fn jsonb(json_value: &OwnedValue) -> crate::Result { match jsonbin { Ok(jsonbin) => Ok(OwnedValue::Blob(Rc::new(jsonbin.data()))), Err(_) => { - bail_parse_error!("Malformed json") + bail_parse_error!("malformed JSON") } } } @@ -829,11 +829,11 @@ mod tests { #[test] fn test_get_json_blob_valid_jsonb() { - let binary_json = b"\x40\0\0\x01\x10\0\0\x03\x10\0\0\x03\x61\x73\x64\x61\x64\x66".to_vec(); + let binary_json = vec![124, 55, 104, 101, 121, 39, 121, 111]; let input = OwnedValue::Blob(Rc::new(binary_json)); let result = get_json(&input, None).unwrap(); if let OwnedValue::Text(result_str) = result { - assert!(result_str.as_str().contains("\"asd\":\"adf\"")); + assert!(result_str.as_str().contains(r#"{"hey":"yo"}"#)); assert_eq!(result_str.subtype, TextSubtype::Json); } else { panic!("Expected OwnedValue::Text"); @@ -845,6 +845,7 @@ mod tests { let binary_json: Vec = vec![0xA2, 0x62, 0x6B, 0x31, 0x62, 0x76]; // Incomplete binary JSON let input = OwnedValue::Blob(Rc::new(binary_json)); let result = get_json(&input, None); + println!("{:?}", result); match result { Ok(_) => panic!("Expected error for malformed JSON"), Err(e) => assert!(e.to_string().contains("malformed JSON")), @@ -1085,13 +1086,6 @@ mod tests { assert_eq!(result, OwnedValue::Integer(0)); } - #[test] - fn test_json_error_position_blob() { - let input = OwnedValue::Blob(Rc::new(r#"["a",55,"b",72,,]"#.as_bytes().to_owned())); - let result = json_error_position(&input).unwrap(); - assert_eq!(result, OwnedValue::Integer(16)); - } - #[test] fn test_json_object_simple() { let key = OwnedValue::build_text("key"); From 8a2740ad8a8a341382ad20cd219a71e940ad4655 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Wed, 12 Mar 2025 15:34:36 +0200 Subject: [PATCH 08/10] cleanup --- core/json/jsonb.rs | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index ad61d3f55..13759820e 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -192,23 +192,6 @@ impl Jsonb { } } - #[allow(dead_code)] - // Needed for debug. I am open to remove it - pub fn debug_read(&self) { - let mut cursor = 0usize; - while cursor < self.len() { - let (header, offset) = self.read_header(cursor).unwrap(); - println!("{}, {}", cursor, offset); - cursor += offset; - println!("{:?}: HEADER", header); - if header.0 != ElementType::OBJECT || header.0 != ElementType::ARRAY { - let value = from_utf8(&self.data[cursor..cursor + header.1]).unwrap(); - println!("{:?}: VALUE", value); - cursor += header.1 - } - } - } - pub fn to_string(&self) -> Result { let mut result = String::with_capacity(self.data.len() * 2); self.write_to_string(&mut result)?; @@ -973,7 +956,6 @@ impl Jsonb { let next_ch = input.peek(); match next_ch { Some(ch) => { - println!("{}", **ch as char); if !ch.is_ascii_alphanumeric() { is_json5 = true; } From 39c2481ce31ff86d38d9ac02cf40c541720a653c Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Wed, 12 Mar 2025 15:34:58 +0200 Subject: [PATCH 09/10] add e2e tests --- testing/json.test | 88 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/testing/json.test b/testing/json.test index d5fa827d9..c6dc99553 100755 --- a/testing/json.test +++ b/testing/json.test @@ -682,9 +682,12 @@ do_execsql_test json_valid_1 { do_execsql_test json_valid_2 { SELECT json_valid('["a",55,"b",72]'); } {1} -do_execsql_test json_valid_3 { - SELECT json_valid( CAST('{"a":1}' AS BLOB) ); -} {1} +# +# Unimplemented +#do_execsql_test json_valid_3 { +# SELECT json_valid( CAST('{"a":"1}' AS BLOB) ); +#} {0} +# do_execsql_test json_valid_4 { SELECT json_valid(123); } {1} @@ -700,9 +703,7 @@ do_execsql_test json_valid_7 { do_execsql_test json_valid_8 { SELECT json_valid('{"a":55 "b":72}'); } {0} -do_execsql_test json_valid_3 { - SELECT json_valid( CAST('{"a":"1}' AS BLOB) ); -} {0} + do_execsql_test json_valid_9 { SELECT json_valid(NULL); } {} @@ -906,6 +907,80 @@ do_execsql_test json_quote_json_value { SELECT json_quote(json('{a:1, b: "test"}')); } {{{"a":1,"b":"test"}}} +do_execsql_test json_basics { + SELECT json(jsonb('{"name":"John", "age":30, "city":"New York"}')); +} {{{"name":"John","age":30,"city":"New York"}}} + +do_execsql_test json_complex_nested { + SELECT json(jsonb('{"complex": {"nested": ["array", "of", "values"], "numbers": [1, 2, 3]}}')); +} {{{"complex":{"nested":["array","of","values"],"numbers":[1,2,3]}}}} + +do_execsql_test json_array_of_objects { + SELECT json(jsonb('[{"id": 1, "data": "value1"}, {"id": 2, "data": "value2"}]')); +} {{[{"id":1,"data":"value1"},{"id":2,"data":"value2"}]}} + +do_execsql_test json_special_chars { + SELECT json(jsonb('{"special_chars": "!@#$%^&*()_+", "quotes": "\"quoted text\""}')); +} {{{"special_chars":"!@#$%^&*()_+","quotes":"\"quoted text\""}}} + +do_execsql_test json_unicode_emoji { + SELECT json(jsonb('{"unicode": "こんにちは世界", "emoji": "🚀🔥💯"}')); +} {{{"unicode":"こんにちは世界","emoji":"🚀🔥💯"}}} + +do_execsql_test json_value_types { + SELECT json(jsonb('{"boolean": true, "null_value": null, "number": 42.5}')); +} {{{"boolean":true,"null_value":null,"number":42.5}}} + +do_execsql_test json_deeply_nested { + SELECT json(jsonb('{"deeply": {"nested": {"structure": {"with": "values"}}}}')); +} {{{"deeply":{"nested":{"structure":{"with":"values"}}}}}} + +do_execsql_test json_mixed_array { + SELECT json(jsonb('{"array_mixed": [1, "text", true, null, {"obj": "inside array"}]}')); +} {{{"array_mixed":[1,"text",true,null,{"obj":"inside array"}]}}} + +do_execsql_test json_single_line_comments { + SELECT json(jsonb('{"name": "John", // This is a comment + "age": 30}')); +} {{{"name":"John","age":30}}} + +do_execsql_test json_multi_line_comments { + SELECT json(jsonb('{"data": "value", /* This is a + multi-line comment that spans + several lines */ "more": "data"}')); +} {{{"data":"value","more":"data"}}} + +do_execsql_test json_trailing_commas { + SELECT json(jsonb('{"items": ["one", "two", "three",], "status": "complete",}')); +} {{{"items":["one","two","three"],"status":"complete"}}} + +do_execsql_test json_unquoted_keys { + SELECT json(jsonb('{name: "Alice", age: 25}')); +} {{{"name":"Alice","age":25}}} + +do_execsql_test json_newlines { + SELECT json(jsonb('{"description": "Text with \nnew lines\nand more\nformatting"}')); +} {{{"description":"Text with \nnew lines\nand more\nformatting"}}} + +do_execsql_test json_hex_values { + SELECT json(jsonb('{"hex_value": "\x68\x65\x6c\x6c\x6f"}')); +} {{{"hex_value":"\u0068\u0065\u006c\u006c\u006f"}}} + +do_execsql_test json_unicode_escape { + SELECT json(jsonb('{"unicode": "\u0068\u0065\u006c\u006c\u006f"}')); +} {{{"unicode":"\u0068\u0065\u006c\u006c\u006f"}}} + +do_execsql_test json_tabs_whitespace { + SELECT json(jsonb('{"formatted": "Text with \ttabs and \tspacing"}')); +} {{{"formatted":"Text with \ttabs and \tspacing"}}} + +do_execsql_test json_mixed_escaping { + SELECT json(jsonb('{"mixed": "Newlines: \n Tabs: \t Quotes: \" Backslash: \\ Hex: \x40"}')); +} {{{"mixed":"Newlines: \n Tabs: \t Quotes: \" Backslash: \\ Hex: \u0040"}}} + +do_execsql_test json_control_chars { + SELECT json(jsonb('{"control": "Bell: \u0007 Backspace: \u0008 Form feed: \u000C"}')); +} {{{"control":"Bell: \u0007 Backspace: \u0008 Form feed: \u000C"}}} # Escape character tests in sqlite source depend on json_valid and in some syntax that is not implemented # yet in limbo. @@ -916,4 +991,3 @@ do_execsql_test json_quote_json_value { # WITH RECURSIVE c(x) AS (VALUES(1) UNION ALL SELECT x+1 FROM c WHERE x<0x1f) # SELECT sum(json_valid(json_quote('a'||char(x)||'z'))) FROM c ORDER BY x; # } {31} - From ffa0b1aaca870cd6d9b6cd5ceb82a54621fdda41 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Wed, 12 Mar 2025 16:08:16 +0200 Subject: [PATCH 10/10] fix clippy --- core/json/jsonb.rs | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index 13759820e..911f293be 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -1517,19 +1517,14 @@ world""#, #[test] fn test_header_decoding() { // Create sample data with various headers - let mut data = Vec::new(); - - // Small payload - data.push((5 << 4) | (ElementType::TEXT as u8)); - - // Medium payload (8-bit) - data.push((PAYLOAD_SIZE8 << 4) | (ElementType::ARRAY as u8)); - data.push(150); // Payload size - - // Large payload (16-bit) - data.push((PAYLOAD_SIZE16 << 4) | (ElementType::OBJECT as u8)); - data.push(0x98); // High byte of 39000 - data.push(0x68); // Low byte of 39000 + let data = vec![ + (5 << 4) | (ElementType::TEXT as u8), + (PAYLOAD_SIZE8 << 4) | (ElementType::ARRAY as u8), + 150, + (PAYLOAD_SIZE16 << 4) | (ElementType::OBJECT as u8), + 0x98, + 0x68, + ]; // Parse and verify each header let (header1, offset1) = JsonbHeader::from_slice(0, &data).unwrap();