diff --git a/core/Cargo.toml b/core/Cargo.toml index a33f3bba2..a4ce76cd3 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -108,3 +108,7 @@ harness = false [[bench]] name = "mvcc_benchmark" harness = false + +[[bench]] +name = "json_benchmark" +harness = false diff --git a/core/benches/json_benchmark.rs b/core/benches/json_benchmark.rs new file mode 100644 index 000000000..c0b04d5e0 --- /dev/null +++ b/core/benches/json_benchmark.rs @@ -0,0 +1,493 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use limbo_core::{Database, PlatformIO, IO}; +use pprof::{ + criterion::{Output, PProfProfiler}, + flamegraph::Options, +}; +use std::sync::Arc; + +// Title: JSONB Function Benchmarking + +fn rusqlite_open() -> rusqlite::Connection { + let sqlite_conn = rusqlite::Connection::open("../testing/testing.db").unwrap(); + sqlite_conn + .pragma_update(None, "locking_mode", "EXCLUSIVE") + .unwrap(); + sqlite_conn +} + +fn bench(criterion: &mut Criterion) { + // Flag to disable rusqlite benchmarks if needed + let enable_rusqlite = std::env::var("DISABLE_RUSQLITE_BENCHMARK").is_err(); + + #[allow(clippy::arc_with_non_send_sync)] + let io = Arc::new(PlatformIO::new().unwrap()); + let db = Database::open_file(io.clone(), "../testing/testing.db", false).unwrap(); + let limbo_conn = db.connect().unwrap(); + + // Benchmark JSONB with different payload sizes + let json_sizes = [ + ("Small", r#"{"id": 1, "name": "Test"}"#), + ( + "Medium", + r#"{"id": 1, "name": "Test", "attributes": {"color": "blue", "size": "medium", "tags": ["tag1", "tag2", "tag3"]}}"#, + ), + ( + "Large", + r#"[{"metadata":{"title":"Standard JSON Test File","description":"A complex JSON file for testing parsers and serializers (Standard JSON only)","version":"1.0.0","generated":"2025-03-12T12:00:00Z","author":"Claude AI"},"primitives":{"null_value":null,"boolean_values":{"true_value":true,"false_value":false},"number_values":{"integer":42,"negative":-273,"zero":0,"large_integer":9007199254740991,"small_integer":-9007199254740991,"decimal":3.14159265358979,"negative_decimal":-2.71828,"exponent_positive":6.022e+23,"exponent_negative":1.602e-19},"string_values":{"empty":"","simple":"Hello, world!","unicode":"你好,世界!😀🌍🚀","quotes":"She said \"Hello!\" to me.","backslash":"C:\\Program Files\\App\\","controls":"Line1\nLine2\tTabbed\rCarriage\bBackspace\fForm-feed","unicode_escapes":"Copyright: ©, Emoji: 😀","all_escapes":"\\b\\f\\n\\r\\t\\\"\\\\"}},"arrays":{"empty_array":[],"homogeneous":[1,2,3,4,5,6,7,8,9,10],"heterogeneous":[null,true,42,"string",{"key":"value"},[1,2,3]],"nested":[[1,2,3],[4,5,6],[7,8,9]],"deep":[[[[[[[[[["Very deep"]]]]]]]]]],"large":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99]},"objects":{"empty_object":{},"simple_object":{"key1":"value1","key2":"value2"},"nested_object":{"level1":{"level2":{"level3":{"level4":{"level5":"Deep nesting"}}}}},"complex_keys":{"simple":"value","with spaces":"value","with-dash":"value","with_underscore":"value","with.dot":"value","with:colon":"value","with@symbol":"value","withUnicode":"value","withEmoji":"value","withQuotes":"value","withBackslashes":"value"}},"edge_cases":{"zero_byte_string":"","one_byte_string":"x","long_string":"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.","almost_too_deep":{"a":{"b":{"c":{"d":{"e":{"f":{"g":{"h":{"i":{"j":{"k":{"l":{"m":{"n":{"o":{"p":{"q":{"r":{"s":{"t":{"u":{"v":{"w":{"x":{"y":{"z":"Deep nesting test"}}}}}}}}}}}}}}}}}}}}}}}}}}},"many_properties":{"prop01":1,"prop02":2,"prop03":3,"prop04":4,"prop05":5,"prop06":6,"prop07":7,"prop08":8,"prop09":9,"prop10":10,"prop11":11,"prop12":12,"prop13":13,"prop14":14,"prop15":15,"prop16":16,"prop17":17,"prop18":18,"prop19":19,"prop20":20,"prop21":21,"prop22":22,"prop23":23,"prop24":24,"prop25":25,"prop26":26,"prop27":27,"prop28":28,"prop29":29,"prop30":30,"prop31":31,"prop32":32,"prop33":33,"prop34":34,"prop35":35,"prop36":36,"prop37":37,"prop38":38,"prop39":39,"prop40":40,"prop41":41,"prop42":42,"prop43":43,"prop44":44,"prop45":45,"prop46":46,"prop47":47,"prop48":48,"prop49":49,"prop50":50}},{"standard_features":{"numeric_literals":{"decimal_integer":12345,"negative_integer":-12345,"decimal_fraction":123.45,"negative_fraction":-123.45,"exponential_positive":123400,"exponential_negative":0.00001234},"string_escapes":{"quotation_mark":"Quote: \"Hello\"","reverse_solidus":"Backslash: \\","solidus":"Slash: / (optional escape)","backspace":"Control: \b","formfeed":"Control: \f","newline":"Control: \n","carriage_return":"Control: \r","tab":"Control: \t","unicode":"Unicode: © € ☃"}},"generated_data":{"people":[{"id":1,"name":"John Smith","email":"john.smith@example.com","age":42,"address":{"street":"123 Main St","city":"Anytown","state":"CA","zip":"12345"},"phone_numbers":[{"type":"home","number":"555-1234"},{"type":"work","number":"555-5678"}],"tags":["employee","manager","developer"],"active":true},{"id":2,"name":"Jane Doe","email":"jane.doe@example.com","age":36,"address":{"street":"456 Elm St","city":"Othertown","state":"NY","zip":"67890"},"phone_numbers":[{"type":"mobile","number":"555-9012"}],"tags":["employee","designer"],"active":true},{"id":3,"name":"Bob Johnson","email":"bob.johnson@example.com","age":51,"address":{"street":"789 Oak St","city":"Somewhere","state":"TX","zip":"45678"},"phone_numbers":[{"type":"home","number":"555-3456"},{"type":"work","number":"555-7890"},{"type":"mobile","number":"555-1234"}],"tags":["employee","manager","sales"],"active":false}],"products":[{"id":"P001","name":"Smartphone","category":"Electronics","price":799.99,"features":["5G","Dual Camera","Fast Charging"],"specifications":{"dimensions":{"width":71.5,"height":146.7,"depth":7.4},"weight":174,"display":{"type":"OLED","size":6.1,"resolution":"1170x2532"},"processor":"A14 Bionic","memory":128},"in_stock":true,"release_date":"2023-09-15"},{"id":"P002","name":"Laptop","category":"Electronics","price":1299.99,"features":["16GB RAM","512GB SSD","Retina Display"],"specifications":{"dimensions":{"width":304.1,"height":212.4,"depth":15.6},"weight":1400,"display":{"type":"IPS","size":13.3,"resolution":"2560x1600"},"processor":"Intel Core i7","memory":512},"in_stock":true,"release_date":"2023-06-10"},{"id":"P003","name":"Wireless Headphones","category":"Audio","price":249.99,"features":["Noise Cancellation","20h Battery","Bluetooth 5.0"],"specifications":{"dimensions":{"width":168,"height":162,"depth":83},"weight":254,"driver":{"type":"Dynamic","size":40},"battery":{"capacity":500,"life":20}},"in_stock":false,"release_date":"2023-03-22"}],"orders":[{"id":"ORD-2023-001","customer_id":1,"date":"2023-01-15T10:30:00Z","items":[{"product_id":"P001","quantity":1,"price":799.99},{"product_id":"P003","quantity":2,"price":249.99}],"total":1299.97,"status":"delivered","shipping":{"address":{"street":"123 Main St","city":"Anytown","state":"CA","zip":"12345"},"method":"express","cost":15.99,"tracking_number":"SHP-123456789"},"payment":{"method":"credit_card","transaction_id":"TRX-987654321","status":"completed"}},{"id":"ORD-2023-002","customer_id":2,"date":"2023-02-20T14:45:00Z","items":[{"product_id":"P002","quantity":1,"price":1299.99}],"total":1299.99,"status":"shipped","shipping":{"address":{"street":"456 Elm St","city":"Othertown","state":"NY","zip":"67890"},"method":"standard","cost":9.99,"tracking_number":"SHP-234567890"},"payment":{"method":"paypal","transaction_id":"TRX-876543210","status":"completed"}},{"id":"ORD-2023-003","customer_id":3,"date":"2023-03-05T09:15:00Z","items":[{"product_id":"P001","quantity":1,"price":799.99},{"product_id":"P002","quantity":1,"price":1299.99},{"product_id":"P003","quantity":1,"price":249.99}],"total":2349.97,"status":"processing","shipping":{"address":{"street":"789 Oak St","city":"Somewhere","state":"TX","zip":"45678"},"method":"express","cost":25.99,"tracking_number":null},"payment":{"method":"bank_transfer","transaction_id":"TRX-765432109","status":"pending"}}]},"repeated_property_names":{"data":{"data":{"data":{"data":"Nested properties with the same name"}}}},"large_strings":{"base64_data":"iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAJ5SURBVHjalFJbSBRRGP7OzO7sbmrqbmo6lom5CYVEkiBIUKEPvRQFRlEEPvTSa0+9By+9BhFkIAhCQSJhWlEpaKVpZaSVpplpbre1vM26u7ObzpyZ0892MJMM6oOf833nO/f/ZwghWE86mOIq7YSQs3r0kqDgr8I2zlKb3k/G57fQz8Z9fOh6HDUsL9AcC5m2Zt4ZBBnLEbBCMfDNVv8s0T8f5sJaMCuW/51sDBQKYOcSUA68JgLhODzT8ToQh1ZeyhKtVMlgXl3NWZbDqcl3aOrtRFPvC/i5XJYk0UTjdiVduTuOleRCUomTiMdho+MxNvX1YlPfSwRMZthsdtllBUAIY27BkRWAbCx+jn+IQJB9NKwMQ0ehFYEVUIMQAh3DQKfRQqPTQR6FAPF4ImuYUQh7eGEuY+MJASMHg8kEvVYLPcsukBACVgiK9iS8ZtMBAQFDMeBp8NJQr9WAp03JOkgHBBZvitg8lkQRbr8fJJHMe5AgYj7iJYcPXPfNexf5rDFK0eFwQM+wYCgWLM2s1IWgYDqPrcVpXJZzUSY+Bp9t82wy6YG5sAJVlRtw0eWCw2RKt5oZiCQSYCkWT5bMOCK/AQgF1eofSDgOa4tKcKCuTnawGTFxLMrZGBpW38Md7SYI6Rts1lrZPRZ0g9d+w5LiiMx2lRVYjO15lTjbfw9sNIw9xdXYXVGBzTq99AVpcPncAz/f0cHnsojQaU3JF4jxhKaFQNUl4PJN4OJ5ICQAeg4jnvmMWWcnCErB5rHyD7Rmf3d1KbH+cOQx2XTkLJ5vPYL+8lrUL30FBjwYtDkw4nBmxNmKPwMAKrUbALKE+vwAAAAASUVORK5CYII=","lorem_ipsum":"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"},"binary_data_sizes":{"small_payload":{"description":"Small payload (0-11 bytes in header)","data":[1,2,3,4,5]},"medium_payload":{"description":"Medium payload (1-byte size in header)","data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50]},"large_payload":{"description":"Simulation of large payload (2-byte size in header)","data_description":"Would normally contain 256-65535 bytes"},"extra_large_payload":{"description":"Simulation of extra large payload (4-byte size in header)","data_description":"Would normally contain >65535 bytes"}},"stress_test":{"recursive_structure":{"name":"Level 1","children":[{"name":"Level 1.1","children":[{"name":"Level 1.1.1","children":[]},{"name":"Level 1.1.2","children":[{"name":"Level 1.1.2.1","children":[]}]}]},{"name":"Level 1.2","children":[{"name":"Level 1.2.1","children":[]}]}]},"long_array_nested_objects":[{"id":1,"data":{"value":"test1"}},{"id":2,"data":{"value":"test2"}},{"id":3,"data":{"value":"test3"}},{"id":4,"data":{"value":"test4"}},{"id":5,"data":{"value":"test5"}},{"id":6,"data":{"value":"test6"}},{"id":7,"data":{"value":"test7"}},{"id":8,"data":{"value":"test8"}},{"id":9,"data":{"value":"test9"}},{"id":10,"data":{"value":"test10"}},{"id":11,"data":{"value":"test11"}},{"id":12,"data":{"value":"test12"}},{"id":13,"data":{"value":"test13"}},{"id":14,"data":{"value":"test14"}},{"id":15,"data":{"value":"test15"}},{"id":16,"data":{"value":"test16"}},{"id":17,"data":{"value":"test17"}},{"id":18,"data":{"value":"test18"}},{"id":19,"data":{"value":"test19"}},{"id":20,"data":{"value":"test20"}}]}}]"#, + ), // Generate a larger JSON object with 20 nested items + ( + "Real world json #1", + r#"{ + "user": { + "id": "usr_7f8d3a2e", + "name": "Jane Smith", + "email": "jane.smith@example.com", + "verified": true, + "created_at": "2023-05-12T15:42:31Z", + "preferences": { + "theme": "dark", + "notifications": { + "email": true, + "push": false, + "sms": true + }, + "language": "en-US" + }, + "subscription": { + "plan": "premium", + "status": "active", + "next_billing_date": "2024-05-12" + }, + "address": { + "street": "123 Main St", + "city": "Boston", + "state": "MA", + "zip": "02108", + "country": "USA" + } + }, + "meta": { + "request_id": "req_9d7e6c5b4a3f2e1d", + "timestamp": 1683905123 + } + }"#, + ), + ( + "Real world json 2", + r#"{ + "products": [ + { + "id": "p-1001", + "name": "Wireless Headphones", + "price": 79.99, + "currency": "USD", + "in_stock": true, + "quantity": 45, + "categories": ["electronics", "audio", "wireless"], + "ratings": { + "average": 4.7, + "count": 238 + }, + "specs": { + "brand": "SoundMax", + "color": "black", + "connectivity": "Bluetooth 5.0", + "battery_life": "20 hours" + }, + "images": [ + "https://example.com/products/headphones-1.jpg", + "https://example.com/products/headphones-2.jpg" + ] + }, + { + "id": "p-1002", + "name": "Smart Watch", + "price": 149.99, + "currency": "USD", + "in_stock": true, + "quantity": 28, + "categories": ["electronics", "wearables", "fitness"], + "ratings": { + "average": 4.3, + "count": 182 + }, + "specs": { + "brand": "TechFit", + "color": "silver", + "display": "AMOLED", + "waterproof": true + }, + "images": [ + "https://example.com/products/smartwatch-1.jpg", + "https://example.com/products/smartwatch-2.jpg" + ] + } + ], + "pagination": { + "total": 237, + "page": 1, + "per_page": 2, + "next_page": 2 + } + } + "#, + ), + ( + "Real world json 3", + r#"{ + "app_name": "DataProcessor", + "version": "2.1.3", + "environment": "production", + "debug": false, + "log_level": "info", + "database": { + "main": { + "host": "db-primary.internal", + "port": 5432, + "name": "app_production", + "user": "app_user", + "max_connections": 50, + "timeout_ms": 5000 + }, + "replica": { + "host": "db-replica.internal", + "port": 5432, + "name": "app_production_replica", + "user": "app_readonly", + "max_connections": 25, + "timeout_ms": 3000 + } + }, + "cache": { + "enabled": true, + "ttl_seconds": 3600, + "max_size_mb": 512 + }, + "api": { + "host": "0.0.0.0", + "port": 8080, + "rate_limit": { + "requests_per_minute": 120, + "burst": 30 + }, + "timeouts": { + "read_ms": 5000, + "write_ms": 10000, + "idle_ms": 60000 + } + }, + "feature_flags": { + "new_dashboard": true, + "beta_analytics": false, + "improved_search": true + } + } + "#, + ), + ( + "Real world json 4", + r#"{ + "app_name": "DataProcessor", + "version": "2.1.3", + "environment": "production", + "debug": false, + "log_level": "info", + "database": { + "main": { + "host": "db-primary.internal", + "port": 5432, + "name": "app_production", + "user": "app_user", + "max_connections": 50, + "timeout_ms": 5000 + }, + "replica": { + "host": "db-replica.internal", + "port": 5432, + "name": "app_production_replica", + "user": "app_readonly", + "max_connections": 25, + "timeout_ms": 3000 + } + }, + "cache": { + "enabled": true, + "ttl_seconds": 3600, + "max_size_mb": 512 + }, + "api": { + "host": "0.0.0.0", + "port": 8080, + "rate_limit": { + "requests_per_minute": 120, + "burst": 30 + }, + "timeouts": { + "read_ms": 5000, + "write_ms": 10000, + "idle_ms": 60000 + } + }, + "feature_flags": { + "new_dashboard": true, + "beta_analytics": false, + "improved_search": true + } + }"#, + ), + ( + "Real world json 5", + r#" + { + "app_name": "DataProcessor", + "version": "2.1.3", + "environment": "production", + "debug": false, + "log_level": "info", + "database": { + "main": { + "host": "db-primary.internal", + "port": 5432, + "name": "app_production", + "user": "app_user", + "max_connections": 50, + "timeout_ms": 5000 + }, + "replica": { + "host": "db-replica.internal", + "port": 5432, + "name": "app_production_replica", + "user": "app_readonly", + "max_connections": 25, + "timeout_ms": 3000 + } + }, + "cache": { + "enabled": true, + "ttl_seconds": 3600, + "max_size_mb": 512 + }, + "api": { + "host": "0.0.0.0", + "port": 8080, + "rate_limit": { + "requests_per_minute": 120, + "burst": 30 + }, + "timeouts": { + "read_ms": 5000, + "write_ms": 10000, + "idle_ms": 60000 + } + }, + "feature_flags": { + "new_dashboard": true, + "beta_analytics": false, + "improved_search": true + } + }"#, + ), + ( + "Real world json 6", + r#" + { + "event_id": "evt_0ab1cde23f4g5h6i", + "event_type": "page_view", + "timestamp": "2024-03-12T08:14:27.345Z", + "user": { + "id": "u_789012", + "anonymous_id": "anon_6c7d8e9f0a", + "device_id": "dev_3e4f5g6h7i", + "session_id": "sess_1b2c3d4e5f" + }, + "context": { + "page": { + "url": "https://example.com/products/smart-home", + "title": "Smart Home Products | Example Store", + "referrer": "https://google.com", + "path": "/products/smart-home" + }, + "device": { + "type": "desktop", + "manufacturer": "Apple", + "model": "MacBook Pro", + "screen": { + "width": 1440, + "height": 900 + } + }, + "browser": { + "name": "Chrome", + "version": "99.0.4844.51" + }, + "os": { + "name": "macOS", + "version": "12.3" + }, + "location": { + "country": "US", + "region": "CA", + "city": "San Francisco", + "timezone": "America/Los_Angeles" + } + }, + "properties": { + "duration_ms": 5327, + "is_logged_in": true, + "tags": ["homepage", "featured", "promo"], + "utm": { + "source": "newsletter", + "medium": "email", + "campaign": "spring_sale_2024" + } + } + } + "#, + ), + ( + "Deeply nested", + r#"{ + "config": { + "level1": { + "level2": { + "level3": { + "level4": { + "level5": { + "level6": { + "level7": { + "value": "deeply nested value", + "enabled": true, + "numbers": [1, 2, 3, 4, 5], + "settings": { + "mode": "advanced", + "retries": 3 + } + } + } + } + } + } + } + } + } + }"#, + ), + ( + "Array heavy", + r#"{ + "feed": { + "user_id": "u_12345", + "posts": [ + { + "id": "post_001", + "author": "user_789", + "content": "Just launched our new product! Check it out at example.com/new", + "timestamp": "2024-03-13T14:27:32Z", + "likes": 24, + "comments": [ + { + "id": "comment_001", + "author": "user_456", + "content": "Looks amazing! Cant wait to try it.", + "timestamp": "2024-03-13T14:35:12Z", + "likes": 3 + }, + { + "id": "comment_002", + "author": "user_789", + "content": "Thanks! Let me know what you think after youve tried it.", + "timestamp": "2024-03-13T14:42:45Z", + "likes": 1 + } + ], + "tags": ["product", "launch", "technology"] + }, + { + "id": "post_002", + "author": "user_123", + "content": "Beautiful day for hiking! #nature #outdoors", + "timestamp": "2024-03-13T11:15:22Z", + "likes": 57, + "comments": [ + { + "id": "comment_003", + "author": "user_345", + "content": "Where is this? So beautiful!", + "timestamp": "2024-03-13T11:22:05Z", + "likes": 2 + }, + { + "id": "comment_004", + "author": "user_123", + "content": "Mount Rainier National Park!", + "timestamp": "2024-03-13T11:30:16Z", + "likes": 3 + } + ], + "tags": ["nature", "outdoors", "hiking"], + "location": { + "name": "Mount Rainier National Park", + "latitude": 46.8800, + "longitude": -121.7269 + } + } + ], + "has_more": true, + "next_cursor": "cursor_xyz123" + } + }"#, + ), + ]; + + for (size_name, json_payload) in json_sizes.iter() { + let query = format!("SELECT jsonb('{}')", json_payload.replace("'", "\\'")); + + let mut group = criterion.benchmark_group(format!("JSONB Size - {}", size_name)); + + group.bench_function("Limbo", |b| { + let mut stmt = limbo_conn.prepare(&query).unwrap(); + let io = io.clone(); + b.iter(|| { + loop { + match stmt.step().unwrap() { + limbo_core::StepResult::Row => {} + limbo_core::StepResult::IO => { + let _ = io.run_once(); + } + limbo_core::StepResult::Done => { + break; + } + limbo_core::StepResult::Interrupt | limbo_core::StepResult::Busy => { + unreachable!(); + } + } + } + stmt.reset(); + }); + }); + + if enable_rusqlite { + let sqlite_conn = rusqlite_open(); + + group.bench_function("Sqlite3", |b| { + let mut stmt = sqlite_conn.prepare(&query).unwrap(); + b.iter(|| { + let mut rows = stmt.raw_query(); + while let Some(row) = rows.next().unwrap() { + black_box(row); + } + }); + }); + } + + group.finish(); + } +} + +criterion_group! { + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(Some(Options::default())))); + targets = bench +} +criterion_main!(benches); diff --git a/core/json/jsonb.rs b/core/json/jsonb.rs index 911f293be..e40cda8ff 100644 --- a/core/json/jsonb.rs +++ b/core/json/jsonb.rs @@ -1,12 +1,170 @@ use crate::{bail_parse_error, LimboError, Result}; -use std::{fmt::Write, iter::Peekable, str::from_utf8}; +use std::{fmt::Write, str::from_utf8}; -const PAYLOAD_SIZE8: u8 = 12; -const PAYLOAD_SIZE16: u8 = 13; -const PAYLOAD_SIZE32: u8 = 14; +const SIZE_MARKER_8BIT: u8 = 12; +const SIZE_MARKER_16BIT: u8 = 13; +const SIZE_MARKER_32BIT: u8 = 14; const MAX_JSON_DEPTH: usize = 1000; const INFINITY_CHAR_COUNT: u8 = 5; +const fn make_whitespace_table() -> [u8; 256] { + let mut table = [0u8; 256]; + + // Mark whitespace characters + table[0x09] = 1; // Tab + table[0x0A] = 1; // Line feed + table[0x0D] = 1; // Carriage return + table[0x20] = 1; // Space + + table +} + +static WS_TABLE: [u8; 256] = make_whitespace_table(); + +const fn make_character_type_table() -> [u8; 256] { + let mut table = [0u8; 256]; + + // Mark whitespace characters + table[0x09] = 1; // Tab + table[0x0A] = 1; // Line feed + table[0x0D] = 1; // Carriage return + table[0x20] = 1; // Space + + // Mark numeric digits + table[0x30] = 2; // 0 + table[0x31] = 2; // 1 + table[0x32] = 2; // 2 + table[0x33] = 2; // 3 + table[0x34] = 2; // 4 + table[0x35] = 2; // 5 + table[0x36] = 2; // 6 + table[0x37] = 2; // 7 + table[0x38] = 2; // 8 + table[0x39] = 2; // 9 + + // Mark hex digits (a-f, A-F) + table[0x41] = 3; // A + table[0x42] = 3; // B + table[0x43] = 3; // C + table[0x44] = 3; // D + table[0x45] = 3; // E + table[0x46] = 3; // F + table[0x61] = 3; // a + table[0x62] = 3; // b + table[0x63] = 3; // c + table[0x64] = 3; // d + table[0x65] = 3; // e + table[0x66] = 3; // f + + table +} + +static CHARACTER_TYPE: [u8; 256] = make_character_type_table(); + +const fn make_character_type_ok_table() -> [u8; 256] { + let mut table = [0u8; 256]; + + table[0x20] |= 4; // Space + table[0x21] |= 4; // ! + // Skipping 0x22 (") as it needs escaping + table[0x23] |= 4; // # + table[0x24] |= 4; // $ + table[0x25] |= 4; // % + table[0x26] |= 4; // & + table[0x27] |= 4; // ' + table[0x28] |= 4; // ( + table[0x29] |= 4; // ) + table[0x2A] |= 4; // * + table[0x2B] |= 4; // + + table[0x2C] |= 4; // , + table[0x2D] |= 4; // - + table[0x2E] |= 4; // . + table[0x2F] |= 4; // / + table[0x30] |= 4; // 0 + table[0x31] |= 4; // 1 + table[0x32] |= 4; // 2 + table[0x33] |= 4; // 3 + table[0x34] |= 4; // 4 + table[0x35] |= 4; // 5 + table[0x36] |= 4; // 6 + table[0x37] |= 4; // 7 + table[0x38] |= 4; // 8 + table[0x39] |= 4; // 9 + table[0x3A] |= 4; // : + table[0x3B] |= 4; // ; + table[0x3C] |= 4; // + table[0x3D] |= 4; // = + table[0x3E] |= 4; // > + table[0x3F] |= 4; // ? + table[0x40] |= 4; // @ + table[0x41] |= 4; // A + table[0x42] |= 4; // B + table[0x43] |= 4; // C + table[0x44] |= 4; // D + table[0x45] |= 4; // E + table[0x46] |= 4; // F + table[0x47] |= 4; // G + table[0x48] |= 4; // H + table[0x49] |= 4; // I + table[0x4A] |= 4; // J + table[0x4B] |= 4; // K + table[0x4C] |= 4; // L + table[0x4D] |= 4; // M + table[0x4E] |= 4; // N + table[0x4F] |= 4; // O + table[0x50] |= 4; // P + table[0x51] |= 4; // Q + table[0x52] |= 4; // R + table[0x53] |= 4; // S + table[0x54] |= 4; // T + table[0x55] |= 4; // U + table[0x56] |= 4; // V + table[0x57] |= 4; // W + table[0x58] |= 4; // X + table[0x59] |= 4; // Y + table[0x5A] |= 4; // Z + table[0x5B] |= 4; // [ + // Skipping 0x5C (\) as it needs escaping + table[0x5D] |= 4; // ] + table[0x5E] |= 4; // ^ + table[0x5F] |= 4; // _ + table[0x60] |= 4; // ` + table[0x61] |= 4; // a + table[0x62] |= 4; // b + table[0x63] |= 4; // c + table[0x64] |= 4; // d + table[0x65] |= 4; // e + table[0x66] |= 4; // f + table[0x67] |= 4; // g + table[0x68] |= 4; // h + table[0x69] |= 4; // i + table[0x6A] |= 4; // j + table[0x6B] |= 4; // k + table[0x6C] |= 4; // l + table[0x6D] |= 4; // m + table[0x6E] |= 4; // n + table[0x6F] |= 4; // o + table[0x70] |= 4; // p + table[0x71] |= 4; // q + table[0x72] |= 4; // r + table[0x73] |= 4; // s + table[0x74] |= 4; // t + table[0x75] |= 4; // u + table[0x76] |= 4; // v + table[0x77] |= 4; // w + table[0x78] |= 4; // x + table[0x79] |= 4; // y + table[0x7A] |= 4; // z + table[0x7B] |= 4; // { + table[0x7C] |= 4; // | + table[0x7D] |= 4; // } + table[0x7E] |= 4; // ~ + + table +} + +static CHARACTER_TYPE_OK: [u8; 256] = make_character_type_ok_table(); + #[derive(Debug, Clone)] pub struct Jsonb { data: Vec, @@ -63,6 +221,24 @@ type PayloadSize = usize; #[derive(Debug, Clone)] pub struct JsonbHeader(ElementType, PayloadSize); +enum HeaderFormat { + Inline([u8; 1]), // Small payloads embedded directly in the header + OneByte([u8; 2]), // Medium payloads with 1-byte size field + TwoBytes([u8; 3]), // Large payloads with 2-byte size field + FourBytes([u8; 5]), // Extra large payloads with 4-byte size field +} + +impl HeaderFormat { + fn as_bytes(&self) -> &[u8] { + match self { + Self::Inline(bytes) => bytes, + Self::OneByte(bytes) => bytes, + Self::TwoBytes(bytes) => bytes, + Self::FourBytes(bytes) => bytes, + } + } +} + impl JsonbHeader { fn new(element_type: ElementType, payload_size: PayloadSize) -> Self { Self(element_type, payload_size) @@ -118,35 +294,45 @@ impl JsonbHeader { } } - fn into_bytes(self) -> [u8; 5] { - let mut bytes = [0; 5]; - let element_type = self.0; - let payload_size = self.1; - if payload_size <= 11 { - bytes[0] = (element_type as u8) | ((payload_size as u8) << 4); - } else if payload_size <= 0xFF { - bytes[0] = (element_type as u8) | (PAYLOAD_SIZE8 << 4); - bytes[1] = payload_size as u8; - } else if payload_size <= 0xFFFF { - bytes[0] = (element_type as u8) | (PAYLOAD_SIZE16 << 4); + fn into_bytes(self) -> HeaderFormat { + let (element_type, payload_size) = (self.0, self.1); - let size_bytes = (payload_size as u16).to_be_bytes(); - bytes[1] = size_bytes[0]; - bytes[2] = size_bytes[1]; - } else if payload_size <= 0xFFFFFFFF { - bytes[0] = (element_type as u8) | (PAYLOAD_SIZE32 << 4); + match payload_size { + // Small payload (fits in 4 bits) + size if size <= 11 => { + HeaderFormat::Inline([(element_type as u8) | ((size as u8) << 4)]) + } - let size_bytes = (payload_size as u32).to_be_bytes(); + // Medium payload (fits in 1 byte) + size if size <= 0xFF => { + HeaderFormat::OneByte([(element_type as u8) | (SIZE_MARKER_8BIT << 4), size as u8]) + } - bytes[1] = size_bytes[0]; - bytes[2] = size_bytes[1]; - bytes[3] = size_bytes[2]; - bytes[4] = size_bytes[3]; - } else { - panic!("Payload size too large for encoding"); + // Large payload (fits in 2 bytes) + size if size <= 0xFFFF => { + let size_bytes = (size as u16).to_be_bytes(); + HeaderFormat::TwoBytes([ + (element_type as u8) | (SIZE_MARKER_16BIT << 4), + size_bytes[0], + size_bytes[1], + ]) + } + + // Extra large payload (fits in 4 bytes) + size if size <= 0xFFFFFFFF => { + let size_bytes = (size as u32).to_be_bytes(); + HeaderFormat::FourBytes([ + (element_type as u8) | (SIZE_MARKER_32BIT << 4), + size_bytes[0], + size_bytes[1], + size_bytes[2], + size_bytes[3], + ]) + } + + // Payload too large + _ => panic!("Payload size too large for encoding"), } - - bytes } fn get_size_bytes(slice: &[u8], start: usize, count: usize) -> Result<&[u8]> { @@ -315,7 +501,7 @@ impl Jsonb { let ch = word_slice[i]; // Handle normal characters that don't need escaping - if self.is_json_ok(ch) || ch == b'\'' { + if is_json_ok(ch) || ch == b'\'' { string.push(ch as char); i += 1; continue; @@ -448,10 +634,6 @@ impl Jsonb { Ok(cursor + len) } - fn is_json_ok(&self, ch: u8) -> bool { - (0x20..=0x7E).contains(&ch) && ch != b'"' && ch != b'\\' - } - fn serialize_number( &self, string: &mut String, @@ -568,410 +750,445 @@ impl Jsonb { cursor } - fn deserialize_value<'a, I>(&mut self, input: &mut Peekable, depth: usize) -> Result - where - I: Iterator, - { + fn deserialize_value(&mut self, input: &[u8], mut pos: usize, depth: usize) -> Result { if depth > MAX_JSON_DEPTH { - bail_parse_error!("Too deep") - }; - let current_depth = depth + 1; - skip_whitespace(input); - match input.peek() { - Some(b'{') => { - input.next(); // consume '{' - self.deserialize_obj(input, current_depth) - } - Some(b'[') => { - input.next(); // consume '[' - self.deserialize_array(input, current_depth) - } - Some(b't') => self.deserialize_true(input), - Some(b'f') => self.deserialize_false(input), - Some(b'n') => self.deserialize_null_or_nan(input), - Some(b'"') => self.deserialize_string(input), - Some(b'\'') => self.deserialize_string(input), - Some(&&c) - if c.is_ascii_digit() - || c == b'-' - || c == b'+' - || c == b'.' - || c.to_ascii_lowercase() == b'i' => - { - self.deserialize_number(input) - } - Some(ch) => bail_parse_error!("Unexpected character: {}", ch), - None => Ok(0), + bail_parse_error!("Too deep"); } + + pos = skip_whitespace(input, pos); + if pos >= input.len() { + bail_parse_error!("Unexpected end of input") + } + + match input[pos] { + b'{' => { + pos += 1; // consume '{' + pos = self.deserialize_obj(input, pos, depth + 1)?; + } + b'[' => { + pos += 1; // consume '[' + pos = self.deserialize_array(input, pos, depth + 1)?; + } + b't' => { + pos = self.deserialize_true(input, pos)?; + } + b'f' => { + pos = self.deserialize_false(input, pos)?; + } + b'n' => { + pos = self.deserialize_null_or_nan(input, pos)?; + } + b'"' | b'\'' => { + pos = self.deserialize_string(input, pos)?; + } + c if (c >= b'0' && c <= b'9') + || c == b'-' + || c == b'+' + || c == b'.' + || c.to_ascii_lowercase() == b'i' => + { + pos = self.deserialize_number(input, pos)?; + } + _ => { + bail_parse_error!("Unexpected character: {}", input[pos] as char); + } + } + + Ok(pos) } - pub fn deserialize_obj<'a, I>(&mut self, input: &mut Peekable, depth: usize) -> Result - where - I: Iterator, - { + fn deserialize_obj(&mut self, input: &[u8], mut pos: usize, depth: usize) -> Result { if depth > MAX_JSON_DEPTH { - bail_parse_error!("Too deep!") + bail_parse_error!("Too deep!"); } + if self.data.capacity() - self.data.len() < 50 { + self.data.reserve(self.data.capacity()); + } + if pos >= input.len() { + bail_parse_error!("Unexpected end of input"); + } + let header_pos = self.len(); self.write_element_header(header_pos, ElementType::OBJECT, 0)?; let obj_start = self.len(); let mut first = true; - let current_depth = depth + 1; - loop { - skip_whitespace(input); - match input.peek() { - Some(&&b'}') => { - input.next(); // consume '}' + loop { + pos = skip_whitespace(input, pos); + if pos >= input.len() { + bail_parse_error!("Unexpected end of input"); + } + + match input[pos] { + b'}' => { + pos += 1; // consume '}' if first { - return Ok(1); // empty header + return Ok(pos); } else { let obj_size = self.len() - obj_start; self.write_element_header(header_pos, ElementType::OBJECT, obj_size)?; - return Ok(obj_size + 2); + return Ok(pos); } } - Some(&&b',') if !first => { - input.next(); // consume ',' - skip_whitespace(input); + b',' if !first => { + pos += 1; // consume ',' + pos = skip_whitespace(input, pos); } - Some(_) => { + _ => { // Parse key (must be string) - self.deserialize_string(input)?; + pos = self.deserialize_string(input, pos)?; - skip_whitespace(input); - - // Expect and consume ':' - if input.next() != Some(&b':') { + pos = skip_whitespace(input, pos); + if pos >= input.len() || input[pos] != b':' { bail_parse_error!("Expected ':' after object key"); } + pos += 1; // consume ':' - skip_whitespace(input); + pos = skip_whitespace(input, pos); // Parse value - can be any JSON value including another object - self.deserialize_value(input, current_depth)?; + pos = self.deserialize_value(input, pos, depth)?; first = false; } - None => { - bail_parse_error!("Unexpected end of input!") - } } } } - pub fn deserialize_array<'a, I>( - &mut self, - input: &mut Peekable, - depth: usize, - ) -> Result - where - I: Iterator, - { + fn deserialize_array(&mut self, input: &[u8], mut pos: usize, depth: usize) -> Result { if depth > MAX_JSON_DEPTH { bail_parse_error!("Too deep"); } + let header_pos = self.len(); self.write_element_header(header_pos, ElementType::ARRAY, 0)?; let arr_start = self.len(); let mut first = true; - let current_depth = depth + 1; - loop { - skip_whitespace(input); - match input.peek() { - Some(&&b']') => { - input.next(); + loop { + pos = skip_whitespace(input, pos); + if pos >= input.len() { + bail_parse_error!("Unexpected end of input"); + } + + match input[pos] { + b']' => { + pos += 1; // consume ']' if first { - return Ok(1); + return Ok(pos); } else { let arr_len = self.len() - arr_start; - let header_size = - self.write_element_header(header_pos, ElementType::ARRAY, arr_len)?; - return Ok(arr_len + header_size); + self.write_element_header(header_pos, ElementType::ARRAY, arr_len)?; + return Ok(pos); } } - Some(&&b',') if !first => { - input.next(); // consume ',' - skip_whitespace(input); + b',' if !first => { + pos += 1; // consume ',' + pos = skip_whitespace(input, pos); } - Some(_) => { - skip_whitespace(input); - self.deserialize_value(input, current_depth)?; + _ => { + pos = skip_whitespace(input, pos); + + // Parse array element + pos = self.deserialize_value(input, pos, depth + 1)?; first = false; } - None => { - bail_parse_error!("Unexpected end of input!") - } } } } - fn deserialize_string<'a, I>(&mut self, input: &mut Peekable) -> Result - where - I: Iterator, - { - let string_start = self.len(); - let quote = input.next().unwrap(); // " - let quoted = quote == &b'"' || quote == &b'\''; - let mut len = 0; - self.write_element_header(string_start, ElementType::TEXT, 0)?; - let payload_start = self.len(); + fn deserialize_string(&mut self, input: &[u8], mut pos: usize) -> Result { + if pos >= input.len() { + bail_parse_error!("Unexpected end of input"); + } - if input.peek().is_none() { - bail_parse_error!("Unexpected end of input in string handling"); - }; + let string_start = self.len(); + let quote = input[pos]; + pos += 1; // consume quote + + let quoted = quote == b'"' || quote == b'\''; + let mut len = 0; + + // Write placeholder header to be updated later + self.write_element_header(string_start, ElementType::TEXT, 0)?; + + if pos >= input.len() { + bail_parse_error!("Unexpected end of input in string"); + } let mut element_type = ElementType::TEXT; - // This needed to support 1 char unquoted JSON5 keys + + // Special case for unquoted JSON5 keys (identifiers) if !quoted { - self.data.push(*quote); + self.data.push(quote); len += 1; - if let Some(&&c) = input.peek() { - if c == b':' { - self.write_element_header(string_start, element_type, len)?; - return Ok(self.len() - payload_start); - } - } - }; - - while let Some(c) = input.next() { - if c == quote && quoted { - break; - } else if c == &b'\\' { - // Handle escapes - if let Some(&esc) = input.next() { - match esc { - b'b' => { - self.data.push(b'\\'); - self.data.push(b'b'); - len += 2; - element_type = ElementType::TEXTJ; - } - b'f' => { - self.data.push(b'\\'); - self.data.push(b'f'); - len += 2; - element_type = ElementType::TEXTJ; - } - b'n' => { - self.data.push(b'\\'); - self.data.push(b'n'); - len += 2; - element_type = ElementType::TEXTJ; - } - b'r' => { - self.data.push(b'\\'); - self.data.push(b'r'); - len += 2; - element_type = ElementType::TEXTJ; - } - b't' => { - self.data.push(b'\\'); - self.data.push(b't'); - len += 2; - element_type = ElementType::TEXTJ; - } - b'\\' | b'"' | b'/' => { - self.data.push(b'\\'); - self.data.push(esc); - len += 2; - element_type = ElementType::TEXTJ; - } - b'u' => { - // Unicode escape - element_type = ElementType::TEXTJ; - self.data.push(b'\\'); - self.data.push(b'u'); - len += 2; - for _ in 0..4 { - if let Some(&h) = input.next() { - if is_hex_digit(h) { - self.data.push(h); - len += 1; - } else { - bail_parse_error!("Incomplete Unicode escape"); - } - } else { - bail_parse_error!("Incomplete Unicode escape"); - } - } - } - // JSON5 extensions - b'\n' => { - element_type = ElementType::TEXT5; - self.data.push(b'\\'); - self.data.push(b'\n'); - len += 2; - } - b'\'' => { - element_type = ElementType::TEXT5; - self.data.push(b'\\'); - self.data.push(b'\''); - len += 2; - } - b'0' => { - element_type = ElementType::TEXT5; - self.data.push(b'\\'); - self.data.push(b'0'); - len += 2; - } - b'v' => { - element_type = ElementType::TEXT5; - self.data.push(b'\\'); - self.data.push(b'v'); - len += 2; - } - b'x' => { - element_type = ElementType::TEXT5; - self.data.push(b'\\'); - self.data.push(b'x'); - len += 2; - for _ in 0..2 { - if let Some(&h) = input.next() { - if is_hex_digit(h) { - self.data.push(h); - len += 1; - } else { - bail_parse_error!("Invalid hex escape sequence"); - } - } else { - bail_parse_error!("Incomplete hex escape sequence"); - } - } - } - _ => { - bail_parse_error!("Invalid escape sequence") - } - } - } else { - bail_parse_error!("Unexpected end of input in escape sequence"); - } - } else if c <= &0x1F { - element_type = ElementType::TEXT5; - self.data.push(*c); - len += 1; - } else { - self.data.push(*c); - len += 1; - } - if let Some(&&c) = input.peek() { - if (c == b':' || c.is_ascii_whitespace()) && !quoted { - break; - } + if pos < input.len() && input[pos] == b':' { + self.write_element_header(string_start, element_type, len)?; + return Ok(pos); } } - // Write header and payload + let mut escape_buffer = [0u8; 6]; // Buffer for escape sequences + + while pos < input.len() { + let c = input[pos]; + pos += 1; + + if quoted && c == quote { + break; // End of string + } else if c == b'\\' { + // Handle escape sequences + if pos >= input.len() { + bail_parse_error!("Unexpected end of input in escape sequence"); + } + + let esc = input[pos]; + pos += 1; + + match esc { + b'b' => { + self.data.extend_from_slice(b"\\b"); + len += 2; + element_type = ElementType::TEXTJ; + } + b'f' => { + self.data.extend_from_slice(b"\\f"); + len += 2; + element_type = ElementType::TEXTJ; + } + b'n' => { + self.data.extend_from_slice(b"\\n"); + len += 2; + element_type = ElementType::TEXTJ; + } + b'r' => { + self.data.extend_from_slice(b"\\r"); + len += 2; + element_type = ElementType::TEXTJ; + } + b't' => { + self.data.extend_from_slice(b"\\t"); + len += 2; + element_type = ElementType::TEXTJ; + } + b'\\' | b'"' | b'/' => { + self.data.push(b'\\'); + self.data.push(esc); + len += 2; + element_type = ElementType::TEXTJ; + } + b'u' => { + // Unicode escape sequence + if pos + 4 > input.len() { + bail_parse_error!("Incomplete Unicode escape sequence"); + } + + escape_buffer[0] = b'\\'; + escape_buffer[1] = b'u'; + + for i in 0..4 { + let h = input[pos + i]; + if !is_hex_digit(h) { + bail_parse_error!("Invalid Unicode escape sequence"); + } + escape_buffer[2 + i] = h; + } + + self.data.extend_from_slice(&escape_buffer[0..6]); + len += 6; + pos += 4; + element_type = ElementType::TEXTJ; + } + // JSON5 extensions + b'\n' => { + self.data.extend_from_slice(b"\\\n"); + len += 2; + element_type = ElementType::TEXT5; + } + b'\'' => { + self.data.extend_from_slice(b"\\\'"); + len += 2; + element_type = ElementType::TEXT5; + } + b'0' => { + self.data.extend_from_slice(b"\\0"); + len += 2; + element_type = ElementType::TEXT5; + } + b'v' => { + self.data.extend_from_slice(b"\\v"); + len += 2; + element_type = ElementType::TEXT5; + } + b'x' => { + // Hex escape sequence (JSON5) + if pos + 2 > input.len() { + bail_parse_error!("Incomplete hex escape sequence"); + } + + escape_buffer[0] = b'\\'; + escape_buffer[1] = b'x'; + + for i in 0..2 { + let h = input[pos + i]; + if !is_hex_digit(h) { + bail_parse_error!("Invalid hex escape sequence"); + } + escape_buffer[2 + i] = h; + } + + self.data.extend_from_slice(&escape_buffer[0..4]); + len += 4; + pos += 2; + element_type = ElementType::TEXT5; + } + _ => { + bail_parse_error!("Invalid escape sequence: \\{}", esc as char); + } + } + } else if !quoted && (c == b':' || c.is_ascii_whitespace()) { + // End of unquoted identifier + pos -= 1; // Put back the terminating character + break; + } else if c <= 0x1F { + // Control character + element_type = ElementType::TEXT5; + self.data.push(c); + len += 1; + } else { + // Normal character + self.data.push(c); + len += 1; + } + } + + // Write final header with correct type and size self.write_element_header(string_start, element_type, len)?; - Ok(self.len() - payload_start) + Ok(pos) } - pub fn deserialize_number<'a, I>(&mut self, input: &mut Peekable) -> Result - where - I: Iterator, - { + fn deserialize_number(&mut self, input: &[u8], mut pos: usize) -> Result { let num_start = self.len(); + let start_pos = pos; let mut len = 0; let mut is_float = false; let mut is_json5 = false; - // Dummy header + // Write placeholder header self.write_element_header(num_start, ElementType::INT, 0)?; // Handle sign - if input.peek() == Some(&&b'-') || input.peek() == Some(&&b'+') { - if input.peek() == Some(&&b'+') { + if pos < input.len() && (input[pos] == b'-' || input[pos] == b'+') { + if input[pos] == b'+' { is_json5 = true; - input.next(); + pos += 1; } else { - self.data.push(*input.next().unwrap()); + self.data.push(input[pos]); + pos += 1; len += 1; } } - // Handle json5 float number - if input.peek() == Some(&&b'.') { + // Handle JSON5 float starting with dot + if pos < input.len() && input[pos] == b'.' { is_json5 = true; - }; + is_float = true; + } // Check for hex (JSON5) - if input.peek() == Some(&&b'0') { - self.data.push(*input.next().unwrap()); + if pos < input.len() && input[pos] == b'0' && pos + 1 < input.len() { + self.data.push(input[pos]); + pos += 1; len += 1; - let next_ch = input.peek(); - if let Some(&&ch) = next_ch { - if ch == b'x' || ch == b'X' { - self.data.push(*input.next().unwrap()); + + if pos < input.len() && (input[pos] == b'x' || input[pos] == b'X') { + // Hexadecimal number + self.data.push(input[pos]); + pos += 1; + len += 1; + + let mut has_digit = false; + while pos < input.len() && is_hex_digit(input[pos]) { + self.data.push(input[pos]); + pos += 1; len += 1; - while let Some(&&byte) = input.peek() { - if is_hex_digit(byte) { - self.data.push(*input.next().unwrap()); - len += 1; - } else { - break; - } - } - - self.write_element_header(num_start, ElementType::INT5, len)?; - - return Ok(self.len() - num_start); - } else if ch.is_ascii_alphanumeric() { - bail_parse_error!("Leading zero is not allowed") + has_digit = true; } + + if !has_digit { + bail_parse_error!("Invalid hex number: no digits after 0x"); + } + + self.write_element_header(num_start, ElementType::INT5, len)?; + return Ok(pos); + } else if pos < input.len() && input[pos].is_ascii_digit() { + // Leading zero followed by digit is not allowed in standard JSON + bail_parse_error!("Leading zero is not allowed in number"); } } // Check for Infinity - if input.peek().map(|x| x.to_ascii_lowercase()) == Some(b'i') { - for expected in b"infinity" { - if input.next().map(|x| x.to_ascii_lowercase()) != Some(*expected) { - bail_parse_error!("Failed to parse number"); + if pos < input.len() && (input[pos] == b'I' || input[pos] == b'i') { + // Try to match "Infinity" + let infinity = b"infinity"; + let mut i = 0; + + while i < infinity.len() && pos + i < input.len() { + if input[pos + i].to_ascii_lowercase() != infinity[i] { + bail_parse_error!("Invalid number: expected Infinity"); } + i += 1; } + + if i < infinity.len() { + bail_parse_error!("Invalid number: incomplete Infinity"); + } + + pos += infinity.len(); + + // Write Infinity as 9e999 + self.data.extend_from_slice(b"9e999"); self.write_element_header( num_start, ElementType::FLOAT5, len + INFINITY_CHAR_COUNT as usize, )?; - self.data.extend_from_slice(b"9e999"); - - return Ok(self.len() - num_start); - }; + return Ok(pos); + } // Regular number parsing - while let Some(&&ch) = input.peek() { - match ch { + while pos < input.len() { + match input[pos] { b'0'..=b'9' => { - self.data.push(*input.next().unwrap()); + self.data.push(input[pos]); + pos += 1; len += 1; } b'.' => { is_float = true; - self.data.push(*input.next().unwrap()); - let next_ch = input.peek(); - match next_ch { - Some(ch) => { - if !ch.is_ascii_alphanumeric() { - is_json5 = true; - } - } - None => { - is_json5 = true; - } - }; + self.data.push(input[pos]); + pos += 1; len += 1; + + // Check for trailing dot + if pos >= input.len() || !input[pos].is_ascii_digit() { + is_json5 = true; + } } b'e' | b'E' => { is_float = true; - self.data.push(*input.next().unwrap()); + self.data.push(input[pos]); + pos += 1; len += 1; - if input.peek() == Some(&&b'+') || input.peek() == Some(&&b'-') { - self.data.push(*input.next().unwrap()); + + // Optional sign after exponent + if pos < input.len() && (input[pos] == b'+' || input[pos] == b'-') { + self.data.push(input[pos]); + pos += 1; len += 1; } } @@ -979,7 +1196,12 @@ impl Jsonb { } } - // Write appropriate header and payload + // No digits found + if len == 0 && (!is_json5 || !is_float) { + bail_parse_error!("Invalid number at position {}", start_pos); + } + + // Determine the appropriate element type let element_type = if is_float { if is_json5 { ElementType::FLOAT5 @@ -996,70 +1218,68 @@ impl Jsonb { self.write_element_header(num_start, element_type, len)?; - Ok(self.len() - num_start) + Ok(pos) } - pub fn deserialize_null_or_nan<'a, I>(&mut self, input: &mut Peekable) -> Result - where - I: Iterator, - { - let start = self.len(); - let nul = b"null"; - let nan = b"nan"; - let mut nan_score = 0; - let mut nul_score = 0; - for i in 0..4 { - if nan_score == 3 { - self.data.push(ElementType::NULL as u8); - return Ok(self.len() - start); - }; - let nul_ch = nul.get(i); - let nan_ch = nan.get(i); - let ch = input.next(); - if nan_ch != ch && nul_ch != ch { - bail_parse_error!("expected null or nan"); - } - if nan_ch == ch { - nan_score += 1; - } - if nul_ch == ch { - nul_score += 1; - } - } - if nul_score == 4 { - self.data.push(ElementType::NULL as u8); - Ok(self.len() - start) - } else { - bail_parse_error!("expected null or nan"); - } - } - - pub fn deserialize_true<'a, I>(&mut self, input: &mut Peekable) -> Result - where - I: Iterator, - { - let start = self.len(); - for expected in b"true" { - if input.next() != Some(expected) { + fn deserialize_true(&mut self, input: &[u8], mut pos: usize) -> Result { + let true_lit = b"true"; + for i in 0..true_lit.len() { + if pos + i >= input.len() || input[pos + i] != true_lit[i] { bail_parse_error!("Expected 'true'"); } } + + pos += true_lit.len(); self.data.push(ElementType::TRUE as u8); - Ok(self.len() - start) + + Ok(pos) } - fn deserialize_false<'a, I>(&mut self, input: &mut Peekable) -> Result - where - I: Iterator, - { - let start = self.len(); - for expected in b"false" { - if input.next() != Some(expected) { + fn deserialize_false(&mut self, input: &[u8], mut pos: usize) -> Result { + let false_lit = b"false"; + for i in 0..false_lit.len() { + if pos + i >= input.len() || input[pos + i] != false_lit[i] { bail_parse_error!("Expected 'false'"); } } + + pos += false_lit.len(); self.data.push(ElementType::FALSE as u8); - Ok(self.len() - start) + + Ok(pos) + } + + pub fn deserialize_null_or_nan(&mut self, input: &[u8], mut pos: usize) -> Result { + // First check if we have enough bytes remaining + if pos + 3 >= input.len() { + bail_parse_error!("Unexpected end of input, expected 'null' or 'nan'"); + } + + // Fast path for "null" + if pos + 4 <= input.len() + && input[pos] == b'n' + && input[pos + 1] == b'u' + && input[pos + 2] == b'l' + && input[pos + 3] == b'l' + { + pos += 4; + self.data.push(ElementType::NULL as u8); + return Ok(pos); + } + + // Fast path for "nan" + if pos + 3 <= input.len() + && (input[pos] == b'n' || input[pos] == b'N') + && (input[pos + 1] == b'a' || input[pos + 1] == b'A') + && (input[pos + 2] == b'n' || input[pos + 2] == b'N') + { + pos += 3; + self.data.push(ElementType::NULL as u8); + return Ok(pos); + } + + // If we get here, we didn't match either pattern + bail_parse_error!("Expected 'null' or 'nan'"); } fn write_element_header( @@ -1068,28 +1288,80 @@ impl Jsonb { element_type: ElementType, payload_size: usize, ) -> Result { + if payload_size <= 11 { + let header_byte = (element_type as u8) | ((payload_size as u8) << 4); + if cursor == self.len() { + self.data.push(header_byte); + } else { + self.data[cursor] = header_byte; + } + return Ok(1); + } + let header = JsonbHeader::new(element_type, payload_size).into_bytes(); + + let header_bytes = header.as_bytes(); + let header_len = header_bytes.len(); if cursor == self.len() { - for byte in header { - if byte != 0 { - self.data.push(byte); + self.data.extend_from_slice(header_bytes); + } else { + // Calculate difference in length + let old_len = 1; // We're replacing 1 byte + let new_len = header_bytes.len(); + let diff = new_len as isize - old_len as isize; + + // Resize the Vec if needed + if diff > 0 { + // Need to make room + self.data.resize(self.data.len() + diff as usize, 0); + + // Shift data after cursor to the right + unsafe { + let ptr = self.data.as_mut_ptr(); + std::ptr::copy( + ptr.add(cursor + old_len), + ptr.add(cursor + new_len), + self.data.len() - cursor - new_len, + ); + } + } else if diff < 0 { + // Need to shrink + unsafe { + let ptr = self.data.as_mut_ptr(); + std::ptr::copy( + ptr.add(cursor + old_len), + ptr.add(cursor + new_len), + self.data.len() - cursor - old_len, + ); } } - } else { - self.data[cursor] = header[0]; - self.data.splice( - cursor + 1..cursor + 1, - header[1..].iter().filter(|&&x| x != 0).cloned(), - ); + + // Copy the header bytes + for (i, &byte) in header_bytes.iter().enumerate() { + self.data[cursor + i] = byte; + } } - Ok(header.iter().filter(|&&x| x != 0).count()) + Ok(header_len) } fn from_str(input: &str) -> Result { let mut result = Self::new(input.len(), None); - let mut input_iter = input.as_bytes().iter().peekable(); - while input_iter.peek().is_some() { - result.deserialize_value(&mut input_iter, 0)?; + let input = input.as_bytes(); + + if input.is_empty() { + bail_parse_error!("Empty input"); + } + + // Parse the first complete JSON value + let mut pos = 0; + pos = result.deserialize_value(input, pos, 0)?; + + // Skip any trailing whitespace + pos = skip_whitespace(input, pos); + + // Check for any non-whitespace characters after the JSON value + if pos < input.len() { + bail_parse_error!("Unexpected trailing content after JSON value"); } Ok(result) @@ -1108,52 +1380,70 @@ impl std::str::FromStr for Jsonb { } } -pub fn skip_whitespace<'a, I>(input: &mut Peekable) -where - I: Iterator, -{ - while let Some(&ch) = input.peek() { - match ch { - b' ' | b'\t' | b'\n' | b'\r' => { - input.next(); - } - b'/' => { - // Handle JSON5 comments - input.next(); - if let Some(&&next_ch) = input.peek() { - if next_ch == b'/' { - // Line comment - skip until newline - input.next(); - while let Some(&c) = input.next() { - if c == b'\n' { - break; - } - } - } else if next_ch == b'*' { - // Block comment - skip until "*/" - input.next(); - let mut prev = b'\0'; - while let Some(&c) = input.next() { - if prev == b'*' && c == b'/' { - break; - } - prev = c; - } - } else { - // Not a comment, put the '/' back - break; +#[inline] +pub fn skip_whitespace(input: &[u8], mut pos: usize) -> usize { + let len = input.len(); + if pos >= len { + return pos; + } + + // Fast path for non-whitespace, non-comment + if (WS_TABLE[input[pos] as usize] & 1) == 0 && input[pos] != b'/' { + return pos; + } + + // Process whitespace and comments + while pos < len { + let ch = input[pos]; + if (WS_TABLE[ch as usize] & 1) != 0 { + // Skip whitespace + pos += 1; + } else if ch == b'/' && pos + 1 < len { + // Handle JSON5 comments + match input[pos + 1] { + b'/' => { + // Line comment - skip until newline + pos += 2; + while pos < len && input[pos] != b'\n' { + pos += 1; } - } else { + if pos < len { + pos += 1; // Skip the newline + } + } + b'*' => { + // Block comment - skip until "*/" + pos += 2; + while pos + 1 < len { + if input[pos] == b'*' && input[pos + 1] == b'/' { + pos += 2; + break; + } + pos += 1; + } + } + _ => { + // Not a comment break; } } - _ => break, + } else { + // Not whitespace or comment + break; } } + + pos } -fn is_hex_digit(b: u8) -> bool { - matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F') +#[inline] +fn is_hex_digit(ch: u8) -> bool { + (CHARACTER_TYPE[ch as usize] & 3) == 2 || (CHARACTER_TYPE[ch as usize] & 3) == 3 +} + +#[inline] +fn is_json_ok(ch: u8) -> bool { + (CHARACTER_TYPE_OK[ch as usize] & 4) != 0 } #[cfg(test)] @@ -1488,26 +1778,35 @@ world""#, fn test_header_encoding() { // Small payload (fits in 4 bits) let header = JsonbHeader::new(ElementType::TEXT, 5); - let bytes = header.into_bytes(); + let bytes = header.into_bytes().as_bytes().to_vec(); assert_eq!(bytes[0], (5 << 4) | (ElementType::TEXT as u8)); // Medium payload (8-bit) let header = JsonbHeader::new(ElementType::TEXT, 200); - let bytes = header.into_bytes(); - assert_eq!(bytes[0], (PAYLOAD_SIZE8 << 4) | (ElementType::TEXT as u8)); + let bytes = header.into_bytes().as_bytes().to_vec(); + assert_eq!( + bytes[0], + (SIZE_MARKER_8BIT << 4) | (ElementType::TEXT as u8) + ); assert_eq!(bytes[1], 200); // Large payload (16-bit) let header = JsonbHeader::new(ElementType::TEXT, 40000); - let bytes = header.into_bytes(); - assert_eq!(bytes[0], (PAYLOAD_SIZE16 << 4) | (ElementType::TEXT as u8)); + let bytes = header.into_bytes().as_bytes().to_vec(); + assert_eq!( + bytes[0], + (SIZE_MARKER_16BIT << 4) | (ElementType::TEXT as u8) + ); assert_eq!(bytes[1], (40000 >> 8) as u8); assert_eq!(bytes[2], (40000 & 0xFF) as u8); // Extra large payload (32-bit) let header = JsonbHeader::new(ElementType::TEXT, 70000); - let bytes = header.into_bytes(); - assert_eq!(bytes[0], (PAYLOAD_SIZE32 << 4) | (ElementType::TEXT as u8)); + let bytes = header.into_bytes().as_bytes().to_vec(); + assert_eq!( + bytes[0], + (SIZE_MARKER_32BIT << 4) | (ElementType::TEXT as u8) + ); assert_eq!(bytes[1], (70000 >> 24) as u8); assert_eq!(bytes[2], ((70000 >> 16) & 0xFF) as u8); assert_eq!(bytes[3], ((70000 >> 8) & 0xFF) as u8); @@ -1519,9 +1818,9 @@ world""#, // Create sample data with various headers let data = vec![ (5 << 4) | (ElementType::TEXT as u8), - (PAYLOAD_SIZE8 << 4) | (ElementType::ARRAY as u8), + (SIZE_MARKER_8BIT << 4) | (ElementType::ARRAY as u8), 150, - (PAYLOAD_SIZE16 << 4) | (ElementType::OBJECT as u8), + (SIZE_MARKER_16BIT << 4) | (ElementType::OBJECT as u8), 0x98, 0x68, ]; diff --git a/core/json/mod.rs b/core/json/mod.rs index 6f8b571f8..5f63ae2e7 100644 --- a/core/json/mod.rs +++ b/core/json/mod.rs @@ -75,7 +75,7 @@ pub fn get_json(json_value: &OwnedValue, indent: Option<&str>) -> crate::Result< pub fn jsonb(json_value: &OwnedValue) -> crate::Result { let jsonbin = match json_value { OwnedValue::Null | OwnedValue::Integer(_) | OwnedValue::Float(_) | OwnedValue::Text(_) => { - Jsonb::from_str(&json_value.to_string()) + Jsonb::from_str(&json_value.to_text().unwrap()) } OwnedValue::Blob(blob) => { let blob = Jsonb::new(blob.len(), Some(&blob));