From 7c353095ede1e7314e870236dd8070fac2925a65 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 15 Jul 2025 17:37:14 +0300 Subject: [PATCH 1/4] types: fix and unify record header size calculation --- core/types.rs | 122 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 31 deletions(-) diff --git a/core/types.rs b/core/types.rs index cdfa8eec6..ac9c00495 100644 --- a/core/types.rs +++ b/core/types.rs @@ -892,27 +892,7 @@ impl ImmutableRecord { size_values += value_size; } - let mut header_size = size_header; - const MIN_HEADER_SIZE: usize = 126; - if header_size <= MIN_HEADER_SIZE { - // common case - // This case means the header size can be contained by a single byte, therefore - // header_size == size of serial types + 1 byte from the header size - // Since header_size is a varint, and a varint the first bit is used to represent we have more bytes to read, - // header size here will be 126 == (2^7 - 1) - header_size += 1; - } else { - // Rare case of a really large header - let mut temp_buf = [0u8; 9]; - let n_varint = write_varint(&mut temp_buf, header_size as u64); // or however you get varint length - header_size += n_varint; - - // Check if adding the varint bytes changes the varint length - let new_n_varint = write_varint(&mut temp_buf, header_size as u64); - if n_varint < new_n_varint { - header_size += 1; - } - } + let header_size = Record::calc_header_size(size_header); // 1. write header size let mut buf = Vec::new(); @@ -2180,6 +2160,37 @@ impl Record { Self { values } } + /// Calculates the total size needed for a SQLite record header. + /// + /// The record header consists of: + /// 1. A varint encoding the total header size (self-referentially, e.g. a 100 byte header literally has the number '100' in the header suffix) + /// 2. A sequence of varints encoding the serial types + /// + /// For small headers (<=126 bytes), we only need 1 byte to encode the header size, because 127 fits in 7 bits (varint uses 7 bits for the value and 1 continuation bit) + /// For larger headers, we need to account for the variable length of the header size varint. + pub fn calc_header_size(sizeof_serial_types: usize) -> usize { + if sizeof_serial_types < i8::MAX as usize { + return sizeof_serial_types + 1; + } + + let mut header_size = sizeof_serial_types; + // For larger headers, calculate how many bytes we need for the header size varint + let mut temp_buf = [0u8; 9]; + let mut prev_header_size; + + loop { + prev_header_size = header_size; + let varint_len = write_varint(&mut temp_buf, header_size as u64); + header_size = sizeof_serial_types + varint_len; + + if header_size == prev_header_size { + break; + } + } + + header_size + } + pub fn serialize(&self, buf: &mut Vec) { let initial_i = buf.len(); @@ -2219,16 +2230,7 @@ impl Record { } let mut header_bytes_buf: Vec = Vec::new(); - if header_size <= 126 { - // common case - header_size += 1; - } else { - todo!("calculate big header size extra bytes"); - // get header varint len - // header_size += n; - // if( nVarint Date: Tue, 15 Jul 2025 17:57:52 +0300 Subject: [PATCH 2/4] compare_records: fix assumption that header size is 1 byte and serial type is 1 byte --- core/types.rs | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/core/types.rs b/core/types.rs index ac9c00495..963c62906 100644 --- a/core/types.rs +++ b/core/types.rs @@ -1618,7 +1618,6 @@ pub fn get_tie_breaker_from_seek_op(seek_op: SeekOp) -> std::cmp::Ordering { /// /// The function uses the optimized path when ALL of these conditions are met: /// - Payload is at least 2 bytes (header size + first serial type) -/// - Header size ≤ 63 bytes (`payload[0] <= 63`) - safety constraint /// - First serial type indicates integer (`1-6`, `8`, or `9`) /// - First unpacked field is a `RefValue::Integer` /// @@ -1650,7 +1649,7 @@ fn compare_records_int( tie_breaker: std::cmp::Ordering, ) -> Result { let payload = serialized.get_payload(); - if payload.len() < 2 || payload[0] > 63 { + if payload.len() < 2 { return compare_records_generic( serialized, unpacked, @@ -1661,10 +1660,21 @@ fn compare_records_int( ); } - let header_size = payload[0] as usize; - let first_serial_type = payload[1]; + let (header_size, offset_1st_serialtype) = read_varint(payload)?; + let header_size = header_size as usize; - if !matches!(first_serial_type, 1..=6 | 8 | 9) { + if payload.len() < header_size { + return Err(LimboError::Corrupt(format!( + "Record payload too short: claimed header size {} but payload only {} bytes", + header_size, + payload.len() + ))); + } + + let (first_serial_type, _) = read_varint(&payload[offset_1st_serialtype..])?; + + let serialtype_is_integer = matches!(first_serial_type, 1..=6 | 8 | 9); + if !serialtype_is_integer { return compare_records_generic( serialized, unpacked, @@ -1677,7 +1687,7 @@ fn compare_records_int( let data_start = header_size; - let lhs_int = read_integer(&payload[data_start..], first_serial_type)?; + let lhs_int = read_integer(&payload[data_start..], first_serial_type as u8)?; let RefValue::Integer(rhs_int) = unpacked[0] else { return compare_records_generic( serialized, @@ -1768,11 +1778,21 @@ fn compare_records_string( ); } - let header_size = payload[0] as usize; - let first_serial_type = payload[1]; + let (header_size, offset_1st_serialtype) = read_varint(payload)?; + let header_size = header_size as usize; - // Check if serial type is not a string or if its a blob - if first_serial_type < 13 || (first_serial_type & 1) == 0 { + if payload.len() < header_size { + return Err(LimboError::Corrupt(format!( + "Record payload too short: claimed header size {} but payload only {} bytes", + header_size, + payload.len() + ))); + } + + let (first_serial_type, _) = read_varint(&payload[offset_1st_serialtype..])?; + + let serialtype_is_string = first_serial_type >= 13 && (first_serial_type & 1) == 1; + if !serialtype_is_string { return compare_records_generic( serialized, unpacked, @@ -1799,7 +1819,7 @@ fn compare_records_string( debug_assert!(data_start + string_len <= payload.len()); - let serial_type = SerialType::try_from(first_serial_type as u64)?; + let serial_type = SerialType::try_from(first_serial_type)?; let (lhs_value, _) = read_value(&payload[data_start..], serial_type)?; let RefValue::Text(lhs_text) = lhs_value else { From 38183d3b3bf45e2f5066ace2b40d0d96b3f074de Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 15 Jul 2025 18:42:27 +0300 Subject: [PATCH 3/4] tcl: add regression test for large text keys --- testing/insert.test | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/testing/insert.test b/testing/insert.test index 6a5027d25..5ddd09c9e 100755 --- a/testing/insert.test +++ b/testing/insert.test @@ -393,3 +393,20 @@ do_execsql_test_on_specific_db {:memory:} rowid-overflow-random-generation { INSERT INTO q(y) VALUES (3); SELECT COUNT(*) FROM q; } {3} + +# regression test for incorrect processing of record header in the case of large text columns +if {[info exists ::env(SQLITE_EXEC)] && ($::env(SQLITE_EXEC) eq "scripts/limbo-sqlite3-index-experimental" || $::env(SQLITE_EXEC) eq "sqlite3")} { + do_execsql_test_on_specific_db {:memory:} large-text-index-seek { + CREATE TABLE t(x TEXT, y); + CREATE INDEX t_idx ON t(x); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'a', 1); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'b', 2); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'c', 3); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'd', 4); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'e', 5); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'f', 6); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'g', 7); + INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'h', 8); + SELECT COUNT(*) FROM t WHERE x >= replace(hex(zeroblob(100)), '00', 'a'); + } {8} +} \ No newline at end of file From fda92d43a29df340202e79d8198e8bca69d914a0 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 15 Jul 2025 18:52:27 +0300 Subject: [PATCH 4/4] adjust comment in header size test --- core/types.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/types.rs b/core/types.rs index 963c62906..f03d27a8a 100644 --- a/core/types.rs +++ b/core/types.rs @@ -2547,7 +2547,7 @@ mod tests { #[test] fn test_calc_header_size() { - // Test 1-byte header size (0 to 127) + // Test 1-byte header size (serial type sizes 0 to 126) const MIN_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER: usize = 0; assert_eq!( Record::calc_header_size(MIN_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER), @@ -2560,7 +2560,7 @@ mod tests { MAX_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER + 1 ); - // Test 2-byte header size (128 to 16383) + // Test 2-byte header size (serial type sizes 127 to 16381) const MIN_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER: usize = MAX_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER + 1; assert_eq!( @@ -2574,7 +2574,7 @@ mod tests { MAX_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER + 2 ); - // Test 3-byte header size (16384 to 2097151) + // Test 3-byte header size (serial type sizes 16382 to 2097148) const MIN_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER: usize = MAX_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER + 1; assert_eq!( @@ -2588,7 +2588,7 @@ mod tests { MAX_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER + 3 ); - // Test 4-byte header size (2097152 to 268435455) + // Test 4-byte header size (serial type sizes 2097149 to 268435451) const MIN_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER: usize = MAX_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER + 1; assert_eq!(