Merge 'fix record header size calculations and incorrect assumptions' from Jussi Saurio

- remove assumptions that record header size fits into 1 byte or serial type fits into 1 byte - add tests for record header size calculation ```sql turso> CREATE TABLE t(x TEXT, y); CREATE INDEX t_idx ON t(x); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'a', 1); -- 1000 bytes of 'a' INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'b', 2); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'c', 3); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'd', 4); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'e', 5); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'f', 6); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'g', 7); INSERT INTO t VALUES (replace(zeroblob(1000), x'00', 'a') || 'h', 8); SELECT COUNT(*) FROM t WHERE x >= replace(hex(zeroblob(100)), '00', 'a'); ┌───────────┐ │ COUNT (*) │ ├───────────┤ │ 8 │ └───────────┘ ``` Fixes #2096 Fixes #2088 Reviewed-by: Nikita Sivukhin (@sivukhin) Closes #2098
2026-02-19 15:05:47 +01:00 · 2025-07-15 19:09:31 +03:00
parent 025ddd98a6 fda92d43a2
commit 86b1b0d009
2 changed files with 139 additions and 42 deletions
--- a/core/types.rs
+++ b/core/types.rs
@@ -892,27 +892,7 @@ impl ImmutableRecord {
            size_values += value_size;
        }

-        let mut header_size = size_header;
-        const MIN_HEADER_SIZE: usize = 126;
-        if header_size <= MIN_HEADER_SIZE {
-            // common case
-            // This case means the header size can be contained by a single byte, therefore
-            // header_size == size of serial types + 1 byte from the header size
-            // Since header_size is a varint, and a varint the first bit is used to represent we have more bytes to read,
-            // header size here will be 126 == (2^7 - 1)
-            header_size += 1;
-        } else {
-            // Rare case of a really large header
-            let mut temp_buf = [0u8; 9];
-            let n_varint = write_varint(&mut temp_buf, header_size as u64); // or however you get varint length
-            header_size += n_varint;
-
-            // Check if adding the varint bytes changes the varint length
-            let new_n_varint = write_varint(&mut temp_buf, header_size as u64);
-            if n_varint < new_n_varint {
-                header_size += 1;
-            }
-        }
+        let header_size = Record::calc_header_size(size_header);

        // 1. write header size
        let mut buf = Vec::new();
@@ -1638,7 +1618,6 @@ pub fn get_tie_breaker_from_seek_op(seek_op: SeekOp) -> std::cmp::Ordering {
 ///
 /// The function uses the optimized path when ALL of these conditions are met:
 /// - Payload is at least 2 bytes (header size + first serial type)
-/// - Header size ≤ 63 bytes (`payload[0] <= 63`) - safety constraint
 /// - First serial type indicates integer (`1-6`, `8`, or `9`)
 /// - First unpacked field is a `RefValue::Integer`
 ///
@@ -1670,7 +1649,7 @@ fn compare_records_int(
    tie_breaker: std::cmp::Ordering,
 ) -> Result<std::cmp::Ordering> {
    let payload = serialized.get_payload();
-    if payload.len() < 2 || payload[0] > 63 {
+    if payload.len() < 2 {
        return compare_records_generic(
            serialized,
            unpacked,
@@ -1681,10 +1660,21 @@ fn compare_records_int(
        );
    }

-    let header_size = payload[0] as usize;
-    let first_serial_type = payload[1];
+    let (header_size, offset_1st_serialtype) = read_varint(payload)?;
+    let header_size = header_size as usize;

-    if !matches!(first_serial_type, 1..=6 | 8 | 9) {
+    if payload.len() < header_size {
+        return Err(LimboError::Corrupt(format!(
+            "Record payload too short: claimed header size {} but payload only {} bytes",
+            header_size,
+            payload.len()
+        )));
+    }
+
+    let (first_serial_type, _) = read_varint(&payload[offset_1st_serialtype..])?;
+
+    let serialtype_is_integer = matches!(first_serial_type, 1..=6 | 8 | 9);
+    if !serialtype_is_integer {
        return compare_records_generic(
            serialized,
            unpacked,
@@ -1697,7 +1687,7 @@ fn compare_records_int(

    let data_start = header_size;

-    let lhs_int = read_integer(&payload[data_start..], first_serial_type)?;
+    let lhs_int = read_integer(&payload[data_start..], first_serial_type as u8)?;
    let RefValue::Integer(rhs_int) = unpacked[0] else {
        return compare_records_generic(
            serialized,
@@ -1788,11 +1778,21 @@ fn compare_records_string(
        );
    }

-    let header_size = payload[0] as usize;
-    let first_serial_type = payload[1];
+    let (header_size, offset_1st_serialtype) = read_varint(payload)?;
+    let header_size = header_size as usize;

-    // Check if serial type is not a string or if its a blob
-    if first_serial_type < 13 || (first_serial_type & 1) == 0 {
+    if payload.len() < header_size {
+        return Err(LimboError::Corrupt(format!(
+            "Record payload too short: claimed header size {} but payload only {} bytes",
+            header_size,
+            payload.len()
+        )));
+    }
+
+    let (first_serial_type, _) = read_varint(&payload[offset_1st_serialtype..])?;
+
+    let serialtype_is_string = first_serial_type >= 13 && (first_serial_type & 1) == 1;
+    if !serialtype_is_string {
        return compare_records_generic(
            serialized,
            unpacked,
@@ -1819,7 +1819,7 @@ fn compare_records_string(

    debug_assert!(data_start + string_len <= payload.len());

-    let serial_type = SerialType::try_from(first_serial_type as u64)?;
+    let serial_type = SerialType::try_from(first_serial_type)?;
    let (lhs_value, _) = read_value(&payload[data_start..], serial_type)?;

    let RefValue::Text(lhs_text) = lhs_value else {
@@ -2180,6 +2180,37 @@ impl Record {
        Self { values }
    }

+    /// Calculates the total size needed for a SQLite record header.
+    ///
+    /// The record header consists of:
+    /// 1. A varint encoding the total header size (self-referentially, e.g. a 100 byte header literally has the number '100' in the header suffix)
+    /// 2. A sequence of varints encoding the serial types
+    ///
+    /// For small headers (<=126 bytes), we only need 1 byte to encode the header size, because 127 fits in 7 bits (varint uses 7 bits for the value and 1 continuation bit)
+    /// For larger headers, we need to account for the variable length of the header size varint.
+    pub fn calc_header_size(sizeof_serial_types: usize) -> usize {
+        if sizeof_serial_types < i8::MAX as usize {
+            return sizeof_serial_types + 1;
+        }
+
+        let mut header_size = sizeof_serial_types;
+        // For larger headers, calculate how many bytes we need for the header size varint
+        let mut temp_buf = [0u8; 9];
+        let mut prev_header_size;
+
+        loop {
+            prev_header_size = header_size;
+            let varint_len = write_varint(&mut temp_buf, header_size as u64);
+            header_size = sizeof_serial_types + varint_len;
+
+            if header_size == prev_header_size {
+                break;
+            }
+        }
+
+        header_size
+    }
+
    pub fn serialize(&self, buf: &mut Vec<u8>) {
        let initial_i = buf.len();

@@ -2219,16 +2250,7 @@ impl Record {
        }

        let mut header_bytes_buf: Vec<u8> = Vec::new();
-        if header_size <= 126 {
-            // common case
-            header_size += 1;
-        } else {
-            todo!("calculate big header size extra bytes");
-            // get header varint len
-            // header_size += n;
-            // if( nVarint<sqlite3VarintLen(nHdr) ) nHdr++;
-        }
-        assert!(header_size <= 126);
+        header_size = Record::calc_header_size(header_size);
        header_bytes_buf.extend(std::iter::repeat_n(0, 9));
        let n = write_varint(header_bytes_buf.as_mut_slice(), header_size as u64);
        header_bytes_buf.truncate(n);
@@ -2523,6 +2545,64 @@ mod tests {
        );
    }

+    #[test]
+    fn test_calc_header_size() {
+        // Test 1-byte header size (serial type sizes 0 to 126)
+        const MIN_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER: usize = 0;
+        assert_eq!(
+            Record::calc_header_size(MIN_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER),
+            MIN_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER + 1
+        );
+        const BITS_7_MAX: usize = (1 << 7) - 1; // varints use 7 bits for the value and 1 continuation bit
+        const MAX_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER: usize = BITS_7_MAX - 1;
+        assert_eq!(
+            Record::calc_header_size(MAX_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER),
+            MAX_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER + 1
+        );
+
+        // Test 2-byte header size (serial type sizes 127 to 16381)
+        const MIN_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER: usize =
+            MAX_SERIALTYPES_SIZE_FOR_1_BYTE_HEADER + 1;
+        assert_eq!(
+            Record::calc_header_size(MIN_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER),
+            MIN_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER + 2
+        );
+        const BITS_14_MAX: usize = (1 << 14) - 1;
+        const MAX_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER: usize = BITS_14_MAX - 2;
+        assert_eq!(
+            Record::calc_header_size(MAX_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER),
+            MAX_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER + 2
+        );
+
+        // Test 3-byte header size (serial type sizes 16382 to 2097148)
+        const MIN_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER: usize =
+            MAX_SERIALTYPES_SIZE_FOR_2_BYTE_HEADER + 1;
+        assert_eq!(
+            Record::calc_header_size(MIN_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER),
+            MIN_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER + 3
+        );
+        const BITS_21_MAX: usize = (1 << 21) - 1;
+        const MAX_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER: usize = BITS_21_MAX - 3;
+        assert_eq!(
+            Record::calc_header_size(MAX_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER),
+            MAX_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER + 3
+        );
+
+        // Test 4-byte header size (serial type sizes 2097149 to 268435451)
+        const MIN_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER: usize =
+            MAX_SERIALTYPES_SIZE_FOR_3_BYTE_HEADER + 1;
+        assert_eq!(
+            Record::calc_header_size(MIN_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER),
+            MIN_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER + 4
+        );
+        const BITS_28_MAX: usize = (1 << 28) - 1;
+        const MAX_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER: usize = BITS_28_MAX - 4;
+        assert_eq!(
+            Record::calc_header_size(MAX_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER),
+            MAX_SERIALTYPES_SIZE_FOR_4_BYTE_HEADER + 4
+        );
+    }
+
    #[test]
    fn test_integer_fast_path() {
        let index_info = create_index_info(2, vec![SortOrder::Asc, SortOrder::Asc]);
--- a/testing/insert.test
+++ b/testing/insert.test
@@ -393,3 +393,20 @@ do_execsql_test_on_specific_db {:memory:} rowid-overflow-random-generation {
    INSERT INTO q(y) VALUES (3);
    SELECT COUNT(*) FROM q;
 } {3}
+
+# regression test for incorrect processing of record header in the case of large text columns
+if {[info exists ::env(SQLITE_EXEC)] && ($::env(SQLITE_EXEC) eq "scripts/limbo-sqlite3-index-experimental" || $::env(SQLITE_EXEC) eq "sqlite3")} {
+    do_execsql_test_on_specific_db {:memory:} large-text-index-seek {
+        CREATE TABLE t(x TEXT, y);
+        CREATE INDEX t_idx ON t(x);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'a', 1);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'b', 2);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'c', 3);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'd', 4);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'e', 5);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'f', 6);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'g', 7);
+        INSERT INTO t VALUES (replace(hex(zeroblob(1000)), '00', 'a') || 'h', 8);
+        SELECT COUNT(*) FROM t WHERE x >= replace(hex(zeroblob(100)), '00', 'a');
+    } {8}
+}