diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 4fb22df2b..ef49158ea 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -532,6 +532,20 @@ pub struct BTreeCursor { read_overflow_state: RefCell>, /// Contains the current cell_idx for `find_cell` find_cell_state: FindCellState, + /// `RecordCursor` is used to parse SQLite record format data retrieved from B-tree + /// leaf pages. It provides incremental parsing, only deserializing the columns that are + /// actually accessed, which is crucial for performance when dealing with wide tables + /// where only a subset of columns are needed. + /// + /// - Record parsing is logically a read operation from the caller's perspective + /// - But internally requires updating the cursor's cached parsing state + /// - Multiple methods may need to access different columns from the same record + /// + /// # Lifecycle + /// + /// The cursor is invalidated and reset when: + /// - Moving to a different record/row + /// - The underlying `ImmutableRecord` is modified pub record_cursor: RefCell, } diff --git a/core/types.rs b/core/types.rs index 8ab1f2a4e..70cf255c7 100644 --- a/core/types.rs +++ b/core/types.rs @@ -1055,11 +1055,30 @@ impl ImmutableRecord { } } +/// A cursor for lazily parsing SQLite record format data. +/// +/// `RecordCursor` provides incremental parsing of SQLite records, which follow the format: +/// `[header_size][serial_type1][serial_type2]...[data1][data2]...` +/// +/// Instead of parsing the entire record upfront, this cursor parses only what's needed +/// for the requested operations, improving performance for large records where only +/// a few columns are accessed. +/// +/// SQLite records consist of: +/// - **Header size**: Varint indicating total header length +/// - **Serial types**: Variable-length integers describing each field's type and size +/// - **Data section**: The actual field data in the same order as serial types #[derive(Debug, Default)] pub struct RecordCursor { + /// Parsed serial type values for each column. + /// Serial types encode both the data type and size information. pub serial_types: Vec, + /// Byte offsets where each column's data begins in the record payload. + /// Always has one more entry than `serial_types` (the final offset marks the end). pub offsets: Vec, + /// Total size of the record header in bytes. pub header_size: usize, + /// Current parsing position within the header section. pub header_offset: usize, } @@ -1097,6 +1116,29 @@ impl RecordCursor { self.ensure_parsed_upto(record, MAX_COLUMN) } + /// Ensures the header is parsed up to (and including) the target column index. + /// + /// This is the core lazy parsing method. It only parses as much of the header + /// as needed to access the requested column, making it efficient for sparse + /// column access patterns. + /// + /// # Arguments + /// + /// * `record` - The record containing the data to parse + /// * `target_idx` - The column index that needs to be accessible (0-based) + /// + /// # Returns + /// + /// * `Ok(())` - Parsing completed successfully + /// * `Err(LimboError)` - Parsing failed due to corrupt data or I/O error + /// + /// # Behavior + /// + /// - If `target_idx` is already parsed, returns immediately + /// - Parses incrementally from the current position to the target + /// - Handles the initial header size parsing on first call + /// - Calculates and caches data offsets for each parsed column + /// #[inline(always)] pub fn ensure_parsed_upto( &mut self, @@ -1134,6 +1176,25 @@ impl RecordCursor { Ok(()) } + /// Deserializes a specific column without additional parsing. + /// + /// This method assumes the header has already been parsed up to the target + /// column index (via `ensure_parsed_upto`). It extracts the actual data + /// value from the record's data section. + /// + /// # Arguments + /// + /// * `record` - The record containing the data + /// * `idx` - The column index to deserialize (0-based) + /// + /// # Returns + /// + /// * `Ok(RefValue)` - The deserialized value (may reference record data) + /// * `Err(LimboError)` - Deserialization failed + /// + /// # Special Cases + /// + /// - Returns `RefValue::Null` for out-of-bounds indices pub fn deserialize_column(&self, record: &ImmutableRecord, idx: usize) -> Result { if idx >= self.serial_types.len() { return Ok(RefValue::Null); @@ -1162,6 +1223,21 @@ impl RecordCursor { Ok(value) } + /// Gets the value at the specified column index. + /// + /// This is the primary method for accessing record data. It combines + /// lazy parsing with deserialization in a single call. + /// + /// # Arguments + /// + /// * `record` - The record to read from + /// * `idx` - The column index (0-based) + /// + /// # Returns + /// + /// * `Ok(RefValue)` - The value at the specified index + /// * `Err(LimboError)` - Access failed due to invalid record or parsing error + /// #[inline(always)] pub fn get_value(&mut self, record: &ImmutableRecord, idx: usize) -> Result { if record.is_invalidated() { @@ -1172,6 +1248,19 @@ impl RecordCursor { self.deserialize_column(record, idx) } + /// Gets the value at the specified column index, returning `None` on any error. + /// + /// # Arguments + /// + /// * `record` - The record to read from + /// * `idx` - The column index (0-based) + /// + /// # Returns + /// + /// * `Some(Ok(RefValue))` - Successfully read value + /// * `Some(Err(LimboError))` - Parsing succeeded but deserialization failed + /// * `None` - Record is invalid or index is out of bounds + /// pub fn get_value_opt( &mut self, record: &ImmutableRecord, @@ -1188,6 +1277,17 @@ impl RecordCursor { Some(self.deserialize_column(record, idx)) } + /// Returns the number of columns in the record. + /// + /// This method parses the complete header to determine the total + /// column count. The result is cached for subsequent calls. + /// # Arguments + /// + /// * `record` - The record to count columns in + /// + /// # Returns + /// + /// The number of columns, or 0 if the record is invalid. pub fn count(&mut self, record: &ImmutableRecord) -> usize { if record.is_invalidated() { return 0; @@ -1197,10 +1297,33 @@ impl RecordCursor { self.serial_types.len() } + /// Alias for `count()`. Returns the number of columns in the record. + /// + /// # Arguments + /// + /// * `record` - The record to get length of + /// + /// # Returns + /// + /// The number of columns, or 0 if the record is invalid. pub fn len(&mut self, record: &ImmutableRecord) -> usize { self.count(record) } + /// Returns all values in the record as a vector. + /// + /// This method parses the complete header and deserializes all columns. + /// Use this when you need access to most or all columns in the record. + /// + /// # Arguments + /// + /// * `record` - The record to extract all values from + /// + /// # Returns + /// + /// * `Ok(Vec)` - All values in column order + /// * `Err(LimboError)` - Parsing or deserialization failed + /// pub fn get_values(&mut self, record: &ImmutableRecord) -> Result> { if record.is_invalidated() { return Ok(Vec::new()); @@ -1429,37 +1552,6 @@ pub fn compare_immutable( std::cmp::Ordering::Equal } -pub fn compare_immutable_for_testing( - l: &[RefValue], - r: &[RefValue], - index_key_sort_order: IndexKeySortOrder, - collations: &[CollationSeq], - tie_breaker: std::cmp::Ordering, -) -> std::cmp::Ordering { - let min_len = l.len().min(r.len()); - - for i in 0..min_len { - let column_order = index_key_sort_order.get_sort_order_for_col(i); - let collation = collations.get(i).copied().unwrap_or_default(); - - let cmp = match (&l[i], &r[i]) { - (RefValue::Text(left), RefValue::Text(right)) => { - collation.compare_strings(left.as_str(), right.as_str()) - } - _ => l[i].partial_cmp(&r[i]).unwrap_or(std::cmp::Ordering::Equal), - }; - - if cmp != std::cmp::Ordering::Equal { - return match column_order { - SortOrder::Asc => cmp, - SortOrder::Desc => cmp.reverse(), - }; - } - } - - tie_breaker -} - #[derive(Debug, Clone, Copy)] pub enum RecordCompare { Int, @@ -2272,6 +2364,37 @@ mod tests { use super::*; use crate::translate::collate::CollationSeq; + pub fn compare_immutable_for_testing( + l: &[RefValue], + r: &[RefValue], + index_key_sort_order: IndexKeySortOrder, + collations: &[CollationSeq], + tie_breaker: std::cmp::Ordering, + ) -> std::cmp::Ordering { + let min_len = l.len().min(r.len()); + + for i in 0..min_len { + let column_order = index_key_sort_order.get_sort_order_for_col(i); + let collation = collations.get(i).copied().unwrap_or_default(); + + let cmp = match (&l[i], &r[i]) { + (RefValue::Text(left), RefValue::Text(right)) => { + collation.compare_strings(left.as_str(), right.as_str()) + } + _ => l[i].partial_cmp(&r[i]).unwrap_or(std::cmp::Ordering::Equal), + }; + + if cmp != std::cmp::Ordering::Equal { + return match column_order { + SortOrder::Asc => cmp, + SortOrder::Desc => cmp.reverse(), + }; + } + } + + tie_breaker + } + fn create_record(values: Vec) -> ImmutableRecord { let registers: Vec = values.into_iter().map(Register::Value).collect(); ImmutableRecord::from_registers(®isters, registers.len())