- Address some review comments

- Add docs for `RecordCursor`
This commit is contained in:
Krishna Vishal
2025-07-08 15:39:36 +05:30
parent b1f27cad94
commit ea4a4708ea
2 changed files with 168 additions and 31 deletions

View File

@@ -532,6 +532,20 @@ pub struct BTreeCursor {
read_overflow_state: RefCell<Option<ReadPayloadOverflow>>,
/// Contains the current cell_idx for `find_cell`
find_cell_state: FindCellState,
/// `RecordCursor` is used to parse SQLite record format data retrieved from B-tree
/// leaf pages. It provides incremental parsing, only deserializing the columns that are
/// actually accessed, which is crucial for performance when dealing with wide tables
/// where only a subset of columns are needed.
///
/// - Record parsing is logically a read operation from the caller's perspective
/// - But internally requires updating the cursor's cached parsing state
/// - Multiple methods may need to access different columns from the same record
///
/// # Lifecycle
///
/// The cursor is invalidated and reset when:
/// - Moving to a different record/row
/// - The underlying `ImmutableRecord` is modified
pub record_cursor: RefCell<RecordCursor>,
}

View File

@@ -1055,11 +1055,30 @@ impl ImmutableRecord {
}
}
/// A cursor for lazily parsing SQLite record format data.
///
/// `RecordCursor` provides incremental parsing of SQLite records, which follow the format:
/// `[header_size][serial_type1][serial_type2]...[data1][data2]...`
///
/// Instead of parsing the entire record upfront, this cursor parses only what's needed
/// for the requested operations, improving performance for large records where only
/// a few columns are accessed.
///
/// SQLite records consist of:
/// - **Header size**: Varint indicating total header length
/// - **Serial types**: Variable-length integers describing each field's type and size
/// - **Data section**: The actual field data in the same order as serial types
#[derive(Debug, Default)]
pub struct RecordCursor {
/// Parsed serial type values for each column.
/// Serial types encode both the data type and size information.
pub serial_types: Vec<u64>,
/// Byte offsets where each column's data begins in the record payload.
/// Always has one more entry than `serial_types` (the final offset marks the end).
pub offsets: Vec<usize>,
/// Total size of the record header in bytes.
pub header_size: usize,
/// Current parsing position within the header section.
pub header_offset: usize,
}
@@ -1097,6 +1116,29 @@ impl RecordCursor {
self.ensure_parsed_upto(record, MAX_COLUMN)
}
/// Ensures the header is parsed up to (and including) the target column index.
///
/// This is the core lazy parsing method. It only parses as much of the header
/// as needed to access the requested column, making it efficient for sparse
/// column access patterns.
///
/// # Arguments
///
/// * `record` - The record containing the data to parse
/// * `target_idx` - The column index that needs to be accessible (0-based)
///
/// # Returns
///
/// * `Ok(())` - Parsing completed successfully
/// * `Err(LimboError)` - Parsing failed due to corrupt data or I/O error
///
/// # Behavior
///
/// - If `target_idx` is already parsed, returns immediately
/// - Parses incrementally from the current position to the target
/// - Handles the initial header size parsing on first call
/// - Calculates and caches data offsets for each parsed column
///
#[inline(always)]
pub fn ensure_parsed_upto(
&mut self,
@@ -1134,6 +1176,25 @@ impl RecordCursor {
Ok(())
}
/// Deserializes a specific column without additional parsing.
///
/// This method assumes the header has already been parsed up to the target
/// column index (via `ensure_parsed_upto`). It extracts the actual data
/// value from the record's data section.
///
/// # Arguments
///
/// * `record` - The record containing the data
/// * `idx` - The column index to deserialize (0-based)
///
/// # Returns
///
/// * `Ok(RefValue)` - The deserialized value (may reference record data)
/// * `Err(LimboError)` - Deserialization failed
///
/// # Special Cases
///
/// - Returns `RefValue::Null` for out-of-bounds indices
pub fn deserialize_column(&self, record: &ImmutableRecord, idx: usize) -> Result<RefValue> {
if idx >= self.serial_types.len() {
return Ok(RefValue::Null);
@@ -1162,6 +1223,21 @@ impl RecordCursor {
Ok(value)
}
/// Gets the value at the specified column index.
///
/// This is the primary method for accessing record data. It combines
/// lazy parsing with deserialization in a single call.
///
/// # Arguments
///
/// * `record` - The record to read from
/// * `idx` - The column index (0-based)
///
/// # Returns
///
/// * `Ok(RefValue)` - The value at the specified index
/// * `Err(LimboError)` - Access failed due to invalid record or parsing error
///
#[inline(always)]
pub fn get_value(&mut self, record: &ImmutableRecord, idx: usize) -> Result<RefValue> {
if record.is_invalidated() {
@@ -1172,6 +1248,19 @@ impl RecordCursor {
self.deserialize_column(record, idx)
}
/// Gets the value at the specified column index, returning `None` on any error.
///
/// # Arguments
///
/// * `record` - The record to read from
/// * `idx` - The column index (0-based)
///
/// # Returns
///
/// * `Some(Ok(RefValue))` - Successfully read value
/// * `Some(Err(LimboError))` - Parsing succeeded but deserialization failed
/// * `None` - Record is invalid or index is out of bounds
///
pub fn get_value_opt(
&mut self,
record: &ImmutableRecord,
@@ -1188,6 +1277,17 @@ impl RecordCursor {
Some(self.deserialize_column(record, idx))
}
/// Returns the number of columns in the record.
///
/// This method parses the complete header to determine the total
/// column count. The result is cached for subsequent calls.
/// # Arguments
///
/// * `record` - The record to count columns in
///
/// # Returns
///
/// The number of columns, or 0 if the record is invalid.
pub fn count(&mut self, record: &ImmutableRecord) -> usize {
if record.is_invalidated() {
return 0;
@@ -1197,10 +1297,33 @@ impl RecordCursor {
self.serial_types.len()
}
/// Alias for `count()`. Returns the number of columns in the record.
///
/// # Arguments
///
/// * `record` - The record to get length of
///
/// # Returns
///
/// The number of columns, or 0 if the record is invalid.
pub fn len(&mut self, record: &ImmutableRecord) -> usize {
self.count(record)
}
/// Returns all values in the record as a vector.
///
/// This method parses the complete header and deserializes all columns.
/// Use this when you need access to most or all columns in the record.
///
/// # Arguments
///
/// * `record` - The record to extract all values from
///
/// # Returns
///
/// * `Ok(Vec<RefValue>)` - All values in column order
/// * `Err(LimboError)` - Parsing or deserialization failed
///
pub fn get_values(&mut self, record: &ImmutableRecord) -> Result<Vec<RefValue>> {
if record.is_invalidated() {
return Ok(Vec::new());
@@ -1429,37 +1552,6 @@ pub fn compare_immutable(
std::cmp::Ordering::Equal
}
pub fn compare_immutable_for_testing(
l: &[RefValue],
r: &[RefValue],
index_key_sort_order: IndexKeySortOrder,
collations: &[CollationSeq],
tie_breaker: std::cmp::Ordering,
) -> std::cmp::Ordering {
let min_len = l.len().min(r.len());
for i in 0..min_len {
let column_order = index_key_sort_order.get_sort_order_for_col(i);
let collation = collations.get(i).copied().unwrap_or_default();
let cmp = match (&l[i], &r[i]) {
(RefValue::Text(left), RefValue::Text(right)) => {
collation.compare_strings(left.as_str(), right.as_str())
}
_ => l[i].partial_cmp(&r[i]).unwrap_or(std::cmp::Ordering::Equal),
};
if cmp != std::cmp::Ordering::Equal {
return match column_order {
SortOrder::Asc => cmp,
SortOrder::Desc => cmp.reverse(),
};
}
}
tie_breaker
}
#[derive(Debug, Clone, Copy)]
pub enum RecordCompare {
Int,
@@ -2272,6 +2364,37 @@ mod tests {
use super::*;
use crate::translate::collate::CollationSeq;
pub fn compare_immutable_for_testing(
l: &[RefValue],
r: &[RefValue],
index_key_sort_order: IndexKeySortOrder,
collations: &[CollationSeq],
tie_breaker: std::cmp::Ordering,
) -> std::cmp::Ordering {
let min_len = l.len().min(r.len());
for i in 0..min_len {
let column_order = index_key_sort_order.get_sort_order_for_col(i);
let collation = collations.get(i).copied().unwrap_or_default();
let cmp = match (&l[i], &r[i]) {
(RefValue::Text(left), RefValue::Text(right)) => {
collation.compare_strings(left.as_str(), right.as_str())
}
_ => l[i].partial_cmp(&r[i]).unwrap_or(std::cmp::Ordering::Equal),
};
if cmp != std::cmp::Ordering::Equal {
return match column_order {
SortOrder::Asc => cmp,
SortOrder::Desc => cmp.reverse(),
};
}
}
tie_breaker
}
fn create_record(values: Vec<Value>) -> ImmutableRecord {
let registers: Vec<Register> = values.into_iter().map(Register::Value).collect();
ImmutableRecord::from_registers(&registers, registers.len())