btree: avoid reading entire cell when only rowid needed

This commit is contained in:
Jussi Saurio
2025-04-17 10:43:02 +03:00
parent ac8ffa645d
commit 017cdb9568
2 changed files with 90 additions and 73 deletions

View File

@@ -1250,62 +1250,42 @@ impl BTreeCursor {
}
let cur_cell_idx = (min + max) / 2;
self.stack.set_cell_index(cur_cell_idx as i32);
let cur_cell = contents.cell_get(
cur_cell_idx as usize,
payload_overflow_threshold_max(
contents.page_type(),
self.usable_space() as u16,
),
payload_overflow_threshold_min(
contents.page_type(),
self.usable_space() as u16,
),
self.usable_space(),
)?;
match &cur_cell {
BTreeCell::TableInteriorCell(TableInteriorCell {
_left_child_page,
_rowid: cell_rowid,
}) => {
// in sqlite btrees left child pages have <= keys.
// table btrees can have a duplicate rowid in the interior cell, so for example if we are looking for rowid=10,
// and we find an interior cell with rowid=10, we need to move to the left page since (due to the <= rule of sqlite btrees)
// the left page may have a rowid=10.
// Logic table for determining if target leaf page is in left subtree
//
// Forwards iteration (looking for first match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// GT | > | go left | First > key is in left subtree
// GT | = or < | go right | First > key is in right subtree
// GE | > or = | go left | First >= key is in left subtree
// GE | < | go right | First >= key is in right subtree
//
// Backwards iteration (looking for last match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// LE | > or = | go left | Last <= key is in left subtree
// LE | < | go right | Last <= key is in right subtree
// LT | > or = | go left | Last < key is in left subtree
// LT | < | go right?| Last < key is in right subtree, except if cell rowid is exactly 1 less
//
// No iteration (point query):
// EQ | > or = | go left | Last = key is in left subtree
// EQ | < | go right | Last = key is in right subtree
let is_on_left = match seek_op {
SeekOp::GT => *cell_rowid > rowid,
SeekOp::GE => *cell_rowid >= rowid,
SeekOp::LE => *cell_rowid >= rowid,
SeekOp::LT => *cell_rowid + 1 >= rowid,
SeekOp::EQ => *cell_rowid >= rowid,
};
if is_on_left {
leftmost_matching_cell = Some(cur_cell_idx as usize);
max = cur_cell_idx - 1;
} else {
min = cur_cell_idx + 1;
}
}
_ => unreachable!("unexpected cell type: {:?}", cur_cell),
let cell_rowid = contents.cell_table_interior_read_rowid(cur_cell_idx as usize)?;
// in sqlite btrees left child pages have <= keys.
// table btrees can have a duplicate rowid in the interior cell, so for example if we are looking for rowid=10,
// and we find an interior cell with rowid=10, we need to move to the left page since (due to the <= rule of sqlite btrees)
// the left page may have a rowid=10.
// Logic table for determining if target leaf page is in left subtree
//
// Forwards iteration (looking for first match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// GT | > | go left | First > key is in left subtree
// GT | = or < | go right | First > key is in right subtree
// GE | > or = | go left | First >= key is in left subtree
// GE | < | go right | First >= key is in right subtree
//
// Backwards iteration (looking for last match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// LE | > or = | go left | Last <= key is in left subtree
// LE | < | go right | Last <= key is in right subtree
// LT | > or = | go left | Last < key is in left subtree
// LT | < | go right?| Last < key is in right subtree, except if cell rowid is exactly 1 less
//
// No iteration (point query):
// EQ | > or = | go left | Last = key is in left subtree
// EQ | < | go right | Last = key is in right subtree
let is_on_left = match seek_op {
SeekOp::GT => cell_rowid > rowid,
SeekOp::GE => cell_rowid >= rowid,
SeekOp::LE => cell_rowid >= rowid,
SeekOp::LT => cell_rowid + 1 >= rowid,
SeekOp::EQ => cell_rowid >= rowid,
};
if is_on_left {
leftmost_matching_cell = Some(cur_cell_idx as usize);
max = cur_cell_idx - 1;
} else {
min = cur_cell_idx + 1;
}
}
}
@@ -1508,23 +1488,7 @@ impl BTreeCursor {
let cur_cell_idx = (min + max) / 2;
self.stack.set_cell_index(cur_cell_idx as i32);
let cur_cell = contents.cell_get(
cur_cell_idx as usize,
payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16),
payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16),
self.usable_space(),
)?;
let BTreeCell::TableLeafCell(TableLeafCell {
_rowid: cell_rowid,
_payload,
first_overflow_page,
payload_size,
..
}) = cur_cell
else {
unreachable!("unexpected cell type: {:?}", cur_cell);
};
let cell_rowid = contents.cell_table_leaf_read_rowid(cur_cell_idx as usize)?;
let cmp = cell_rowid.cmp(&rowid);
@@ -1538,6 +1502,28 @@ impl BTreeCursor {
// rowids are unique, so we can return the rowid immediately
if found && SeekOp::EQ == seek_op {
let cur_cell = contents.cell_get(
cur_cell_idx as usize,
payload_overflow_threshold_max(
contents.page_type(),
self.usable_space() as u16,
),
payload_overflow_threshold_min(
contents.page_type(),
self.usable_space() as u16,
),
self.usable_space(),
)?;
let BTreeCell::TableLeafCell(TableLeafCell {
_rowid: _,
_payload,
first_overflow_page,
payload_size,
..
}) = cur_cell
else {
unreachable!("unexpected cell type: {:?}", cur_cell);
};
return_if_io!(self.read_record_w_possible_overflow(
_payload,
first_overflow_page,

View File

@@ -598,6 +598,37 @@ impl PageContent {
usable_size,
)
}
/// Read the rowid of a table interior cell.
#[inline(always)]
pub fn cell_table_interior_read_rowid(&self, idx: usize) -> Result<u64> {
assert!(self.page_type() == PageType::TableInterior);
let buf = self.as_ptr();
const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12;
let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer = self.read_u16(cell_pointer) as usize;
const LEFT_CHILD_PAGE_SIZE_BYTES: usize = 4;
let (rowid, _) = read_varint(&buf[cell_pointer + LEFT_CHILD_PAGE_SIZE_BYTES..])?;
Ok(rowid)
}
/// Read the rowid of a table leaf cell.
#[inline(always)]
pub fn cell_table_leaf_read_rowid(&self, idx: usize) -> Result<u64> {
assert!(self.page_type() == PageType::TableLeaf);
let buf = self.as_ptr();
const LEAF_PAGE_HEADER_SIZE_BYTES: usize = 8;
let cell_pointer_array_start = LEAF_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer = self.read_u16(cell_pointer) as usize;
let mut pos = cell_pointer;
let (_, nr) = read_varint(&buf[pos..])?;
pos += nr;
let (rowid, _) = read_varint(&buf[pos..])?;
Ok(rowid)
}
/// The cell pointer array of a b-tree page immediately follows the b-tree page header.
/// Let K be the number of cells on the btree.
/// The cell pointer array consists of K 2-byte integer offsets to the cell contents.