Files
turso/core/storage/btree.rs
2025-07-24 13:18:33 +03:00

9223 lines
416 KiB
Rust

use tracing::{instrument, Level};
use crate::{
schema::Index,
storage::{
header_accessor,
pager::{BtreePageAllocMode, Pager},
sqlite3_ondisk::{
read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell,
TableLeafCell, CELL_PTR_SIZE_BYTES, INTERIOR_PAGE_HEADER_SIZE_BYTES,
LEAF_PAGE_HEADER_SIZE_BYTES, LEFT_CHILD_PTR_SIZE_BYTES,
},
},
translate::plan::IterationDirection,
turso_assert,
types::{
find_compare, get_tie_breaker_from_seek_op, IndexInfo, ParseRecordState, RecordCompare,
RecordCursor, SeekResult,
},
MvCursor,
};
use crate::{
return_corrupt, return_if_io,
types::{compare_immutable, IOResult, ImmutableRecord, RefValue, SeekKey, SeekOp, Value},
LimboError, Result,
};
use super::{
pager::PageRef,
sqlite3_ondisk::{
write_varint_to_vec, IndexInteriorCell, IndexLeafCell, OverflowCell, DATABASE_HEADER_SIZE,
MINIMUM_CELL_SIZE,
},
};
#[cfg(debug_assertions)]
use std::collections::HashSet;
use std::{
cell::{Cell, Ref, RefCell},
cmp::{Ordering, Reverse},
collections::BinaryHeap,
fmt::Debug,
ops::DerefMut,
pin::Pin,
rc::Rc,
sync::Arc,
};
/// The B-Tree page header is 12 bytes for interior pages and 8 bytes for leaf pages.
///
/// +--------+-----------------+-----------------+-----------------+--------+----- ..... ----+
/// | Page | First Freeblock | Cell Count | Cell Content | Frag. | Right-most |
/// | Type | Offset | | Area Start | Bytes | pointer |
/// +--------+-----------------+-----------------+-----------------+--------+----- ..... ----+
/// 0 1 2 3 4 5 6 7 8 11
///
pub mod offset {
/// Type of the B-Tree page (u8).
pub const BTREE_PAGE_TYPE: usize = 0;
/// A pointer to the first freeblock (u16).
///
/// This field of the B-Tree page header is an offset to the first freeblock, or zero if
/// there are no freeblocks on the page. A freeblock is a structure used to identify
/// unallocated space within a B-Tree page, organized as a chain.
///
/// Please note that freeblocks do not mean the regular unallocated free space to the left
/// of the cell content area pointer, but instead blocks of at least 4
/// bytes WITHIN the cell content area that are not in use due to e.g.
/// deletions.
pub const BTREE_FIRST_FREEBLOCK: usize = 1;
/// The number of cells in the page (u16).
pub const BTREE_CELL_COUNT: usize = 3;
/// A pointer to the first byte of cell allocated content from top (u16).
///
/// A zero value for this integer is interpreted as 65,536.
/// If a page contains no cells (which is only possible for a root page of a table that
/// contains no rows) then the offset to the cell content area will equal the page size minus
/// the bytes of reserved space. If the database uses a 65536-byte page size and the
/// reserved space is zero (the usual value for reserved space) then the cell content offset of
/// an empty page wants to be 6,5536
///
/// SQLite strives to place cells as far toward the end of the b-tree page as it can, in
/// order to leave space for future growth of the cell pointer array. This means that the
/// cell content area pointer moves leftward as cells are added to the page.
pub const BTREE_CELL_CONTENT_AREA: usize = 5;
/// The number of fragmented bytes (u8).
///
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
pub const BTREE_FRAGMENTED_BYTES_COUNT: usize = 7;
/// The right-most pointer (saved separately from cells) (u32)
pub const BTREE_RIGHTMOST_PTR: usize = 8;
}
/// Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
/// this will be declared corrupt. This value is calculated based on a
/// maximum database size of 2^31 pages a minimum fanout of 2 for a
/// root-node and 3 for all other internal nodes.
///
/// If a tree that appears to be taller than this is encountered, it is
/// assumed that the database is corrupt.
pub const BTCURSOR_MAX_DEPTH: usize = 20;
/// Maximum number of sibling pages that balancing is performed on.
pub const MAX_SIBLING_PAGES_TO_BALANCE: usize = 3;
/// We only need maximum 5 pages to balance 3 pages, because we can guarantee that cells from 3 pages will fit in 5 pages.
pub const MAX_NEW_SIBLING_PAGES_AFTER_BALANCE: usize = 5;
/// Check if the page is unlocked, if not return IO.
macro_rules! return_if_locked {
($expr:expr) => {{
if $expr.is_locked() {
return Ok(IOResult::IO);
}
}};
}
/// Validate cells in a page are in a valid state. Only in debug mode.
macro_rules! debug_validate_cells {
($page_contents:expr, $usable_space:expr) => {
#[cfg(debug_assertions)]
{
debug_validate_cells_core($page_contents, $usable_space);
}
};
}
/// Check if the page is unlocked, if not return IO. If the page is not locked but not loaded, then try to load it.
macro_rules! return_if_locked_maybe_load {
($pager:expr, $btree_page:expr) => {{
if $btree_page.get().is_locked() {
return Ok(IOResult::IO);
}
if !$btree_page.get().is_loaded() {
let page = $pager.read_page($btree_page.get().get().id)?;
$btree_page.page.replace(page);
return Ok(IOResult::IO);
}
}};
}
/// Wrapper around a page reference used in order to update the reference in case page was unloaded
/// and we need to update the reference.
pub struct BTreePageInner {
pub page: RefCell<PageRef>,
}
pub type BTreePage = Arc<BTreePageInner>;
unsafe impl Send for BTreePageInner {}
unsafe impl Sync for BTreePageInner {}
/// State machine of destroy operations
/// Keep track of traversal so that it can be resumed when IO is encountered
#[derive(Debug, Clone)]
enum DestroyState {
Start,
LoadPage,
ProcessPage,
ClearOverflowPages { cell: BTreeCell },
FreePage,
}
struct DestroyInfo {
state: DestroyState,
}
#[derive(Debug, Clone)]
enum DeleteSavepoint {
Rowid(i64),
Payload(ImmutableRecord),
}
#[derive(Debug, Clone)]
enum DeleteState {
Start,
DeterminePostBalancingSeekKey,
LoadPage {
post_balancing_seek_key: Option<DeleteSavepoint>,
},
FindCell {
post_balancing_seek_key: Option<DeleteSavepoint>,
},
ClearOverflowPages {
cell_idx: usize,
cell: BTreeCell,
original_child_pointer: Option<u32>,
post_balancing_seek_key: Option<DeleteSavepoint>,
},
InteriorNodeReplacement {
page: PageRef,
/// the btree level of the page where the cell replacement happened.
/// if the replacement causes the page to overflow/underflow, we need to remember it and balance it
/// after the deletion process is otherwise complete.
btree_depth: usize,
cell_idx: usize,
original_child_pointer: Option<u32>,
post_balancing_seek_key: Option<DeleteSavepoint>,
},
CheckNeedsBalancing {
/// same as `InteriorNodeReplacement::btree_depth`
btree_depth: usize,
post_balancing_seek_key: Option<DeleteSavepoint>,
},
WaitForBalancingToComplete {
/// If provided, will also balance an ancestor page at depth `balance_ancestor_at_depth`.
/// If not provided, balancing will stop as soon as a level is encountered where no balancing is required.
balance_ancestor_at_depth: Option<usize>,
target_key: DeleteSavepoint,
},
SeekAfterBalancing {
target_key: DeleteSavepoint,
},
/// If the seek performed in [DeleteState::SeekAfterBalancing] returned a [SeekResult::TryAdvance] we need to call next()/prev() to get to the right location.
/// We need to have this separate state for re-entrancy as calling next()/prev() might yield on IO.
/// FIXME: refactor DeleteState not to have SeekAfterBalancing and instead use save_context() and restore_context()
TryAdvance,
}
#[derive(Clone)]
struct DeleteInfo {
state: DeleteState,
balance_write_info: Option<WriteInfo>,
}
/// State machine of a write operation.
/// May involve balancing due to overflow.
#[derive(Debug, Clone, Copy)]
enum WriteState {
Start,
BalanceStart,
BalanceFreePages {
curr_page: usize,
sibling_count_new: usize,
},
/// Choose which sibling pages to balance (max 3).
/// Generally, the siblings involved will be the page that triggered the balancing and its left and right siblings.
/// The exceptions are:
/// 1. If the leftmost page triggered balancing, up to 3 leftmost pages will be balanced.
/// 2. If the rightmost page triggered balancing, up to 3 rightmost pages will be balanced.
BalanceNonRootPickSiblings,
/// Perform the actual balancing. This will result in 1-5 pages depending on the number of total cells to be distributed
/// from the source pages.
BalanceNonRootDoBalancing,
Finish,
}
struct ReadPayloadOverflow {
payload: Vec<u8>,
next_page: u32,
remaining_to_read: usize,
page: BTreePage,
}
enum PayloadOverflowWithOffset {
SkipOverflowPages {
next_page: u32,
pages_left_to_skip: u32,
page_offset: u32,
amount: u32,
buffer_offset: usize,
is_write: bool,
},
ProcessPage {
next_page: u32,
remaining_to_read: u32,
page: BTreePage,
current_offset: usize,
buffer_offset: usize,
is_write: bool,
},
}
#[derive(Clone, Debug)]
pub enum BTreeKey<'a> {
TableRowId((i64, Option<&'a ImmutableRecord>)),
IndexKey(&'a ImmutableRecord),
}
impl BTreeKey<'_> {
/// Create a new table rowid key from a rowid and an optional immutable record.
/// The record is optional because it may not be available when the key is created.
pub fn new_table_rowid(rowid: i64, record: Option<&ImmutableRecord>) -> BTreeKey<'_> {
BTreeKey::TableRowId((rowid, record))
}
/// Create a new index key from an immutable record.
pub fn new_index_key(record: &ImmutableRecord) -> BTreeKey<'_> {
BTreeKey::IndexKey(record)
}
/// Get the record, if present. Index will always be present,
fn get_record(&self) -> Option<&'_ ImmutableRecord> {
match self {
BTreeKey::TableRowId((_, record)) => *record,
BTreeKey::IndexKey(record) => Some(record),
}
}
/// Get the rowid, if present. Index will never be present.
fn maybe_rowid(&self) -> Option<i64> {
match self {
BTreeKey::TableRowId((rowid, _)) => Some(*rowid),
BTreeKey::IndexKey(_) => None,
}
}
/// Assert that the key is an integer rowid and return it.
fn to_rowid(&self) -> i64 {
match self {
BTreeKey::TableRowId((rowid, _)) => *rowid,
BTreeKey::IndexKey(_) => panic!("BTreeKey::to_rowid called on IndexKey"),
}
}
}
#[derive(Clone)]
struct BalanceInfo {
/// Old pages being balanced. We can have maximum 3 pages being balanced at the same time.
pages_to_balance: [Option<BTreePage>; MAX_SIBLING_PAGES_TO_BALANCE],
/// Bookkeeping of the rightmost pointer so the offset::BTREE_RIGHTMOST_PTR can be updated.
rightmost_pointer: *mut u8,
/// Divider cells of old pages. We can have maximum 2 divider cells because of 3 pages.
divider_cell_payloads: [Option<Vec<u8>>; MAX_SIBLING_PAGES_TO_BALANCE - 1],
/// Number of siblings being used to balance
sibling_count: usize,
/// First divider cell to remove that marks the first sibling
first_divider_cell: usize,
}
#[derive(Clone)]
struct WriteInfo {
/// State of the write operation state machine.
state: WriteState,
balance_info: RefCell<Option<BalanceInfo>>,
}
impl WriteInfo {
fn new() -> WriteInfo {
WriteInfo {
state: WriteState::Start,
balance_info: RefCell::new(None),
}
}
}
/// Holds the state machine for the operation that was in flight when the cursor
/// was suspended due to IO.
enum CursorState {
None,
ReadWritePayload(PayloadOverflowWithOffset),
Write(WriteInfo),
Destroy(DestroyInfo),
Delete(DeleteInfo),
}
impl CursorState {
fn write_info(&self) -> Option<&WriteInfo> {
match self {
CursorState::Write(x) => Some(x),
_ => None,
}
}
fn mut_write_info(&mut self) -> Option<&mut WriteInfo> {
match self {
CursorState::Write(x) => Some(x),
_ => None,
}
}
fn destroy_info(&self) -> Option<&DestroyInfo> {
match self {
CursorState::Destroy(x) => Some(x),
_ => None,
}
}
fn mut_destroy_info(&mut self) -> Option<&mut DestroyInfo> {
match self {
CursorState::Destroy(x) => Some(x),
_ => None,
}
}
fn delete_info(&self) -> Option<&DeleteInfo> {
match self {
CursorState::Delete(x) => Some(x),
_ => None,
}
}
fn mut_delete_info(&mut self) -> Option<&mut DeleteInfo> {
match self {
CursorState::Delete(x) => Some(x),
_ => None,
}
}
}
impl Debug for CursorState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Delete(..) => write!(f, "Delete"),
Self::Destroy(..) => write!(f, "Destroy"),
Self::None => write!(f, "None"),
Self::ReadWritePayload(..) => write!(f, "ReadWritePayload"),
Self::Write(..) => write!(f, "Write"),
}
}
}
enum OverflowState {
Start,
ProcessPage { next_page: u32 },
Done,
}
/// Holds a Record or RowId, so that these can be transformed into a SeekKey to restore
/// cursor position to its previous location.
pub enum CursorContext {
TableRowId(i64),
/// If we are in an index tree we can then reuse this field to save
/// our cursor information
IndexKeyRowId(ImmutableRecord),
}
/// In the future, we may expand these general validity states
#[derive(Debug, PartialEq, Eq)]
pub enum CursorValidState {
/// Cursor is pointing a to an existing location/cell in the Btree
Valid,
/// Cursor may be pointing to a non-existent location/cell. This can happen after balancing operations
RequireSeek,
/// Cursor requires an advance after a seek
RequireAdvance(IterationDirection),
}
#[derive(Debug)]
/// State used for seeking
pub enum CursorSeekState {
Start,
MovingBetweenPages {
eq_seen: Cell<bool>,
},
InteriorPageBinarySearch {
min_cell_idx: Cell<isize>,
max_cell_idx: Cell<isize>,
nearest_matching_cell: Cell<Option<usize>>,
eq_seen: Cell<bool>,
},
FoundLeaf {
eq_seen: Cell<bool>,
},
LeafPageBinarySearch {
min_cell_idx: Cell<isize>,
max_cell_idx: Cell<isize>,
nearest_matching_cell: Cell<Option<usize>>,
/// Indicates if we have seen an exact match during the downwards traversal of the btree.
/// This is only needed in index seeks, in cases where we need to determine whether we call
/// an additional next()/prev() to fetch a matching record from an interior node. We will not
/// do that if both are true:
/// 1. We have not seen an EQ during the traversal
/// 2. We are looking for an exact match ([SeekOp::GE] or [SeekOp::LE] with eq_only: true)
eq_seen: Cell<bool>,
/// In multiple places, we do a seek that checks for an exact match (SeekOp::EQ) in the tree.
/// In those cases, we need to know where to land if we don't find an exact match in the leaf page.
/// For non-eq-only conditions (GT, LT, GE, LE), this is pretty simple:
/// - If we are looking for GT/GE and don't find a match, we should end up beyond the end of the page (idx=cell count).
/// - If we are looking for LT/LE and don't find a match, we should end up before the beginning of the page (idx=-1).
///
/// For eq-only conditions (GE { eq_only: true } or LE { eq_only: true }), we need to know where to land if we don't find an exact match.
/// For GE, we want to land at the first cell that is greater than the seek key.
/// For LE, we want to land at the last cell that is less than the seek key.
/// This is because e.g. when we attempt to insert rowid 666, we first check if it exists.
/// If it doesn't, we want to land in the place where rowid 666 WOULD be inserted.
target_cell_when_not_found: Cell<i32>,
},
}
pub struct BTreeCursor {
/// The multi-version cursor that is used to read and write to the database file.
mv_cursor: Option<Rc<RefCell<MvCursor>>>,
/// The pager that is used to read and write to the database file.
pager: Rc<Pager>,
/// Page id of the root page used to go back up fast.
root_page: usize,
/// Rowid and record are stored before being consumed.
has_record: Cell<bool>,
null_flag: bool,
/// Index internal pages are consumed on the way up, so we store going upwards flag in case
/// we just moved to a parent page and the parent page is an internal index page which requires
/// to be consumed.
going_upwards: bool,
/// Information maintained across execution attempts when an operation yields due to I/O.
state: CursorState,
/// Information maintained while freeing overflow pages. Maintained separately from cursor state since
/// any method could require freeing overflow pages
overflow_state: Option<OverflowState>,
/// Page stack used to traverse the btree.
/// Each cursor has a stack because each cursor traverses the btree independently.
stack: PageStack,
/// Reusable immutable record, used to allow better allocation strategy.
reusable_immutable_record: RefCell<Option<ImmutableRecord>>,
/// Reusable immutable record, used to allow better allocation strategy.
parse_record_state: RefCell<ParseRecordState>,
/// Information about the index key structure (sort order, collation, etc)
pub index_info: Option<IndexInfo>,
/// Maintain count of the number of records in the btree. Used for the `Count` opcode
count: usize,
/// Stores the cursor context before rebalancing so that a seek can be done later
context: Option<CursorContext>,
/// Store whether the Cursor is in a valid state. Meaning if it is pointing to a valid cell index or not
pub valid_state: CursorValidState,
seek_state: CursorSeekState,
/// Separate state to read a record with overflow pages. This separation from `state` is necessary as
/// we can be in a function that relies on `state`, but also needs to process overflow pages
read_overflow_state: RefCell<Option<ReadPayloadOverflow>>,
/// `RecordCursor` is used to parse SQLite record format data retrieved from B-tree
/// leaf pages. It provides incremental parsing, only deserializing the columns that are
/// actually accessed, which is crucial for performance when dealing with wide tables
/// where only a subset of columns are needed.
///
/// - Record parsing is logically a read operation from the caller's perspective
/// - But internally requires updating the cursor's cached parsing state
/// - Multiple methods may need to access different columns from the same record
///
/// # Lifecycle
///
/// The cursor is invalidated and reset when:
/// - Moving to a different record/row
/// - The underlying `ImmutableRecord` is modified
pub record_cursor: RefCell<RecordCursor>,
}
/// We store the cell index and cell count for each page in the stack.
/// The reason we store the cell count is because we need to know when we are at the end of the page,
/// without having to perform IO to get the ancestor pages.
#[derive(Debug, Clone, Copy, Default)]
struct BTreeNodeState {
cell_idx: i32,
cell_count: Option<i32>,
}
impl BTreeNodeState {
/// Check if the current cell index is at the end of the page.
/// This information is used to determine whether a child page should move up to its parent.
/// If the child page is the rightmost leaf page and it has reached the end, this means all of its ancestors have
/// already reached the end, so it should not go up because there are no more records to traverse.
fn is_at_end(&self) -> bool {
let cell_count = self.cell_count.expect("cell_count is not set");
// cell_idx == cell_count means: we will traverse to the rightmost pointer next.
// cell_idx == cell_count + 1 means: we have already gone down to the rightmost pointer.
self.cell_idx == cell_count + 1
}
}
impl BTreeCursor {
pub fn new(
mv_cursor: Option<Rc<RefCell<MvCursor>>>,
pager: Rc<Pager>,
root_page: usize,
num_columns: usize,
) -> Self {
Self {
mv_cursor,
pager,
root_page,
has_record: Cell::new(false),
null_flag: false,
going_upwards: false,
state: CursorState::None,
overflow_state: None,
stack: PageStack {
current_page: Cell::new(-1),
node_states: RefCell::new([BTreeNodeState::default(); BTCURSOR_MAX_DEPTH + 1]),
stack: RefCell::new([const { None }; BTCURSOR_MAX_DEPTH + 1]),
},
reusable_immutable_record: RefCell::new(None),
index_info: None,
count: 0,
context: None,
valid_state: CursorValidState::Valid,
seek_state: CursorSeekState::Start,
read_overflow_state: RefCell::new(None),
parse_record_state: RefCell::new(ParseRecordState::Init),
record_cursor: RefCell::new(RecordCursor::with_capacity(num_columns)),
}
}
pub fn new_table(
mv_cursor: Option<Rc<RefCell<MvCursor>>>,
pager: Rc<Pager>,
root_page: usize,
num_columns: usize,
) -> Self {
Self::new(mv_cursor, pager, root_page, num_columns)
}
pub fn new_index(
mv_cursor: Option<Rc<RefCell<MvCursor>>>,
pager: Rc<Pager>,
root_page: usize,
index: &Index,
num_columns: usize,
) -> Self {
let mut cursor = Self::new(mv_cursor, pager, root_page, num_columns);
cursor.index_info = Some(IndexInfo::new_from_index(index));
cursor
}
pub fn has_rowid(&self) -> bool {
match &self.index_info {
Some(index_key_info) => index_key_info.has_rowid,
None => true, // currently we don't support WITHOUT ROWID tables
}
}
pub fn get_index_rowid_from_record(&self) -> Option<i64> {
if !self.has_rowid() {
return None;
}
let mut record_cursor_ref = self.record_cursor.borrow_mut();
let record_cursor = record_cursor_ref.deref_mut();
let rowid = match self
.get_immutable_record()
.as_ref()
.unwrap()
.last_value(record_cursor)
{
Some(Ok(RefValue::Integer(rowid))) => rowid,
_ => unreachable!(
"index where has_rowid() is true should have an integer rowid as the last value"
),
};
Some(rowid)
}
/// Check if the table is empty.
/// This is done by checking if the root page has no cells.
#[instrument(skip_all, level = Level::DEBUG)]
fn is_empty_table(&self) -> Result<IOResult<bool>> {
if let Some(mv_cursor) = &self.mv_cursor {
let mv_cursor = mv_cursor.borrow();
return Ok(IOResult::Done(mv_cursor.is_empty()));
}
let page = self.pager.read_page(self.root_page)?;
return_if_locked!(page);
let cell_count = page.get().contents.as_ref().unwrap().cell_count();
Ok(IOResult::Done(cell_count == 0))
}
/// Move the cursor to the previous record and return it.
/// Used in backwards iteration.
#[instrument(skip(self), level = Level::DEBUG, name = "prev")]
fn get_prev_record(&mut self) -> Result<IOResult<bool>> {
loop {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
let cell_count = contents.cell_count();
let cell_idx = self.stack.current_cell_index();
// If we are at the end of the page and we haven't just come back from the right child,
// we now need to move to the rightmost child.
if self.stack.current_cell_index() == i32::MAX && !self.going_upwards {
let rightmost_pointer = contents.rightmost_pointer();
if let Some(rightmost_pointer) = rightmost_pointer {
let past_rightmost_pointer = cell_count as i32 + 1;
self.stack.set_cell_index(past_rightmost_pointer);
self.stack
.push_backwards(self.read_page(rightmost_pointer as usize)?);
continue;
}
}
if cell_idx >= cell_count as i32 {
self.stack.set_cell_index(cell_count as i32 - 1);
} else if !self.stack.current_cell_index_less_than_min() {
let is_index = page.is_index();
// skip retreat in case we still haven't visited this cell in index
let should_visit_internal_node = is_index && self.going_upwards; // we are going upwards, this means we still need to visit divider cell in an index
let page_type = contents.page_type();
if should_visit_internal_node {
self.going_upwards = false;
return Ok(IOResult::Done(true));
} else if matches!(
page_type,
PageType::IndexLeaf | PageType::TableLeaf | PageType::TableInterior
) {
self.stack.retreat();
}
}
// moved to beginning of current page
// todo: find a better way to flag moved to end or begin of page
if self.stack.current_cell_index_less_than_min() {
loop {
if self.stack.current_cell_index() >= 0 {
break;
}
if self.stack.has_parent() {
self.going_upwards = true;
self.stack.pop();
} else {
// moved to begin of btree
// dbg!(false);
return Ok(IOResult::Done(false));
}
}
// continue to next loop to get record from the new page
continue;
}
let cell_idx = self.stack.current_cell_index() as usize;
let cell = contents.cell_get(cell_idx, self.usable_space())?;
match cell {
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page, ..
}) => {
let mem_page = self.read_page(left_child_page as usize)?;
self.stack.push_backwards(mem_page);
continue;
}
BTreeCell::TableLeafCell(TableLeafCell { .. }) => {
return Ok(IOResult::Done(true));
}
BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page, ..
}) => {
if !self.going_upwards {
// In backwards iteration, if we haven't just moved to this interior node from the
// right child, but instead are about to move to the left child, we need to retreat
// so that we don't come back to this node again.
// For example:
// this parent: key 666
// left child has: key 663, key 664, key 665
// we need to move to the previous parent (with e.g. key 662) when iterating backwards.
let mem_page = self.read_page(left_child_page as usize)?;
self.stack.retreat();
self.stack.push_backwards(mem_page);
continue;
}
// Going upwards = we just moved to an interior cell from the right child.
// On the first pass we must take the record from the interior cell (since unlike table btrees, index interior cells have payloads)
// We then mark going_upwards=false so that we go back down the tree on the next invocation.
self.going_upwards = false;
return Ok(IOResult::Done(true));
}
BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => {
return Ok(IOResult::Done(true));
}
}
}
}
/// Reads the record of a cell that has overflow pages. This is a state machine that requires to be called until completion so everything
/// that calls this function should be reentrant.
#[instrument(skip_all, level = Level::DEBUG)]
fn process_overflow_read(
&self,
payload: &'static [u8],
start_next_page: u32,
payload_size: u64,
) -> Result<IOResult<()>> {
if self.read_overflow_state.borrow().is_none() {
let page = self.read_page(start_next_page as usize)?;
*self.read_overflow_state.borrow_mut() = Some(ReadPayloadOverflow {
payload: payload.to_vec(),
next_page: start_next_page,
remaining_to_read: payload_size as usize - payload.len(),
page,
});
return Ok(IOResult::IO);
}
let mut read_overflow_state = self.read_overflow_state.borrow_mut();
let ReadPayloadOverflow {
payload,
next_page,
remaining_to_read,
page: page_btree,
} = read_overflow_state.as_mut().unwrap();
if page_btree.get().is_locked() {
return Ok(IOResult::IO);
}
tracing::debug!(next_page, remaining_to_read, "reading overflow page");
let page = page_btree.get();
let contents = page.get_contents();
// The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain.
let next = contents.read_u32_no_offset(0);
let buf = contents.as_ptr();
let usable_space = self.pager.usable_space();
let to_read = (*remaining_to_read).min(usable_space - 4);
payload.extend_from_slice(&buf[4..4 + to_read]);
*remaining_to_read -= to_read;
if *remaining_to_read != 0 && next != 0 {
let new_page = self.pager.read_page(next as usize).map(|page| {
Arc::new(BTreePageInner {
page: RefCell::new(page),
})
})?;
*page_btree = new_page;
*next_page = next;
return Ok(IOResult::IO);
}
turso_assert!(
*remaining_to_read == 0 && next == 0,
"we can't have more pages to read while also have read everything"
);
let mut payload_swap = Vec::new();
std::mem::swap(payload, &mut payload_swap);
let mut reuse_immutable = self.get_immutable_record_or_create();
reuse_immutable.as_mut().unwrap().invalidate();
reuse_immutable
.as_mut()
.unwrap()
.start_serialization(&payload_swap);
self.record_cursor.borrow_mut().invalidate();
let _ = read_overflow_state.take();
Ok(IOResult::Done(()))
}
/// Calculates how much of a cell's payload should be stored locally vs in overflow pages
///
/// Parameters:
/// - payload_len: Total length of the payload data
/// - page_type: Type of the B-tree page (affects local storage thresholds)
///
/// Returns:
/// - A tuple of (n_local, payload_len) where:
/// - n_local: Amount of payload to store locally on the page
/// - payload_len: Total payload length (unchanged from input)
pub fn parse_cell_info(
&self,
payload_len: usize,
page_type: PageType,
usable_size: usize,
) -> Result<(usize, usize)> {
let max_local = payload_overflow_threshold_max(page_type, usable_size);
let min_local = payload_overflow_threshold_min(page_type, usable_size);
// This matches btreeParseCellAdjustSizeForOverflow logic
let n_local = if payload_len <= max_local {
// Common case - everything fits locally
payload_len
} else {
// For payloads that need overflow pages:
// Calculate how much should be stored locally using the following formula:
// surplus = min_local + (payload_len - min_local) % (usable_space - 4)
//
// This tries to minimize unused space on overflow pages while keeping
// the local storage between min_local and max_local thresholds.
// The (usable_space - 4) factor accounts for overhead in overflow pages.
let surplus = min_local + (payload_len - min_local) % (self.usable_space() - 4);
if surplus <= max_local {
surplus
} else {
min_local
}
};
Ok((n_local, payload_len))
}
/// This function is used to read/write into the payload of a cell that
/// cursor is pointing to.
/// Parameters:
/// - offset: offset in the payload to start reading/writing
/// - buffer: buffer to read/write into
/// - amount: amount of bytes to read/write
/// - is_write: true if writing, false if reading
///
/// If the cell has overflow pages, it will skip till the overflow page which
/// is at the offset given.
#[instrument(skip_all, level = Level::DEBUG)]
pub fn read_write_payload_with_offset(
&mut self,
mut offset: u32,
buffer: &mut Vec<u8>,
mut amount: u32,
is_write: bool,
) -> Result<IOResult<()>> {
if let CursorState::ReadWritePayload(PayloadOverflowWithOffset::SkipOverflowPages {
..
})
| CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { .. }) =
&self.state
{
return self.continue_payload_overflow_with_offset(buffer, self.usable_space());
}
let page_btree = self.stack.top();
return_if_locked_maybe_load!(self.pager, page_btree);
let page = page_btree.get();
let contents = page.get().contents.as_ref().unwrap();
let cell_idx = self.stack.current_cell_index() as usize - 1;
if cell_idx >= contents.cell_count() {
return Err(LimboError::Corrupt("Invalid cell index".into()));
}
let usable_size = self.usable_space();
let cell = contents.cell_get(cell_idx, usable_size).unwrap();
let (payload, payload_size, first_overflow_page) = match cell {
BTreeCell::TableLeafCell(cell) => {
(cell.payload, cell.payload_size, cell.first_overflow_page)
}
BTreeCell::IndexLeafCell(cell) => {
(cell.payload, cell.payload_size, cell.first_overflow_page)
}
BTreeCell::IndexInteriorCell(cell) => {
(cell.payload, cell.payload_size, cell.first_overflow_page)
}
BTreeCell::TableInteriorCell(_) => {
return Err(LimboError::Corrupt(
"Cannot access payload of table interior cell".into(),
));
}
};
turso_assert!(
offset + amount <= payload_size as u32,
"offset + amount <= payload_size"
);
let (local_size, _) =
self.parse_cell_info(payload_size as usize, contents.page_type(), usable_size)?;
let mut bytes_processed: u32 = 0;
if offset < local_size as u32 {
let mut local_amount: u32 = amount;
if local_amount + offset > local_size as u32 {
local_amount = local_size as u32 - offset;
}
if is_write {
self.write_payload_to_page(
offset,
local_amount,
payload,
buffer,
page_btree.clone(),
);
} else {
self.read_payload_from_page(offset, local_amount, payload, buffer);
}
offset = 0;
amount -= local_amount;
bytes_processed += local_amount;
} else {
offset -= local_size as u32;
}
if amount > 0 {
if first_overflow_page.is_none() {
return Err(LimboError::Corrupt(
"Expected overflow page but none found".into(),
));
}
let overflow_size = usable_size - 4;
let pages_to_skip = offset / overflow_size as u32;
let page_offset = offset % overflow_size as u32;
self.state =
CursorState::ReadWritePayload(PayloadOverflowWithOffset::SkipOverflowPages {
next_page: first_overflow_page.unwrap(),
pages_left_to_skip: pages_to_skip,
page_offset,
amount,
buffer_offset: bytes_processed as usize,
is_write,
});
return Ok(IOResult::IO);
}
Ok(IOResult::Done(()))
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn continue_payload_overflow_with_offset(
&mut self,
buffer: &mut Vec<u8>,
usable_space: usize,
) -> Result<IOResult<()>> {
loop {
let mut state = std::mem::replace(&mut self.state, CursorState::None);
match &mut state {
CursorState::ReadWritePayload(PayloadOverflowWithOffset::SkipOverflowPages {
next_page,
pages_left_to_skip,
page_offset,
amount,
buffer_offset,
is_write,
}) => {
if *pages_left_to_skip == 0 {
let page = self.read_page(*next_page as usize)?;
return_if_locked_maybe_load!(self.pager, page);
self.state =
CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage {
next_page: *next_page,
remaining_to_read: *amount,
page,
current_offset: *page_offset as usize,
buffer_offset: *buffer_offset,
is_write: *is_write,
});
continue;
}
let page = self.read_page(*next_page as usize)?;
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get_contents();
let next = contents.read_u32_no_offset(0);
if next == 0 {
return Err(LimboError::Corrupt(
"Overflow chain ends prematurely".into(),
));
}
*next_page = next;
*pages_left_to_skip -= 1;
self.state = CursorState::ReadWritePayload(
PayloadOverflowWithOffset::SkipOverflowPages {
next_page: next,
pages_left_to_skip: *pages_left_to_skip,
page_offset: *page_offset,
amount: *amount,
buffer_offset: *buffer_offset,
is_write: *is_write,
},
);
return Ok(IOResult::IO);
}
CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage {
next_page,
remaining_to_read,
page: page_btree,
current_offset,
buffer_offset,
is_write,
}) => {
if page_btree.get().is_locked() {
self.state =
CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage {
next_page: *next_page,
remaining_to_read: *remaining_to_read,
page: page_btree.clone(),
current_offset: *current_offset,
buffer_offset: *buffer_offset,
is_write: *is_write,
});
return Ok(IOResult::IO);
}
let page = page_btree.get();
let contents = page.get_contents();
let overflow_size = usable_space - 4;
let page_offset = *current_offset;
let bytes_to_process = std::cmp::min(
*remaining_to_read,
overflow_size as u32 - page_offset as u32,
);
let payload_offset = 4 + page_offset;
let page_payload = contents.as_ptr();
if *is_write {
self.write_payload_to_page(
payload_offset as u32,
bytes_to_process,
page_payload,
buffer,
page_btree.clone(),
);
} else {
self.read_payload_from_page(
payload_offset as u32,
bytes_to_process,
page_payload,
buffer,
);
}
*remaining_to_read -= bytes_to_process;
*buffer_offset += bytes_to_process as usize;
if *remaining_to_read == 0 {
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
let next = contents.read_u32_no_offset(0);
if next == 0 {
return Err(LimboError::Corrupt(
"Overflow chain ends prematurely".into(),
));
}
// Load next page
*next_page = next;
*current_offset = 0; // Reset offset for new page
*page_btree = self.read_page(next as usize)?;
// Return IO to allow other operations
return Ok(IOResult::IO);
}
_ => {
return Err(LimboError::InternalError(
"Invalid state for continue_payload_overflow_with_offset".into(),
))
}
}
}
}
fn read_payload_from_page(
&self,
payload_offset: u32,
num_bytes: u32,
payload: &[u8],
buffer: &mut Vec<u8>,
) {
buffer.extend_from_slice(
&payload[payload_offset as usize..(payload_offset + num_bytes) as usize],
);
}
/// This function write from a buffer into a page.
/// SAFETY: This function uses unsafe in the write path to write to the page payload directly.
/// - Make sure the page is pointing to valid data ie the page is not evicted from the page-cache.
fn write_payload_to_page(
&mut self,
payload_offset: u32,
num_bytes: u32,
payload: &[u8],
buffer: &mut [u8],
page: BTreePage,
) {
self.pager.add_dirty(&page.get());
// SAFETY: This is safe as long as the page is not evicted from the cache.
let payload_mut =
unsafe { std::slice::from_raw_parts_mut(payload.as_ptr() as *mut u8, payload.len()) };
payload_mut[payload_offset as usize..payload_offset as usize + num_bytes as usize]
.copy_from_slice(&buffer[..num_bytes as usize]);
}
/// Check if any ancestor pages still have cells to iterate.
/// If not, traversing back up to parent is of no use because we are at the end of the tree.
fn ancestor_pages_have_more_children(&self) -> bool {
let node_states = self.stack.node_states.borrow();
(0..self.stack.current())
.rev()
.any(|idx| !node_states[idx].is_at_end())
}
/// Move the cursor to the next record and return it.
/// Used in forwards iteration, which is the default.
#[instrument(skip(self), level = Level::DEBUG, name = "next")]
fn get_next_record(&mut self) -> Result<IOResult<bool>> {
if let Some(mv_cursor) = &self.mv_cursor {
let mut mv_cursor = mv_cursor.borrow_mut();
mv_cursor.forward();
let rowid = mv_cursor.current_row_id();
match rowid {
Some(_rowid) => {
return Ok(IOResult::Done(true));
}
None => return Ok(IOResult::Done(false)),
}
}
loop {
let mem_page_rc = self.stack.top();
return_if_locked_maybe_load!(self.pager, mem_page_rc);
let mem_page = mem_page_rc.get();
let contents = mem_page.get().contents.as_ref().unwrap();
let cell_count = contents.cell_count();
tracing::debug!(
id = mem_page_rc.get().get().id,
cell = self.stack.current_cell_index(),
cell_count,
"current_before_advance",
);
let is_index = mem_page_rc.get().is_index();
let should_skip_advance = is_index
&& self.going_upwards // we are going upwards, this means we still need to visit divider cell in an index
&& self.stack.current_cell_index() >= 0 && self.stack.current_cell_index() < cell_count as i32; // if we weren't on a
// valid cell then it means we will have to move upwards again or move to right page,
// anyways, we won't visit this invalid cell index
if should_skip_advance {
tracing::debug!(
going_upwards = self.going_upwards,
page = mem_page_rc.get().get().id,
cell_idx = self.stack.current_cell_index(),
"skipping advance",
);
self.going_upwards = false;
return Ok(IOResult::Done(true));
}
// Important to advance only after loading the page in order to not advance > 1 times
self.stack.advance();
let cell_idx = self.stack.current_cell_index() as usize;
tracing::debug!(id = mem_page_rc.get().get().id, cell = cell_idx, "current");
if cell_idx >= cell_count {
let rightmost_already_traversed = cell_idx > cell_count;
match (contents.rightmost_pointer(), rightmost_already_traversed) {
(Some(right_most_pointer), false) => {
// do rightmost
self.stack.advance();
let mem_page = self.read_page(right_most_pointer as usize)?;
self.stack.push(mem_page);
continue;
}
_ => {
if self.ancestor_pages_have_more_children() {
tracing::trace!("moving simple upwards");
self.going_upwards = true;
self.stack.pop();
continue;
} else {
// If none of the ancestor pages have more children to iterate, that means we are at the end of the btree and should stop iterating.
return Ok(IOResult::Done(false));
}
}
}
}
turso_assert!(
cell_idx < contents.cell_count(),
"cell index out of bounds: cell_idx={}, cell_count={}, page_type={:?} page_id={}",
cell_idx,
contents.cell_count(),
contents.page_type(),
mem_page_rc.get().get().id
);
let cell = contents.cell_get(cell_idx, self.usable_space())?;
match &cell {
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page, ..
}) => {
let mem_page = self.read_page(*left_child_page as usize)?;
self.stack.push(mem_page);
continue;
}
BTreeCell::TableLeafCell(TableLeafCell { .. }) => {
return Ok(IOResult::Done(true));
}
BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page, ..
}) => {
if self.going_upwards {
self.going_upwards = false;
return Ok(IOResult::Done(true));
} else {
let mem_page = self.read_page(*left_child_page as usize)?;
self.stack.push(mem_page);
continue;
}
}
BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => {
return Ok(IOResult::Done(true));
}
}
}
}
/// Move the cursor to the record that matches the seek key and seek operation.
/// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10)
/// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10).
/// We don't include the rowid in the comparison and that's why the last value from the record is not included.
fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<IOResult<SeekResult>> {
let ret = return_if_io!(match key {
SeekKey::TableRowId(rowid) => {
self.tablebtree_seek(rowid, op)
}
SeekKey::IndexKey(index_key) => {
self.indexbtree_seek(index_key, op)
}
});
self.valid_state = CursorValidState::Valid;
Ok(IOResult::Done(ret))
}
/// Move the cursor to the root page of the btree.
#[instrument(skip_all, level = Level::DEBUG)]
fn move_to_root(&mut self) -> Result<()> {
self.seek_state = CursorSeekState::Start;
self.going_upwards = false;
tracing::trace!(root_page = self.root_page);
let mem_page = self.read_page(self.root_page)?;
self.stack.clear();
self.stack.push(mem_page);
Ok(())
}
/// Move the cursor to the rightmost record in the btree.
#[instrument(skip(self), level = Level::DEBUG)]
fn move_to_rightmost(&mut self) -> Result<IOResult<bool>> {
self.move_to_root()?;
loop {
let mem_page = self.stack.top();
let page_idx = mem_page.get().get().id;
let page = self.read_page(page_idx)?;
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
if contents.is_leaf() {
if contents.cell_count() > 0 {
self.stack.set_cell_index(contents.cell_count() as i32 - 1);
return Ok(IOResult::Done(true));
}
return Ok(IOResult::Done(false));
}
match contents.rightmost_pointer() {
Some(right_most_pointer) => {
self.stack.set_cell_index(contents.cell_count() as i32 + 1);
let mem_page = self.read_page(right_most_pointer as usize)?;
self.stack.push(mem_page);
continue;
}
None => {
unreachable!("interior page should have a rightmost pointer");
}
}
}
}
/// Specialized version of move_to() for table btrees.
#[instrument(skip(self), level = Level::DEBUG)]
fn tablebtree_move_to(&mut self, rowid: i64, seek_op: SeekOp) -> Result<IOResult<()>> {
'outer: loop {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
if contents.is_leaf() {
self.seek_state = CursorSeekState::FoundLeaf {
eq_seen: Cell::new(false),
};
return Ok(IOResult::Done(()));
}
let cell_count = contents.cell_count();
if matches!(
self.seek_state,
CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. }
) {
let eq_seen = match &self.seek_state {
CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(),
_ => false,
};
let min_cell_idx = Cell::new(0);
let max_cell_idx = Cell::new(cell_count as isize - 1);
let nearest_matching_cell = Cell::new(None);
self.seek_state = CursorSeekState::InteriorPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
eq_seen: Cell::new(eq_seen),
};
}
let CursorSeekState::InteriorPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
eq_seen,
..
} = &self.seek_state
else {
unreachable!("we must be in an interior binary search state");
};
loop {
let min = min_cell_idx.get();
let max = max_cell_idx.get();
if min > max {
if let Some(nearest_matching_cell) = nearest_matching_cell.get() {
let left_child_page =
contents.cell_interior_read_left_child_page(nearest_matching_cell);
self.stack.set_cell_index(nearest_matching_cell as i32);
let mem_page = self.read_page(left_child_page as usize)?;
self.stack.push(mem_page);
self.seek_state = CursorSeekState::MovingBetweenPages {
eq_seen: Cell::new(eq_seen.get()),
};
continue 'outer;
}
self.stack.set_cell_index(cell_count as i32 + 1);
match contents.rightmost_pointer() {
Some(right_most_pointer) => {
let mem_page = self.read_page(right_most_pointer as usize)?;
self.stack.push(mem_page);
self.seek_state = CursorSeekState::MovingBetweenPages {
eq_seen: Cell::new(eq_seen.get()),
};
continue 'outer;
}
None => {
unreachable!("we shall not go back up! The only way is down the slope");
}
}
}
let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here.
let cell_rowid = contents.cell_table_interior_read_rowid(cur_cell_idx as usize)?;
// in sqlite btrees left child pages have <= keys.
// table btrees can have a duplicate rowid in the interior cell, so for example if we are looking for rowid=10,
// and we find an interior cell with rowid=10, we need to move to the left page since (due to the <= rule of sqlite btrees)
// the left page may have a rowid=10.
// Logic table for determining if target leaf page is in left subtree
//
// Forwards iteration (looking for first match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// GT | > | go left | First > key is in left subtree
// GT | = or < | go right | First > key is in right subtree
// GE | > or = | go left | First >= key is in left subtree
// GE | < | go right | First >= key is in right subtree
//
// Backwards iteration (looking for last match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// LE | > or = | go left | Last <= key is in left subtree
// LE | < | go right | Last <= key is in right subtree
// LT | > or = | go left | Last < key is in left subtree
// LT | < | go right?| Last < key is in right subtree, except if cell rowid is exactly 1 less
//
// No iteration (point query):
// EQ | > or = | go left | Last = key is in left subtree
// EQ | < | go right | Last = key is in right subtree
let is_on_left = match seek_op {
SeekOp::GT => cell_rowid > rowid,
SeekOp::GE { .. } => cell_rowid >= rowid,
SeekOp::LE { .. } => cell_rowid >= rowid,
SeekOp::LT => cell_rowid + 1 >= rowid,
};
if is_on_left {
nearest_matching_cell.set(Some(cur_cell_idx as usize));
max_cell_idx.set(cur_cell_idx - 1);
} else {
min_cell_idx.set(cur_cell_idx + 1);
}
}
}
}
/// Specialized version of move_to() for index btrees.
#[instrument(skip(self, index_key), level = Level::DEBUG)]
fn indexbtree_move_to(
&mut self,
index_key: &ImmutableRecord,
cmp: SeekOp,
) -> Result<IOResult<()>> {
let iter_dir = cmp.iteration_direction();
let key_values = index_key.get_values();
let record_comparer = {
let index_info = self
.index_info
.as_ref()
.expect("indexbtree_move_to without index_info");
find_compare(&key_values, index_info)
};
tracing::debug!("Using record comparison strategy: {:?}", record_comparer);
let tie_breaker = get_tie_breaker_from_seek_op(cmp);
'outer: loop {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
if contents.is_leaf() {
let eq_seen = match &self.seek_state {
CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(),
_ => false,
};
self.seek_state = CursorSeekState::FoundLeaf {
eq_seen: Cell::new(eq_seen),
};
return Ok(IOResult::Done(()));
}
if matches!(
self.seek_state,
CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. }
) {
let eq_seen = match &self.seek_state {
CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(),
_ => false,
};
let cell_count = contents.cell_count();
let min_cell_idx = Cell::new(0);
let max_cell_idx = Cell::new(cell_count as isize - 1);
let nearest_matching_cell = Cell::new(None);
self.seek_state = CursorSeekState::InteriorPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
eq_seen: Cell::new(eq_seen),
};
}
let CursorSeekState::InteriorPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
eq_seen,
} = &self.seek_state
else {
unreachable!(
"we must be in an interior binary search state, got {:?}",
self.seek_state
);
};
loop {
let min = min_cell_idx.get();
let max = max_cell_idx.get();
if min > max {
let Some(leftmost_matching_cell) = nearest_matching_cell.get() else {
self.stack.set_cell_index(contents.cell_count() as i32 + 1);
match contents.rightmost_pointer() {
Some(right_most_pointer) => {
let mem_page = self.read_page(right_most_pointer as usize)?;
self.stack.push(mem_page);
self.seek_state = CursorSeekState::MovingBetweenPages {
eq_seen: Cell::new(eq_seen.get()),
};
continue 'outer;
}
None => {
unreachable!(
"we shall not go back up! The only way is down the slope"
);
}
}
};
let matching_cell =
contents.cell_get(leftmost_matching_cell, self.usable_space())?;
self.stack.set_cell_index(leftmost_matching_cell as i32);
// we don't advance in case of forward iteration and index tree internal nodes because we will visit this node going up.
// in backwards iteration, we must retreat because otherwise we would unnecessarily visit this node again.
// Example:
// this parent: key 666, and we found the target key in the left child.
// left child has: key 663, key 664, key 665
// we need to move to the previous parent (with e.g. key 662) when iterating backwards so that we don't end up back here again.
if iter_dir == IterationDirection::Backwards {
self.stack.retreat();
}
let BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page, ..
}) = &matching_cell
else {
unreachable!("unexpected cell type: {:?}", matching_cell);
};
turso_assert!(
page.get().id != *left_child_page as usize,
"corrupt: current page and left child page of cell {} are both {}",
leftmost_matching_cell,
page.get().id
);
let mem_page = self.read_page(*left_child_page as usize)?;
self.stack.push(mem_page);
self.seek_state = CursorSeekState::MovingBetweenPages {
eq_seen: Cell::new(eq_seen.get()),
};
continue 'outer;
}
let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here.
self.stack.set_cell_index(cur_cell_idx as i32);
let cell = contents.cell_get(cur_cell_idx as usize, self.usable_space())?;
let BTreeCell::IndexInteriorCell(IndexInteriorCell {
payload,
payload_size,
first_overflow_page,
..
}) = &cell
else {
unreachable!("unexpected cell type: {:?}", cell);
};
if let Some(next_page) = first_overflow_page {
return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size))
} else {
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.invalidate();
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.start_serialization(payload);
self.record_cursor.borrow_mut().invalidate();
};
let (target_leaf_page_is_in_left_subtree, is_eq) = {
let record = self.get_immutable_record();
let record = record.as_ref().unwrap();
let interior_cell_vs_index_key = record_comparer
.compare(
record,
&key_values,
self.index_info
.as_ref()
.expect("indexbtree_move_to without index_info"),
0,
tie_breaker,
)
.unwrap();
// in sqlite btrees left child pages have <= keys.
// in general, in forwards iteration we want to find the first key that matches the seek condition.
// in backwards iteration we want to find the last key that matches the seek condition.
//
// Logic table for determining if target leaf page is in left subtree.
// For index b-trees this is a bit more complicated since the interior cells contain payloads (the key is the payload).
// and for non-unique indexes there might be several cells with the same key.
//
// Forwards iteration (looking for first match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// GT | > | go left | First > key could be exactly this one, or in left subtree
// GT | = or < | go right | First > key must be in right subtree
// GE | > | go left | First >= key could be exactly this one, or in left subtree
// GE | = | go left | First >= key could be exactly this one, or in left subtree
// GE | < | go right | First >= key must be in right subtree
//
// Backwards iteration (looking for last match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// LE | > | go left | Last <= key must be in left subtree
// LE | = | go right | Last <= key is either this one, or somewhere to the right of this one. So we need to go right to make sure
// LE | < | go right | Last <= key must be in right subtree
// LT | > | go left | Last < key must be in left subtree
// LT | = | go left | Last < key must be in left subtree since we want strictly less than
// LT | < | go right | Last < key could be exactly this one, or in right subtree
//
// No iteration (point query):
// EQ | > | go left | First = key must be in left subtree
// EQ | = | go left | First = key could be exactly this one, or in left subtree
// EQ | < | go right | First = key must be in right subtree
(
match cmp {
SeekOp::GT => interior_cell_vs_index_key.is_gt(),
SeekOp::GE { .. } => interior_cell_vs_index_key.is_ge(),
SeekOp::LE { .. } => interior_cell_vs_index_key.is_gt(),
SeekOp::LT => interior_cell_vs_index_key.is_ge(),
},
interior_cell_vs_index_key.is_eq(),
)
};
if is_eq {
eq_seen.set(true);
}
if target_leaf_page_is_in_left_subtree {
nearest_matching_cell.set(Some(cur_cell_idx as usize));
max_cell_idx.set(cur_cell_idx - 1);
} else {
min_cell_idx.set(cur_cell_idx + 1);
}
}
}
}
/// Specialized version of do_seek() for table btrees that uses binary search instead
/// of iterating cells in order.
#[instrument(skip_all, level = Level::DEBUG)]
fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result<IOResult<SeekResult>> {
turso_assert!(
self.mv_cursor.is_none(),
"attempting to seek with MV cursor"
);
let iter_dir = seek_op.iteration_direction();
if matches!(
self.seek_state,
CursorSeekState::Start
| CursorSeekState::MovingBetweenPages { .. }
| CursorSeekState::InteriorPageBinarySearch { .. }
) {
// No need for another move_to_root. Move_to already moves to root
return_if_io!(self.move_to(SeekKey::TableRowId(rowid), seek_op));
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
turso_assert!(
contents.is_leaf(),
"tablebtree_seek() called on non-leaf page"
);
let cell_count = contents.cell_count();
if cell_count == 0 {
self.stack.set_cell_index(0);
return Ok(IOResult::Done(SeekResult::NotFound));
}
let min_cell_idx = Cell::new(0);
let max_cell_idx = Cell::new(cell_count as isize - 1);
// If iter dir is forwards, we want the first cell that matches;
// If iter dir is backwards, we want the last cell that matches.
let nearest_matching_cell = Cell::new(None);
self.seek_state = CursorSeekState::LeafPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
eq_seen: Cell::new(false), // not relevant for table btrees
target_cell_when_not_found: Cell::new(match seek_op.iteration_direction() {
IterationDirection::Forwards => cell_count as i32,
IterationDirection::Backwards => -1,
}),
};
}
let CursorSeekState::LeafPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
target_cell_when_not_found,
..
} = &self.seek_state
else {
unreachable!("we must be in a leaf binary search state");
};
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
loop {
let min = min_cell_idx.get();
let max = max_cell_idx.get();
if min > max {
if let Some(nearest_matching_cell) = nearest_matching_cell.get() {
self.stack.set_cell_index(nearest_matching_cell as i32);
self.has_record.set(true);
return Ok(IOResult::Done(SeekResult::Found));
} else {
// if !eq_only - matching entry can exist in neighbour leaf page
// this can happen if key in the interiour page was deleted - but divider kept untouched
// in such case BTree can navigate to the leaf which no longer has matching key for seek_op
// in this case, caller must advance cursor if necessary
return Ok(IOResult::Done(if seek_op.eq_only() {
let has_record = target_cell_when_not_found.get() >= 0
&& target_cell_when_not_found.get() < contents.cell_count() as i32;
self.has_record.set(has_record);
self.stack.set_cell_index(target_cell_when_not_found.get());
SeekResult::NotFound
} else {
// set cursor to the position where which would hold the op-boundary if it were present
self.stack.set_cell_index(target_cell_when_not_found.get());
SeekResult::TryAdvance
}));
};
}
let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here.
let cell_rowid = contents.cell_table_leaf_read_rowid(cur_cell_idx as usize)?;
let cmp = cell_rowid.cmp(&rowid);
let found = match seek_op {
SeekOp::GT => cmp.is_gt(),
SeekOp::GE { eq_only: true } => cmp.is_eq(),
SeekOp::GE { eq_only: false } => cmp.is_ge(),
SeekOp::LE { eq_only: true } => cmp.is_eq(),
SeekOp::LE { eq_only: false } => cmp.is_le(),
SeekOp::LT => cmp.is_lt(),
};
// rowids are unique, so we can return the rowid immediately
if found && seek_op.eq_only() {
self.stack.set_cell_index(cur_cell_idx as i32);
self.has_record.set(true);
return Ok(IOResult::Done(SeekResult::Found));
}
if found {
nearest_matching_cell.set(Some(cur_cell_idx as usize));
match iter_dir {
IterationDirection::Forwards => {
max_cell_idx.set(cur_cell_idx - 1);
}
IterationDirection::Backwards => {
min_cell_idx.set(cur_cell_idx + 1);
}
}
} else if cmp.is_gt() {
if matches!(seek_op, SeekOp::GE { eq_only: true }) {
target_cell_when_not_found
.set(target_cell_when_not_found.get().min(cur_cell_idx as i32));
}
max_cell_idx.set(cur_cell_idx - 1);
} else if cmp.is_lt() {
if matches!(seek_op, SeekOp::LE { eq_only: true }) {
target_cell_when_not_found
.set(target_cell_when_not_found.get().max(cur_cell_idx as i32));
}
min_cell_idx.set(cur_cell_idx + 1);
} else {
match iter_dir {
IterationDirection::Forwards => {
min_cell_idx.set(cur_cell_idx + 1);
}
IterationDirection::Backwards => {
max_cell_idx.set(cur_cell_idx - 1);
}
}
}
}
}
#[instrument(skip_all, level = Level::DEBUG)]
fn indexbtree_seek(
&mut self,
key: &ImmutableRecord,
seek_op: SeekOp,
) -> Result<IOResult<SeekResult>> {
let key_values = key.get_values();
let record_comparer = {
let index_info = self
.index_info
.as_ref()
.expect("indexbtree_seek without index_info");
find_compare(&key_values, index_info)
};
tracing::debug!(
"Using record comparison strategy for seek: {:?}",
record_comparer
);
if matches!(
self.seek_state,
CursorSeekState::Start
| CursorSeekState::MovingBetweenPages { .. }
| CursorSeekState::InteriorPageBinarySearch { .. }
) {
// No need for another move_to_root. Move_to already moves to root
return_if_io!(self.move_to(SeekKey::IndexKey(key), seek_op));
let CursorSeekState::FoundLeaf { eq_seen } = &self.seek_state else {
unreachable!(
"We must still be in FoundLeaf state after move_to, got: {:?}",
self.seek_state
);
};
let eq_seen = eq_seen.get();
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
let cell_count = contents.cell_count();
if cell_count == 0 {
return Ok(IOResult::Done(SeekResult::NotFound));
}
let min = Cell::new(0);
let max = Cell::new(cell_count as isize - 1);
// If iter dir is forwards, we want the first cell that matches;
// If iter dir is backwards, we want the last cell that matches.
let nearest_matching_cell = Cell::new(None);
self.seek_state = CursorSeekState::LeafPageBinarySearch {
min_cell_idx: min,
max_cell_idx: max,
nearest_matching_cell,
eq_seen: Cell::new(eq_seen),
target_cell_when_not_found: Cell::new(match seek_op.iteration_direction() {
IterationDirection::Forwards => cell_count as i32,
IterationDirection::Backwards => -1,
}),
};
}
let CursorSeekState::LeafPageBinarySearch {
min_cell_idx,
max_cell_idx,
nearest_matching_cell,
eq_seen,
target_cell_when_not_found,
} = &self.seek_state
else {
unreachable!(
"we must be in a leaf binary search state, got: {:?}",
self.seek_state
);
};
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
let iter_dir = seek_op.iteration_direction();
loop {
let min = min_cell_idx.get();
let max = max_cell_idx.get();
if min > max {
if let Some(nearest_matching_cell) = nearest_matching_cell.get() {
self.stack.set_cell_index(nearest_matching_cell as i32);
self.has_record.set(true);
return Ok(IOResult::Done(SeekResult::Found));
} else {
// set cursor to the position where which would hold the op-boundary if it were present
let target_cell = target_cell_when_not_found.get();
self.stack.set_cell_index(target_cell);
let has_record = target_cell >= 0 && target_cell < contents.cell_count() as i32;
self.has_record.set(has_record);
// Similar logic as in tablebtree_seek(), but for indexes.
// The difference is that since index keys are not necessarily unique, we need to TryAdvance
// even when eq_only=true and we have seen an EQ match up in the tree in an interior node.
if seek_op.eq_only() && !eq_seen.get() {
return Ok(IOResult::Done(SeekResult::NotFound));
}
return Ok(IOResult::Done(SeekResult::TryAdvance));
};
}
let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here.
self.stack.set_cell_index(cur_cell_idx as i32);
let cell = contents.cell_get(cur_cell_idx as usize, self.usable_space())?;
let BTreeCell::IndexLeafCell(IndexLeafCell {
payload,
first_overflow_page,
payload_size,
}) = &cell
else {
unreachable!("unexpected cell type: {:?}", cell);
};
if let Some(next_page) = first_overflow_page {
return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size))
} else {
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.invalidate();
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.start_serialization(payload);
self.record_cursor.borrow_mut().invalidate();
};
let (cmp, found) = self.compare_with_current_record(
key_values.as_slice(),
seek_op,
&record_comparer,
self.index_info
.as_ref()
.expect("indexbtree_seek without index_info"),
);
if found {
nearest_matching_cell.set(Some(cur_cell_idx as usize));
match iter_dir {
IterationDirection::Forwards => {
max_cell_idx.set(cur_cell_idx - 1);
}
IterationDirection::Backwards => {
min_cell_idx.set(cur_cell_idx + 1);
}
}
} else if cmp.is_gt() {
if matches!(seek_op, SeekOp::GE { eq_only: true }) {
target_cell_when_not_found
.set(target_cell_when_not_found.get().min(cur_cell_idx as i32));
}
max_cell_idx.set(cur_cell_idx - 1);
} else if cmp.is_lt() {
if matches!(seek_op, SeekOp::LE { eq_only: true }) {
target_cell_when_not_found
.set(target_cell_when_not_found.get().max(cur_cell_idx as i32));
}
min_cell_idx.set(cur_cell_idx + 1);
} else {
match iter_dir {
IterationDirection::Forwards => {
min_cell_idx.set(cur_cell_idx + 1);
}
IterationDirection::Backwards => {
max_cell_idx.set(cur_cell_idx - 1);
}
}
}
}
}
fn compare_with_current_record(
&self,
key_values: &[RefValue],
seek_op: SeekOp,
record_comparer: &RecordCompare,
index_info: &IndexInfo,
) -> (Ordering, bool) {
let record = self.get_immutable_record();
let record = record.as_ref().unwrap();
let tie_breaker = get_tie_breaker_from_seek_op(seek_op);
let cmp = record_comparer
.compare(record, key_values, index_info, 0, tie_breaker)
.unwrap();
let found = match seek_op {
SeekOp::GT => cmp.is_gt(),
SeekOp::GE { eq_only: true } => cmp.is_eq(),
SeekOp::GE { eq_only: false } => cmp.is_ge(),
SeekOp::LE { eq_only: true } => cmp.is_eq(),
SeekOp::LE { eq_only: false } => cmp.is_le(),
SeekOp::LT => cmp.is_lt(),
};
(cmp, found)
}
#[instrument(skip_all, level = Level::INFO)]
pub fn move_to(&mut self, key: SeekKey<'_>, cmp: SeekOp) -> Result<IOResult<()>> {
turso_assert!(
self.mv_cursor.is_none(),
"attempting to move with MV cursor"
);
tracing::trace!(?key, ?cmp);
// For a table with N rows, we can find any row by row id in O(log(N)) time by starting at the root page and following the B-tree pointers.
// B-trees consist of interior pages and leaf pages. Interior pages contain pointers to other pages, while leaf pages contain the actual row data.
//
// Conceptually, each Interior Cell in a interior page has a rowid and a left child node, and the page itself has a right-most child node.
// Example: consider an interior page that contains cells C1(rowid=10), C2(rowid=20), C3(rowid=30).
// - All rows with rowids <= 10 are in the left child node of C1.
// - All rows with rowids > 10 and <= 20 are in the left child node of C2.
// - All rows with rowids > 20 and <= 30 are in the left child node of C3.
// - All rows with rowids > 30 are in the right-most child node of the page.
//
// There will generally be multiple levels of interior pages before we reach a leaf page,
// so we need to follow the interior page pointers until we reach the leaf page that contains the row we are looking for (if it exists).
//
// Here's a high-level overview of the algorithm:
// 1. Since we start at the root page, its cells are all interior cells.
// 2. We scan the interior cells until we find a cell whose rowid is greater than or equal to the rowid we are looking for.
// 3. Follow the left child pointer of the cell we found in step 2.
// a. In case none of the cells in the page have a rowid greater than or equal to the rowid we are looking for,
// we follow the right-most child pointer of the page instead (since all rows with rowids greater than the rowid we are looking for are in the right-most child node).
// 4. We are now at a new page. If it's another interior page, we repeat the process from step 2. If it's a leaf page, we continue to step 5.
// 5. We scan the leaf cells in the leaf page until we find the cell whose rowid is equal to the rowid we are looking for.
// This cell contains the actual data we are looking for.
// 6. If we find the cell, we return the record. Otherwise, we return an empty result.
// If we are at the beginning/end of seek state, start a new move from the root.
if matches!(
self.seek_state,
// these are stages that happen at the leaf page, so we can consider that the previous seek finished and we can start a new one.
CursorSeekState::LeafPageBinarySearch { .. } | CursorSeekState::FoundLeaf { .. }
) {
self.seek_state = CursorSeekState::Start;
}
if matches!(self.seek_state, CursorSeekState::Start) {
self.move_to_root()?;
}
let ret = match key {
SeekKey::TableRowId(rowid_key) => self.tablebtree_move_to(rowid_key, cmp),
SeekKey::IndexKey(index_key) => self.indexbtree_move_to(index_key, cmp),
};
return_if_io!(ret);
Ok(IOResult::Done(()))
}
/// Insert a record into the btree.
/// If the insert operation overflows the page, it will be split and the btree will be balanced.
#[instrument(skip_all, level = Level::DEBUG)]
fn insert_into_page(&mut self, bkey: &BTreeKey) -> Result<IOResult<()>> {
let record = bkey
.get_record()
.expect("expected record present on insert");
let record_values = record.get_values();
if let CursorState::None = &self.state {
self.state = CursorState::Write(WriteInfo::new());
}
let ret = loop {
let write_state = {
let write_info = self
.state
.mut_write_info()
.expect("can't insert while counting");
write_info.state
};
match write_state {
WriteState::Start => {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
// get page and find cell
let cell_idx = {
return_if_locked!(page.get());
let page = page.get();
self.pager.add_dirty(&page);
self.stack.current_cell_index()
};
if cell_idx == -1 {
// This might be a brand new table and the cursor hasn't moved yet. Let's advance it to the first slot.
self.stack.set_cell_index(0);
}
let cell_idx = self.stack.current_cell_index() as usize;
tracing::debug!(cell_idx);
// if the cell index is less than the total cells, check: if its an existing
// rowid, we are going to update / overwrite the cell
if cell_idx < page.get().get_contents().cell_count() {
let cell = page
.get()
.get_contents()
.cell_get(cell_idx, self.usable_space())?;
match cell {
BTreeCell::TableLeafCell(tbl_leaf) => {
if tbl_leaf.rowid == bkey.to_rowid() {
tracing::debug!("TableLeafCell: found exact match with cell_idx={cell_idx}, overwriting");
self.overwrite_cell(page.clone(), cell_idx, record)?;
let write_info = self
.state
.mut_write_info()
.expect("expected write info");
if page.get().get_contents().overflow_cells.is_empty() {
write_info.state = WriteState::Finish;
} else {
write_info.state = WriteState::BalanceStart;
// If we balance, we must save the cursor position and seek to it later.
// FIXME: we shouldn't have both DeleteState::SeekAfterBalancing and
// save_context()/restore/context(), they are practically the same thing.
self.save_context(CursorContext::TableRowId(bkey.to_rowid()));
}
continue;
}
}
BTreeCell::IndexLeafCell(..) | BTreeCell::IndexInteriorCell(..) => {
return_if_io!(self.record());
let cmp = compare_immutable(
record_values.as_slice(),
self.get_immutable_record()
.as_ref()
.unwrap()
.get_values().as_slice(),
&self.index_info.as_ref().unwrap().key_info,
);
if cmp == Ordering::Equal {
tracing::debug!("IndexLeafCell: found exact match with cell_idx={cell_idx}, overwriting");
self.has_record.set(true);
self.overwrite_cell(page.clone(), cell_idx, record)?;
let write_info = self
.state
.mut_write_info()
.expect("expected write info");
if page.get().get_contents().overflow_cells.is_empty() {
write_info.state = WriteState::Finish;
} else {
write_info.state = WriteState::BalanceStart;
// If we balance, we must save the cursor position and seek to it later.
// FIXME: we shouldn't have both DeleteState::SeekAfterBalancing and
// save_context()/restore/context(), they are practically the same thing.
self.save_context(CursorContext::IndexKeyRowId((*record).clone()));
}
continue;
} else {
turso_assert!(
!matches!(cell, BTreeCell::IndexInteriorCell(..)),
"we should not be inserting a new index interior cell. the only valid operation on an index interior cell is an overwrite!"
);
}
}
other => panic!("unexpected cell type, expected TableLeaf or IndexLeaf, found: {other:?}"),
}
}
// insert cell
let mut cell_payload: Vec<u8> = Vec::with_capacity(record_values.len() + 4);
fill_cell_payload(
page.get().get().contents.as_ref().unwrap(),
bkey.maybe_rowid(),
&mut cell_payload,
cell_idx,
record,
self.usable_space(),
self.pager.clone(),
);
// insert
let overflow = {
let page = page.get();
let contents = page.get().contents.as_mut().unwrap();
tracing::debug!(name: "overflow", cell_count = contents.cell_count());
insert_into_cell(
contents,
cell_payload.as_slice(),
cell_idx,
self.usable_space() as u16,
)?;
!contents.overflow_cells.is_empty()
};
self.stack.set_cell_index(cell_idx as i32);
if overflow {
// A balance will happen so save the key we were inserting
tracing::debug!(page = page.get().get().id, cell_idx, "balance triggered:");
self.save_context(match bkey {
BTreeKey::TableRowId(rowid) => CursorContext::TableRowId(rowid.0),
BTreeKey::IndexKey(record) => {
CursorContext::IndexKeyRowId((*record).clone())
}
});
let write_info = self
.state
.mut_write_info()
.expect("can't count while inserting");
write_info.state = WriteState::BalanceStart;
} else {
let write_info = self
.state
.mut_write_info()
.expect("can't count while inserting");
write_info.state = WriteState::Finish;
}
}
WriteState::BalanceStart
| WriteState::BalanceFreePages { .. }
| WriteState::BalanceNonRootPickSiblings
| WriteState::BalanceNonRootDoBalancing => {
return_if_io!(self.balance(None));
}
WriteState::Finish => {
break Ok(IOResult::Done(()));
}
};
};
if matches!(self.state.write_info().unwrap().state, WriteState::Finish) {
// if there was a balance triggered, the cursor position is invalid.
// it's probably not the greatest idea in the world to do this eagerly here,
// but at least it works.
return_if_io!(self.restore_context());
}
self.state = CursorState::None;
ret
}
/// Balance a leaf page.
/// Balancing is done when a page overflows.
/// see e.g. https://en.wikipedia.org/wiki/B-tree
///
/// This is a naive algorithm that doesn't try to distribute cells evenly by content.
/// It will try to split the page in half by keys not by content.
/// Sqlite tries to have a page at least 40% full.
///
/// `balance_ancestor_at_depth` specifies whether to balance an ancestor page at a specific depth.
/// If `None`, balancing stops when a level is encountered that doesn't need balancing.
/// If `Some(depth)`, the page on the stack at depth `depth` will be rebalanced after balancing the current page.
#[instrument(skip(self), level = Level::DEBUG)]
fn balance(&mut self, balance_ancestor_at_depth: Option<usize>) -> Result<IOResult<()>> {
turso_assert!(
matches!(self.state, CursorState::Write(_)),
"Cursor must be in balancing state"
);
loop {
let state = self.state.write_info().expect("must be balancing").state;
match state {
WriteState::BalanceStart => {
assert!(
self.state
.write_info()
.unwrap()
.balance_info
.borrow()
.is_none(),
"BalanceInfo should be empty on start"
);
let current_page = self.stack.top();
let next_balance_depth =
balance_ancestor_at_depth.unwrap_or(self.stack.current());
{
// check if we don't need to balance
// don't continue if:
// - current page is not overfull root
// OR
// - current page is not overfull and the amount of free space on the page
// is less than 2/3rds of the total usable space on the page
//
// https://github.com/sqlite/sqlite/blob/0aa95099f5003dc99f599ab77ac0004950b281ef/src/btree.c#L9064-L9071
let current_page = current_page.get();
let page = current_page.get().contents.as_mut().unwrap();
let usable_space = self.usable_space();
let free_space = compute_free_space(page, usable_space as u16);
let this_level_is_already_balanced = page.overflow_cells.is_empty()
&& (!self.stack.has_parent()
|| free_space as usize * 3 <= usable_space * 2);
if this_level_is_already_balanced {
if self.stack.current() > next_balance_depth {
while self.stack.current() > next_balance_depth {
// Even though this level is already balanced, we know there's an upper level that needs balancing.
// So we pop the stack and continue.
self.stack.pop();
}
continue;
}
// Otherwise, we're done.
let write_info = self.state.mut_write_info().unwrap();
write_info.state = WriteState::Finish;
return Ok(IOResult::Done(()));
}
}
if !self.stack.has_parent() {
self.balance_root()?;
}
let write_info = self.state.mut_write_info().unwrap();
write_info.state = WriteState::BalanceNonRootPickSiblings;
self.stack.pop();
return_if_io!(self.balance_non_root());
}
WriteState::BalanceNonRootPickSiblings
| WriteState::BalanceNonRootDoBalancing
| WriteState::BalanceFreePages { .. } => {
return_if_io!(self.balance_non_root());
}
WriteState::Finish => return Ok(IOResult::Done(())),
_ => panic!("unexpected state on balance {state:?}"),
}
}
}
/// Balance a non root page by trying to balance cells between a maximum of 3 siblings that should be neighboring the page that overflowed/underflowed.
#[instrument(skip_all, level = Level::DEBUG)]
fn balance_non_root(&mut self) -> Result<IOResult<()>> {
turso_assert!(
matches!(self.state, CursorState::Write(_)),
"Cursor must be in balancing state"
);
let state = self.state.write_info().expect("must be balancing").state;
tracing::debug!(?state);
let (next_write_state, result) = match state {
WriteState::Start => todo!(),
WriteState::BalanceStart => todo!(),
WriteState::BalanceNonRootPickSiblings => {
let parent_page = self.stack.top();
return_if_locked_maybe_load!(self.pager, parent_page);
let parent_page = parent_page.get();
let parent_contents = parent_page.get_contents();
let page_type = parent_contents.page_type();
turso_assert!(
matches!(page_type, PageType::IndexInterior | PageType::TableInterior),
"expected index or table interior page"
);
let number_of_cells_in_parent =
parent_contents.cell_count() + parent_contents.overflow_cells.len();
// If `seek` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1.
// In any other case, `seek` will stay in the correct index.
let past_rightmost_pointer =
self.stack.current_cell_index() as usize == number_of_cells_in_parent + 1;
if past_rightmost_pointer {
self.stack.retreat();
} else if !parent_contents.overflow_cells.is_empty() {
// The ONLY way we can have an overflow cell in the parent is if we replaced an interior cell from a cell in the child, and that replacement did not fit.
// This can only happen on index btrees.
if matches!(page_type, PageType::IndexInterior) {
turso_assert!(parent_contents.overflow_cells.len() == 1, "index interior page must have no more than 1 overflow cell, as a result of InteriorNodeReplacement");
} else {
turso_assert!(false, "{page_type:?} must have no overflow cells");
}
let overflow_cell = parent_contents.overflow_cells.first().unwrap();
let parent_page_cell_idx = self.stack.current_cell_index() as usize;
// Parent page must be positioned at the divider cell that overflowed due to the replacement.
turso_assert!(
overflow_cell.index == parent_page_cell_idx,
"overflow cell index must be the result of InteriorNodeReplacement that leaves both child and parent (id={}) unbalanced, and hence parent page's position must = overflow_cell.index. Instead got: parent_page_cell_idx={parent_page_cell_idx} overflow_cell.index={}",
parent_page.get().id,
overflow_cell.index
);
}
self.pager.add_dirty(&parent_page);
let parent_contents = parent_page.get().contents.as_ref().unwrap();
let page_to_balance_idx = self.stack.current_cell_index() as usize;
tracing::debug!(
"balance_non_root(parent_id={} page_to_balance_idx={})",
parent_page.get().id,
page_to_balance_idx
);
// Part 1: Find the sibling pages to balance
let mut pages_to_balance: [Option<BTreePage>; MAX_SIBLING_PAGES_TO_BALANCE] =
[const { None }; MAX_SIBLING_PAGES_TO_BALANCE];
turso_assert!(
page_to_balance_idx <= parent_contents.cell_count(),
"page_to_balance_idx={page_to_balance_idx} is out of bounds for parent cell count {number_of_cells_in_parent}"
);
// As there will be at maximum 3 pages used to balance:
// sibling_pointer is the index represeneting one of those 3 pages, and we initialize it to the last possible page.
// next_divider is the first divider that contains the first page of the 3 pages.
let (sibling_pointer, first_cell_divider) = match number_of_cells_in_parent {
n if n < 2 => (number_of_cells_in_parent, 0),
2 => (2, 0),
// Here we will have at lest 2 cells and one right pointer, therefore we can get 3 siblings.
// In case of 2 we will have all pages to balance.
_ => {
// In case of > 3 we have to check which ones to get
let next_divider = if page_to_balance_idx == 0 {
// first cell, take first 3
0
} else if page_to_balance_idx == number_of_cells_in_parent {
// Page corresponds to right pointer, so take last 3
number_of_cells_in_parent - 2
} else {
// Some cell in the middle, so we want to take sibling on left and right.
page_to_balance_idx - 1
};
(2, next_divider)
}
};
let sibling_count = sibling_pointer + 1;
let last_sibling_is_right_pointer = sibling_pointer + first_cell_divider
- parent_contents.overflow_cells.len()
== parent_contents.cell_count();
// Get the right page pointer that we will need to update later
let right_pointer = if last_sibling_is_right_pointer {
parent_contents.rightmost_pointer_raw().unwrap()
} else {
let max_overflow_cells = if matches!(page_type, PageType::IndexInterior) {
1
} else {
0
};
turso_assert!(
parent_contents.overflow_cells.len() <= max_overflow_cells,
"must have at most {max_overflow_cells} overflow cell in the parent"
);
// OVERFLOW CELL ADJUSTMENT:
// Let there be parent with cells [0,1,2,3,4].
// Let's imagine the cell at idx 2 gets replaced with a new payload that causes it to overflow.
// See handling of InteriorNodeReplacement in btree.rs.
//
// In this case the rightmost divider is going to be 3 (2 is the middle one and we pick neighbors 1-3).
// drop_cell(): [0,1,2,3,4] -> [0,1,3,4] <-- cells on right side get shifted left!
// insert_into_cell(): [0,1,3,4] -> [0,1,3,4] + overflow cell (2) <-- crucially, no physical shifting happens, overflow cell is stored separately
//
// This means '3' is actually physically located at index '2'.
// So IF the parent has an overflow cell, we need to subtract 1 to get the actual rightmost divider cell idx to physically read from.
// The formula for the actual cell idx is:
// first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len()
// so in the above case:
// actual_cell_idx = 1 + 2 - 1 = 2
//
// In the case where the last divider cell is the overflow cell, there would be no left-shifting of cells in drop_cell(),
// because they are still positioned correctly (imagine .pop() from a vector).
// However, note that we are always looking for the _rightmost_ child page pointer between the (max 2) dividers, and for any case where the last divider cell is the overflow cell,
// the 'last_sibling_is_right_pointer' condition will also be true (since the overflow cell's left child will be the middle page), so we won't enter this code branch.
//
// Hence: when we enter this branch with overflow_cells.len() == 1, we know that left-shifting has happened and we need to subtract 1.
let actual_cell_idx =
first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len();
let (start_of_cell, _) =
parent_contents.cell_get_raw_region(actual_cell_idx, self.usable_space());
let buf = parent_contents.as_ptr().as_mut_ptr();
unsafe { buf.add(start_of_cell) }
};
// load sibling pages
// start loading right page first
let mut pgno: u32 = unsafe { right_pointer.cast::<u32>().read().swap_bytes() };
let current_sibling = sibling_pointer;
for i in (0..=current_sibling).rev() {
let page = self.read_page(pgno as usize)?;
{
// mark as dirty
let sibling_page = page.get();
self.pager.add_dirty(&sibling_page);
}
#[cfg(debug_assertions)]
{
return_if_locked!(page.get());
debug_validate_cells!(
&page.get().get_contents(),
self.usable_space() as u16
);
}
pages_to_balance[i].replace(page);
if i == 0 {
break;
}
let next_cell_divider = i + first_cell_divider - 1;
let divider_is_overflow_cell = parent_contents
.overflow_cells
.first()
.is_some_and(|overflow_cell| overflow_cell.index == next_cell_divider);
if divider_is_overflow_cell {
turso_assert!(
matches!(parent_contents.page_type(), PageType::IndexInterior),
"expected index interior page, got {:?}",
parent_contents.page_type()
);
turso_assert!(
parent_contents.overflow_cells.len() == 1,
"must have a single overflow cell in the parent, as a result of InteriorNodeReplacement"
);
let overflow_cell = parent_contents.overflow_cells.first().unwrap();
pgno = u32::from_be_bytes(overflow_cell.payload[0..4].try_into().unwrap());
} else {
// grep for 'OVERFLOW CELL ADJUSTMENT' for explanation.
// here we only subtract 1 if the divider cell has been shifted left, i.e. the overflow cell was placed to the left
// this cell.
let actual_cell_idx =
if let Some(overflow_cell) = parent_contents.overflow_cells.first() {
if next_cell_divider < overflow_cell.index {
next_cell_divider
} else {
next_cell_divider - 1
}
} else {
next_cell_divider
};
pgno =
match parent_contents.cell_get(actual_cell_idx, self.usable_space())? {
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page,
..
})
| BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page,
..
}) => left_child_page,
other => {
crate::bail_corrupt_error!(
"expected interior cell, got {:?}",
other
)
}
};
}
}
#[cfg(debug_assertions)]
{
let page_type_of_siblings = pages_to_balance[0]
.as_ref()
.unwrap()
.get()
.get_contents()
.page_type();
for page in pages_to_balance.iter().take(sibling_count) {
return_if_locked_maybe_load!(self.pager, page.as_ref().unwrap());
let page = page.as_ref().unwrap().get();
let contents = page.get_contents();
debug_validate_cells!(&contents, self.usable_space() as u16);
assert_eq!(contents.page_type(), page_type_of_siblings);
}
}
self.state
.write_info()
.unwrap()
.balance_info
.replace(Some(BalanceInfo {
pages_to_balance,
rightmost_pointer: right_pointer,
divider_cell_payloads: [const { None }; MAX_SIBLING_PAGES_TO_BALANCE - 1],
sibling_count,
first_divider_cell: first_cell_divider,
}));
(WriteState::BalanceNonRootDoBalancing, Ok(IOResult::IO))
}
WriteState::BalanceNonRootDoBalancing => {
// Ensure all involved pages are in memory.
let write_info = self.state.write_info().unwrap();
let mut balance_info = write_info.balance_info.borrow_mut();
let balance_info = balance_info.as_mut().unwrap();
for page in balance_info
.pages_to_balance
.iter()
.take(balance_info.sibling_count)
{
let page = page.as_ref().unwrap();
return_if_locked_maybe_load!(self.pager, page);
}
// Start balancing.
let parent_page_btree = self.stack.top();
let parent_page = parent_page_btree.get();
let parent_contents = parent_page.get_contents();
let parent_is_root = !self.stack.has_parent();
// 1. Collect cell data from divider cells, and count the total number of cells to be distributed.
// The count includes: all cells and overflow cells from the sibling pages, and divider cells from the parent page,
// excluding the rightmost divider, which will not be dropped from the parent; instead it will be updated at the end.
let mut total_cells_to_redistribute = 0;
let mut pages_to_balance_new: [Option<BTreePage>;
MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] =
[const { None }; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE];
for i in (0..balance_info.sibling_count).rev() {
let sibling_page = balance_info.pages_to_balance[i].as_ref().unwrap();
let sibling_page = sibling_page.get();
turso_assert!(sibling_page.is_loaded(), "sibling page is not loaded");
let sibling_contents = sibling_page.get_contents();
total_cells_to_redistribute += sibling_contents.cell_count();
total_cells_to_redistribute += sibling_contents.overflow_cells.len();
// Right pointer is not dropped, we simply update it at the end. This could be a divider cell that points
// to the last page in the list of pages to balance or this could be the rightmost pointer that points to a page.
let is_last_sibling = i == balance_info.sibling_count - 1;
if is_last_sibling {
continue;
}
// Since we know we have a left sibling, take the divider that points to left sibling of this page
let cell_idx = balance_info.first_divider_cell + i;
let divider_is_overflow_cell = parent_contents
.overflow_cells
.first()
.is_some_and(|overflow_cell| overflow_cell.index == cell_idx);
let cell_buf = if divider_is_overflow_cell {
turso_assert!(
matches!(parent_contents.page_type(), PageType::IndexInterior),
"expected index interior page, got {:?}",
parent_contents.page_type()
);
turso_assert!(
parent_contents.overflow_cells.len() == 1,
"must have a single overflow cell in the parent, as a result of InteriorNodeReplacement"
);
let overflow_cell = parent_contents.overflow_cells.first().unwrap();
&overflow_cell.payload
} else {
// grep for 'OVERFLOW CELL ADJUSTMENT' for explanation.
// here we can subtract overflow_cells.len() every time, because we are iterating right-to-left,
// so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0.
let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len();
let (cell_start, cell_len) = parent_contents
.cell_get_raw_region(actual_cell_idx, self.usable_space());
let buf = parent_contents.as_ptr();
&buf[cell_start..cell_start + cell_len]
};
// Count the divider cell itself (which will be dropped from the parent)
total_cells_to_redistribute += 1;
tracing::debug!(
"balance_non_root(drop_divider_cell, first_divider_cell={}, divider_cell={}, left_pointer={})",
balance_info.first_divider_cell,
i,
read_u32(cell_buf, 0)
);
// TODO(pere): make this reference and not copy
balance_info.divider_cell_payloads[i].replace(cell_buf.to_vec());
if divider_is_overflow_cell {
tracing::debug!(
"clearing overflow cells from parent cell_idx={}",
cell_idx
);
parent_contents.overflow_cells.clear();
} else {
// grep for 'OVERFLOW CELL ADJUSTMENT' for explanation.
// here we can subtract overflow_cells.len() every time, because we are iterating right-to-left,
// so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0.
let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len();
tracing::trace!(
"dropping divider cell from parent cell_idx={} count={}",
actual_cell_idx,
parent_contents.cell_count()
);
drop_cell(parent_contents, actual_cell_idx, self.usable_space() as u16)?;
}
}
/* 2. Initialize CellArray with all the cells used for distribution, this includes divider cells if !leaf. */
let mut cell_array = CellArray {
cell_payloads: Vec::with_capacity(total_cells_to_redistribute),
cell_count_per_page_cumulative: [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
};
let cells_capacity_start = cell_array.cell_payloads.capacity();
let mut total_cells_inserted = 0;
// This is otherwise identical to CellArray.cell_count_per_page_cumulative,
// but we exclusively track what the prefix sums were _before_ we started redistributing cells.
let mut old_cell_count_per_page_cumulative: [u16;
MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] = [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE];
let page_type = balance_info.pages_to_balance[0]
.as_ref()
.unwrap()
.get()
.get_contents()
.page_type();
tracing::debug!("balance_non_root(page_type={:?})", page_type);
let is_table_leaf = matches!(page_type, PageType::TableLeaf);
let is_leaf = matches!(page_type, PageType::TableLeaf | PageType::IndexLeaf);
for (i, old_page) in balance_info
.pages_to_balance
.iter()
.take(balance_info.sibling_count)
.enumerate()
{
let old_page = old_page.as_ref().unwrap().get();
let old_page_contents = old_page.get_contents();
debug_validate_cells!(&old_page_contents, self.usable_space() as u16);
for cell_idx in 0..old_page_contents.cell_count() {
let (cell_start, cell_len) =
old_page_contents.cell_get_raw_region(cell_idx, self.usable_space());
let buf = old_page_contents.as_ptr();
let cell_buf = &mut buf[cell_start..cell_start + cell_len];
// TODO(pere): make this reference and not copy
cell_array.cell_payloads.push(to_static_buf(cell_buf));
}
// Insert overflow cells into correct place
let offset = total_cells_inserted;
for overflow_cell in old_page_contents.overflow_cells.iter_mut() {
cell_array.cell_payloads.insert(
offset + overflow_cell.index,
to_static_buf(&mut Pin::as_mut(&mut overflow_cell.payload)),
);
}
old_cell_count_per_page_cumulative[i] = cell_array.cell_payloads.len() as u16;
let mut cells_inserted =
old_page_contents.cell_count() + old_page_contents.overflow_cells.len();
let is_last_sibling = i == balance_info.sibling_count - 1;
if !is_last_sibling && !is_table_leaf {
// If we are a index page or a interior table page we need to take the divider cell too.
// But we don't need the last divider as it will remain the same.
let mut divider_cell = balance_info.divider_cell_payloads[i]
.as_mut()
.unwrap()
.as_mut_slice();
// TODO(pere): in case of old pages are leaf pages, so index leaf page, we need to strip page pointers
// from divider cells in index interior pages (parent) because those should not be included.
cells_inserted += 1;
if !is_leaf {
// This divider cell needs to be updated with new left pointer,
let right_pointer = old_page_contents.rightmost_pointer().unwrap();
divider_cell[..LEFT_CHILD_PTR_SIZE_BYTES]
.copy_from_slice(&right_pointer.to_be_bytes());
} else {
// index leaf
turso_assert!(
divider_cell.len() >= LEFT_CHILD_PTR_SIZE_BYTES,
"divider cell is too short"
);
// let's strip the page pointer
divider_cell = &mut divider_cell[LEFT_CHILD_PTR_SIZE_BYTES..];
}
cell_array.cell_payloads.push(to_static_buf(divider_cell));
}
total_cells_inserted += cells_inserted;
}
turso_assert!(
cell_array.cell_payloads.capacity() == cells_capacity_start,
"calculation of max cells was wrong"
);
// Let's copy all cells for later checks
#[cfg(debug_assertions)]
let mut cells_debug = Vec::new();
#[cfg(debug_assertions)]
{
for cell in &cell_array.cell_payloads {
cells_debug.push(cell.to_vec());
if is_leaf {
assert!(cell[0] != 0)
}
}
}
#[cfg(debug_assertions)]
validate_cells_after_insertion(&cell_array, is_table_leaf);
/* 3. Initiliaze current size of every page including overflow cells and divider cells that might be included. */
let mut new_page_sizes: [i64; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] =
[0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE];
let header_size = if is_leaf {
LEAF_PAGE_HEADER_SIZE_BYTES
} else {
INTERIOR_PAGE_HEADER_SIZE_BYTES
};
// number of bytes beyond header, different from global usableSapce which includes
// header
let usable_space = self.usable_space() - header_size;
for i in 0..balance_info.sibling_count {
cell_array.cell_count_per_page_cumulative[i] =
old_cell_count_per_page_cumulative[i];
let page = &balance_info.pages_to_balance[i].as_ref().unwrap();
let page = page.get();
let page_contents = page.get_contents();
let free_space = compute_free_space(page_contents, self.usable_space() as u16);
new_page_sizes[i] = usable_space as i64 - free_space as i64;
for overflow in &page_contents.overflow_cells {
// 2 to account of pointer
new_page_sizes[i] += 2 + overflow.payload.len() as i64;
}
let is_last_sibling = i == balance_info.sibling_count - 1;
if !is_leaf && !is_last_sibling {
// Account for divider cell which is included in this page.
new_page_sizes[i] += cell_array.cell_payloads
[cell_array.cell_count_up_to_page(i)]
.len() as i64;
}
}
/* 4. Now let's try to move cells to the left trying to stack them without exceeding the maximum size of a page.
There are two cases:
* If current page has too many cells, it will move them to the next page.
* If it still has space, and it can take a cell from the right it will take them.
Here there is a caveat. Taking a cell from the right might take cells from page i+1, i+2, i+3, so not necessarily
adjacent. But we decrease the size of the adjacent page if we move from the right. This might cause a intermitent state
where page can have size <0.
This will also calculate how many pages are required to balance the cells and store in sibling_count_new.
*/
// Try to pack as many cells to the left
let mut sibling_count_new = balance_info.sibling_count;
let mut i = 0;
while i < sibling_count_new {
// First try to move cells to the right if they do not fit
while new_page_sizes[i] > usable_space as i64 {
let needs_new_page = i + 1 >= sibling_count_new;
if needs_new_page {
sibling_count_new = i + 2;
turso_assert!(
sibling_count_new <= 5,
"it is corrupt to require more than 5 pages to balance 3 siblings"
);
new_page_sizes[sibling_count_new - 1] = 0;
cell_array.cell_count_per_page_cumulative[sibling_count_new - 1] =
cell_array.cell_payloads.len() as u16;
}
let size_of_cell_to_remove_from_left =
2 + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i) - 1]
.len() as i64;
new_page_sizes[i] -= size_of_cell_to_remove_from_left;
let size_of_cell_to_move_right = if !is_table_leaf {
if cell_array.cell_count_per_page_cumulative[i]
< cell_array.cell_payloads.len() as u16
{
// This means we move to the right page the divider cell and we
// promote left cell to divider
CELL_PTR_SIZE_BYTES as i64
+ cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)]
.len() as i64
} else {
0
}
} else {
size_of_cell_to_remove_from_left
};
new_page_sizes[i + 1] += size_of_cell_to_move_right;
cell_array.cell_count_per_page_cumulative[i] -= 1;
}
// Now try to take from the right if we didn't have enough
while cell_array.cell_count_per_page_cumulative[i]
< cell_array.cell_payloads.len() as u16
{
let size_of_cell_to_remove_from_right = CELL_PTR_SIZE_BYTES as i64
+ cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)].len()
as i64;
let can_take = new_page_sizes[i] + size_of_cell_to_remove_from_right
> usable_space as i64;
if can_take {
break;
}
new_page_sizes[i] += size_of_cell_to_remove_from_right;
cell_array.cell_count_per_page_cumulative[i] += 1;
let size_of_cell_to_remove_from_right = if !is_table_leaf {
if cell_array.cell_count_per_page_cumulative[i]
< cell_array.cell_payloads.len() as u16
{
CELL_PTR_SIZE_BYTES as i64
+ cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)]
.len() as i64
} else {
0
}
} else {
size_of_cell_to_remove_from_right
};
new_page_sizes[i + 1] -= size_of_cell_to_remove_from_right;
}
// Check if this page contains up to the last cell. If this happens it means we really just need up to this page.
// Let's update the number of new pages to be up to this page (i+1)
let page_completes_all_cells = cell_array.cell_count_per_page_cumulative[i]
>= cell_array.cell_payloads.len() as u16;
if page_completes_all_cells {
sibling_count_new = i + 1;
break;
}
i += 1;
if i >= sibling_count_new {
break;
}
}
tracing::debug!(
"balance_non_root(sibling_count={}, sibling_count_new={}, cells={})",
balance_info.sibling_count,
sibling_count_new,
cell_array.cell_payloads.len()
);
/* 5. Balance pages starting from a left stacked cell state and move them to right trying to maintain a balanced state
where we only move from left to right if it will not unbalance both pages, meaning moving left to right won't make
right page bigger than left page.
*/
// Comment borrowed from SQLite src/btree.c
// The packing computed by the previous block is biased toward the siblings
// on the left side (siblings with smaller keys). The left siblings are
// always nearly full, while the right-most sibling might be nearly empty.
// The next block of code attempts to adjust the packing of siblings to
// get a better balance.
//
// This adjustment is more than an optimization. The packing above might
// be so out of balance as to be illegal. For example, the right-most
// sibling might be completely empty. This adjustment is not optional.
for i in (1..sibling_count_new).rev() {
let mut size_right_page = new_page_sizes[i];
let mut size_left_page = new_page_sizes[i - 1];
let mut cell_left = cell_array.cell_count_per_page_cumulative[i - 1] - 1;
// When table leaves are being balanced, divider cells are not part of the balancing,
// because table dividers don't have payloads unlike index dividers.
// Hence:
// - For table leaves: the same cell that is removed from left is added to right.
// - For all other page types: the divider cell is added to right, and the last non-divider cell is removed from left;
// the cell removed from the left will later become a new divider cell in the parent page.
// TABLE LEAVES BALANCING:
// =======================
// Before balancing:
// LEFT RIGHT
// +-----+-----+-----+-----+ +-----+-----+
// | C1 | C2 | C3 | C4 | | C5 | C6 |
// +-----+-----+-----+-----+ +-----+-----+
// ^ ^
// (too full) (has space)
// After balancing:
// LEFT RIGHT
// +-----+-----+-----+ +-----+-----+-----+
// | C1 | C2 | C3 | | C4 | C5 | C6 |
// +-----+-----+-----+ +-----+-----+-----+
// ^
// (C4 moved directly)
//
// (C3's rowid also becomes the divider cell's rowid in the parent page
//
// OTHER PAGE TYPES BALANCING:
// ===========================
// Before balancing:
// PARENT: [...|D1|...]
// |
// LEFT RIGHT
// +-----+-----+-----+-----+ +-----+-----+
// | K1 | K2 | K3 | K4 | | K5 | K6 |
// +-----+-----+-----+-----+ +-----+-----+
// ^ ^
// (too full) (has space)
// After balancing:
// PARENT: [...|K4|...] <-- K4 becomes new divider
// |
// LEFT RIGHT
// +-----+-----+-----+ +-----+-----+-----+
// | K1 | K2 | K3 | | D1 | K5 | K6 |
// +-----+-----+-----+ +-----+-----+-----+
// ^
// (old divider D1 added to right)
// Legend:
// - C# = Cell (table leaf)
// - K# = Key cell (index/internal node)
// - D# = Divider cell
let mut cell_right = if is_table_leaf {
cell_left
} else {
cell_left + 1
};
loop {
let cell_left_size = cell_array.cell_size_bytes(cell_left as usize) as i64;
let cell_right_size =
cell_array.cell_size_bytes(cell_right as usize) as i64;
// TODO: add assert nMaxCells
let is_last_sibling = i == sibling_count_new - 1;
let pointer_size = if is_last_sibling {
0
} else {
CELL_PTR_SIZE_BYTES as i64
};
// As mentioned, this step rebalances the siblings so that cells are moved from left to right, since the previous step just
// packed as much as possible to the left. However, if the right-hand-side page would become larger than the left-hand-side page,
// we stop.
let would_not_improve_balance =
size_right_page + cell_right_size + (CELL_PTR_SIZE_BYTES as i64)
> size_left_page - (cell_left_size + pointer_size);
if size_right_page != 0 && would_not_improve_balance {
break;
}
size_left_page -= cell_left_size + (CELL_PTR_SIZE_BYTES as i64);
size_right_page += cell_right_size + (CELL_PTR_SIZE_BYTES as i64);
cell_array.cell_count_per_page_cumulative[i - 1] = cell_left;
if cell_left == 0 {
break;
}
cell_left -= 1;
cell_right -= 1;
}
new_page_sizes[i] = size_right_page;
new_page_sizes[i - 1] = size_left_page;
assert!(
cell_array.cell_count_per_page_cumulative[i - 1]
> if i > 1 {
cell_array.cell_count_per_page_cumulative[i - 2]
} else {
0
}
);
}
// Allocate pages or set dirty if not needed
for i in 0..sibling_count_new {
if i < balance_info.sibling_count {
let page = balance_info.pages_to_balance[i].as_ref().unwrap();
turso_assert!(
page.get().is_dirty(),
"sibling page must be already marked dirty"
);
pages_to_balance_new[i].replace(page.clone());
} else {
// FIXME: handle page cache is full
let page = self.allocate_page(page_type, 0)?;
pages_to_balance_new[i].replace(page);
// Since this page didn't exist before, we can set it to cells length as it
// marks them as empty since it is a prefix sum of cells.
old_cell_count_per_page_cumulative[i] =
cell_array.cell_payloads.len() as u16;
}
}
// Reassign page numbers in increasing order
{
let mut page_numbers: [usize; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] =
[0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE];
for (i, page) in pages_to_balance_new
.iter()
.take(sibling_count_new)
.enumerate()
{
page_numbers[i] = page.as_ref().unwrap().get().get().id;
}
page_numbers.sort();
for (page, new_id) in pages_to_balance_new
.iter()
.take(sibling_count_new)
.rev()
.zip(page_numbers.iter().rev().take(sibling_count_new))
{
let page = page.as_ref().unwrap();
if *new_id != page.get().get().id {
page.get().get().id = *new_id;
self.pager
.update_dirty_loaded_page_in_cache(*new_id, page.get())?;
}
}
#[cfg(debug_assertions)]
{
tracing::debug!(
"balance_non_root(parent page_id={})",
parent_page.get().id
);
for page in pages_to_balance_new.iter().take(sibling_count_new) {
tracing::debug!(
"balance_non_root(new_sibling page_id={})",
page.as_ref().unwrap().get().get().id
);
}
}
}
// pages_pointed_to helps us debug we did in fact create divider cells to all the new pages and the rightmost pointer,
// also points to the last page.
#[cfg(debug_assertions)]
let mut pages_pointed_to = HashSet::new();
// Write right pointer in parent page to point to new rightmost page. keep in mind
// we update rightmost pointer first because inserting cells could defragment parent page,
// therfore invalidating the pointer.
let right_page_id = pages_to_balance_new[sibling_count_new - 1]
.as_ref()
.unwrap()
.get()
.get()
.id as u32;
let rightmost_pointer = balance_info.rightmost_pointer;
let rightmost_pointer =
unsafe { std::slice::from_raw_parts_mut(rightmost_pointer, 4) };
rightmost_pointer[0..4].copy_from_slice(&right_page_id.to_be_bytes());
#[cfg(debug_assertions)]
pages_pointed_to.insert(right_page_id);
tracing::debug!(
"balance_non_root(rightmost_pointer_update, rightmost_pointer={})",
right_page_id
);
/* 6. Update parent pointers. Update right pointer and insert divider cells with newly created distribution of cells */
// Ensure right-child pointer of the right-most new sibling pge points to the page
// that was originally on that place.
let is_leaf_page = matches!(page_type, PageType::TableLeaf | PageType::IndexLeaf);
if !is_leaf_page {
let last_sibling_idx = balance_info.sibling_count - 1;
let last_page = balance_info.pages_to_balance[last_sibling_idx]
.as_ref()
.unwrap();
let right_pointer = last_page.get().get_contents().rightmost_pointer().unwrap();
let new_last_page = pages_to_balance_new[sibling_count_new - 1]
.as_ref()
.unwrap();
new_last_page
.get()
.get_contents()
.write_u32(offset::BTREE_RIGHTMOST_PTR, right_pointer);
}
turso_assert!(
parent_contents.overflow_cells.is_empty(),
"parent page overflow cells should be empty before divider cell reinsertion"
);
// TODO: pointer map update (vacuum support)
// Update divider cells in parent
for (sibling_page_idx, page) in pages_to_balance_new
.iter()
.enumerate()
.take(sibling_count_new - 1)
/* do not take last page */
{
let page = page.as_ref().unwrap();
// e.g. if we have 3 pages and the leftmost child page has 3 cells,
// then the divider cell idx is 3 in the flat cell array.
let divider_cell_idx = cell_array.cell_count_up_to_page(sibling_page_idx);
let mut divider_cell = &mut cell_array.cell_payloads[divider_cell_idx];
// FIXME: dont use auxiliary space, could be done without allocations
let mut new_divider_cell = Vec::new();
if !is_leaf_page {
// Interior
// Make this page's rightmost pointer point to pointer of divider cell before modification
let previous_pointer_divider = read_u32(divider_cell, 0);
page.get()
.get_contents()
.write_u32(offset::BTREE_RIGHTMOST_PTR, previous_pointer_divider);
// divider cell now points to this page
new_divider_cell
.extend_from_slice(&(page.get().get().id as u32).to_be_bytes());
// now copy the rest of the divider cell:
// Table Interior page:
// * varint rowid
// Index Interior page:
// * varint payload size
// * payload
// * first overflow page (u32 optional)
new_divider_cell.extend_from_slice(&divider_cell[4..]);
} else if is_table_leaf {
// For table leaves, divider_cell_idx effectively points to the last cell of the old left page.
// The new divider cell's rowid becomes the second-to-last cell's rowid.
// i.e. in the diagram above, the new divider cell's rowid becomes the rowid of C3.
// FIXME: not needed conversion
// FIXME: need to update cell size in order to free correctly?
// insert into cell with correct range should be enough
divider_cell = &mut cell_array.cell_payloads[divider_cell_idx - 1];
let (_, n_bytes_payload) = read_varint(divider_cell)?;
let (rowid, _) = read_varint(&divider_cell[n_bytes_payload..])?;
new_divider_cell
.extend_from_slice(&(page.get().get().id as u32).to_be_bytes());
write_varint_to_vec(rowid, &mut new_divider_cell);
} else {
// Leaf index
new_divider_cell
.extend_from_slice(&(page.get().get().id as u32).to_be_bytes());
new_divider_cell.extend_from_slice(divider_cell);
}
let left_pointer = read_u32(&new_divider_cell[..LEFT_CHILD_PTR_SIZE_BYTES], 0);
turso_assert!(
left_pointer != parent_page.get().id as u32,
"left pointer is the same as parent page id"
);
#[cfg(debug_assertions)]
pages_pointed_to.insert(left_pointer);
tracing::debug!(
"balance_non_root(insert_divider_cell, first_divider_cell={}, divider_cell={}, left_pointer={})",
balance_info.first_divider_cell,
sibling_page_idx,
left_pointer
);
turso_assert!(
left_pointer == page.get().get().id as u32,
"left pointer is not the same as page id"
);
// FIXME: remove this lock
let database_size = header_accessor::get_database_size(&self.pager)?;
turso_assert!(
left_pointer <= database_size,
"invalid page number divider left pointer {} > database number of pages {}",
left_pointer,
database_size
);
// FIXME: defragment shouldn't be needed
// defragment_page(parent_contents, self.usable_space() as u16);
let divider_cell_insert_idx_in_parent =
balance_info.first_divider_cell + sibling_page_idx;
let overflow_cell_count_before = parent_contents.overflow_cells.len();
insert_into_cell(
parent_contents,
&new_divider_cell,
divider_cell_insert_idx_in_parent,
self.usable_space() as u16,
)?;
let overflow_cell_count_after = parent_contents.overflow_cells.len();
let divider_cell_is_overflow_cell =
overflow_cell_count_after > overflow_cell_count_before;
#[cfg(debug_assertions)]
self.validate_balance_non_root_divider_cell_insertion(
balance_info,
parent_contents,
divider_cell_insert_idx_in_parent,
divider_cell_is_overflow_cell,
&page.get(),
);
}
tracing::debug!(
"balance_non_root(parent_overflow={})",
parent_contents.overflow_cells.len()
);
#[cfg(debug_assertions)]
{
// Let's ensure every page is pointed to by the divider cell or the rightmost pointer.
for page in pages_to_balance_new.iter().take(sibling_count_new) {
let page = page.as_ref().unwrap();
assert!(
pages_pointed_to.contains(&(page.get().get().id as u32)),
"page {} not pointed to by divider cell or rightmost pointer",
page.get().get().id
);
}
}
/* 7. Start real movement of cells. Next comment is borrowed from SQLite: */
/* Now update the actual sibling pages. The order in which they are updated
** is important, as this code needs to avoid disrupting any page from which
** cells may still to be read. In practice, this means:
**
** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
** then it is not safe to update page apNew[iPg] until after
** the left-hand sibling apNew[iPg-1] has been updated.
**
** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
** then it is not safe to update page apNew[iPg] until after
** the right-hand sibling apNew[iPg+1] has been updated.
**
** If neither of the above apply, the page is safe to update.
**
** The iPg value in the following loop starts at nNew-1 goes down
** to 0, then back up to nNew-1 again, thus making two passes over
** the pages. On the initial downward pass, only condition (1) above
** needs to be tested because (2) will always be true from the previous
** step. On the upward pass, both conditions are always true, so the
** upwards pass simply processes pages that were missed on the downward
** pass.
*/
let mut done = [false; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE];
let rightmost_page_negative_idx = 1 - sibling_count_new as i64;
let rightmost_page_positive_idx = sibling_count_new as i64 - 1;
for i in rightmost_page_negative_idx..=rightmost_page_positive_idx {
// As mentioned above, we do two passes over the pages:
// 1. Downward pass: Process pages in decreasing order
// 2. Upward pass: Process pages in increasing order
// Hence if we have 3 siblings:
// the order of 'i' will be: -2, -1, 0, 1, 2.
// and the page processing order is: 2, 1, 0, 1, 2.
let page_idx = i.unsigned_abs() as usize;
if done[page_idx] {
continue;
}
// As outlined above, this condition ensures we process pages in the correct order to avoid disrupting cells that still need to be read.
// 1. i >= 0 handles the upward pass where we process any pages not processed in the downward pass.
// - condition (1) is not violated: if cells are moving right-to-left, righthand sibling has not been updated yet.
// - condition (2) is not violated: if cells are moving left-to-right, righthand sibling has already been updated in the downward pass.
// 2. The second condition checks if it's safe to process a page during the downward pass.
// - condition (1) is not violated: if cells are moving right-to-left, we do nothing.
// - condition (2) is not violated: if cells are moving left-to-right, we are allowed to update.
if i >= 0
|| old_cell_count_per_page_cumulative[page_idx - 1]
>= cell_array.cell_count_per_page_cumulative[page_idx - 1]
{
let (start_old_cells, start_new_cells, number_new_cells) = if page_idx == 0
{
(0, 0, cell_array.cell_count_up_to_page(0))
} else {
let this_was_old_page = page_idx < balance_info.sibling_count;
// We add !is_table_leaf because we want to skip 1 in case of divider cell which is encountared between pages assigned
let start_old_cells = if this_was_old_page {
old_cell_count_per_page_cumulative[page_idx - 1] as usize
+ (!is_table_leaf) as usize
} else {
cell_array.cell_payloads.len()
};
let start_new_cells = cell_array.cell_count_up_to_page(page_idx - 1)
+ (!is_table_leaf) as usize;
(
start_old_cells,
start_new_cells,
cell_array.cell_count_up_to_page(page_idx) - start_new_cells,
)
};
let page = pages_to_balance_new[page_idx].as_ref().unwrap();
let page = page.get();
tracing::debug!("pre_edit_page(page={})", page.get().id);
let page_contents = page.get_contents();
edit_page(
page_contents,
start_old_cells,
start_new_cells,
number_new_cells,
&cell_array,
self.usable_space() as u16,
)?;
debug_validate_cells!(page_contents, self.usable_space() as u16);
tracing::trace!(
"edit_page page={} cells={}",
page.get().id,
page_contents.cell_count()
);
page_contents.overflow_cells.clear();
done[page_idx] = true;
}
}
// TODO: vacuum support
let first_child_page = pages_to_balance_new[0].as_ref().unwrap();
let first_child_page = first_child_page.get();
let first_child_contents = first_child_page.get_contents();
if parent_is_root
&& parent_contents.cell_count() == 0
// this check to make sure we are not having negative free space
&& parent_contents.offset
<= compute_free_space(first_child_contents, self.usable_space() as u16)
as usize
{
// From SQLite:
// The root page of the b-tree now contains no cells. The only sibling
// page is the right-child of the parent. Copy the contents of the
// child page into the parent, decreasing the overall height of the
// b-tree structure by one. This is described as the "balance-shallower"
// sub-algorithm in some documentation.
assert!(sibling_count_new == 1);
let parent_offset = if parent_page.get().id == 1 {
DATABASE_HEADER_SIZE
} else {
0
};
// From SQLite:
// It is critical that the child page be defragmented before being
// copied into the parent, because if the parent is page 1 then it will
// by smaller than the child due to the database header, and so
// all the free space needs to be up front.
defragment_page(first_child_contents, self.usable_space() as u16);
let child_top = first_child_contents.cell_content_area() as usize;
let parent_buf = parent_contents.as_ptr();
let child_buf = first_child_contents.as_ptr();
let content_size = self.usable_space() - child_top;
// Copy cell contents
parent_buf[child_top..child_top + content_size]
.copy_from_slice(&child_buf[child_top..child_top + content_size]);
// Copy header and pointer
// NOTE: don't use .cell_pointer_array_offset_and_size() because of different
// header size
let header_and_pointer_size = first_child_contents.header_size()
+ first_child_contents.cell_pointer_array_size();
parent_buf[parent_offset..parent_offset + header_and_pointer_size]
.copy_from_slice(
&child_buf[first_child_contents.offset
..first_child_contents.offset + header_and_pointer_size],
);
self.stack.set_cell_index(0); // reset cell index, top is already parent
sibling_count_new -= 1; // decrease sibling count for debugging and free at the end
assert!(sibling_count_new < balance_info.sibling_count);
}
#[cfg(debug_assertions)]
self.post_balance_non_root_validation(
&parent_page_btree,
balance_info,
parent_contents,
pages_to_balance_new,
page_type,
is_table_leaf,
cells_debug,
sibling_count_new,
right_page_id,
);
(
WriteState::BalanceFreePages {
curr_page: sibling_count_new,
sibling_count_new,
},
Ok(IOResult::Done(())),
)
}
WriteState::BalanceFreePages {
curr_page,
sibling_count_new,
} => {
let write_info = self.state.write_info().unwrap();
let mut balance_info: std::cell::RefMut<'_, Option<BalanceInfo>> =
write_info.balance_info.borrow_mut();
let balance_info = balance_info.as_mut().unwrap();
// We have to free pages that are not used anymore
if !((sibling_count_new..balance_info.sibling_count).contains(&curr_page)) {
(WriteState::BalanceStart, Ok(IOResult::Done(())))
} else {
let page = balance_info.pages_to_balance[curr_page].as_ref().unwrap();
return_if_io!(self
.pager
.free_page(Some(page.get().clone()), page.get().get().id));
(
WriteState::BalanceFreePages {
curr_page: curr_page + 1,
sibling_count_new,
},
Ok(IOResult::Done(())),
)
}
}
WriteState::Finish => todo!(),
};
if matches!(next_write_state, WriteState::BalanceStart) {
// reset balance state
let _ = self.state.mut_write_info().unwrap().balance_info.take();
}
let write_info = self.state.mut_write_info().unwrap();
write_info.state = next_write_state;
result
}
/// Validates that a divider cell was correctly inserted into the parent page
/// during B-tree balancing and that it points to the correct child page.
#[cfg(debug_assertions)]
fn validate_balance_non_root_divider_cell_insertion(
&self,
balance_info: &mut BalanceInfo,
parent_contents: &mut PageContent,
divider_cell_insert_idx_in_parent: usize,
divider_cell_is_overflow_cell: bool,
child_page: &std::sync::Arc<crate::Page>,
) {
let left_pointer = if divider_cell_is_overflow_cell {
parent_contents.overflow_cells
.iter()
.find(|cell| cell.index == divider_cell_insert_idx_in_parent)
.map(|cell| read_u32(&cell.payload, 0))
.unwrap_or_else(|| {
panic!(
"overflow cell with divider cell was not found (divider_cell_idx={}, balance_info.first_divider_cell={}, overflow_cells.len={})",
divider_cell_insert_idx_in_parent,
balance_info.first_divider_cell,
parent_contents.overflow_cells.len(),
)
})
} else if divider_cell_insert_idx_in_parent < parent_contents.cell_count() {
let (cell_start, cell_len) = parent_contents
.cell_get_raw_region(divider_cell_insert_idx_in_parent, self.usable_space());
read_u32(
&parent_contents.as_ptr()[cell_start..cell_start + cell_len],
0,
)
} else {
panic!(
"divider cell is not in the parent page (divider_cell_idx={}, balance_info.first_divider_cell={}, overflow_cells.len={})",
divider_cell_insert_idx_in_parent,
balance_info.first_divider_cell,
parent_contents.overflow_cells.len(),
)
};
// Verify the left pointer points to the correct page
assert_eq!(
left_pointer,
child_page.get().id as u32,
"the cell we just inserted doesn't point to the correct page. points to {}, should point to {}",
left_pointer,
child_page.get().id as u32
);
}
#[cfg(debug_assertions)]
#[allow(clippy::too_many_arguments)]
fn post_balance_non_root_validation(
&self,
parent_page: &BTreePage,
balance_info: &mut BalanceInfo,
parent_contents: &mut PageContent,
pages_to_balance_new: [Option<BTreePage>; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
page_type: PageType,
is_table_leaf: bool,
mut cells_debug: Vec<Vec<u8>>,
sibling_count_new: usize,
right_page_id: u32,
) {
let mut valid = true;
let mut current_index_cell = 0;
for cell_idx in 0..parent_contents.cell_count() {
let cell = parent_contents
.cell_get(cell_idx, self.usable_space())
.unwrap();
match cell {
BTreeCell::TableInteriorCell(table_interior_cell) => {
let left_child_page = table_interior_cell.left_child_page;
if left_child_page == parent_page.get().get().id as u32 {
tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})",
parent_page.get().get().id,
left_child_page,
);
valid = false;
}
}
BTreeCell::IndexInteriorCell(index_interior_cell) => {
let left_child_page = index_interior_cell.left_child_page;
if left_child_page == parent_page.get().get().id as u32 {
tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})",
parent_page.get().get().id,
left_child_page,
);
valid = false;
}
}
_ => {}
}
}
// Let's now make a in depth check that we in fact added all possible cells somewhere and they are not lost
for (page_idx, page) in pages_to_balance_new
.iter()
.take(sibling_count_new)
.enumerate()
{
let page = page.as_ref().unwrap();
let page = page.get();
let contents = page.get_contents();
debug_validate_cells!(contents, self.usable_space() as u16);
// Cells are distributed in order
for cell_idx in 0..contents.cell_count() {
let (cell_start, cell_len) =
contents.cell_get_raw_region(cell_idx, self.usable_space());
let buf = contents.as_ptr();
let cell_buf = to_static_buf(&mut buf[cell_start..cell_start + cell_len]);
let cell_buf_in_array = &cells_debug[current_index_cell];
if cell_buf != cell_buf_in_array {
tracing::error!("balance_non_root(cell_not_found_debug, page_id={}, cell_in_cell_array_idx={})",
page.get().id,
current_index_cell,
);
valid = false;
}
let cell = crate::storage::sqlite3_ondisk::read_btree_cell(
cell_buf,
contents,
0,
self.usable_space(),
)
.unwrap();
match &cell {
BTreeCell::TableInteriorCell(table_interior_cell) => {
let left_child_page = table_interior_cell.left_child_page;
if left_child_page == page.get().id as u32 {
tracing::error!("balance_non_root(child_page_points_same_page, page_id={}, cell_left_child_page={}, page_idx={})",
page.get().id,
left_child_page,
page_idx
);
valid = false;
}
if left_child_page == parent_page.get().get().id as u32 {
tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})",
page.get().id,
left_child_page,
page_idx
);
valid = false;
}
}
BTreeCell::IndexInteriorCell(index_interior_cell) => {
let left_child_page = index_interior_cell.left_child_page;
if left_child_page == page.get().id as u32 {
tracing::error!("balance_non_root(child_page_points_same_page, page_id={}, cell_left_child_page={}, page_idx={})",
page.get().id,
left_child_page,
page_idx
);
valid = false;
}
if left_child_page == parent_page.get().get().id as u32 {
tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})",
page.get().id,
left_child_page,
page_idx
);
valid = false;
}
}
_ => {}
}
current_index_cell += 1;
}
// Now check divider cells and their pointers.
let parent_buf = parent_contents.as_ptr();
let cell_divider_idx = balance_info.first_divider_cell + page_idx;
if sibling_count_new == 0 {
// Balance-shallower case
// We need to check data in parent page
debug_validate_cells!(parent_contents, self.usable_space() as u16);
if pages_to_balance_new[0].is_none() {
tracing::error!(
"balance_non_root(balance_shallower_incorrect_page, page_idx={})",
0
);
valid = false;
}
for (i, value) in pages_to_balance_new
.iter()
.enumerate()
.take(sibling_count_new)
.skip(1)
{
if value.is_some() {
tracing::error!(
"balance_non_root(balance_shallower_incorrect_page, page_idx={})",
i
);
valid = false;
}
}
if current_index_cell != cells_debug.len()
|| cells_debug.len() != contents.cell_count()
|| contents.cell_count() != parent_contents.cell_count()
{
tracing::error!("balance_non_root(balance_shallower_incorrect_cell_count, current_index_cell={}, cells_debug={}, cell_count={}, parent_cell_count={})",
current_index_cell,
cells_debug.len(),
contents.cell_count(),
parent_contents.cell_count()
);
valid = false;
}
if right_page_id == page.get().id as u32
|| right_page_id == parent_page.get().get().id as u32
{
tracing::error!("balance_non_root(balance_shallower_rightmost_pointer, page_id={}, parent_page_id={}, rightmost={})",
page.get().id,
parent_page.get().get().id,
right_page_id,
);
valid = false;
}
if let Some(rm) = contents.rightmost_pointer() {
if rm != right_page_id {
tracing::error!("balance_non_root(balance_shallower_rightmost_pointer, page_rightmost={}, rightmost={})",
rm,
right_page_id,
);
valid = false;
}
}
if let Some(rm) = parent_contents.rightmost_pointer() {
if rm != right_page_id {
tracing::error!("balance_non_root(balance_shallower_rightmost_pointer, parent_rightmost={}, rightmost={})",
rm,
right_page_id,
);
valid = false;
}
}
if parent_contents.page_type() != page_type {
tracing::error!("balance_non_root(balance_shallower_parent_page_type, page_type={:?}, parent_page_type={:?})",
page_type,
parent_contents.page_type()
);
valid = false
}
for (parent_cell_idx, cell_buf_in_array) in
cells_debug.iter().enumerate().take(contents.cell_count())
{
let (parent_cell_start, parent_cell_len) =
parent_contents.cell_get_raw_region(parent_cell_idx, self.usable_space());
let (cell_start, cell_len) =
contents.cell_get_raw_region(parent_cell_idx, self.usable_space());
let buf = contents.as_ptr();
let cell_buf = to_static_buf(&mut buf[cell_start..cell_start + cell_len]);
let parent_cell_buf = to_static_buf(
&mut parent_buf[parent_cell_start..parent_cell_start + parent_cell_len],
);
if cell_buf != cell_buf_in_array || cell_buf != parent_cell_buf {
tracing::error!("balance_non_root(balance_shallower_cell_not_found_debug, page_id={}, cell_in_cell_array_idx={})",
page.get().id,
parent_cell_idx,
);
valid = false;
}
}
} else if page_idx == sibling_count_new - 1 {
// We will only validate rightmost pointer of parent page, we will not validate rightmost if it's a cell and not the last pointer because,
// insert cell could've defragmented the page and invalidated the pointer.
// right pointer, we just check right pointer points to this page.
if cell_divider_idx == parent_contents.cell_count()
&& right_page_id != page.get().id as u32
{
tracing::error!("balance_non_root(cell_divider_right_pointer, should point to {}, but points to {})",
page.get().id,
right_page_id
);
valid = false;
}
} else {
// divider cell might be an overflow cell
let mut was_overflow = false;
for overflow_cell in &parent_contents.overflow_cells {
if overflow_cell.index == cell_divider_idx {
let left_pointer = read_u32(&overflow_cell.payload, 0);
if left_pointer != page.get().id as u32 {
tracing::error!("balance_non_root(cell_divider_left_pointer_overflow, should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})",
page.get().id,
left_pointer,
page_idx,
parent_contents.overflow_cells.len()
);
valid = false;
}
was_overflow = true;
break;
}
}
if was_overflow {
if !is_table_leaf {
// remember to increase cell if this cell was moved to parent
current_index_cell += 1;
}
continue;
}
// check if overflow
// check if right pointer, this is the last page. Do we update rightmost pointer and defragment moves it?
let (cell_start, cell_len) =
parent_contents.cell_get_raw_region(cell_divider_idx, self.usable_space());
let cell_left_pointer = read_u32(&parent_buf[cell_start..cell_start + cell_len], 0);
if cell_left_pointer != page.get().id as u32 {
tracing::error!("balance_non_root(cell_divider_left_pointer, should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})",
page.get().id,
cell_left_pointer,
page_idx,
parent_contents.overflow_cells.len()
);
valid = false;
}
if is_table_leaf {
// If we are in a table leaf page, we just need to check that this cell that should be a divider cell is in the parent
// This means we already check cell in leaf pages but not on parent so we don't advance current_index_cell
let last_sibling_idx = balance_info.sibling_count - 1;
if page_idx >= last_sibling_idx {
// This means we are in the last page and we don't need to check anything
continue;
}
let cell_buf: &'static mut [u8] =
to_static_buf(&mut cells_debug[current_index_cell - 1]);
let cell = crate::storage::sqlite3_ondisk::read_btree_cell(
cell_buf,
contents,
0,
self.usable_space(),
)
.unwrap();
let parent_cell = parent_contents
.cell_get(cell_divider_idx, self.usable_space())
.unwrap();
let rowid = match cell {
BTreeCell::TableLeafCell(table_leaf_cell) => table_leaf_cell.rowid,
_ => unreachable!(),
};
let rowid_parent = match parent_cell {
BTreeCell::TableInteriorCell(table_interior_cell) => {
table_interior_cell.rowid
}
_ => unreachable!(),
};
if rowid_parent != rowid {
tracing::error!("balance_non_root(cell_divider_rowid, page_id={}, cell_divider_idx={}, rowid_parent={}, rowid={})",
page.get().id,
cell_divider_idx,
rowid_parent,
rowid
);
valid = false;
}
} else {
// In any other case, we need to check that this cell was moved to parent as divider cell
let mut was_overflow = false;
for overflow_cell in &parent_contents.overflow_cells {
if overflow_cell.index == cell_divider_idx {
let left_pointer = read_u32(&overflow_cell.payload, 0);
if left_pointer != page.get().id as u32 {
tracing::error!("balance_non_root(cell_divider_divider_cell_overflow should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})",
page.get().id,
left_pointer,
page_idx,
parent_contents.overflow_cells.len()
);
valid = false;
}
was_overflow = true;
break;
}
}
if was_overflow {
if !is_table_leaf {
// remember to increase cell if this cell was moved to parent
current_index_cell += 1;
}
continue;
}
let (parent_cell_start, parent_cell_len) =
parent_contents.cell_get_raw_region(cell_divider_idx, self.usable_space());
let cell_buf_in_array = &cells_debug[current_index_cell];
let left_pointer = read_u32(
&parent_buf[parent_cell_start..parent_cell_start + parent_cell_len],
0,
);
if left_pointer != page.get().id as u32 {
tracing::error!("balance_non_root(divider_cell_left_pointer_interior should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})",
page.get().id,
left_pointer,
page_idx,
parent_contents.overflow_cells.len()
);
valid = false;
}
match page_type {
PageType::TableInterior | PageType::IndexInterior => {
let parent_cell_buf =
&parent_buf[parent_cell_start..parent_cell_start + parent_cell_len];
if parent_cell_buf[4..] != cell_buf_in_array[4..] {
tracing::error!("balance_non_root(cell_divider_cell, page_id={}, cell_divider_idx={})",
page.get().id,
cell_divider_idx,
);
valid = false;
}
}
PageType::IndexLeaf => {
let parent_cell_buf =
&parent_buf[parent_cell_start..parent_cell_start + parent_cell_len];
if parent_cell_buf[4..] != cell_buf_in_array[..] {
tracing::error!("balance_non_root(cell_divider_cell_index_leaf, page_id={}, cell_divider_idx={})",
page.get().id,
cell_divider_idx,
);
valid = false;
}
}
_ => {
unreachable!()
}
}
current_index_cell += 1;
}
}
}
assert!(
valid,
"corrupted database, cells were not balanced properly"
);
}
/// Balance the root page.
/// This is done when the root page overflows, and we need to create a new root page.
/// See e.g. https://en.wikipedia.org/wiki/B-tree
fn balance_root(&mut self) -> Result<()> {
/* todo: balance deeper, create child and copy contents of root there. Then split root */
/* if we are in root page then we just need to create a new root and push key there */
let is_page_1 = {
let current_root = self.stack.top();
current_root.get().get().id == 1
};
let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 };
let root_btree = self.stack.top();
let root = root_btree.get();
let root_contents = root.get_contents();
// FIXME: handle page cache is full
let child_btree =
self.pager
.do_allocate_page(root_contents.page_type(), 0, BtreePageAllocMode::Any)?;
tracing::debug!(
"balance_root(root={}, rightmost={}, page_type={:?})",
root.get().id,
child_btree.get().get().id,
root.get_contents().page_type()
);
turso_assert!(root.is_dirty(), "root must be marked dirty");
turso_assert!(
child_btree.get().is_dirty(),
"child must be marked dirty as freshly allocated page"
);
let root_buf = root_contents.as_ptr();
let child = child_btree.get();
let child_contents = child.get_contents();
let child_buf = child_contents.as_ptr();
let (root_pointer_start, root_pointer_len) =
root_contents.cell_pointer_array_offset_and_size();
let (child_pointer_start, _) = child.get_contents().cell_pointer_array_offset_and_size();
let top = root_contents.cell_content_area() as usize;
// 1. Modify child
// Copy pointers
child_buf[child_pointer_start..child_pointer_start + root_pointer_len]
.copy_from_slice(&root_buf[root_pointer_start..root_pointer_start + root_pointer_len]);
// Copy cell contents
child_buf[top..].copy_from_slice(&root_buf[top..]);
// Copy header
child_buf[0..root_contents.header_size()]
.copy_from_slice(&root_buf[offset..offset + root_contents.header_size()]);
// Copy overflow cells
std::mem::swap(
&mut child_contents.overflow_cells,
&mut root_contents.overflow_cells,
);
root_contents.overflow_cells.clear();
// 2. Modify root
let new_root_page_type = match root_contents.page_type() {
PageType::IndexLeaf => PageType::IndexInterior,
PageType::TableLeaf => PageType::TableInterior,
other => other,
} as u8;
// set new page type
root_contents.write_u8(offset::BTREE_PAGE_TYPE, new_root_page_type);
root_contents.write_u32(offset::BTREE_RIGHTMOST_PTR, child.get().id as u32);
root_contents.write_u16(offset::BTREE_CELL_CONTENT_AREA, self.usable_space() as u16);
root_contents.write_u16(offset::BTREE_CELL_COUNT, 0);
root_contents.write_u16(offset::BTREE_FIRST_FREEBLOCK, 0);
root_contents.write_u8(offset::BTREE_FRAGMENTED_BYTES_COUNT, 0);
root_contents.overflow_cells.clear();
self.root_page = root.get().id;
self.stack.clear();
self.stack.push(root_btree.clone());
self.stack.set_cell_index(0); // leave parent pointing at the rightmost pointer (in this case 0, as there are no cells), since we will be balancing the rightmost child page.
self.stack.push(child_btree.clone());
Ok(())
}
fn usable_space(&self) -> usize {
self.pager.usable_space()
}
pub fn seek_end(&mut self) -> Result<IOResult<()>> {
assert!(self.mv_cursor.is_none()); // unsure about this -_-
self.move_to_root()?;
loop {
let mem_page = self.stack.top();
let page_id = mem_page.get().get().id;
let page = self.read_page(page_id)?;
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
if contents.is_leaf() {
// set cursor just past the last cell to append
self.stack.set_cell_index(contents.cell_count() as i32);
return Ok(IOResult::Done(()));
}
match contents.rightmost_pointer() {
Some(right_most_pointer) => {
self.stack.set_cell_index(contents.cell_count() as i32 + 1); // invalid on interior
let child = self.read_page(right_most_pointer as usize)?;
self.stack.push(child);
}
None => unreachable!("interior page must have rightmost pointer"),
}
}
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn seek_to_last(&mut self) -> Result<IOResult<()>> {
assert!(self.mv_cursor.is_none());
let has_record = return_if_io!(self.move_to_rightmost());
self.invalidate_record();
self.has_record.replace(has_record);
if !has_record {
let is_empty = return_if_io!(self.is_empty_table());
assert!(is_empty);
return Ok(IOResult::Done(()));
}
Ok(IOResult::Done(()))
}
pub fn is_empty(&self) -> bool {
!self.has_record.get()
}
pub fn root_page(&self) -> usize {
self.root_page
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn rewind(&mut self) -> Result<IOResult<()>> {
if let Some(mv_cursor) = &self.mv_cursor {
{
let mut mv_cursor = mv_cursor.borrow_mut();
mv_cursor.rewind();
}
let cursor_has_record = return_if_io!(self.get_next_record());
self.invalidate_record();
self.has_record.replace(cursor_has_record);
} else {
self.move_to_root()?;
let cursor_has_record = return_if_io!(self.get_next_record());
self.invalidate_record();
self.has_record.replace(cursor_has_record);
}
Ok(IOResult::Done(()))
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn last(&mut self) -> Result<IOResult<()>> {
assert!(self.mv_cursor.is_none());
let cursor_has_record = return_if_io!(self.move_to_rightmost());
self.has_record.replace(cursor_has_record);
self.invalidate_record();
Ok(IOResult::Done(()))
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn next(&mut self) -> Result<IOResult<bool>> {
return_if_io!(self.restore_context());
let cursor_has_record = return_if_io!(self.get_next_record());
self.has_record.replace(cursor_has_record);
self.invalidate_record();
Ok(IOResult::Done(cursor_has_record))
}
fn invalidate_record(&mut self) {
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.invalidate();
self.record_cursor.borrow_mut().invalidate();
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn prev(&mut self) -> Result<IOResult<bool>> {
assert!(self.mv_cursor.is_none());
return_if_io!(self.restore_context());
let cursor_has_record = return_if_io!(self.get_prev_record());
self.has_record.replace(cursor_has_record);
self.invalidate_record();
Ok(IOResult::Done(cursor_has_record))
}
#[instrument(skip(self), level = Level::DEBUG)]
pub fn rowid(&mut self) -> Result<IOResult<Option<i64>>> {
if let Some(mv_cursor) = &self.mv_cursor {
if self.has_record.get() {
let mv_cursor = mv_cursor.borrow();
return Ok(IOResult::Done(
mv_cursor.current_row_id().map(|rowid| rowid.row_id),
));
} else {
return Ok(IOResult::Done(None));
}
}
if self.has_record.get() {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
// load record
let _ = return_if_io!(self.record());
let page_type = page.get().get_contents().page_type();
let page = page.get();
let contents = page.get_contents();
let cell_idx = self.stack.current_cell_index();
let cell = contents.cell_get(cell_idx as usize, self.usable_space())?;
if page_type.is_table() {
let BTreeCell::TableLeafCell(TableLeafCell { rowid, .. }) = cell else {
unreachable!(
"BTreeCursor::rowid(): unexpected page_type: {:?}",
page_type
);
};
Ok(IOResult::Done(Some(rowid)))
} else {
Ok(IOResult::Done(self.get_index_rowid_from_record()))
}
} else {
Ok(IOResult::Done(None))
}
}
#[instrument(skip(self), level = Level::DEBUG)]
pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<IOResult<SeekResult>> {
assert!(self.mv_cursor.is_none());
// Empty trace to capture the span information
tracing::trace!("");
// We need to clear the null flag for the table cursor before seeking,
// because it might have been set to false by an unmatched left-join row during the previous iteration
// on the outer loop.
self.set_null_flag(false);
let seek_result = return_if_io!(self.do_seek(key, op));
self.invalidate_record();
// Reset seek state
self.seek_state = CursorSeekState::Start;
self.valid_state = CursorValidState::Valid;
Ok(IOResult::Done(seek_result))
}
/// Return a reference to the record the cursor is currently pointing to.
/// If record was not parsed yet, then we have to parse it and in case of I/O we yield control
/// back.
#[instrument(skip(self), level = Level::DEBUG)]
pub fn record(&self) -> Result<IOResult<Option<Ref<ImmutableRecord>>>> {
if !self.has_record.get() {
return Ok(IOResult::Done(None));
}
let invalidated = self
.reusable_immutable_record
.borrow()
.as_ref()
.is_none_or(|record| record.is_invalidated());
if !invalidated {
*self.parse_record_state.borrow_mut() = ParseRecordState::Init;
let record_ref =
Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref())
.unwrap();
return Ok(IOResult::Done(Some(record_ref)));
}
if self.mv_cursor.is_some() {
let mv_cursor = self.mv_cursor.as_ref().unwrap().borrow();
let row = mv_cursor.current_row().unwrap().unwrap();
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.invalidate();
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.start_serialization(&row.data);
self.record_cursor.borrow_mut().invalidate();
let record_ref =
Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref())
.unwrap();
return Ok(IOResult::Done(Some(record_ref)));
}
if *self.parse_record_state.borrow() == ParseRecordState::Init {
*self.parse_record_state.borrow_mut() = ParseRecordState::Parsing {
payload: Vec::new(),
};
}
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get_contents();
let cell_idx = self.stack.current_cell_index();
let cell = contents.cell_get(cell_idx as usize, self.usable_space())?;
let (payload, payload_size, first_overflow_page) = match cell {
BTreeCell::TableLeafCell(TableLeafCell {
payload,
payload_size,
first_overflow_page,
..
}) => (payload, payload_size, first_overflow_page),
BTreeCell::IndexInteriorCell(IndexInteriorCell {
payload,
payload_size,
first_overflow_page,
..
}) => (payload, payload_size, first_overflow_page),
BTreeCell::IndexLeafCell(IndexLeafCell {
payload,
first_overflow_page,
payload_size,
}) => (payload, payload_size, first_overflow_page),
_ => unreachable!("unexpected page_type"),
};
if let Some(next_page) = first_overflow_page {
return_if_io!(self.process_overflow_read(payload, next_page, payload_size))
} else {
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.invalidate();
self.get_immutable_record_or_create()
.as_mut()
.unwrap()
.start_serialization(payload);
self.record_cursor.borrow_mut().invalidate();
};
*self.parse_record_state.borrow_mut() = ParseRecordState::Init;
let record_ref =
Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref()).unwrap();
Ok(IOResult::Done(Some(record_ref)))
}
#[instrument(skip(self), level = Level::DEBUG)]
pub fn insert(
&mut self,
key: &BTreeKey,
// Indicate whether it's necessary to traverse to find the leaf page
// FIXME: refactor this out into a state machine, these ad-hoc state
// variables are very hard to reason about
mut moved_before: bool,
) -> Result<IOResult<()>> {
tracing::debug!(valid_state = ?self.valid_state, cursor_state = ?self.state, is_write_in_progress = self.is_write_in_progress());
match &self.mv_cursor {
Some(mv_cursor) => match key.maybe_rowid() {
Some(rowid) => {
let row_id = crate::mvcc::database::RowID::new(self.table_id() as u64, rowid);
let record_buf = key.get_record().unwrap().get_payload().to_vec();
let row = crate::mvcc::database::Row::new(row_id, record_buf);
mv_cursor.borrow_mut().insert(row).unwrap();
}
None => todo!("Support mvcc inserts with index btrees"),
},
None => {
match (&self.valid_state, self.is_write_in_progress()) {
(CursorValidState::Valid, _) => {
// consider the current position valid unless the caller explicitly asks us to seek.
}
(CursorValidState::RequireSeek, false) => {
// we must seek.
moved_before = false;
}
(CursorValidState::RequireSeek, true) => {
// illegal to seek during a write no matter what CursorValidState or caller says -- we might e.g. move to the wrong page during balancing
moved_before = true;
}
(CursorValidState::RequireAdvance(direction), _) => {
// FIXME: this is a hack to support the case where we need to advance the cursor after a seek.
// We should have a proper state machine for this.
return_if_io!(match direction {
IterationDirection::Forwards => self.next(),
IterationDirection::Backwards => self.prev(),
});
self.valid_state = CursorValidState::Valid;
self.seek_state = CursorSeekState::Start;
moved_before = true;
}
};
if !moved_before {
let seek_result = match key {
BTreeKey::IndexKey(_) => {
return_if_io!(self.seek(
SeekKey::IndexKey(key.get_record().unwrap()),
SeekOp::GE { eq_only: true }
))
}
BTreeKey::TableRowId(_) => {
return_if_io!(self.seek(
SeekKey::TableRowId(key.to_rowid()),
SeekOp::GE { eq_only: true }
))
}
};
if SeekResult::TryAdvance == seek_result {
self.valid_state =
CursorValidState::RequireAdvance(IterationDirection::Forwards);
return_if_io!(self.next());
}
self.context.take(); // we know where we wanted to move so if there was any saved context, discard it.
self.valid_state = CursorValidState::Valid;
self.seek_state = CursorSeekState::Start;
tracing::debug!(
"seeked to the right place, page is now {:?}",
self.stack.top().get().get().id
);
}
return_if_io!(self.insert_into_page(key));
if key.maybe_rowid().is_some() {
self.has_record.replace(true);
}
}
};
Ok(IOResult::Done(()))
}
/// Delete state machine flow:
/// 1. Start -> check if the rowid to be delete is present in the page or not. If not we early return
/// 2. DeterminePostBalancingSeekKey -> determine the key to seek to after balancing.
/// 3. LoadPage -> load the page.
/// 4. FindCell -> find the cell to be deleted in the page.
/// 5. ClearOverflowPages -> Clear the overflow pages if there are any before dropping the cell, then if we are in a leaf page we just drop the cell in place.
/// if we are in interior page, we need to rotate keys in order to replace current cell (InteriorNodeReplacement).
/// 6. InteriorNodeReplacement -> we copy the left subtree leaf node into the deleted interior node's place.
/// 7. WaitForBalancingToComplete -> perform balancing
/// 8. SeekAfterBalancing -> adjust the cursor to a node that is closer to the deleted value. go to Finish
/// 9. Finish -> Delete operation is done. Return CursorResult(Ok())
#[instrument(skip(self), level = Level::DEBUG)]
pub fn delete(&mut self) -> Result<IOResult<()>> {
assert!(self.mv_cursor.is_none());
if let CursorState::None = &self.state {
self.state = CursorState::Delete(DeleteInfo {
state: DeleteState::Start,
balance_write_info: None,
})
}
loop {
let delete_state = {
let delete_info = self.state.delete_info().expect("cannot get delete info");
delete_info.state.clone()
};
tracing::debug!(?delete_state);
match delete_state {
DeleteState::Start => {
let page = self.stack.top();
self.pager.add_dirty(&page.get());
if matches!(
page.get().get_contents().page_type(),
PageType::TableLeaf | PageType::TableInterior
) {
if return_if_io!(self.rowid()).is_none() {
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
} else if self.reusable_immutable_record.borrow().is_none() {
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
let delete_info = self.state.mut_delete_info().unwrap();
delete_info.state = DeleteState::DeterminePostBalancingSeekKey;
}
DeleteState::DeterminePostBalancingSeekKey => {
// FIXME: skip this work if we determine deletion wont result in balancing
// Right now we calculate the key every time for simplicity/debugging
// since it won't affect correctness which is more important
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let target_key = if page.get().is_index() {
let record = match return_if_io!(self.record()) {
Some(record) => record.clone(),
None => unreachable!("there should've been a record"),
};
DeleteSavepoint::Payload(record)
} else {
let Some(rowid) = return_if_io!(self.rowid()) else {
panic!("cursor should be pointing to a record with a rowid");
};
DeleteSavepoint::Rowid(rowid)
};
let delete_info = self.state.mut_delete_info().unwrap();
delete_info.state = DeleteState::LoadPage {
post_balancing_seek_key: Some(target_key),
};
}
DeleteState::LoadPage {
post_balancing_seek_key,
} => {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let delete_info = self.state.mut_delete_info().unwrap();
delete_info.state = DeleteState::FindCell {
post_balancing_seek_key,
};
}
DeleteState::FindCell {
post_balancing_seek_key,
} => {
let page = self.stack.top();
let cell_idx = self.stack.current_cell_index() as usize;
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
if cell_idx >= contents.cell_count() {
return_corrupt!(format!(
"Corrupted page: cell index {} is out of bounds for page with {} cells",
cell_idx,
contents.cell_count()
));
}
tracing::debug!(
"DeleteState::FindCell: page_id: {}, cell_idx: {}",
page.get().id,
cell_idx
);
let cell = contents.cell_get(cell_idx, self.usable_space())?;
let original_child_pointer = match &cell {
BTreeCell::TableInteriorCell(interior) => Some(interior.left_child_page),
BTreeCell::IndexInteriorCell(interior) => Some(interior.left_child_page),
_ => None,
};
let delete_info = self.state.mut_delete_info().unwrap();
delete_info.state = DeleteState::ClearOverflowPages {
cell_idx,
cell,
original_child_pointer,
post_balancing_seek_key,
};
}
DeleteState::ClearOverflowPages {
cell_idx,
cell,
original_child_pointer,
post_balancing_seek_key,
} => {
return_if_io!(self.clear_overflow_pages(&cell));
let page = self.stack.top();
let page = page.get();
let contents = page.get_contents();
let delete_info = self.state.mut_delete_info().unwrap();
if !contents.is_leaf() {
delete_info.state = DeleteState::InteriorNodeReplacement {
page: page.clone(),
btree_depth: self.stack.current(),
cell_idx,
original_child_pointer,
post_balancing_seek_key,
};
} else {
drop_cell(contents, cell_idx, self.usable_space() as u16)?;
let delete_info = self.state.mut_delete_info().unwrap();
delete_info.state = DeleteState::CheckNeedsBalancing {
btree_depth: self.stack.current(),
post_balancing_seek_key,
};
}
}
DeleteState::InteriorNodeReplacement {
page,
btree_depth,
cell_idx,
original_child_pointer,
post_balancing_seek_key,
} => {
// This is an interior node, we need to handle deletion differently.
// 1. Move cursor to the largest key in the left subtree.
// 2. Replace the cell in the interior (parent) node with that key.
// 3. Delete that key from the child page.
// Step 1: Move cursor to the largest key in the left subtree.
// The largest key is always in a leaf, and so this traversal may involvegoing multiple pages downwards,
// so we store the page we are currently on.
// avoid calling prev() because it internally calls restore_context() which may cause unintended behavior.
return_if_io!(self.get_prev_record());
// Ensure we keep the parent page at the same position as before the replacement.
self.stack
.node_states
.borrow_mut()
.get_mut(btree_depth)
.expect("parent page should be on the stack")
.cell_idx = cell_idx as i32;
let (cell_payload, leaf_cell_idx) = {
let leaf_page_ref = self.stack.top();
let leaf_page = leaf_page_ref.get();
let leaf_contents = leaf_page.get().contents.as_ref().unwrap();
assert!(leaf_contents.is_leaf());
assert!(leaf_contents.cell_count() > 0);
let leaf_cell_idx = leaf_contents.cell_count() - 1;
let last_cell_on_child_page =
leaf_contents.cell_get(leaf_cell_idx, self.usable_space())?;
let mut cell_payload: Vec<u8> = Vec::new();
let child_pointer =
original_child_pointer.expect("there should be a pointer");
// Rewrite the old leaf cell as an interior cell depending on type.
match last_cell_on_child_page {
BTreeCell::TableLeafCell(leaf_cell) => {
// Table interior cells contain the left child pointer and the rowid as varint.
cell_payload.extend_from_slice(&child_pointer.to_be_bytes());
write_varint_to_vec(leaf_cell.rowid as u64, &mut cell_payload);
}
BTreeCell::IndexLeafCell(leaf_cell) => {
// Index interior cells contain:
// 1. The left child pointer
// 2. The payload size as varint
// 3. The payload
// 4. The first overflow page as varint, omitted if no overflow.
cell_payload.extend_from_slice(&child_pointer.to_be_bytes());
write_varint_to_vec(leaf_cell.payload_size, &mut cell_payload);
cell_payload.extend_from_slice(leaf_cell.payload);
if let Some(first_overflow_page) = leaf_cell.first_overflow_page {
cell_payload
.extend_from_slice(&first_overflow_page.to_be_bytes());
}
}
_ => unreachable!("Expected table leaf cell"),
}
(cell_payload, leaf_cell_idx)
};
let leaf_page = self.stack.top();
self.pager.add_dirty(&page);
self.pager.add_dirty(&leaf_page.get());
// Step 2: Replace the cell in the parent (interior) page.
{
let parent_contents = page.get_contents();
let parent_page_id = page.get().id;
let left_child_page = u32::from_be_bytes(
cell_payload[..4].try_into().expect("invalid cell payload"),
);
turso_assert!(
left_child_page as usize != parent_page_id,
"corrupt: current page and left child page of cell {} are both {}",
left_child_page,
parent_page_id
);
// First, drop the old cell that is being replaced.
drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?;
// Then, insert the new cell (the predecessor) in its place.
insert_into_cell(
parent_contents,
&cell_payload,
cell_idx,
self.usable_space() as u16,
)?;
}
// Step 3: Delete the predecessor cell from the leaf page.
{
let leaf_page_ref = leaf_page.get();
let leaf_contents = leaf_page_ref.get_contents();
drop_cell(leaf_contents, leaf_cell_idx, self.usable_space() as u16)?;
}
let delete_info = self.state.mut_delete_info().unwrap();
delete_info.state = DeleteState::CheckNeedsBalancing {
btree_depth,
post_balancing_seek_key,
};
}
DeleteState::CheckNeedsBalancing {
btree_depth,
post_balancing_seek_key,
} => {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
// Check if either the leaf page we took the replacement cell from underflows, or if the interior page we inserted it into overflows OR underflows.
// If the latter is true, we must always balance that level regardless of whether the leaf page (or any ancestor pages in between) need balancing.
let leaf_underflows = {
let leaf_page = page.get();
let leaf_contents = leaf_page.get_contents();
let free_space =
compute_free_space(leaf_contents, self.usable_space() as u16);
free_space as usize * 3 > self.usable_space() * 2
};
let interior_overflows_or_underflows = {
// Invariant: ancestor pages on the stack are pinned to the page cache,
// so we don't need return_if_locked_maybe_load! any ancestor,
// and we already loaded the current page above.
let interior_page = self
.stack
.get_page_at_level(btree_depth)
.expect("ancestor page should be on the stack");
let interior_page = interior_page.get();
let interior_contents = interior_page.get_contents();
let overflows = !interior_contents.overflow_cells.is_empty();
if overflows {
true
} else {
let free_space =
compute_free_space(interior_contents, self.usable_space() as u16);
free_space as usize * 3 > self.usable_space() * 2
}
};
let needs_balancing = leaf_underflows || interior_overflows_or_underflows;
if needs_balancing {
let delete_info = self.state.mut_delete_info().unwrap();
if delete_info.balance_write_info.is_none() {
let mut write_info = WriteInfo::new();
write_info.state = WriteState::BalanceStart;
delete_info.balance_write_info = Some(write_info);
}
let balance_only_ancestor =
!leaf_underflows && interior_overflows_or_underflows;
if balance_only_ancestor {
// Only need to balance the ancestor page; move there immediately.
while self.stack.current() > btree_depth {
self.stack.pop();
}
}
let balance_both = leaf_underflows && interior_overflows_or_underflows;
delete_info.state = DeleteState::WaitForBalancingToComplete {
balance_ancestor_at_depth: if balance_both {
Some(btree_depth)
} else {
None
},
target_key: post_balancing_seek_key.unwrap(),
}
} else {
// No balancing needed, we're done
self.stack.retreat();
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
}
DeleteState::WaitForBalancingToComplete {
target_key,
balance_ancestor_at_depth,
} => {
let delete_info = self.state.mut_delete_info().unwrap();
// Switch the CursorState to Write state for balancing
let write_info = delete_info.balance_write_info.take().unwrap();
self.state = CursorState::Write(write_info);
match self.balance(balance_ancestor_at_depth)? {
IOResult::Done(()) => {
let write_info = match &self.state {
CursorState::Write(wi) => wi.clone(),
_ => unreachable!("Balance operation changed cursor state"),
};
// Move to seek state
self.state = CursorState::Delete(DeleteInfo {
state: DeleteState::SeekAfterBalancing { target_key },
balance_write_info: Some(write_info),
});
}
IOResult::IO => {
// Move to seek state
// Save balance progress and return IO
let write_info = match &self.state {
CursorState::Write(wi) => wi.clone(),
_ => unreachable!("Balance operation changed cursor state"),
};
self.state = CursorState::Delete(DeleteInfo {
state: DeleteState::WaitForBalancingToComplete {
target_key,
balance_ancestor_at_depth,
},
balance_write_info: Some(write_info),
});
return Ok(IOResult::IO);
}
}
}
DeleteState::SeekAfterBalancing { target_key } => {
let key = match &target_key {
DeleteSavepoint::Rowid(rowid) => SeekKey::TableRowId(*rowid),
DeleteSavepoint::Payload(immutable_record) => {
SeekKey::IndexKey(immutable_record)
}
};
// We want to end up pointing at the row to the left of the position of the row we deleted, so
// that after we call next() in the loop,the next row we delete will again be the same position as this one.
let seek_result = return_if_io!(self.seek(key, SeekOp::LT));
if let SeekResult::TryAdvance = seek_result {
let CursorState::Delete(delete_info) = &self.state else {
unreachable!("expected delete state");
};
self.state = CursorState::Delete(DeleteInfo {
state: DeleteState::TryAdvance,
balance_write_info: delete_info.balance_write_info.clone(),
});
continue;
}
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
DeleteState::TryAdvance => {
// we use LT always for post-delete seeks, which uses backwards iteration, so we always call prev() here.
return_if_io!(self.prev());
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
}
}
}
/// In outer joins, whenever the right-side table has no matching row, the query must still return a row
/// for each left-side row. In order to achieve this, we set the null flag on the right-side table cursor
/// so that it returns NULL for all columns until cleared.
#[inline(always)]
pub fn set_null_flag(&mut self, flag: bool) {
self.null_flag = flag;
}
#[inline(always)]
pub fn get_null_flag(&self) -> bool {
self.null_flag
}
#[instrument(skip_all, level = Level::DEBUG)]
pub fn exists(&mut self, key: &Value) -> Result<IOResult<bool>> {
assert!(self.mv_cursor.is_none());
let int_key = match key {
Value::Integer(i) => i,
_ => unreachable!("btree tables are indexed by integers!"),
};
let seek_result =
return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true }));
let exists = matches!(seek_result, SeekResult::Found);
self.invalidate_record();
Ok(IOResult::Done(exists))
}
/// Clear the overflow pages linked to a specific page provided by the leaf cell
/// Uses a state machine to keep track of it's operations so that traversal can be
/// resumed from last point after IO interruption
#[instrument(skip_all, level = Level::DEBUG)]
fn clear_overflow_pages(&mut self, cell: &BTreeCell) -> Result<IOResult<()>> {
loop {
let state = self.overflow_state.take().unwrap_or(OverflowState::Start);
match state {
OverflowState::Start => {
let first_overflow_page = match cell {
BTreeCell::TableLeafCell(leaf_cell) => leaf_cell.first_overflow_page,
BTreeCell::IndexLeafCell(leaf_cell) => leaf_cell.first_overflow_page,
BTreeCell::IndexInteriorCell(interior_cell) => {
interior_cell.first_overflow_page
}
BTreeCell::TableInteriorCell(_) => return Ok(IOResult::Done(())), // No overflow pages
};
if let Some(page) = first_overflow_page {
self.overflow_state = Some(OverflowState::ProcessPage { next_page: page });
continue;
} else {
self.overflow_state = Some(OverflowState::Done);
}
}
OverflowState::ProcessPage { next_page } => {
if next_page < 2
|| next_page as usize
> header_accessor::get_database_size(&self.pager)? as usize
{
self.overflow_state = None;
return Err(LimboError::Corrupt("Invalid overflow page number".into()));
}
let page = self.read_page(next_page as usize)?;
return_if_locked_maybe_load!(self.pager, page);
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
let next = contents.read_u32(0);
return_if_io!(self.pager.free_page(Some(page), next_page as usize));
if next != 0 {
self.overflow_state = Some(OverflowState::ProcessPage { next_page: next });
} else {
self.overflow_state = Some(OverflowState::Done);
}
}
OverflowState::Done => {
self.overflow_state = None;
return Ok(IOResult::Done(()));
}
};
}
}
/// Destroys a B-tree by freeing all its pages in an iterative depth-first order.
/// This ensures child pages are freed before their parents
/// Uses a state machine to keep track of the operation to ensure IO doesn't cause repeated traversals
///
/// # Example
/// For a B-tree with this structure (where 4' is an overflow page):
/// ```text
/// 1 (root)
/// / \
/// 2 3
/// / \ / \
/// 4' <- 4 5 6 7
/// ```
///
/// The destruction order would be: [4',4,5,2,6,7,3,1]
#[instrument(skip(self), level = Level::DEBUG)]
pub fn btree_destroy(&mut self) -> Result<IOResult<Option<usize>>> {
if let CursorState::None = &self.state {
self.move_to_root()?;
self.state = CursorState::Destroy(DestroyInfo {
state: DestroyState::Start,
});
}
loop {
let destroy_state = {
let destroy_info = self
.state
.destroy_info()
.expect("unable to get a mut reference to destroy state in cursor");
destroy_info.state.clone()
};
match destroy_state {
DestroyState::Start => {
let destroy_info = self
.state
.mut_destroy_info()
.expect("unable to get a mut reference to destroy state in cursor");
destroy_info.state = DestroyState::LoadPage;
}
DestroyState::LoadPage => {
let page = self.stack.top();
return_if_locked_maybe_load!(self.pager, page);
let destroy_info = self
.state
.mut_destroy_info()
.expect("unable to get a mut reference to destroy state in cursor");
destroy_info.state = DestroyState::ProcessPage;
}
DestroyState::ProcessPage => {
let page = self.stack.top();
self.stack.advance();
assert!(page.get().is_loaded()); // page should be loaded at this time
let page = page.get();
let contents = page.get().contents.as_ref().unwrap();
let cell_idx = self.stack.current_cell_index();
// If we've processed all cells in this page, figure out what to do with this page
if cell_idx >= contents.cell_count() as i32 {
match (contents.is_leaf(), cell_idx) {
// Leaf pages with all cells processed
(true, n) if n >= contents.cell_count() as i32 => {
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::FreePage;
continue;
}
// Non-leaf page which has processed all children but not it's potential right child
(false, n) if n == contents.cell_count() as i32 => {
if let Some(rightmost) = contents.rightmost_pointer() {
let rightmost_page = self.read_page(rightmost as usize)?;
self.stack.push(rightmost_page);
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::LoadPage;
} else {
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::FreePage;
}
continue;
}
// Non-leaf page which has processed all children and it's right child
(false, n) if n > contents.cell_count() as i32 => {
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::FreePage;
continue;
}
_ => unreachable!("Invalid cell idx state"),
}
}
// We have not yet processed all cells in this page
// Get the current cell
let cell = contents.cell_get(cell_idx as usize, self.usable_space())?;
match contents.is_leaf() {
// For a leaf cell, clear the overflow pages associated with this cell
true => {
let destroy_info = self
.state
.mut_destroy_info()
.expect("unable to get a mut reference to destroy state in cursor");
destroy_info.state = DestroyState::ClearOverflowPages { cell };
continue;
}
// For interior cells, check the type of cell to determine what to do
false => match &cell {
// For index interior cells, remove the overflow pages
BTreeCell::IndexInteriorCell(_) => {
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::ClearOverflowPages { cell };
continue;
}
// For all other interior cells, load the left child page
_ => {
let child_page_id = match &cell {
BTreeCell::TableInteriorCell(cell) => cell.left_child_page,
BTreeCell::IndexInteriorCell(cell) => cell.left_child_page,
_ => panic!("expected interior cell"),
};
let child_page = self.read_page(child_page_id as usize)?;
self.stack.push(child_page);
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::LoadPage;
continue;
}
},
}
}
DestroyState::ClearOverflowPages { cell } => {
match self.clear_overflow_pages(&cell)? {
IOResult::Done(_) => match cell {
// For an index interior cell, clear the left child page now that overflow pages have been cleared
BTreeCell::IndexInteriorCell(index_int_cell) => {
let child_page =
self.read_page(index_int_cell.left_child_page as usize)?;
self.stack.push(child_page);
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::LoadPage;
continue;
}
// For any leaf cell, advance the index now that overflow pages have been cleared
BTreeCell::TableLeafCell(_) | BTreeCell::IndexLeafCell(_) => {
let destroy_info = self.state.mut_destroy_info().expect(
"unable to get a mut reference to destroy state in cursor",
);
destroy_info.state = DestroyState::LoadPage;
}
_ => panic!("unexpected cell type"),
},
IOResult::IO => return Ok(IOResult::IO),
}
}
DestroyState::FreePage => {
let page = self.stack.top();
let page_id = page.get().get().id;
return_if_io!(self.pager.free_page(Some(page.get()), page_id));
if self.stack.has_parent() {
self.stack.pop();
let destroy_info = self
.state
.mut_destroy_info()
.expect("unable to get a mut reference to destroy state in cursor");
destroy_info.state = DestroyState::ProcessPage;
} else {
self.state = CursorState::None;
// TODO: For now, no-op the result return None always. This will change once [AUTO_VACUUM](https://www.sqlite.org/lang_vacuum.html) is introduced
// At that point, the last root page(call this x) will be moved into the position of the root page of this table and the value returned will be x
return Ok(IOResult::Done(None));
}
}
}
}
}
pub fn table_id(&self) -> usize {
self.root_page
}
pub fn overwrite_cell(
&mut self,
page_ref: BTreePage,
cell_idx: usize,
record: &ImmutableRecord,
) -> Result<IOResult<()>> {
// build the new payload
let page = page_ref.get();
let page_contents = page.get().contents.as_ref().unwrap();
let serial_types_len = self.record_cursor.borrow_mut().len(record);
let mut new_payload = Vec::with_capacity(serial_types_len);
let rowid = return_if_io!(self.rowid());
fill_cell_payload(
page_contents,
rowid,
&mut new_payload,
cell_idx,
record,
self.usable_space(),
self.pager.clone(),
);
// figure out old cell offset & size
let (old_offset, old_local_size) = {
let page_ref = page_ref.get();
let page = page_ref.get().contents.as_ref().unwrap();
page.cell_get_raw_region(cell_idx, self.usable_space())
};
// if it all fits in local space and old_local_size is enough, do an in-place overwrite
if new_payload.len() == old_local_size {
self.overwrite_content(page_ref.clone(), old_offset, &new_payload)?;
Ok(IOResult::Done(()))
} else {
// doesn't fit, drop it and insert a new one
drop_cell(
page_ref.get().get_contents(),
cell_idx,
self.usable_space() as u16,
)?;
insert_into_cell(
page_ref.get().get_contents(),
&new_payload,
cell_idx,
self.usable_space() as u16,
)?;
Ok(IOResult::Done(()))
}
}
pub fn overwrite_content(
&mut self,
page_ref: BTreePage,
dest_offset: usize,
new_payload: &[u8],
) -> Result<IOResult<()>> {
return_if_locked!(page_ref.get());
let page_ref = page_ref.get();
let buf = page_ref.get().contents.as_mut().unwrap().as_ptr();
buf[dest_offset..dest_offset + new_payload.len()].copy_from_slice(new_payload);
Ok(IOResult::Done(()))
}
fn get_immutable_record_or_create(&self) -> std::cell::RefMut<'_, Option<ImmutableRecord>> {
if self.reusable_immutable_record.borrow().is_none() {
let record = ImmutableRecord::new(4096);
self.reusable_immutable_record.replace(Some(record));
}
self.reusable_immutable_record.borrow_mut()
}
fn get_immutable_record(&self) -> std::cell::RefMut<'_, Option<ImmutableRecord>> {
self.reusable_immutable_record.borrow_mut()
}
pub fn is_write_in_progress(&self) -> bool {
matches!(self.state, CursorState::Write(_))
}
/// Count the number of entries in the b-tree
///
/// Only supposed to be used in the context of a simple Count Select Statement
#[instrument(skip(self), level = Level::DEBUG)]
pub fn count(&mut self) -> Result<IOResult<usize>> {
if self.count == 0 {
self.move_to_root()?;
}
if let Some(_mv_cursor) = &self.mv_cursor {
todo!("Implement count for mvcc");
}
let mut mem_page_rc;
let mut mem_page;
let mut contents;
loop {
mem_page_rc = self.stack.top();
return_if_locked_maybe_load!(self.pager, mem_page_rc);
mem_page = mem_page_rc.get();
contents = mem_page.get().contents.as_ref().unwrap();
/* If this is a leaf page or the tree is not an int-key tree, then
** this page contains countable entries. Increment the entry counter
** accordingly.
*/
if !matches!(contents.page_type(), PageType::TableInterior) {
self.count += contents.cell_count();
}
self.stack.advance();
let cell_idx = self.stack.current_cell_index() as usize;
// Second condition is necessary in case we return if the page is locked in the loop below
if contents.is_leaf() || cell_idx > contents.cell_count() {
loop {
if !self.stack.has_parent() {
// All pages of the b-tree have been visited. Return successfully
self.move_to_root()?;
return Ok(IOResult::Done(self.count));
}
// Move to parent
self.stack.pop();
mem_page_rc = self.stack.top();
return_if_locked_maybe_load!(self.pager, mem_page_rc);
mem_page = mem_page_rc.get();
contents = mem_page.get().contents.as_ref().unwrap();
let cell_idx = self.stack.current_cell_index() as usize;
if cell_idx <= contents.cell_count() {
break;
}
}
}
let cell_idx = self.stack.current_cell_index() as usize;
assert!(cell_idx <= contents.cell_count(),);
assert!(!contents.is_leaf());
if cell_idx == contents.cell_count() {
// Move to right child
// should be safe as contents is not a leaf page
let right_most_pointer = contents.rightmost_pointer().unwrap();
self.stack.advance();
let mem_page = self.read_page(right_most_pointer as usize)?;
self.stack.push(mem_page);
} else {
// Move to child left page
let cell = contents.cell_get(cell_idx, self.usable_space())?;
match cell {
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page, ..
})
| BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page, ..
}) => {
self.stack.advance();
let mem_page = self.read_page(left_child_page as usize)?;
self.stack.push(mem_page);
}
_ => unreachable!(),
}
}
}
}
// Save cursor context, to be restored later
pub fn save_context(&mut self, cursor_context: CursorContext) {
self.valid_state = CursorValidState::RequireSeek;
self.context = Some(cursor_context);
}
/// If context is defined, restore it and set it None on success
#[instrument(skip_all, level = Level::DEBUG)]
fn restore_context(&mut self) -> Result<IOResult<()>> {
if self.context.is_none() || matches!(self.valid_state, CursorValidState::Valid) {
return Ok(IOResult::Done(()));
}
if let CursorValidState::RequireAdvance(direction) = self.valid_state {
let has_record = return_if_io!(match direction {
// Avoid calling next()/prev() directly because they immediately call restore_context()
IterationDirection::Forwards => self.get_next_record(),
IterationDirection::Backwards => self.get_prev_record(),
});
self.has_record.set(has_record);
self.invalidate_record();
self.context = None;
self.valid_state = CursorValidState::Valid;
return Ok(IOResult::Done(()));
}
let ctx = self.context.take().unwrap();
let seek_key = match ctx {
CursorContext::TableRowId(rowid) => SeekKey::TableRowId(rowid),
CursorContext::IndexKeyRowId(ref record) => SeekKey::IndexKey(record),
};
let res = self.seek(seek_key, SeekOp::GE { eq_only: true })?;
match res {
IOResult::Done(res) => {
if let SeekResult::TryAdvance = res {
self.valid_state =
CursorValidState::RequireAdvance(IterationDirection::Forwards);
self.context = Some(ctx);
return Ok(IOResult::IO);
}
self.valid_state = CursorValidState::Valid;
Ok(IOResult::Done(()))
}
IOResult::IO => {
self.context = Some(ctx);
Ok(IOResult::IO)
}
}
}
pub fn read_page(&self, page_idx: usize) -> Result<BTreePage> {
btree_read_page(&self.pager, page_idx)
}
pub fn allocate_page(&self, page_type: PageType, offset: usize) -> Result<BTreePage> {
self.pager
.do_allocate_page(page_type, offset, BtreePageAllocMode::Any)
}
}
#[derive(Debug, thiserror::Error)]
pub enum IntegrityCheckError {
#[error("Cell {cell_idx} in page {page_id} is out of range. cell_range={cell_start}..{cell_end}, content_area={content_area}, usable_space={usable_space}")]
CellOutOfRange {
cell_idx: usize,
page_id: usize,
cell_start: usize,
cell_end: usize,
content_area: usize,
usable_space: usize,
},
#[error("Cell {cell_idx} in page {page_id} extends out of page. cell_range={cell_start}..{cell_end}, content_area={content_area}, usable_space={usable_space}")]
CellOverflowsPage {
cell_idx: usize,
page_id: usize,
cell_start: usize,
cell_end: usize,
content_area: usize,
usable_space: usize,
},
#[error("Page {page_id} cell {cell_idx} has rowid={rowid} in wrong order. Parent cell has parent_rowid={max_intkey} and next_rowid={next_rowid}")]
CellRowidOutOfRange {
page_id: usize,
cell_idx: usize,
rowid: i64,
max_intkey: i64,
next_rowid: i64,
},
#[error("Page {page_id} is at different depth from another leaf page this_page_depth={this_page_depth}, other_page_depth={other_page_depth} ")]
LeafDepthMismatch {
page_id: usize,
this_page_depth: usize,
other_page_depth: usize,
},
#[error("Page {page_id} detected freeblock that extends page start={start} end={end}")]
FreeBlockOutOfRange {
page_id: usize,
start: usize,
end: usize,
},
#[error("Page {page_id} cell overlap detected at position={start} with previous_end={prev_end}. content_area={content_area}, is_free_block={is_free_block}")]
CellOverlap {
page_id: usize,
start: usize,
prev_end: usize,
content_area: usize,
is_free_block: bool,
},
#[error("Page {page_id} unexpected fragmentation got={got}, expected={expected}")]
UnexpectedFragmentation {
page_id: usize,
got: usize,
expected: usize,
},
}
#[derive(Clone)]
struct IntegrityCheckPageEntry {
page_idx: usize,
level: usize,
max_intkey: i64,
}
pub struct IntegrityCheckState {
pub current_page: usize,
page_stack: Vec<IntegrityCheckPageEntry>,
first_leaf_level: Option<usize>,
}
impl IntegrityCheckState {
pub fn new(page_idx: usize) -> Self {
Self {
current_page: page_idx,
page_stack: vec![IntegrityCheckPageEntry {
page_idx,
level: 0,
max_intkey: i64::MAX,
}],
first_leaf_level: None,
}
}
}
impl std::fmt::Debug for IntegrityCheckState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("IntegrityCheckState")
.field("current_page", &self.current_page)
.field("first_leaf_level", &self.first_leaf_level)
.finish()
}
}
/// Perform integrity check on a whole table/index. We check for:
/// 1. Correct order of keys in case of rowids.
/// 2. There are no overlap between cells.
/// 3. Cells do not scape outside expected range.
/// 4. Depth of leaf pages are equal.
/// 5. Overflow pages are correct (TODO)
///
/// In order to keep this reentrant, we keep a stack of pages we need to check. Ideally, like in
/// SQLlite, we would have implemented a recursive solution which would make it easier to check the
/// depth.
pub fn integrity_check(
state: &mut IntegrityCheckState,
errors: &mut Vec<IntegrityCheckError>,
pager: &Rc<Pager>,
) -> Result<IOResult<()>> {
let Some(IntegrityCheckPageEntry {
page_idx,
level,
max_intkey,
}) = state.page_stack.last().cloned()
else {
return Ok(IOResult::Done(()));
};
let page = btree_read_page(pager, page_idx)?;
return_if_locked_maybe_load!(pager, page);
state.page_stack.pop();
let page = page.get();
let contents = page.get_contents();
let usable_space = pager.usable_space() as u16;
let mut coverage_checker = CoverageChecker::new(page.get().id);
// Now we check every cell for few things:
// 1. Check cell is in correct range. Not exceeds page and not starts before we have marked
// (cell content area).
// 2. We add the cell to coverage checker in order to check if cells do not overlap.
// 3. We check order of rowids in case of table pages. We iterate backwards in order to check
// if current cell's rowid is less than the next cell. We also check rowid is less than the
// parent's divider cell. In case of this page being root page max rowid will be i64::MAX.
// 4. We append pages to the stack to check later.
// 5. In case of leaf page, check if the current level(depth) is equal to other leaf pages we
// have seen.
let mut next_rowid = max_intkey;
for cell_idx in (0..contents.cell_count()).rev() {
let (cell_start, cell_length) =
contents.cell_get_raw_region(cell_idx, usable_space as usize);
if cell_start < contents.cell_content_area() as usize
|| cell_start > usable_space as usize - 4
{
errors.push(IntegrityCheckError::CellOutOfRange {
cell_idx,
page_id: page.get().id,
cell_start,
cell_end: cell_start + cell_length,
content_area: contents.cell_content_area() as usize,
usable_space: usable_space as usize,
});
}
if cell_start + cell_length > usable_space as usize {
errors.push(IntegrityCheckError::CellOverflowsPage {
cell_idx,
page_id: page.get().id,
cell_start,
cell_end: cell_start + cell_length,
content_area: contents.cell_content_area() as usize,
usable_space: usable_space as usize,
});
}
coverage_checker.add_cell(cell_start, cell_start + cell_length);
let cell = contents.cell_get(cell_idx, usable_space as usize)?;
match cell {
BTreeCell::TableInteriorCell(table_interior_cell) => {
state.page_stack.push(IntegrityCheckPageEntry {
page_idx: table_interior_cell.left_child_page as usize,
level: level + 1,
max_intkey: table_interior_cell.rowid,
});
let rowid = table_interior_cell.rowid;
if rowid > max_intkey || rowid > next_rowid {
errors.push(IntegrityCheckError::CellRowidOutOfRange {
page_id: page.get().id,
cell_idx,
rowid,
max_intkey,
next_rowid,
});
}
next_rowid = rowid;
}
BTreeCell::TableLeafCell(table_leaf_cell) => {
// check depth of leaf pages are equal
if let Some(expected_leaf_level) = state.first_leaf_level {
if expected_leaf_level != level {
errors.push(IntegrityCheckError::LeafDepthMismatch {
page_id: page.get().id,
this_page_depth: level,
other_page_depth: expected_leaf_level,
});
}
} else {
state.first_leaf_level = Some(level);
}
let rowid = table_leaf_cell.rowid;
if rowid > max_intkey || rowid > next_rowid {
errors.push(IntegrityCheckError::CellRowidOutOfRange {
page_id: page.get().id,
cell_idx,
rowid,
max_intkey,
next_rowid,
});
}
next_rowid = rowid;
}
BTreeCell::IndexInteriorCell(index_interior_cell) => {
state.page_stack.push(IntegrityCheckPageEntry {
page_idx: index_interior_cell.left_child_page as usize,
level: level + 1,
max_intkey, // we don't care about intkey in non-table pages
});
}
BTreeCell::IndexLeafCell(_) => {
// check depth of leaf pages are equal
if let Some(expected_leaf_level) = state.first_leaf_level {
if expected_leaf_level != level {
errors.push(IntegrityCheckError::LeafDepthMismatch {
page_id: page.get().id,
this_page_depth: level,
other_page_depth: expected_leaf_level,
});
}
} else {
state.first_leaf_level = Some(level);
}
}
}
}
// Now we add free blocks to the coverage checker
let first_freeblock = contents.first_freeblock();
if first_freeblock > 0 {
let mut pc = first_freeblock;
while pc > 0 {
let next = contents.read_u16_no_offset(pc as usize);
let size = contents.read_u16_no_offset(pc as usize + 2) as usize;
// check it doesn't go out of range
if pc > usable_space - 4 {
errors.push(IntegrityCheckError::FreeBlockOutOfRange {
page_id: page.get().id,
start: pc as usize,
end: pc as usize + size,
});
break;
}
coverage_checker.add_free_block(pc as usize, pc as usize + size);
pc = next;
}
}
// Let's check the overlap of freeblocks and cells now that we have collected them all.
coverage_checker.analyze(
usable_space,
contents.cell_content_area() as usize,
errors,
contents.num_frag_free_bytes() as usize,
);
Ok(IOResult::Done(()))
}
pub fn btree_read_page(pager: &Rc<Pager>, page_idx: usize) -> Result<BTreePage> {
pager.read_page(page_idx).map(|page| {
Arc::new(BTreePageInner {
page: RefCell::new(page),
})
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct IntegrityCheckCellRange {
start: usize,
end: usize,
is_free_block: bool,
}
// Implement ordering for min-heap (smallest start address first)
impl Ord for IntegrityCheckCellRange {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.start.cmp(&other.start)
}
}
impl PartialOrd for IntegrityCheckCellRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
#[cfg(debug_assertions)]
fn validate_cells_after_insertion(cell_array: &CellArray, leaf_data: bool) {
for cell in &cell_array.cell_payloads {
assert!(cell.len() >= 4);
if leaf_data {
assert!(cell[0] != 0, "payload is {cell:?}");
}
}
}
pub struct CoverageChecker {
/// Min-heap ordered by cell start
heap: BinaryHeap<Reverse<IntegrityCheckCellRange>>,
page_idx: usize,
}
impl CoverageChecker {
pub fn new(page_idx: usize) -> Self {
Self {
heap: BinaryHeap::new(),
page_idx,
}
}
fn add_range(&mut self, cell_start: usize, cell_end: usize, is_free_block: bool) {
self.heap.push(Reverse(IntegrityCheckCellRange {
start: cell_start,
end: cell_end,
is_free_block,
}));
}
pub fn add_cell(&mut self, cell_start: usize, cell_end: usize) {
self.add_range(cell_start, cell_end, false);
}
pub fn add_free_block(&mut self, cell_start: usize, cell_end: usize) {
self.add_range(cell_start, cell_end, true);
}
pub fn analyze(
&mut self,
usable_space: u16,
content_area: usize,
errors: &mut Vec<IntegrityCheckError>,
expected_fragmentation: usize,
) {
let mut fragmentation = 0;
let mut prev_end = content_area;
while let Some(cell) = self.heap.pop() {
let start = cell.0.start;
if prev_end > start {
errors.push(IntegrityCheckError::CellOverlap {
page_id: self.page_idx,
start,
prev_end,
content_area,
is_free_block: cell.0.is_free_block,
});
break;
} else {
fragmentation += start - prev_end;
prev_end = cell.0.end;
}
}
fragmentation += usable_space as usize - prev_end;
if fragmentation != expected_fragmentation {
errors.push(IntegrityCheckError::UnexpectedFragmentation {
page_id: self.page_idx,
got: fragmentation,
expected: expected_fragmentation,
});
}
}
}
/// Stack of pages representing the tree traversal order.
/// current_page represents the current page being used in the tree and current_page - 1 would be
/// the parent. Using current_page + 1 or higher is undefined behaviour.
struct PageStack {
/// Pointer to the current page being consumed
current_page: Cell<i32>,
/// List of pages in the stack. Root page will be in index 0
pub stack: RefCell<[Option<BTreePage>; BTCURSOR_MAX_DEPTH + 1]>,
/// List of cell indices in the stack.
/// node_states[current_page] is the current cell index being consumed. Similarly
/// node_states[current_page-1] is the cell index of the parent of the current page
/// that we save in case of going back up.
/// There are two points that need special attention:
/// If node_states[current_page] = -1, it indicates that the current iteration has reached the start of the current_page
/// If node_states[current_page] = `cell_count`, it means that the current iteration has reached the end of the current_page
node_states: RefCell<[BTreeNodeState; BTCURSOR_MAX_DEPTH + 1]>,
}
impl PageStack {
fn increment_current(&self) {
self.current_page.set(self.current_page.get() + 1);
}
fn decrement_current(&self) {
assert!(self.current_page.get() > 0);
self.current_page.set(self.current_page.get() - 1);
}
/// Push a new page onto the stack.
/// This effectively means traversing to a child page.
#[instrument(skip_all, level = Level::DEBUG, name = "pagestack::push")]
fn _push(&self, page: BTreePage, starting_cell_idx: i32) {
tracing::trace!(
current = self.current_page.get(),
new_page_id = page.get().get().id,
);
'validate: {
let current = self.current_page.get();
if current == -1 {
break 'validate;
}
let stack = self.stack.borrow();
let current_top = stack[current as usize].as_ref();
if let Some(current_top) = current_top {
turso_assert!(
current_top.get().get().id != page.get().get().id,
"about to push page {} twice",
page.get().get().id
);
}
}
self.populate_parent_cell_count();
self.increment_current();
let current = self.current_page.get();
assert!(
current < BTCURSOR_MAX_DEPTH as i32,
"corrupted database, stack is bigger than expected"
);
assert!(current >= 0);
// Pin the page to prevent it from being evicted while on the stack
page.get().pin();
self.stack.borrow_mut()[current as usize] = Some(page);
self.node_states.borrow_mut()[current as usize] = BTreeNodeState {
cell_idx: starting_cell_idx,
cell_count: None, // we don't know the cell count yet, so we set it to None. any code pushing a child page onto the stack MUST set the parent page's cell_count.
};
}
/// Populate the parent page's cell count.
/// This is needed so that we can, from a child page, check of ancestor pages' position relative to its cell index
/// without having to perform IO to get the ancestor page contents.
///
/// This rests on the assumption that the parent page is already in memory whenever a child is pushed onto the stack.
/// We currently ensure this by pinning all the pages on [PageStack] to the page cache so that they cannot be evicted.
fn populate_parent_cell_count(&self) {
let stack_empty = self.current_page.get() == -1;
if stack_empty {
return;
}
let current = self.current();
let stack = self.stack.borrow();
let page = stack[current].as_ref().unwrap();
let page = page.get();
turso_assert!(
page.is_pinned(),
"parent page {} is not pinned",
page.get().id
);
turso_assert!(
page.is_loaded(),
"parent page {} is not loaded",
page.get().id
);
let contents = page.get_contents();
let cell_count = contents.cell_count() as i32;
self.node_states.borrow_mut()[current].cell_count = Some(cell_count);
}
fn push(&self, page: BTreePage) {
self._push(page, -1);
}
fn push_backwards(&self, page: BTreePage) {
self._push(page, i32::MAX);
}
/// Pop a page off the stack.
/// This effectively means traversing back up to a parent page.
#[instrument(skip_all, level = Level::DEBUG, name = "pagestack::pop")]
fn pop(&self) {
let current = self.current_page.get();
assert!(current >= 0);
tracing::trace!(current);
// Unpin the page before removing it from the stack
if let Some(page) = &self.stack.borrow()[current as usize] {
page.get().unpin();
}
self.node_states.borrow_mut()[current as usize] = BTreeNodeState::default();
self.stack.borrow_mut()[current as usize] = None;
self.decrement_current();
}
/// Get the top page on the stack.
/// This is the page that is currently being traversed.
#[instrument(skip(self), level = Level::DEBUG, name = "pagestack::top", )]
fn top(&self) -> BTreePage {
let page = self.stack.borrow()[self.current()]
.as_ref()
.unwrap()
.clone();
tracing::trace!(current = self.current(), page_id = page.get().get().id);
page
}
/// Current page pointer being used
fn current(&self) -> usize {
let current = self.current_page.get() as usize;
assert!(self.current_page.get() >= 0);
current
}
/// Cell index of the current page
fn current_cell_index(&self) -> i32 {
let current = self.current();
self.node_states.borrow()[current].cell_idx
}
/// Check if the current cell index is less than 0.
/// This means we have been iterating backwards and have reached the start of the page.
fn current_cell_index_less_than_min(&self) -> bool {
let cell_idx = self.current_cell_index();
cell_idx < 0
}
/// Advance the current cell index of the current page to the next cell.
/// We usually advance after going traversing a new page
#[instrument(skip(self), level = Level::DEBUG, name = "pagestack::advance",)]
fn advance(&self) {
let current = self.current();
tracing::trace!(
curr_cell_index = self.node_states.borrow()[current].cell_idx,
node_states = ?self.node_states.borrow().iter().map(|state| state.cell_idx).collect::<Vec<_>>(),
);
self.node_states.borrow_mut()[current].cell_idx += 1;
}
#[instrument(skip(self), level = Level::DEBUG, name = "pagestack::retreat")]
fn retreat(&self) {
let current = self.current();
tracing::trace!(
curr_cell_index = self.node_states.borrow()[current].cell_idx,
node_states = ?self.node_states.borrow().iter().map(|state| state.cell_idx).collect::<Vec<_>>(),
);
self.node_states.borrow_mut()[current].cell_idx -= 1;
}
fn set_cell_index(&self, idx: i32) {
let current = self.current();
self.node_states.borrow_mut()[current].cell_idx = idx;
}
fn has_parent(&self) -> bool {
self.current_page.get() > 0
}
/// Get a page at a specific level in the stack (0 = root, 1 = first child, etc.)
fn get_page_at_level(&self, level: usize) -> Option<BTreePage> {
let stack = self.stack.borrow();
if level < stack.len() {
stack[level].clone()
} else {
None
}
}
fn unpin_all_if_pinned(&self) {
self.stack
.borrow_mut()
.iter_mut()
.flatten()
.for_each(|page| {
let _ = page.get().try_unpin();
});
}
fn clear(&self) {
self.unpin_all_if_pinned();
self.current_page.set(-1);
}
}
impl Drop for PageStack {
fn drop(&mut self) {
self.unpin_all_if_pinned();
}
}
/// Used for redistributing cells during a balance operation.
struct CellArray {
/// The actual cell data.
/// For all other page types except table leaves, this will also contain the associated divider cell from the parent page.
cell_payloads: Vec<&'static mut [u8]>,
/// Prefix sum of cells in each page.
/// For example, if three pages have 1, 2, and 3 cells, respectively,
/// then cell_count_per_page_cumulative will be [1, 3, 6].
cell_count_per_page_cumulative: [u16; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
}
impl CellArray {
pub fn cell_size_bytes(&self, cell_idx: usize) -> u16 {
self.cell_payloads[cell_idx].len() as u16
}
/// Returns the number of cells up to and including the given page.
pub fn cell_count_up_to_page(&self, page_idx: usize) -> usize {
self.cell_count_per_page_cumulative[page_idx] as usize
}
}
impl BTreePageInner {
pub fn get(&self) -> PageRef {
self.page.borrow().clone()
}
}
/// Try to find a free block available and allocate it if found
fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> Result<usize> {
// NOTE: freelist is in ascending order of keys and pc
// unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc
let mut prev_pc = page_ref.offset + offset::BTREE_FIRST_FREEBLOCK;
let mut pc = page_ref.first_freeblock() as usize;
let maxpc = usable_space as usize - amount;
while pc <= maxpc {
if pc + 4 > usable_space as usize {
return_corrupt!("Free block header extends beyond page");
}
let next = page_ref.read_u16_no_offset(pc);
let size = page_ref.read_u16_no_offset(pc + 2);
if amount <= size as usize {
let new_size = size as usize - amount;
if new_size < 4 {
// The code is checking if using a free slot that would leave behind a very small fragment (x < 4 bytes)
// would cause the total fragmentation to exceed the limit of 60 bytes
// check sqlite docs https://www.sqlite.org/fileformat.html#:~:text=A%20freeblock%20requires,not%20exceed%2060
if page_ref.num_frag_free_bytes() > 57 {
return Ok(0);
}
// Delete the slot from freelist and update the page's fragment count.
page_ref.write_u16_no_offset(prev_pc, next);
let frag = page_ref.num_frag_free_bytes() + new_size as u8;
page_ref.write_u8(offset::BTREE_FRAGMENTED_BYTES_COUNT, frag);
return Ok(pc);
} else if new_size + pc > maxpc {
return_corrupt!("Free block extends beyond page end");
} else {
// Requested amount fits inside the current free slot so we reduce its size
// to account for newly allocated space.
page_ref.write_u16_no_offset(pc + 2, new_size as u16);
return Ok(pc + new_size);
}
}
prev_pc = pc;
pc = next as usize;
if pc <= prev_pc {
if pc != 0 {
return_corrupt!("Free list not in ascending order");
}
return Ok(0);
}
}
if pc > maxpc + amount - 4 {
return_corrupt!("Free block chain extends beyond page end");
}
Ok(0)
}
pub fn btree_init_page(page: &BTreePage, page_type: PageType, offset: usize, usable_space: u16) {
// setup btree page
let contents = page.get();
tracing::debug!(
"btree_init_page(id={}, offset={})",
contents.get().id,
offset
);
let contents = contents.get().contents.as_mut().unwrap();
contents.offset = offset;
let id = page_type as u8;
contents.write_u8(offset::BTREE_PAGE_TYPE, id);
contents.write_u16(offset::BTREE_FIRST_FREEBLOCK, 0);
contents.write_u16(offset::BTREE_CELL_COUNT, 0);
contents.write_u16(offset::BTREE_CELL_CONTENT_AREA, usable_space);
contents.write_u8(offset::BTREE_FRAGMENTED_BYTES_COUNT, 0);
contents.write_u32(offset::BTREE_RIGHTMOST_PTR, 0);
}
fn to_static_buf(buf: &mut [u8]) -> &'static mut [u8] {
unsafe { std::mem::transmute::<&mut [u8], &'static mut [u8]>(buf) }
}
fn edit_page(
page: &mut PageContent,
start_old_cells: usize,
start_new_cells: usize,
number_new_cells: usize,
cell_array: &CellArray,
usable_space: u16,
) -> Result<()> {
tracing::debug!(
"edit_page start_old_cells={} start_new_cells={} number_new_cells={} cell_array={}",
start_old_cells,
start_new_cells,
number_new_cells,
cell_array.cell_payloads.len()
);
let end_old_cells = start_old_cells + page.cell_count() + page.overflow_cells.len();
let end_new_cells = start_new_cells + number_new_cells;
let mut count_cells = page.cell_count();
if start_old_cells < start_new_cells {
debug_validate_cells!(page, usable_space);
let number_to_shift = page_free_array(
page,
start_old_cells,
start_new_cells - start_old_cells,
cell_array,
usable_space,
)?;
// shift pointers left
shift_cells_left(page, count_cells, number_to_shift);
count_cells -= number_to_shift;
debug_validate_cells!(page, usable_space);
}
if end_new_cells < end_old_cells {
debug_validate_cells!(page, usable_space);
let number_tail_removed = page_free_array(
page,
end_new_cells,
end_old_cells - end_new_cells,
cell_array,
usable_space,
)?;
assert!(count_cells >= number_tail_removed);
count_cells -= number_tail_removed;
debug_validate_cells!(page, usable_space);
}
// TODO: make page_free_array defragment, for now I'm lazy so this will work for now.
defragment_page(page, usable_space);
// TODO: add to start
if start_new_cells < start_old_cells {
let count = number_new_cells.min(start_old_cells - start_new_cells);
page_insert_array(page, start_new_cells, count, cell_array, 0, usable_space)?;
count_cells += count;
}
// TODO: overflow cells
debug_validate_cells!(page, usable_space);
for i in 0..page.overflow_cells.len() {
let overflow_cell = &page.overflow_cells[i];
// cell index in context of new list of cells that should be in the page
if start_old_cells + overflow_cell.index >= start_new_cells {
let cell_idx = start_old_cells + overflow_cell.index - start_new_cells;
if cell_idx < number_new_cells {
count_cells += 1;
page_insert_array(
page,
start_new_cells + cell_idx,
1,
cell_array,
cell_idx,
usable_space,
)?;
}
}
}
debug_validate_cells!(page, usable_space);
// TODO: append cells to end
page_insert_array(
page,
start_new_cells + count_cells,
number_new_cells - count_cells,
cell_array,
count_cells,
usable_space,
)?;
debug_validate_cells!(page, usable_space);
// TODO: noverflow
page.write_u16(offset::BTREE_CELL_COUNT, number_new_cells as u16);
Ok(())
}
/// Shifts the cell pointers in the B-tree page to the left by a specified number of positions.
///
/// # Parameters
/// - `page`: A mutable reference to the `PageContent` representing the B-tree page.
/// - `count_cells`: The total number of cells currently in the page.
/// - `number_to_shift`: The number of cell pointers to shift to the left.
///
/// # Behavior
/// This function modifies the cell pointer array within the page by copying memory regions.
/// It shifts the pointers starting from `number_to_shift` to the beginning of the array,
/// effectively removing the first `number_to_shift` pointers.
fn shift_cells_left(page: &mut PageContent, count_cells: usize, number_to_shift: usize) {
let buf = page.as_ptr();
let (start, _) = page.cell_pointer_array_offset_and_size();
buf.copy_within(
start + (number_to_shift * 2)..start + (count_cells * 2),
start,
);
}
fn page_free_array(
page: &mut PageContent,
first: usize,
count: usize,
cell_array: &CellArray,
usable_space: u16,
) -> Result<usize> {
tracing::debug!("page_free_array {}..{}", first, first + count);
let buf = &mut page.as_ptr()[page.offset..usable_space as usize];
let buf_range = buf.as_ptr_range();
let mut number_of_cells_removed = 0;
let mut number_of_cells_buffered = 0;
let mut buffered_cells_offsets: [u16; 10] = [0; 10];
let mut buffered_cells_ends: [u16; 10] = [0; 10];
for i in first..first + count {
let cell = &cell_array.cell_payloads[i];
let cell_pointer = cell.as_ptr_range();
// check if not overflow cell
if cell_pointer.start >= buf_range.start && cell_pointer.start < buf_range.end {
assert!(
cell_pointer.end >= buf_range.start && cell_pointer.end <= buf_range.end,
"whole cell should be inside the page"
);
// TODO: remove pointer too
let offset = (cell_pointer.start as usize - buf_range.start as usize) as u16;
let len = (cell_pointer.end as usize - cell_pointer.start as usize) as u16;
assert!(len > 0, "cell size should be greater than 0");
let end = offset + len;
/* Try to merge the current cell with a contiguous buffered cell to reduce the number of
* `free_cell_range()` operations. Break on the first merge to avoid consuming too much time,
* `free_cell_range()` will try to merge contiguous cells anyway. */
let mut j = 0;
while j < number_of_cells_buffered {
// If the buffered cell is immediately after the current cell
if buffered_cells_offsets[j] == end {
// Merge them by updating the buffered cell's offset to the current cell's offset
buffered_cells_offsets[j] = offset;
break;
// If the buffered cell is immediately before the current cell
} else if buffered_cells_ends[j] == offset {
// Merge them by updating the buffered cell's end offset to the current cell's end offset
buffered_cells_ends[j] = end;
break;
}
j += 1;
}
// If no cells were merged
if j >= number_of_cells_buffered {
// If the buffered cells array is full, flush the buffered cells using `free_cell_range()` to empty the array
if number_of_cells_buffered >= buffered_cells_offsets.len() {
for j in 0..number_of_cells_buffered {
free_cell_range(
page,
buffered_cells_offsets[j],
buffered_cells_ends[j] - buffered_cells_offsets[j],
usable_space,
)?;
}
number_of_cells_buffered = 0; // Reset array counter
}
// Buffer the current cell
buffered_cells_offsets[number_of_cells_buffered] = offset;
buffered_cells_ends[number_of_cells_buffered] = end;
number_of_cells_buffered += 1;
}
number_of_cells_removed += 1;
}
}
for j in 0..number_of_cells_buffered {
free_cell_range(
page,
buffered_cells_offsets[j],
buffered_cells_ends[j] - buffered_cells_offsets[j],
usable_space,
)?;
}
page.write_u16(
offset::BTREE_CELL_COUNT,
page.cell_count() as u16 - number_of_cells_removed as u16,
);
Ok(number_of_cells_removed)
}
fn page_insert_array(
page: &mut PageContent,
first: usize,
count: usize,
cell_array: &CellArray,
mut start_insert: usize,
usable_space: u16,
) -> Result<()> {
// TODO: implement faster algorithm, this is doing extra work that's not needed.
// See pageInsertArray to understand faster way.
tracing::debug!(
"page_insert_array(cell_array.cells={}..{}, cell_count={}, page_type={:?})",
first,
first + count,
page.cell_count(),
page.page_type()
);
for i in first..first + count {
insert_into_cell_during_balance(
page,
cell_array.cell_payloads[i],
start_insert,
usable_space,
)?;
start_insert += 1;
}
debug_validate_cells!(page, usable_space);
Ok(())
}
/// Free the range of bytes that a cell occupies.
/// This function also updates the freeblock list in the page.
/// Freeblocks are used to keep track of free space in the page,
/// and are organized as a linked list.
fn free_cell_range(
page: &mut PageContent,
mut offset: u16,
len: u16,
usable_space: u16,
) -> Result<()> {
if len < 4 {
return_corrupt!("Minimum cell size is 4");
}
if offset > usable_space.saturating_sub(4) {
return_corrupt!("Start offset beyond usable space");
}
let mut size = len;
let mut end = offset + len;
let mut pointer_to_pc = page.offset as u16 + 1;
// if the freeblock list is empty, we set this block as the first freeblock in the page header.
let pc = if page.first_freeblock() == 0 {
0
} else {
// if the freeblock list is not empty, and the offset is greater than the first freeblock,
// then we need to do some more calculation to figure out where to insert the freeblock
// in the freeblock linked list.
let first_block = page.first_freeblock();
let mut pc = first_block;
while pc < offset {
if pc <= pointer_to_pc {
if pc == 0 {
break;
}
return_corrupt!("free cell range free block not in ascending order");
}
let next = page.read_u16_no_offset(pc as usize);
pointer_to_pc = pc;
pc = next;
}
if pc > usable_space - 4 {
return_corrupt!("Free block beyond usable space");
}
let mut removed_fragmentation = 0;
if pc > 0 && offset + len + 3 >= pc {
removed_fragmentation = (pc - end) as u8;
if end > pc {
return_corrupt!("Invalid block overlap");
}
end = pc + page.read_u16_no_offset(pc as usize + 2);
if end > usable_space {
return_corrupt!("Coalesced block extends beyond page");
}
size = end - offset;
pc = page.read_u16_no_offset(pc as usize);
}
if pointer_to_pc > page.offset as u16 + 1 {
let prev_end = pointer_to_pc + page.read_u16_no_offset(pointer_to_pc as usize + 2);
if prev_end + 3 >= offset {
if prev_end > offset {
return_corrupt!("Invalid previous block overlap");
}
removed_fragmentation += (offset - prev_end) as u8;
size = end - pointer_to_pc;
offset = pointer_to_pc;
}
}
if removed_fragmentation > page.num_frag_free_bytes() {
return_corrupt!(format!(
"Invalid fragmentation count. Had {} and removed {}",
page.num_frag_free_bytes(),
removed_fragmentation
));
}
let frag = page.num_frag_free_bytes() - removed_fragmentation;
page.write_u8(offset::BTREE_FRAGMENTED_BYTES_COUNT, frag);
pc
};
if (offset as u32) <= page.cell_content_area() {
if (offset as u32) < page.cell_content_area() {
return_corrupt!("Free block before content area");
}
if pointer_to_pc != page.offset as u16 + offset::BTREE_FIRST_FREEBLOCK as u16 {
return_corrupt!("Invalid content area merge");
}
page.write_u16(offset::BTREE_FIRST_FREEBLOCK, pc);
page.write_u16(offset::BTREE_CELL_CONTENT_AREA, end);
} else {
page.write_u16_no_offset(pointer_to_pc as usize, offset);
page.write_u16_no_offset(offset as usize, pc);
page.write_u16_no_offset(offset as usize + 2, size);
}
Ok(())
}
/// Defragment a page. This means packing all the cells to the end of the page.
fn defragment_page(page: &PageContent, usable_space: u16) {
debug_validate_cells!(page, usable_space);
tracing::debug!("defragment_page");
let cloned_page = page.clone();
// TODO(pere): usable space should include offset probably
let mut cbrk = usable_space;
// TODO: implement fast algorithm
let last_cell = usable_space - 4;
let first_cell = cloned_page.unallocated_region_start() as u16;
if cloned_page.cell_count() > 0 {
let read_buf = cloned_page.as_ptr();
let write_buf = page.as_ptr();
for i in 0..cloned_page.cell_count() {
let (cell_offset, _) = page.cell_pointer_array_offset_and_size();
let cell_idx = cell_offset + (i * 2);
let pc = cloned_page.read_u16_no_offset(cell_idx);
if pc > last_cell {
unimplemented!("corrupted page");
}
assert!(pc <= last_cell);
let (_, size) = cloned_page.cell_get_raw_region(i, usable_space as usize);
let size = size as u16;
cbrk -= size;
if cbrk < first_cell || pc + size > usable_space {
todo!("corrupt");
}
assert!(cbrk + size <= usable_space && cbrk >= first_cell);
// set new pointer
page.write_u16_no_offset(cell_idx, cbrk);
// copy payload
write_buf[cbrk as usize..cbrk as usize + size as usize]
.copy_from_slice(&read_buf[pc as usize..pc as usize + size as usize]);
}
}
// assert!( nfree >= 0 );
// if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
// return SQLITE_CORRUPT_PAGE(pPage);
// }
assert!(cbrk >= first_cell);
// set new first byte of cell content
page.write_u16(offset::BTREE_CELL_CONTENT_AREA, cbrk);
// set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start
page.write_u16(offset::BTREE_FIRST_FREEBLOCK, 0);
page.write_u8(offset::BTREE_FRAGMENTED_BYTES_COUNT, 0);
debug_validate_cells!(page, usable_space);
}
#[cfg(debug_assertions)]
/// Only enabled in debug mode, where we ensure that all cells are valid.
fn debug_validate_cells_core(page: &PageContent, usable_space: u16) {
for i in 0..page.cell_count() {
let (offset, size) = page.cell_get_raw_region(i, usable_space as usize);
let buf = &page.as_ptr()[offset..offset + size];
// E.g. the following table btree cell may just have two bytes:
// Payload size 0 (stored as SerialTypeKind::ConstInt0)
// Rowid 1 (stored as SerialTypeKind::ConstInt1)
assert!(
size >= 2,
"cell size should be at least 2 bytes idx={i}, cell={buf:?}, offset={offset}"
);
if page.is_leaf() {
assert!(page.as_ptr()[offset] != 0);
}
assert!(
offset + size <= usable_space as usize,
"cell spans out of usable space"
);
}
}
/// Insert a record into a cell.
/// If the cell overflows, an overflow cell is created.
/// insert_into_cell() is called from insert_into_page(),
/// and the overflow cell count is used to determine if the page overflows,
/// i.e. whether we need to balance the btree after the insert.
fn _insert_into_cell(
page: &mut PageContent,
payload: &[u8],
cell_idx: usize,
usable_space: u16,
allow_regular_insert_despite_overflow: bool, // see [insert_into_cell_during_balance()]
) -> Result<()> {
assert!(
cell_idx <= page.cell_count() + page.overflow_cells.len(),
"attempting to add cell to an incorrect place cell_idx={} cell_count={} page_type={:?}",
cell_idx,
page.cell_count(),
page.page_type()
);
let already_has_overflow = !page.overflow_cells.is_empty();
let enough_space = if already_has_overflow && !allow_regular_insert_despite_overflow {
false
} else {
// otherwise, we need to check if we have enough space
let free = compute_free_space(page, usable_space);
payload.len() + CELL_PTR_SIZE_BYTES <= free as usize
};
if !enough_space {
// add to overflow cell
page.overflow_cells.push(OverflowCell {
index: cell_idx,
payload: Pin::new(Vec::from(payload)),
});
return Ok(());
}
assert!(
cell_idx <= page.cell_count(),
"cell_idx > page.cell_count() without overflow cells"
);
let new_cell_data_pointer = allocate_cell_space(page, payload.len() as u16, usable_space)?;
tracing::debug!(
"insert_into_cell(idx={}, pc={}, size={})",
cell_idx,
new_cell_data_pointer,
payload.len()
);
assert!(new_cell_data_pointer + payload.len() as u16 <= usable_space);
let buf = page.as_ptr();
// copy data
buf[new_cell_data_pointer as usize..new_cell_data_pointer as usize + payload.len()]
.copy_from_slice(payload);
// memmove(pIns+2, pIns, 2*(pPage->nCell - i));
let (cell_pointer_array_start, _) = page.cell_pointer_array_offset_and_size();
let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_PTR_SIZE_BYTES * cell_idx);
// move existing pointers forward by CELL_PTR_SIZE_BYTES...
let n_cells_forward = page.cell_count() - cell_idx;
let n_bytes_forward = CELL_PTR_SIZE_BYTES * n_cells_forward;
if n_bytes_forward > 0 {
buf.copy_within(
cell_pointer_cur_idx..cell_pointer_cur_idx + n_bytes_forward,
cell_pointer_cur_idx + CELL_PTR_SIZE_BYTES,
);
}
// ...and insert new cell pointer at the current index
page.write_u16_no_offset(cell_pointer_cur_idx, new_cell_data_pointer);
// update cell count
let new_n_cells = (page.cell_count() + 1) as u16;
page.write_u16(offset::BTREE_CELL_COUNT, new_n_cells);
debug_validate_cells!(page, usable_space);
Ok(())
}
fn insert_into_cell(
page: &mut PageContent,
payload: &[u8],
cell_idx: usize,
usable_space: u16,
) -> Result<()> {
_insert_into_cell(page, payload, cell_idx, usable_space, false)
}
/// Normally in [insert_into_cell()], if a page already has overflow cells, all
/// new insertions are also added to the overflow cells vector.
/// SQLite doesn't use regular [insert_into_cell()] during balancing,
/// so we have a specialized function for use during balancing that allows regular cell insertion
/// despite the presence of existing overflow cells (overflow cells are one of the reasons we are balancing in the first place).
/// During balancing cells are first repositioned with [edit_page()]
/// and then inserted via [page_insert_array()] which calls [insert_into_cell_during_balance()],
/// and finally the existing overflow cells are cleared.
/// If we would not allow the cell insert to proceed normally despite overflow cells being present,
/// the new insertions would also be added as overflow cells which defeats the point of balancing.
fn insert_into_cell_during_balance(
page: &mut PageContent,
payload: &[u8],
cell_idx: usize,
usable_space: u16,
) -> Result<()> {
_insert_into_cell(page, payload, cell_idx, usable_space, true)
}
/// The amount of free space is the sum of:
/// #1. The size of the unallocated region
/// #2. Fragments (isolated 1-3 byte chunks of free space within the cell content area)
/// #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that
/// are not in use due to e.g. deletions)
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected
/// to be between first cell byte and end of cell pointer area.
#[allow(unused_assignments)]
fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
// TODO(pere): maybe free space is not calculated correctly with offset
// Usable space, not the same as free space, simply means:
// space that is not reserved for extensions by sqlite. Usually reserved_space is 0.
let usable_space = usable_space as usize;
let first_cell = page.offset + page.header_size() + (2 * page.cell_count());
let cell_content_area_start = page.cell_content_area() as usize;
let mut free_space_bytes = cell_content_area_start + page.num_frag_free_bytes() as usize;
// #3 is computed by iterating over the freeblocks linked list
let mut cur_freeblock_ptr = page.first_freeblock() as usize;
if cur_freeblock_ptr > 0 {
if cur_freeblock_ptr < cell_content_area_start {
// Freeblocks exist in the cell content area e.g. after deletions
// They should never exist in the unused area of the page.
todo!("corrupted page");
}
let mut next = 0;
let mut size = 0;
loop {
// TODO: check corruption icellast
next = page.read_u16_no_offset(cur_freeblock_ptr) as usize; // first 2 bytes in freeblock = next freeblock pointer
size = page.read_u16_no_offset(cur_freeblock_ptr + 2) as usize; // next 2 bytes in freeblock = size of current freeblock
free_space_bytes += size;
// Freeblocks are in order from left to right on the page,
// so the next pointer should > current pointer + its size, or 0 if no next block exists.
if next <= cur_freeblock_ptr + size + 3 {
break;
}
cur_freeblock_ptr = next;
}
// Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list
assert_eq!(
next, 0,
"corrupted page: freeblocks list not in ascending order"
);
assert!(
cur_freeblock_ptr + size <= usable_space,
"corrupted page: last freeblock extends last page end"
);
}
assert!(
free_space_bytes <= usable_space,
"corrupted page: free space is greater than usable space"
);
free_space_bytes as u16 - first_cell as u16
}
/// Allocate space for a cell on a page.
fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) -> Result<u16> {
let mut amount = amount as usize;
if amount < MINIMUM_CELL_SIZE {
amount = MINIMUM_CELL_SIZE;
}
let (cell_offset, _) = page_ref.cell_pointer_array_offset_and_size();
let gap = cell_offset + 2 * page_ref.cell_count();
let mut top = page_ref.cell_content_area() as usize;
// there are free blocks and enough space
if page_ref.first_freeblock() != 0 && gap + 2 <= top {
// find slot
let pc = find_free_cell(page_ref, usable_space, amount)?;
if pc != 0 {
return Ok(pc as u16);
}
/* fall through, we might need to defragment */
}
if gap + 2 + amount > top {
// defragment
defragment_page(page_ref, usable_space);
top = page_ref.read_u16(offset::BTREE_CELL_CONTENT_AREA) as usize;
}
top -= amount;
page_ref.write_u16(offset::BTREE_CELL_CONTENT_AREA, top as u16);
assert!(top + amount <= usable_space as usize);
Ok(top as u16)
}
/// Fill in the cell payload with the record.
/// If the record is too large to fit in the cell, it will spill onto overflow pages.
fn fill_cell_payload(
page_contents: &PageContent,
int_key: Option<i64>,
cell_payload: &mut Vec<u8>,
cell_idx: usize,
record: &ImmutableRecord,
usable_space: usize,
pager: Rc<Pager>,
) {
// TODO: make record raw from start, having to serialize is not good
let record_buf = record.get_payload().to_vec();
let page_type = page_contents.page_type();
// fill in header
if matches!(page_type, PageType::IndexInterior) {
// if a write happened on an index interior page, it is always an overwrite.
// we must copy the left child pointer of the replaced cell to the new cell.
let left_child_page = page_contents.cell_interior_read_left_child_page(cell_idx);
cell_payload.extend_from_slice(&left_child_page.to_be_bytes());
}
if matches!(page_type, PageType::TableLeaf) {
let int_key = int_key.unwrap();
write_varint_to_vec(record_buf.len() as u64, cell_payload);
write_varint_to_vec(int_key as u64, cell_payload);
} else {
write_varint_to_vec(record_buf.len() as u64, cell_payload);
}
let payload_overflow_threshold_max = payload_overflow_threshold_max(page_type, usable_space);
tracing::debug!(
"fill_cell_payload(record_size={}, payload_overflow_threshold_max={})",
record_buf.len(),
payload_overflow_threshold_max
);
if record_buf.len() <= payload_overflow_threshold_max {
// enough allowed space to fit inside a btree page
cell_payload.extend_from_slice(record_buf.as_slice());
return;
}
let payload_overflow_threshold_min = payload_overflow_threshold_min(page_type, usable_space);
// see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371
let mut space_left = payload_overflow_threshold_min
+ (record_buf.len() - payload_overflow_threshold_min) % (usable_space - 4);
if space_left > payload_overflow_threshold_max {
space_left = payload_overflow_threshold_min;
}
// cell_size must be equal to first value of space_left as this will be the bytes copied to non-overflow page.
let cell_size = space_left + cell_payload.len() + 4; // 4 is the number of bytes of pointer to first overflow page
let mut to_copy_buffer = record_buf.as_slice();
let prev_size = cell_payload.len();
cell_payload.resize(prev_size + space_left + 4, 0);
let mut pointer = unsafe { cell_payload.as_mut_ptr().add(prev_size) };
let mut pointer_to_next = unsafe { cell_payload.as_mut_ptr().add(prev_size + space_left) };
loop {
let to_copy = space_left.min(to_copy_buffer.len());
unsafe { std::ptr::copy(to_copy_buffer.as_ptr(), pointer, to_copy) };
let left = to_copy_buffer.len() - to_copy;
if left == 0 {
break;
}
// we still have bytes to add, we will need to allocate new overflow page
// FIXME: handle page cache is full
let overflow_page = pager.allocate_overflow_page();
{
let id = overflow_page.get().id as u32;
let contents = overflow_page.get().contents.as_mut().unwrap();
// TODO: take into account offset here?
let buf = contents.as_ptr();
let as_bytes = id.to_be_bytes();
// update pointer to new overflow page
unsafe { std::ptr::copy(as_bytes.as_ptr(), pointer_to_next, 4) };
pointer = unsafe { buf.as_mut_ptr().add(4) };
pointer_to_next = buf.as_mut_ptr();
space_left = usable_space - 4;
}
to_copy_buffer = &to_copy_buffer[to_copy..];
}
assert_eq!(cell_size, cell_payload.len());
}
/// Returns the maximum payload size (X) that can be stored directly on a b-tree page without spilling to overflow pages.
///
/// For table leaf pages: X = usable_size - 35
/// For index pages: X = ((usable_size - 12) * 64/255) - 23
///
/// The usable size is the total page size less the reserved space at the end of each page.
/// These thresholds are designed to:
/// - Give a minimum fanout of 4 for index b-trees
/// - Ensure enough payload is on the b-tree page that the record header can usually be accessed
/// without consulting an overflow page
pub fn payload_overflow_threshold_max(page_type: PageType, usable_space: usize) -> usize {
match page_type {
PageType::IndexInterior | PageType::IndexLeaf => {
((usable_space - 12) * 64 / 255) - 23 // Index page formula
}
PageType::TableInterior | PageType::TableLeaf => {
usable_space - 35 // Table leaf page formula
}
}
}
/// Returns the minimum payload size (M) that must be stored on the b-tree page before spilling to overflow pages is allowed.
///
/// For all page types: M = ((usable_size - 12) * 32/255) - 23
///
/// When payload size P exceeds max_local():
/// - If K = M + ((P-M) % (usable_size-4)) <= max_local(): store K bytes on page
/// - Otherwise: store M bytes on page
///
/// The remaining bytes are stored on overflow pages in both cases.
pub fn payload_overflow_threshold_min(_page_type: PageType, usable_space: usize) -> usize {
// Same formula for all page types
((usable_space - 12) * 32 / 255) - 23
}
/// Drop a cell from a page.
/// This is done by freeing the range of bytes that the cell occupies.
fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) -> Result<()> {
let (cell_start, cell_len) = page.cell_get_raw_region(cell_idx, usable_space as usize);
free_cell_range(page, cell_start as u16, cell_len as u16, usable_space)?;
if page.cell_count() > 1 {
shift_pointers_left(page, cell_idx);
} else {
page.write_u16(offset::BTREE_CELL_CONTENT_AREA, usable_space);
page.write_u16(offset::BTREE_FIRST_FREEBLOCK, 0);
page.write_u8(offset::BTREE_FRAGMENTED_BYTES_COUNT, 0);
}
page.write_u16(offset::BTREE_CELL_COUNT, page.cell_count() as u16 - 1);
debug_validate_cells!(page, usable_space);
Ok(())
}
/// Shift pointers to the left once starting from a cell position
/// This is useful when we remove a cell and we want to move left the cells from the right to fill
/// the empty space that's not needed
fn shift_pointers_left(page: &mut PageContent, cell_idx: usize) {
assert!(page.cell_count() > 0);
let buf = page.as_ptr();
let (start, _) = page.cell_pointer_array_offset_and_size();
let start = start + (cell_idx * 2) + 2;
let right_cells = page.cell_count() - cell_idx - 1;
let amount_to_shift = right_cells * 2;
buf.copy_within(start..start + amount_to_shift, start - 2);
}
#[cfg(test)]
mod tests {
use rand::{thread_rng, Rng};
use rand_chacha::{
rand_core::{RngCore, SeedableRng},
ChaCha8Rng,
};
use sorted_vec::SortedVec;
use test_log::test;
use turso_sqlite3_parser::ast::SortOrder;
use super::*;
use crate::{
io::{Buffer, MemoryIO, OpenFlags, IO},
schema::IndexColumn,
storage::{
database::DatabaseFile,
page_cache::DumbLruPageCache,
pager::{AtomicDbState, DbState},
},
types::Text,
util::IOExt as _,
vdbe::Register,
BufferPool, Completion, Connection, StepResult, WalFile, WalFileShared,
};
use std::{
cell::RefCell,
collections::HashSet,
mem::transmute,
ops::Deref,
rc::Rc,
sync::{Arc, Mutex},
};
use tempfile::TempDir;
use crate::{
io::BufferData,
storage::{
btree::{compute_free_space, fill_cell_payload, payload_overflow_threshold_max},
sqlite3_ondisk::{BTreeCell, PageContent, PageType},
},
types::Value,
Database, Page, Pager, PlatformIO,
};
use super::{btree_init_page, defragment_page, drop_cell, insert_into_cell};
#[allow(clippy::arc_with_non_send_sync)]
fn get_page(id: usize) -> BTreePage {
let page = Arc::new(Page::new(id));
let drop_fn = Rc::new(|_| {});
let inner = PageContent::new(
0,
Arc::new(RefCell::new(Buffer::new(
BufferData::new(vec![0; 4096]),
drop_fn,
))),
);
page.get().contents.replace(inner);
let page = Arc::new(BTreePageInner {
page: RefCell::new(page),
});
btree_init_page(&page, PageType::TableLeaf, 0, 4096);
page
}
#[allow(clippy::arc_with_non_send_sync)]
fn get_database() -> Arc<Database> {
let mut path = TempDir::new().unwrap().keep();
path.push("test.db");
{
let connection = rusqlite::Connection::open(&path).unwrap();
connection
.pragma_update(None, "journal_mode", "wal")
.unwrap();
}
let io: Arc<dyn IO> = Arc::new(PlatformIO::new().unwrap());
let db = Database::open_file(io.clone(), path.to_str().unwrap(), false, false).unwrap();
db
}
fn ensure_cell(page: &mut PageContent, cell_idx: usize, payload: &Vec<u8>) {
let cell = page.cell_get_raw_region(cell_idx, 4096);
tracing::trace!("cell idx={} start={} len={}", cell_idx, cell.0, cell.1);
let buf = &page.as_ptr()[cell.0..cell.0 + cell.1];
assert_eq!(buf.len(), payload.len());
assert_eq!(buf, payload);
}
fn add_record(
id: usize,
pos: usize,
page: &mut PageContent,
record: ImmutableRecord,
conn: &Arc<Connection>,
) -> Vec<u8> {
let mut payload: Vec<u8> = Vec::new();
fill_cell_payload(
page,
Some(id as i64),
&mut payload,
pos,
&record,
4096,
conn.pager.borrow().clone(),
);
insert_into_cell(page, &payload, pos, 4096).unwrap();
payload
}
#[test]
fn test_insert_cell() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let regs = &[Register::Value(Value::Integer(1))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(1, 0, page, record, &conn);
assert_eq!(page.cell_count(), 1);
let free = compute_free_space(page, 4096);
assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size);
let cell_idx = 0;
ensure_cell(page, cell_idx, &payload);
}
struct Cell {
pos: usize,
payload: Vec<u8>,
}
#[test]
fn test_drop_1() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let mut total_size = 0;
let mut cells = Vec::new();
let usable_space = 4096;
for i in 0..3 {
let regs = &[Register::Value(Value::Integer(i as i64))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(i, i, page, record, &conn);
assert_eq!(page.cell_count(), i + 1);
let free = compute_free_space(page, usable_space);
total_size += payload.len() as u16 + 2;
assert_eq!(free, 4096 - total_size - header_size);
cells.push(Cell { pos: i, payload });
}
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
cells.remove(1);
drop_cell(page, 1, usable_space).unwrap();
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
}
fn validate_btree(pager: Rc<Pager>, page_idx: usize) -> (usize, bool) {
let num_columns = 5;
let cursor = BTreeCursor::new_table(None, pager.clone(), page_idx, num_columns);
let page = cursor.read_page(page_idx).unwrap();
while page.get().is_locked() {
pager.io.run_once().unwrap();
}
let page = page.get();
// Pin page in order to not drop it in between
page.set_dirty();
let contents = page.get().contents.as_ref().unwrap();
let mut previous_key = None;
let mut valid = true;
let mut depth = None;
debug_validate_cells!(contents, pager.usable_space() as u16);
let mut child_pages = Vec::new();
for cell_idx in 0..contents.cell_count() {
let cell = contents.cell_get(cell_idx, cursor.usable_space()).unwrap();
let current_depth = match cell {
BTreeCell::TableLeafCell(..) => 1,
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page, ..
}) => {
let child_page = cursor.read_page(left_child_page as usize).unwrap();
while child_page.get().is_locked() {
pager.io.run_once().unwrap();
}
child_pages.push(child_page);
if left_child_page == page.get().id as u32 {
valid = false;
tracing::error!(
"left child page is the same as parent {}",
left_child_page
);
continue;
}
let (child_depth, child_valid) =
validate_btree(pager.clone(), left_child_page as usize);
valid &= child_valid;
child_depth
}
_ => panic!("unsupported btree cell: {cell:?}"),
};
if current_depth >= 100 {
tracing::error!("depth is too big");
page.clear_dirty();
return (100, false);
}
depth = Some(depth.unwrap_or(current_depth + 1));
if depth != Some(current_depth + 1) {
tracing::error!("depth is different for child of page {}", page_idx);
valid = false;
}
match cell {
BTreeCell::TableInteriorCell(TableInteriorCell { rowid, .. })
| BTreeCell::TableLeafCell(TableLeafCell { rowid, .. }) => {
if previous_key.is_some() && previous_key.unwrap() >= rowid {
tracing::error!(
"keys are in bad order: prev={:?}, current={}",
previous_key,
rowid
);
valid = false;
}
previous_key = Some(rowid);
}
_ => panic!("unsupported btree cell: {cell:?}"),
}
}
if let Some(right) = contents.rightmost_pointer() {
let (right_depth, right_valid) = validate_btree(pager.clone(), right as usize);
valid &= right_valid;
depth = Some(depth.unwrap_or(right_depth + 1));
if depth != Some(right_depth + 1) {
tracing::error!("depth is different for child of page {}", page_idx);
valid = false;
}
}
let first_page_type = child_pages.first().map(|p| {
if !p.get().is_loaded() {
let new_page = pager.read_page(p.get().get().id).unwrap();
p.page.replace(new_page);
}
while p.get().is_locked() {
pager.io.run_once().unwrap();
}
p.get().get_contents().page_type()
});
if let Some(child_type) = first_page_type {
for page in child_pages.iter().skip(1) {
if !page.get().is_loaded() {
let new_page = pager.read_page(page.get().get().id).unwrap();
page.page.replace(new_page);
}
while page.get().is_locked() {
pager.io.run_once().unwrap();
}
if page.get().get_contents().page_type() != child_type {
tracing::error!("child pages have different types");
valid = false;
}
}
}
if contents.rightmost_pointer().is_none() && contents.cell_count() == 0 {
valid = false;
}
page.clear_dirty();
(depth.unwrap(), valid)
}
fn format_btree(pager: Rc<Pager>, page_idx: usize, depth: usize) -> String {
let num_columns = 5;
let cursor = BTreeCursor::new_table(None, pager.clone(), page_idx, num_columns);
let page = cursor.read_page(page_idx).unwrap();
while page.get().is_locked() {
pager.io.run_once().unwrap();
}
let page = page.get();
// Pin page in order to not drop it in between loading of different pages. If not contents will be a dangling reference.
page.set_dirty();
let contents = page.get().contents.as_ref().unwrap();
let mut current = Vec::new();
let mut child = Vec::new();
for cell_idx in 0..contents.cell_count() {
let cell = contents.cell_get(cell_idx, cursor.usable_space()).unwrap();
match cell {
BTreeCell::TableInteriorCell(cell) => {
current.push(format!(
"node[rowid:{}, ptr(<=):{}]",
cell.rowid, cell.left_child_page
));
child.push(format_btree(
pager.clone(),
cell.left_child_page as usize,
depth + 2,
));
}
BTreeCell::TableLeafCell(cell) => {
current.push(format!(
"leaf[rowid:{}, len(payload):{}, overflow:{}]",
cell.rowid,
cell.payload.len(),
cell.first_overflow_page.is_some()
));
}
_ => panic!("unsupported btree cell: {cell:?}"),
}
}
if let Some(rightmost) = contents.rightmost_pointer() {
child.push(format_btree(pager.clone(), rightmost as usize, depth + 2));
}
let current = format!(
"{}-page:{}, ptr(right):{}\n{}+cells:{}",
" ".repeat(depth),
page_idx,
contents.rightmost_pointer().unwrap_or(0),
" ".repeat(depth),
current.join(", ")
);
page.clear_dirty();
if child.is_empty() {
current
} else {
current + "\n" + &child.join("\n")
}
}
fn empty_btree() -> (Rc<Pager>, usize, Arc<Database>, Arc<Connection>) {
#[allow(clippy::arc_with_non_send_sync)]
let io: Arc<dyn IO> = Arc::new(MemoryIO::new());
let db = Database::open_file(io.clone(), "test.db", false, false).unwrap();
let conn = db.connect().unwrap();
let pager = conn.pager.borrow().clone();
// FIXME: handle page cache is full
let _ = run_until_done(|| pager.allocate_page1(), &pager);
let page2 = pager.allocate_page().unwrap();
let page2 = Arc::new(BTreePageInner {
page: RefCell::new(page2),
});
btree_init_page(&page2, PageType::TableLeaf, 0, 4096);
(pager, page2.get().get().id, db, conn)
}
#[test]
#[ignore]
pub fn btree_insert_fuzz_ex() {
for sequence in [
&[
(777548915, 3364),
(639157228, 3796),
(709175417, 1214),
(390824637, 210),
(906124785, 1481),
(197677875, 1305),
(457946262, 3734),
(956825466, 592),
(835875722, 1334),
(649214013, 1250),
(531143011, 1788),
(765057993, 2351),
(510007766, 1349),
(884516059, 822),
(81604840, 2545),
]
.as_slice(),
&[
(293471650, 2452),
(163608869, 627),
(544576229, 464),
(705823748, 3441),
]
.as_slice(),
&[
(987283511, 2924),
(261851260, 1766),
(343847101, 1657),
(315844794, 572),
]
.as_slice(),
&[
(987283511, 2924),
(261851260, 1766),
(343847101, 1657),
(315844794, 572),
(649272840, 1632),
(723398505, 3140),
(334416967, 3874),
]
.as_slice(),
] {
let (pager, root_page, _, _) = empty_btree();
let num_columns = 5;
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
for (key, size) in sequence.iter() {
run_until_done(
|| {
let key = SeekKey::TableRowId(*key);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
let regs = &[Register::Value(Value::Blob(vec![0; *size]))];
let value = ImmutableRecord::from_registers(regs, regs.len());
tracing::info!("insert key:{}", key);
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(*key, Some(&value)), true),
pager.deref(),
)
.unwrap();
tracing::info!(
"=========== btree ===========\n{}\n\n",
format_btree(pager.clone(), root_page, 0)
);
}
for (key, _) in sequence.iter() {
let seek_key = SeekKey::TableRowId(*key);
assert!(
matches!(
cursor.seek(seek_key, SeekOp::GE { eq_only: true }).unwrap(),
IOResult::Done(SeekResult::Found)
),
"key {key} is not found"
);
}
}
}
fn rng_from_time_or_env() -> (ChaCha8Rng, u64) {
let seed = std::env::var("SEED").map_or(
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis(),
|v| {
v.parse()
.expect("Failed to parse SEED environment variable as u64")
},
);
let rng = ChaCha8Rng::seed_from_u64(seed as u64);
(rng, seed as u64)
}
fn btree_insert_fuzz_run(
attempts: usize,
inserts: usize,
size: impl Fn(&mut ChaCha8Rng) -> usize,
) {
const VALIDATE_INTERVAL: usize = 1000;
let do_validate_btree = std::env::var("VALIDATE_BTREE")
.is_ok_and(|v| v.parse().expect("validate should be bool"));
let (mut rng, seed) = rng_from_time_or_env();
let mut seen = HashSet::new();
tracing::info!("super seed: {}", seed);
let num_columns = 5;
for _ in 0..attempts {
let (pager, root_page, _db, conn) = empty_btree();
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let mut keys = SortedVec::new();
tracing::info!("seed: {seed}");
for insert_id in 0..inserts {
let do_validate = do_validate_btree || (insert_id % VALIDATE_INTERVAL == 0);
run_until_done(|| pager.begin_read_tx(), &pager).unwrap();
run_until_done(|| pager.begin_write_tx(), &pager).unwrap();
let size = size(&mut rng);
let key = {
let result;
loop {
let key = (rng.next_u64() % (1 << 30)) as i64;
if seen.contains(&key) {
continue;
} else {
seen.insert(key);
}
result = key;
break;
}
result
};
keys.push(key);
tracing::info!(
"INSERT INTO t VALUES ({}, randomblob({})); -- {}",
key,
size,
insert_id
);
run_until_done(
|| {
let key = SeekKey::TableRowId(key);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
let regs = &[Register::Value(Value::Blob(vec![0; size]))];
let value = ImmutableRecord::from_registers(regs, regs.len());
let btree_before = if do_validate {
format_btree(pager.clone(), root_page, 0)
} else {
"".to_string()
};
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(key, Some(&value)), true),
pager.deref(),
)
.unwrap();
loop {
match pager.end_tx(false, false, &conn, false).unwrap() {
IOResult::Done(_) => break,
IOResult::IO => {
pager.io.run_once().unwrap();
}
}
}
run_until_done(|| pager.begin_read_tx(), &pager).unwrap();
// FIXME: add sorted vector instead, should be okay for small amounts of keys for now :P, too lazy to fix right now
cursor.move_to_root().unwrap();
let mut valid = true;
if do_validate {
cursor.move_to_root().unwrap();
for key in keys.iter() {
tracing::trace!("seeking key: {}", key);
run_until_done(|| cursor.next(), pager.deref()).unwrap();
let cursor_rowid = run_until_done(|| cursor.rowid(), pager.deref())
.unwrap()
.unwrap();
if *key != cursor_rowid {
valid = false;
println!("key {key} is not found, got {cursor_rowid}");
break;
}
}
}
// let's validate btree too so that we undertsand where the btree failed
if do_validate
&& (!valid || matches!(validate_btree(pager.clone(), root_page), (_, false)))
{
let btree_after = format_btree(pager.clone(), root_page, 0);
println!("btree before:\n{btree_before}");
println!("btree after:\n{btree_after}");
panic!("invalid btree");
}
pager.end_read_tx().unwrap();
}
run_until_done(|| pager.begin_read_tx(), &pager).unwrap();
tracing::info!(
"=========== btree ===========\n{}\n\n",
format_btree(pager.clone(), root_page, 0)
);
if matches!(validate_btree(pager.clone(), root_page), (_, false)) {
panic!("invalid btree");
}
cursor.move_to_root().unwrap();
for key in keys.iter() {
tracing::trace!("seeking key: {}", key);
run_until_done(|| cursor.next(), pager.deref()).unwrap();
let cursor_rowid = run_until_done(|| cursor.rowid(), pager.deref())
.unwrap()
.unwrap();
assert_eq!(
*key, cursor_rowid,
"key {key} is not found, got {cursor_rowid}"
);
}
pager.end_read_tx().unwrap();
}
}
fn btree_index_insert_fuzz_run(attempts: usize, inserts: usize) {
use crate::storage::pager::CreateBTreeFlags;
let (mut rng, seed) = if std::env::var("SEED").is_ok() {
let seed = std::env::var("SEED").unwrap();
let seed = seed.parse::<u64>().unwrap();
let rng = ChaCha8Rng::seed_from_u64(seed);
(rng, seed)
} else {
rng_from_time_or_env()
};
let mut seen = HashSet::new();
tracing::info!("super seed: {}", seed);
for _ in 0..attempts {
let (pager, _, _db, conn) = empty_btree();
let index_root_page_result =
pager.btree_create(&CreateBTreeFlags::new_index()).unwrap();
let index_root_page = match index_root_page_result {
crate::types::IOResult::Done(id) => id as usize,
crate::types::IOResult::IO => {
panic!("btree_create returned IO in test, unexpected")
}
};
let index_def = Index {
name: "testindex".to_string(),
columns: (0..10)
.map(|i| IndexColumn {
name: format!("test{i}"),
order: SortOrder::Asc,
collation: None,
pos_in_table: i,
default: None,
})
.collect(),
table_name: "test".to_string(),
root_page: index_root_page,
unique: false,
ephemeral: false,
has_rowid: false,
};
let num_columns = index_def.columns.len();
let mut cursor = BTreeCursor::new_index(
None,
pager.clone(),
index_root_page,
&index_def,
num_columns,
);
let mut keys = SortedVec::new();
tracing::info!("seed: {seed}");
for i in 0..inserts {
pager.begin_read_tx().unwrap();
pager.begin_write_tx().unwrap();
let key = {
let result;
loop {
let cols = (0..num_columns)
.map(|_| (rng.next_u64() % (1 << 30)) as i64)
.collect::<Vec<_>>();
if seen.contains(&cols) {
continue;
} else {
seen.insert(cols.clone());
}
result = cols;
break;
}
result
};
tracing::info!("insert {}/{}: {:?}", i + 1, inserts, key);
keys.push(key.clone());
let regs = key
.iter()
.map(|col| Register::Value(Value::Integer(*col)))
.collect::<Vec<_>>();
let value = ImmutableRecord::from_registers(&regs, regs.len());
run_until_done(
|| {
let record = ImmutableRecord::from_registers(&regs, regs.len());
let key = SeekKey::IndexKey(&record);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
run_until_done(
|| {
cursor.insert(
&BTreeKey::new_index_key(&value),
cursor.is_write_in_progress(),
)
},
pager.deref(),
)
.unwrap();
cursor.move_to_root().unwrap();
loop {
match pager.end_tx(false, false, &conn, false).unwrap() {
IOResult::Done(_) => break,
IOResult::IO => {
pager.io.run_once().unwrap();
}
}
}
}
// Check that all keys can be found by seeking
pager.begin_read_tx().unwrap();
cursor.move_to_root().unwrap();
for (i, key) in keys.iter().enumerate() {
tracing::info!("seeking key {}/{}: {:?}", i + 1, keys.len(), key);
let exists = run_until_done(
|| {
let regs = key
.iter()
.map(|col| Register::Value(Value::Integer(*col)))
.collect::<Vec<_>>();
cursor.seek(
SeekKey::IndexKey(&ImmutableRecord::from_registers(&regs, regs.len())),
SeekOp::GE { eq_only: true },
)
},
pager.deref(),
)
.unwrap();
let mut found = matches!(exists, SeekResult::Found);
if matches!(exists, SeekResult::TryAdvance) {
found = run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
assert!(found, "key {key:?} is not found");
}
// Check that key count is right
cursor.move_to_root().unwrap();
let mut count = 0;
while run_until_done(|| cursor.next(), pager.deref()).unwrap() {
count += 1;
}
assert_eq!(
count,
keys.len(),
"key count is not right, got {}, expected {}",
count,
keys.len()
);
// Check that all keys can be found in-order, by iterating the btree
cursor.move_to_root().unwrap();
let mut prev = None;
for (i, key) in keys.iter().enumerate() {
tracing::info!("iterating key {}/{}: {:?}", i + 1, keys.len(), key);
run_until_done(|| cursor.next(), pager.deref()).unwrap();
let record = run_until_done(|| cursor.record(), &pager).unwrap();
let record = record.as_ref().unwrap();
let cur = record.get_values().clone();
if let Some(prev) = prev {
if prev >= cur {
println!("Seed: {seed}");
}
assert!(
prev < cur,
"keys are not in ascending order: {prev:?} < {cur:?}",
);
}
prev = Some(cur);
}
pager.end_read_tx().unwrap();
}
}
fn btree_index_insert_delete_fuzz_run(
attempts: usize,
operations: usize,
size: impl Fn(&mut ChaCha8Rng) -> usize,
insert_chance: f64,
) {
use crate::storage::pager::CreateBTreeFlags;
let (mut rng, seed) = if std::env::var("SEED").is_ok() {
let seed = std::env::var("SEED").unwrap();
let seed = seed.parse::<u64>().unwrap();
let rng = ChaCha8Rng::seed_from_u64(seed);
(rng, seed)
} else {
rng_from_time_or_env()
};
let mut seen = HashSet::new();
tracing::info!("super seed: {}", seed);
for _ in 0..attempts {
let (pager, _, _db, conn) = empty_btree();
let index_root_page_result =
pager.btree_create(&CreateBTreeFlags::new_index()).unwrap();
let index_root_page = match index_root_page_result {
crate::types::IOResult::Done(id) => id as usize,
crate::types::IOResult::IO => {
panic!("btree_create returned IO in test, unexpected")
}
};
let index_def = Index {
name: "testindex".to_string(),
columns: vec![IndexColumn {
name: "testcol".to_string(),
order: SortOrder::Asc,
collation: None,
pos_in_table: 0,
default: None,
}],
table_name: "test".to_string(),
root_page: index_root_page,
unique: false,
ephemeral: false,
has_rowid: false,
};
let mut cursor =
BTreeCursor::new_index(None, pager.clone(), index_root_page, &index_def, 1);
// Track expected keys that should be present in the tree
let mut expected_keys = Vec::new();
tracing::info!("seed: {seed}");
for i in 0..operations {
let print_progress = i % 100 == 0;
pager.begin_read_tx().unwrap();
pager.begin_write_tx().unwrap();
// Decide whether to insert or delete (80% chance of insert)
let is_insert = rng.next_u64() % 100 < (insert_chance * 100.0) as u64;
if is_insert {
// Generate a unique key for insertion
let key = {
let result;
loop {
let sizeof_blob = size(&mut rng);
let blob = (0..sizeof_blob)
.map(|_| (rng.next_u64() % 256) as u8)
.collect::<Vec<_>>();
if seen.contains(&blob) {
continue;
} else {
seen.insert(blob.clone());
}
result = blob;
break;
}
result
};
if print_progress {
tracing::info!("insert {}/{}, seed: {seed}", i + 1, operations);
}
expected_keys.push(key.clone());
let regs = vec![Register::Value(Value::Blob(key))];
let value = ImmutableRecord::from_registers(&regs, regs.len());
let seek_result = run_until_done(
|| {
let record = ImmutableRecord::from_registers(&regs, regs.len());
let key = SeekKey::IndexKey(&record);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
if let SeekResult::TryAdvance = seek_result {
run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
run_until_done(
|| {
cursor.insert(
&BTreeKey::new_index_key(&value),
cursor.is_write_in_progress(),
)
},
pager.deref(),
)
.unwrap();
} else {
// Delete a random existing key
if !expected_keys.is_empty() {
let delete_idx = rng.next_u64() as usize % expected_keys.len();
let key_to_delete = expected_keys[delete_idx].clone();
if print_progress {
tracing::info!("delete {}/{}, seed: {seed}", i + 1, operations);
}
let regs = vec![Register::Value(Value::Blob(key_to_delete.clone()))];
let record = ImmutableRecord::from_registers(&regs, regs.len());
// Seek to the key to delete
let seek_result = run_until_done(
|| {
cursor
.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
let mut found = matches!(seek_result, SeekResult::Found);
if matches!(seek_result, SeekResult::TryAdvance) {
found = run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
assert!(found, "expected key {key_to_delete:?} is not found");
// Delete the key
run_until_done(|| cursor.delete(), pager.deref()).unwrap();
// Remove from expected keys
expected_keys.remove(delete_idx);
}
}
cursor.move_to_root().unwrap();
loop {
match pager.end_tx(false, false, &conn, false).unwrap() {
IOResult::Done(_) => break,
IOResult::IO => {
pager.io.run_once().unwrap();
}
}
}
}
// Final validation
let mut sorted_keys = expected_keys.clone();
sorted_keys.sort();
validate_expected_keys(&pager, &mut cursor, &sorted_keys, seed);
pager.end_read_tx().unwrap();
}
}
fn validate_expected_keys(
pager: &Rc<Pager>,
cursor: &mut BTreeCursor,
expected_keys: &[Vec<u8>],
seed: u64,
) {
// Check that all expected keys can be found by seeking
pager.begin_read_tx().unwrap();
cursor.move_to_root().unwrap();
for (i, key) in expected_keys.iter().enumerate() {
tracing::info!(
"validating key {}/{}, seed: {seed}",
i + 1,
expected_keys.len()
);
let exists = run_until_done(
|| {
let regs = vec![Register::Value(Value::Blob(key.clone()))];
cursor.seek(
SeekKey::IndexKey(&ImmutableRecord::from_registers(&regs, regs.len())),
SeekOp::GE { eq_only: true },
)
},
pager.deref(),
)
.unwrap();
let mut found = matches!(exists, SeekResult::Found);
if matches!(exists, SeekResult::TryAdvance) {
found = run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
assert!(found, "expected key {key:?} is not found");
}
// Check key count
cursor.move_to_root().unwrap();
run_until_done(|| cursor.rewind(), pager.deref()).unwrap();
if !cursor.has_record.get() {
panic!("no keys in tree");
}
let mut count = 1;
loop {
run_until_done(|| cursor.next(), pager.deref()).unwrap();
if !cursor.has_record.get() {
break;
}
count += 1;
}
assert_eq!(
count,
expected_keys.len(),
"key count is not right, got {}, expected {}, seed: {seed}",
count,
expected_keys.len()
);
// Check that all keys can be found in-order, by iterating the btree
cursor.move_to_root().unwrap();
for (i, key) in expected_keys.iter().enumerate() {
run_until_done(|| cursor.next(), pager.deref()).unwrap();
tracing::info!(
"iterating key {}/{}, cursor stack cur idx: {:?}, cursor stack depth: {:?}, seed: {seed}",
i + 1,
expected_keys.len(),
cursor.stack.current_cell_index(),
cursor.stack.current()
);
let record = run_until_done(|| cursor.record(), pager).unwrap();
let record = record.as_ref().unwrap();
let cur = record.get_values().clone();
let cur = cur.first().unwrap();
let RefValue::Blob(ref cur) = cur else {
panic!("expected blob, got {cur:?}");
};
assert_eq!(
cur.to_slice(),
key,
"key {key:?} is not found, seed: {seed}"
);
}
pager.end_read_tx().unwrap();
}
#[test]
pub fn test_drop_odd() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let mut total_size = 0;
let mut cells = Vec::new();
let usable_space = 4096;
let total_cells = 10;
for i in 0..total_cells {
let regs = &[Register::Value(Value::Integer(i as i64))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(i, i, page, record, &conn);
assert_eq!(page.cell_count(), i + 1);
let free = compute_free_space(page, usable_space);
total_size += payload.len() as u16 + 2;
assert_eq!(free, 4096 - total_size - header_size);
cells.push(Cell { pos: i, payload });
}
let mut removed = 0;
let mut new_cells = Vec::new();
for cell in cells {
if cell.pos % 2 == 1 {
drop_cell(page, cell.pos - removed, usable_space).unwrap();
removed += 1;
} else {
new_cells.push(cell);
}
}
let cells = new_cells;
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
}
#[test]
pub fn btree_insert_fuzz_run_equal_size() {
for size in 1..8 {
tracing::info!("======= size:{} =======", size);
btree_insert_fuzz_run(2, 1024, |_| size);
}
}
#[test]
pub fn btree_index_insert_fuzz_run_equal_size() {
btree_index_insert_fuzz_run(2, 1024);
}
#[test]
pub fn btree_index_insert_delete_fuzz_run_test() {
btree_index_insert_delete_fuzz_run(
2,
2000,
|rng| {
let min: u32 = 4;
let size = min + rng.next_u32() % (1024 - min);
size as usize
},
0.65,
);
}
#[test]
pub fn btree_insert_fuzz_run_random() {
btree_insert_fuzz_run(128, 16, |rng| (rng.next_u32() % 4096) as usize);
}
#[test]
pub fn btree_insert_fuzz_run_small() {
btree_insert_fuzz_run(1, 100, |rng| (rng.next_u32() % 128) as usize);
}
#[test]
pub fn btree_insert_fuzz_run_big() {
btree_insert_fuzz_run(64, 32, |rng| 3 * 1024 + (rng.next_u32() % 1024) as usize);
}
#[test]
pub fn btree_insert_fuzz_run_overflow() {
btree_insert_fuzz_run(64, 32, |rng| (rng.next_u32() % 32 * 1024) as usize);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_insert_fuzz_run_equal_size() {
for size in 1..8 {
tracing::info!("======= size:{} =======", size);
btree_insert_fuzz_run(2, 10_000, |_| size);
}
}
#[test]
#[ignore]
pub fn fuzz_long_btree_index_insert_fuzz_run_equal_size() {
btree_index_insert_fuzz_run(2, 10_000);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_index_insert_delete_fuzz_run() {
btree_index_insert_delete_fuzz_run(
2,
10000,
|rng| {
let min: u32 = 4;
let size = min + rng.next_u32() % (1024 - min);
size as usize
},
0.65,
);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_insert_fuzz_run_random() {
btree_insert_fuzz_run(2, 10_000, |rng| (rng.next_u32() % 4096) as usize);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_insert_fuzz_run_small() {
btree_insert_fuzz_run(2, 10_000, |rng| (rng.next_u32() % 128) as usize);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_insert_fuzz_run_big() {
btree_insert_fuzz_run(2, 10_000, |rng| 3 * 1024 + (rng.next_u32() % 1024) as usize);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_insert_fuzz_run_overflow() {
btree_insert_fuzz_run(2, 5_000, |rng| (rng.next_u32() % 32 * 1024) as usize);
}
#[allow(clippy::arc_with_non_send_sync)]
fn setup_test_env(database_size: u32) -> Rc<Pager> {
let page_size = 512;
let buffer_pool = Arc::new(BufferPool::new(Some(page_size as usize)));
// Initialize buffer pool with correctly sized buffers
for _ in 0..10 {
let vec = vec![0; page_size as usize]; // Initialize with correct length, not just capacity
buffer_pool.put(Pin::new(vec));
}
let io: Arc<dyn IO> = Arc::new(MemoryIO::new());
let db_file = Arc::new(DatabaseFile::new(
io.open_file("test.db", OpenFlags::Create, false).unwrap(),
));
let wal_file = io.open_file("test.wal", OpenFlags::Create, false).unwrap();
let wal_shared = WalFileShared::new_shared(page_size, &io, wal_file).unwrap();
let wal = Rc::new(RefCell::new(WalFile::new(
io.clone(),
wal_shared,
buffer_pool.clone(),
)));
let pager = Rc::new(
Pager::new(
db_file,
wal,
io,
Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(10))),
buffer_pool,
Arc::new(AtomicDbState::new(DbState::Uninitialized)),
Arc::new(Mutex::new(())),
)
.unwrap(),
);
pager.io.run_once().unwrap();
let _ = run_until_done(|| pager.allocate_page1(), &pager);
for _ in 0..(database_size - 1) {
pager.allocate_page().unwrap();
}
header_accessor::set_page_size(&pager, page_size).unwrap();
pager
}
#[test]
pub fn test_clear_overflow_pages() -> Result<()> {
let pager = setup_test_env(5);
let num_columns = 5;
let mut cursor = BTreeCursor::new_table(None, pager.clone(), 1, num_columns);
let max_local = payload_overflow_threshold_max(PageType::TableLeaf, 4096);
let usable_size = cursor.usable_space();
// Create a large payload that will definitely trigger overflow
let large_payload = vec![b'A'; max_local + usable_size];
// Setup overflow pages (2, 3, 4) with linking
let mut current_page = 2u32;
while current_page <= 4 {
let drop_fn = Rc::new(|_buf| {});
#[allow(clippy::arc_with_non_send_sync)]
let buf = Arc::new(RefCell::new(Buffer::allocate(
header_accessor::get_page_size(&pager)? as usize,
drop_fn,
)));
let c = Completion::new_write(|_| {});
#[allow(clippy::arc_with_non_send_sync)]
pager
.db_file
.write_page(current_page as usize, buf.clone(), c)?;
pager.io.run_once()?;
let page = cursor.read_page(current_page as usize)?;
while page.get().is_locked() {
cursor.pager.io.run_once()?;
}
{
let page = page.get();
let contents = page.get_contents();
let next_page = if current_page < 4 {
current_page + 1
} else {
0
};
contents.write_u32(0, next_page); // Write pointer to next overflow page
let buf = contents.as_ptr();
buf[4..].fill(b'A');
}
current_page += 1;
}
pager.io.run_once()?;
// Create leaf cell pointing to start of overflow chain
let leaf_cell = BTreeCell::TableLeafCell(TableLeafCell {
rowid: 1,
payload: unsafe { transmute::<&[u8], &'static [u8]>(large_payload.as_slice()) },
first_overflow_page: Some(2), // Point to first overflow page
payload_size: large_payload.len() as u64,
});
let initial_freelist_pages = header_accessor::get_freelist_pages(&pager)?;
// Clear overflow pages
let clear_result = cursor.clear_overflow_pages(&leaf_cell)?;
match clear_result {
IOResult::Done(_) => {
// Verify proper number of pages were added to freelist
assert_eq!(
header_accessor::get_freelist_pages(&pager)?,
initial_freelist_pages + 3,
"Expected 3 pages to be added to freelist"
);
// If this is first trunk page
let trunk_page_id = header_accessor::get_freelist_trunk_page(&pager)?;
if trunk_page_id > 0 {
// Verify trunk page structure
let trunk_page = cursor.read_page(trunk_page_id as usize)?;
if let Some(contents) = trunk_page.get().get().contents.as_ref() {
// Read number of leaf pages in trunk
let n_leaf = contents.read_u32(4);
assert!(n_leaf > 0, "Trunk page should have leaf entries");
for i in 0..n_leaf {
let leaf_page_id = contents.read_u32(8 + (i as usize * 4));
assert!(
(2..=4).contains(&leaf_page_id),
"Leaf page ID {leaf_page_id} should be in range 2-4"
);
}
}
}
}
IOResult::IO => {
cursor.pager.io.run_once()?;
}
}
Ok(())
}
#[test]
pub fn test_clear_overflow_pages_no_overflow() -> Result<()> {
let pager = setup_test_env(5);
let num_columns = 5;
let mut cursor = BTreeCursor::new_table(None, pager.clone(), 1, num_columns);
let small_payload = vec![b'A'; 10];
// Create leaf cell with no overflow pages
let leaf_cell = BTreeCell::TableLeafCell(TableLeafCell {
rowid: 1,
payload: unsafe { transmute::<&[u8], &'static [u8]>(small_payload.as_slice()) },
first_overflow_page: None,
payload_size: small_payload.len() as u64,
});
let initial_freelist_pages = header_accessor::get_freelist_pages(&pager)?;
// Try to clear non-existent overflow pages
let clear_result = cursor.clear_overflow_pages(&leaf_cell)?;
match clear_result {
IOResult::Done(_) => {
// Verify freelist was not modified
assert_eq!(
header_accessor::get_freelist_pages(&pager)?,
initial_freelist_pages,
"Freelist should not change when no overflow pages exist"
);
// Verify trunk page wasn't created
assert_eq!(
header_accessor::get_freelist_trunk_page(&pager)?,
0,
"No trunk page should be created when no overflow pages exist"
);
}
IOResult::IO => {
cursor.pager.io.run_once()?;
}
}
Ok(())
}
#[test]
fn test_btree_destroy() -> Result<()> {
let initial_size = 1;
let pager = setup_test_env(initial_size);
let num_columns = 5;
let mut cursor = BTreeCursor::new_table(None, pager.clone(), 2, num_columns);
// Initialize page 2 as a root page (interior)
let root_page = cursor.allocate_page(PageType::TableInterior, 0)?;
// Allocate two leaf pages
let page3 = cursor.allocate_page(PageType::TableLeaf, 0)?;
let page4 = cursor.allocate_page(PageType::TableLeaf, 0)?;
// Configure the root page to point to the two leaf pages
{
let root_page = root_page.get();
let contents = root_page.get().contents.as_mut().unwrap();
// Set rightmost pointer to page4
contents.write_u32(offset::BTREE_RIGHTMOST_PTR, page4.get().get().id as u32);
// Create a cell with pointer to page3
let cell_content = vec![
// First 4 bytes: left child pointer (page3)
(page3.get().get().id >> 24) as u8,
(page3.get().get().id >> 16) as u8,
(page3.get().get().id >> 8) as u8,
page3.get().get().id as u8,
// Next byte: rowid as varint (simple value 100)
100,
];
// Insert the cell
insert_into_cell(contents, &cell_content, 0, 512)?;
}
// Add a simple record to each leaf page
for page in [&page3, &page4] {
let page = page.get();
let contents = page.get().contents.as_mut().unwrap();
// Simple record with just a rowid and payload
let record_bytes = vec![
5, // Payload length (varint)
page.get().id as u8, // Rowid (varint)
b'h',
b'e',
b'l',
b'l',
b'o', // Payload
];
insert_into_cell(contents, &record_bytes, 0, 512)?;
}
// Verify structure before destruction
assert_eq!(
header_accessor::get_database_size(&pager)?,
4, // We should have pages 1-4
"Database should have 4 pages total"
);
// Track freelist state before destruction
let initial_free_pages = header_accessor::get_freelist_pages(&pager)?;
assert_eq!(initial_free_pages, 0, "should start with no free pages");
run_until_done(|| cursor.btree_destroy(), pager.deref())?;
let pages_freed = header_accessor::get_freelist_pages(&pager)? - initial_free_pages;
assert_eq!(pages_freed, 3, "should free 3 pages (root + 2 leaves)");
Ok(())
}
#[test]
pub fn test_defragment() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let mut total_size = 0;
let mut cells = Vec::new();
let usable_space = 4096;
for i in 0..3 {
let regs = &[Register::Value(Value::Integer(i as i64))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(i, i, page, record, &conn);
assert_eq!(page.cell_count(), i + 1);
let free = compute_free_space(page, usable_space);
total_size += payload.len() as u16 + 2;
assert_eq!(free, 4096 - total_size - header_size);
cells.push(Cell { pos: i, payload });
}
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
cells.remove(1);
drop_cell(page, 1, usable_space).unwrap();
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
defragment_page(page, usable_space);
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
}
#[test]
pub fn test_drop_odd_with_defragment() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let mut total_size = 0;
let mut cells = Vec::new();
let usable_space = 4096;
let total_cells = 10;
for i in 0..total_cells {
let regs = &[Register::Value(Value::Integer(i as i64))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(i, i, page, record, &conn);
assert_eq!(page.cell_count(), i + 1);
let free = compute_free_space(page, usable_space);
total_size += payload.len() as u16 + 2;
assert_eq!(free, 4096 - total_size - header_size);
cells.push(Cell { pos: i, payload });
}
let mut removed = 0;
let mut new_cells = Vec::new();
for cell in cells {
if cell.pos % 2 == 1 {
drop_cell(page, cell.pos - removed, usable_space).unwrap();
removed += 1;
} else {
new_cells.push(cell);
}
}
let cells = new_cells;
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
defragment_page(page, usable_space);
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
}
#[test]
pub fn test_fuzz_drop_defragment_insert() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let mut total_size = 0;
let mut cells = Vec::new();
let usable_space = 4096;
let mut i = 100000;
let seed = thread_rng().gen();
tracing::info!("seed {}", seed);
let mut rng = ChaCha8Rng::seed_from_u64(seed);
while i > 0 {
i -= 1;
match rng.next_u64() % 4 {
0 => {
// allow appends with extra place to insert
let cell_idx = rng.next_u64() as usize % (page.cell_count() + 1);
let free = compute_free_space(page, usable_space);
let regs = &[Register::Value(Value::Integer(i as i64))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let mut payload: Vec<u8> = Vec::new();
fill_cell_payload(
page,
Some(i as i64),
&mut payload,
cell_idx,
&record,
4096,
conn.pager.borrow().clone(),
);
if (free as usize) < payload.len() + 2 {
// do not try to insert overflow pages because they require balancing
continue;
}
insert_into_cell(page, &payload, cell_idx, 4096).unwrap();
assert!(page.overflow_cells.is_empty());
total_size += payload.len() as u16 + 2;
cells.insert(cell_idx, Cell { pos: i, payload });
}
1 => {
if page.cell_count() == 0 {
continue;
}
let cell_idx = rng.next_u64() as usize % page.cell_count();
let (_, len) = page.cell_get_raw_region(cell_idx, usable_space as usize);
drop_cell(page, cell_idx, usable_space).unwrap();
total_size -= len as u16 + 2;
cells.remove(cell_idx);
}
2 => {
defragment_page(page, usable_space);
}
3 => {
// check cells
for (i, cell) in cells.iter().enumerate() {
ensure_cell(page, i, &cell.payload);
}
assert_eq!(page.cell_count(), cells.len());
}
_ => unreachable!(),
}
let free = compute_free_space(page, usable_space);
assert_eq!(free, 4096 - total_size - header_size);
}
}
#[test]
pub fn test_fuzz_drop_defragment_insert_issue_1085() {
// This test is used to demonstrate that issue at https://github.com/tursodatabase/turso/issues/1085
// is FIXED.
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let mut total_size = 0;
let mut cells = Vec::new();
let usable_space = 4096;
let mut i = 1000;
for seed in [15292777653676891381, 9261043168681395159] {
tracing::info!("seed {}", seed);
let mut rng = ChaCha8Rng::seed_from_u64(seed);
while i > 0 {
i -= 1;
match rng.next_u64() % 3 {
0 => {
// allow appends with extra place to insert
let cell_idx = rng.next_u64() as usize % (page.cell_count() + 1);
let free = compute_free_space(page, usable_space);
let regs = &[Register::Value(Value::Integer(i))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let mut payload: Vec<u8> = Vec::new();
fill_cell_payload(
page,
Some(i),
&mut payload,
cell_idx,
&record,
4096,
conn.pager.borrow().clone(),
);
if (free as usize) < payload.len() - 2 {
// do not try to insert overflow pages because they require balancing
continue;
}
insert_into_cell(page, &payload, cell_idx, 4096).unwrap();
assert!(page.overflow_cells.is_empty());
total_size += payload.len() as u16 + 2;
cells.push(Cell {
pos: i as usize,
payload,
});
}
1 => {
if page.cell_count() == 0 {
continue;
}
let cell_idx = rng.next_u64() as usize % page.cell_count();
let (_, len) = page.cell_get_raw_region(cell_idx, usable_space as usize);
drop_cell(page, cell_idx, usable_space).unwrap();
total_size -= len as u16 + 2;
cells.remove(cell_idx);
}
2 => {
defragment_page(page, usable_space);
}
_ => unreachable!(),
}
let free = compute_free_space(page, usable_space);
assert_eq!(free, 4096 - total_size - header_size);
}
}
}
// this test will create a tree like this:
// -page:2, ptr(right):4
// +cells:node[rowid:14, ptr(<=):3]
// -page:3, ptr(right):0
// +cells:leaf[rowid:11, len(payload):137, overflow:false]
// -page:4, ptr(right):0
// +cells:
#[test]
pub fn test_drop_page_in_balancing_issue_1203() {
let db = get_database();
let conn = db.connect().unwrap();
let queries = vec![
"CREATE TABLE lustrous_petit (awesome_nomous TEXT,ambitious_amargi TEXT,fantastic_daniels BLOB,stupendous_highleyman TEXT,relaxed_crane TEXT,elegant_bromma INTEGER,proficient_castro BLOB,ambitious_liman TEXT,responsible_lusbert BLOB);",
"INSERT INTO lustrous_petit VALUES ('funny_sarambi', 'hardworking_naoumov', X'666561726C6573735F68696C6C', 'elegant_iafd', 'rousing_flag', 681399778772406122, X'706572736F6E61626C655F676F6477696E6772696D6D', 'insightful_anonymous', X'706F77657266756C5F726F636861'), ('personable_holmes', 'diligent_pera', X'686F6E6573745F64696D656E73696F6E', 'energetic_raskin', 'gleaming_federasyon', -2778469859573362611, X'656666696369656E745F6769617A', 'sensible_skirda', X'66616E7461737469635F6B656174696E67'), ('inquisitive_baedan', 'brave_sphinx', X'67656E65726F75735F6D6F6E7473656E79', 'inquisitive_syndicate', 'amiable_room', 6954857961525890638, X'7374756E6E696E675F6E6965747A73636865', 'glowing_coordinator', X'64617A7A6C696E675F7365766572696E65'), ('upbeat_foxtale', 'engaging_aktimon', X'63726561746976655F6875746368696E6773', 'ample_locura', 'creative_barrett', 6413352509911171593, X'6772697070696E675F6D696E7969', 'competitive_parissi', X'72656D61726B61626C655F77696E7374616E6C6579');",
"INSERT INTO lustrous_petit VALUES ('ambitious_berry', 'devoted_marshall', X'696E7175697369746976655F6C6172657661', 'flexible_pramen', 'outstanding_stauch', 6936508362673228293, X'6C6F76696E675F6261756572', 'charming_anonymous', X'68617264776F726B696E675F616E6E6973'), ('enchanting_cohen', 'engaging_rubel', X'686F6E6573745F70726F766F63617A696F6E65', 'humorous_robin', 'imaginative_shuzo', 4762266264295288131, X'726F7573696E675F6261796572', 'vivid_bolling', X'6F7267616E697A65645F7275696E73'), ('affectionate_resistance', 'gripping_rustamova', X'6B696E645F6C61726B696E', 'bright_boulanger', 'upbeat_ashirov', -1726815435854320541, X'61646570745F66646361', 'dazzling_tashjian', X'68617264776F726B696E675F6D6F72656C'), ('zestful_ewald', 'favorable_lewis', X'73747570656E646F75735F7368616C6966', 'bright_combustion', 'blithesome_harding', 8408539013935554176, X'62726176655F737079726F706F756C6F75', 'hilarious_finnegan', X'676976696E675F6F7267616E697A696E67'), ('blithesome_picqueray', 'sincere_william', X'636F75726167656F75735F6D69746368656C6C', 'rousing_atan', 'mirthful_katie', -429232313453215091, X'6C6F76656C795F776174616E616265', 'stupendous_mcmillan', X'666F63757365645F6B61666568'), ('incredible_kid', 'friendly_yvetot', X'706572666563745F617A697A', 'helpful_manhattan', 'shining_horrox', -4318061095860308846, X'616D626974696F75735F726F7765', 'twinkling_anarkiya', X'696D6167696E61746976655F73756D6E6572');",
"INSERT INTO lustrous_petit VALUES ('sleek_graeber', 'approachable_ghazzawi', X'62726176655F6865776974747768697465', 'adaptable_zimmer', 'polite_cohn', -5464225138957223865, X'68756D6F726F75735F736E72', 'adaptable_igualada', X'6C6F76656C795F7A686F75'), ('imaginative_rautiainen', 'magnificent_ellul', X'73706C656E6469645F726F6361', 'responsible_brown', 'upbeat_uruguaya', -1185340834321792223, X'616D706C655F6D6470', 'philosophical_kelly', X'676976696E675F6461676865726D6172676F7369616E'), ('blithesome_darkness', 'creative_newell', X'6C757374726F75735F61706174726973', 'engaging_kids', 'charming_wark', -1752453819873942466, X'76697669645F6162657273', 'independent_barricadas', X'676C697374656E696E675F64686F6E6474'), ('productive_chardronnet', 'optimistic_karnage', X'64696C6967656E745F666F72657374', 'engaging_beggar', 'sensible_wolke', 784341549042407442, X'656E676167696E675F6265726B6F7769637A', 'blithesome_zuzenko', X'6E6963655F70726F766F63617A696F6E65');",
"INSERT INTO lustrous_petit VALUES ('shining_sagris', 'considerate_mother', X'6F70656E5F6D696E6465645F72696F74', 'polite_laufer', 'patient_mink', 2240393952789100851, X'636F75726167656F75735F6D636D696C6C616E', 'glowing_robertson', X'68656C7066756C5F73796D6F6E6473'), ('dazzling_glug', 'stupendous_poznan', X'706572736F6E61626C655F6672616E6B73', 'open_minded_ruins', 'qualified_manes', 2937238916206423261, X'696E736967687466756C5F68616B69656C', 'passionate_borl', X'616D6961626C655F6B7570656E647561'), ('wondrous_parry', 'knowledgeable_giovanni', X'6D6F76696E675F77696E6E', 'shimmering_aberlin', 'affectionate_calhoun', 702116954493913499, X'7265736F7572636566756C5F62726F6D6D61', 'propitious_mezzagarcia', X'746563686E6F6C6F676963616C5F6E6973686974616E69');",
"INSERT INTO lustrous_petit VALUES ('kind_room', 'hilarious_crow', X'6F70656E5F6D696E6465645F6B6F74616E7969', 'hardworking_petit', 'adaptable_zarrow', 2491343172109894986, X'70726F647563746976655F646563616C6F677565', 'willing_sindikalis', X'62726561746874616B696E675F6A6F7264616E');",
"INSERT INTO lustrous_petit VALUES ('confident_etrebilal', 'agreeable_shifu', X'726F6D616E7469635F7363687765697A6572', 'loving_debs', 'gripping_spooner', -3136910055229112693, X'677265676172696F75735F736B726F7A6974736B79', 'ample_ontiveros', X'7175616C69666965645F726F6D616E69656E6B6F'), ('competitive_call', 'technological_egoumenides', X'6469706C6F6D617469635F6D6F6E616768616E', 'willing_stew', 'frank_neal', -5973720171570031332, X'6C6F76696E675F6465737461', 'dazzling_gambone', X'70726F647563746976655F6D656E64656C676C6565736F6E'), ('favorable_delesalle', 'sensible_atterbury', X'666169746866756C5F64617861', 'bountiful_aldred', 'marvelous_malgraith', 5330463874397264493, X'706572666563745F7765726265', 'lustrous_anti', X'6C6F79616C5F626F6F6B6368696E'), ('stellar_corlu', 'loyal_espana', X'6D6F76696E675F7A6167', 'efficient_nelson', 'qualified_shepard', 1015518116803600464, X'737061726B6C696E675F76616E6469766572', 'loving_scoffer', X'686F6E6573745F756C72696368'), ('adaptable_taylor', 'shining_yasushi', X'696D6167696E61746976655F776974746967', 'alluring_blackmore', 'zestful_coeurderoy', -7094136731216188999, X'696D6167696E61746976655F757A63617465677569', 'gleaming_hernandez', X'6672616E6B5F646F6D696E69636B'), ('competitive_luis', 'stellar_fredericks', X'616772656561626C655F6D696368656C', 'optimistic_navarro', 'funny_hamilton', 4003895682491323194, X'6F70656E5F6D696E6465645F62656C6D6173', 'incredible_thorndycraft', X'656C6567616E745F746F6C6B69656E'), ('remarkable_parsons', 'sparkling_ulrich', X'737061726B6C696E675F6D6172696E636561', 'technological_leighlais', 'warmhearted_konok', -5789111414354869563, X'676976696E675F68657272696E67', 'adept_dabtara', X'667269656E646C795F72617070');",
"INSERT INTO lustrous_petit VALUES ('hardworking_norberg', 'approachable_winter', X'62726176655F68617474696E6768', 'imaginative_james', 'open_minded_capital', -5950508516718821688, X'6C757374726F75735F72616E7473', 'warmhearted_limanov', X'696E736967687466756C5F646F637472696E65'), ('generous_shatz', 'generous_finley', X'726176697368696E675F6B757A6E6574736F76', 'stunning_arrigoni', 'favorable_volcano', -8442328990977069526, X'6D6972746866756C5F616C7467656C64', 'thoughtful_zurbrugg', X'6D6972746866756C5F6D6F6E726F65'), ('frank_kerr', 'splendid_swain', X'70617373696F6E6174655F6D6470', 'flexible_dubey', 'sensible_tj', 6352949260574274181, X'656666696369656E745F6B656D736B79', 'vibrant_ege', X'736C65656B5F6272696768746F6E'), ('organized_neal', 'glistening_sugar', X'656E676167696E675F6A6F72616D', 'romantic_krieger', 'qualified_corr', -4774868512022958085, X'706572666563745F6B6F7A6172656B', 'bountiful_zaikowska', X'74686F7567687466756C5F6C6F6767616E73'), ('excellent_lydiettcarrion', 'diligent_denslow', X'666162756C6F75735F6D616E68617474616E', 'confident_tomar', 'glistening_ligt', -1134906665439009896, X'7175616C69666965645F6F6E6B656E', 'remarkable_anarkiya', X'6C6F79616C5F696E64616261'), ('passionate_melis', 'loyal_xsilent', X'68617264776F726B696E675F73637564', 'lustrous_barnes', 'nice_sugako', -4097897163377829983, X'726F6D616E7469635F6461686572', 'bright_imrie', X'73656E7369626C655F6D61726B'), ('giving_mlb', 'breathtaking_fourier', X'736C65656B5F616E61726368697374', 'glittering_malet', 'brilliant_crew', 8791228049111405793, X'626F756E746966756C5F626576656E736565', 'lovely_swords', X'70726F706974696F75735F696E656469746173'), ('honest_wright', 'qualified_rabble', X'736C65656B5F6D6172656368616C', 'shimmering_marius', 'blithesome_mckelvie', -1330737263592370654, X'6F70656E5F6D696E6465645F736D616C6C', 'energetic_gorman', X'70726F706974696F75735F6B6F74616E7969');",
"DELETE FROM lustrous_petit WHERE (ambitious_liman > 'adept_dabtaqu');",
"INSERT INTO lustrous_petit VALUES ('technological_dewey', 'fabulous_st', X'6F7074696D69737469635F73687562', 'considerate_levy', 'adaptable_kernis', 4195134012457716562, X'61646570745F736F6C6964617269646164', 'vibrant_crump', X'6C6F79616C5F72796E6572'), ('super_marjan', 'awesome_gethin', X'736C65656B5F6F737465727765696C', 'diplomatic_loidl', 'qualified_bokani', -2822676417968234733, X'6272696768745F64756E6C6170', 'creative_en', X'6D6972746866756C5F656C6F6666'), ('philosophical_malet', 'unique_garcia', X'76697669645F6E6F7262657267', 'spellbinding_fire', 'faithful_barringtonbush', -7293711848773657758, X'6272696C6C69616E745F6F6B65656665', 'gripping_guillon', X'706572736F6E61626C655F6D61726C696E7370696B65'), ('thoughtful_morefus', 'lustrous_rodriguez', X'636F6E666964656E745F67726F73736D616E726F73686368696E', 'devoted_jackson', 'propitious_karnage', -7802999054396485709, X'63617061626C655F64', 'enchanting_orwell', X'7477696E6B6C696E675F64616C616B6F676C6F75'), ('alluring_guillon', 'brilliant_pinotnoir', X'706572736F6E61626C655F6A6165636B6C65', 'open_minded_azeez', 'courageous_romania', 2126962403055072268, X'746563686E6F6C6F676963616C5F6962616E657A', 'open_minded_rosa', X'6C757374726F75735F6575726F7065'), ('courageous_kolokotronis', 'inquisitive_gahman', X'677265676172696F75735F626172726574', 'ambitious_shakur', 'fantastic_apatris', -1232732971861520864, X'737061726B6C696E675F7761746368', 'captivating_clover', X'636F6E666964656E745F736574686E65737363617374726F'), ('charming_sullivan', 'focused_congress', X'7368696D6D6572696E675F636C7562', 'wondrous_skrbina', 'giving_mendanlioglu', -6837337053772308333, X'636861726D696E675F73616C696E6173', 'rousing_hedva', X'6469706C6F6D617469635F7061796E');",
];
for query in queries {
let mut stmt = conn.query(query).unwrap().unwrap();
loop {
let row = stmt.step().expect("step");
match row {
StepResult::Done => {
break;
}
_ => {
tracing::debug!("row {:?}", row);
}
}
}
}
}
// this test will create a tree like this:
// -page:2, ptr(right):3
// +cells:
// -page:3, ptr(right):0
// +cells:
#[test]
pub fn test_drop_page_in_balancing_issue_1203_2() {
let db = get_database();
let conn = db.connect().unwrap();
let queries = vec![
"CREATE TABLE super_becky (engrossing_berger BLOB,plucky_chai BLOB,mirthful_asbo REAL,bountiful_jon REAL,competitive_petit REAL,engrossing_rexroth REAL);",
"INSERT INTO super_becky VALUES (X'636861726D696E675F6261796572', X'70726F647563746976655F70617269737369', 6847793643.408741, 7330361375.924953, -6586051582.891455, -6921021872.711397), (X'657863656C6C656E745F6F7267616E697A696E67', X'6C757374726F75735F73696E64696B616C6973', 9905774996.48619, 570325205.2246342, 5852346465.53047, 728566012.1968269), (X'7570626561745F73656174746C65', X'62726176655F6661756E', -2202725836.424899, 5424554426.388281, 2625872085.917082, -6657362503.808359), (X'676C6F77696E675F6D617877656C6C', X'7761726D686561727465645F726F77616E', -9610936969.793116, 4886606277.093559, -3414536174.7928505, 6898267795.317778), (X'64796E616D69635F616D616E', X'7374656C6C61725F7374657073', 3918935692.153696, 151068445.947237, 4582065669.356403, -3312668220.4789667), (X'64696C6967656E745F64757272757469', X'7175616C69666965645F6D726163686E696B', 5527271629.262201, 6068855126.044355, 289904657.13490677, 2975774820.0877323), (X'6469706C6F6D617469635F726F76657363696F', X'616C6C7572696E675F626F7474696369', 9844748192.66119, -6180276383.305578, -4137330511.025565, -478754566.79494476), (X'776F6E64726F75735F6173686572', X'6465766F7465645F6176657273696F6E', 2310211470.114773, -6129166761.628184, -2865371645.3145514, 7542428654.8645935), (X'617070726F61636861626C655F6B686F6C61', X'6C757374726F75735F6C696E6E656C6C', -4993113161.458349, 7356727284.362968, -3228937035.568404, -1779334005.5067253);",
"INSERT INTO super_becky VALUES (X'74686F7567687466756C5F726576696577', X'617765736F6D655F63726F73736579', 9401977997.012783, 8428201961.643898, 2822821303.052643, 4555601220.718847), (X'73706563746163756C61725F6B686179617469', X'616772656561626C655F61646F6E696465', 7414547022.041355, 365016845.73330307, 50682963.055828094, -9258802584.962656), (X'6C6F79616C5F656D6572736F6E', X'676C6F77696E675F626174616C6F', -5522070106.765736, 2712536599.6384163, 6631385631.869345, 1242757880.7583427), (X'68617264776F726B696E675F6F6B656C6C79', X'666162756C6F75735F66696C697373', 6682622809.9778805, 4233900041.917185, 9017477903.795563, -756846353.6034946), (X'68617264776F726B696E675F626C61756D616368656E', X'616666656374696F6E6174655F6B6F736D616E', -1146438175.3174362, -7545123696.438596, -6799494012.403366, 5646913977.971333), (X'66616E7461737469635F726F77616E', X'74686F7567687466756C5F7465727269746F72696573', -4414529784.916277, -6209371635.279242, 4491104121.288605, 2590223842.117277);",
"INSERT INTO super_becky VALUES (X'676C697374656E696E675F706F72746572', X'696E7175697369746976655F656D', 2986144164.3676434, 3495899172.5935287, -849280584.9386635, 6869709150.2699375), (X'696D6167696E61746976655F6D65726C696E6F', X'676C6F77696E675F616B74696D6F6E', 8733490615.829357, 6782649864.719433, 6926744218.74107, 1532081022.4379768), (X'6E6963655F726F73736574', X'626C69746865736F6D655F66696C697373', -839304300.0706863, 6155504968.705227, -2951592321.950267, -6254186334.572437), (X'636F6E666964656E745F6C69626574', X'676C696D6D6572696E675F6B6F74616E7969', -5344675223.37533, -8703794729.211002, 3987472096.020382, -7678989974.961197), (X'696D6167696E61746976655F6B61726162756C7574', X'64796E616D69635F6D6367697272', 2028227065.6995697, -7435689525.030833, 7011220815.569796, 5526665697.213846), (X'696E7175697369746976655F636C61726B', X'616666656374696F6E6174655F636C6561766572', 3016598350.546356, -3686782925.383732, 9671422351.958004, 9099319829.078941), (X'63617061626C655F746174616E6B61', X'696E6372656469626C655F6F746F6E6F6D61', 6339989259.432795, -8888997534.102034, 6855868409.475763, -2565348887.290493), (X'676F7267656F75735F6265726E657269', X'65647563617465645F6F6D6F77616C69', 6992467657.527826, -3538089391.748543, -7103111660.146708, 4019283237.3740463), (X'616772656561626C655F63756C74757265', X'73706563746163756C61725F657370616E61', 189387871.06959534, 6211851191.361202, 1786455196.9768047, 7966404387.318119);",
"INSERT INTO super_becky VALUES (X'7068696C6F736F70686963616C5F6C656967686C616973', X'666162756C6F75735F73656D696E61746F7265', 8688321500.141502, -7855144036.024546, -5234949709.573349, -9937638367.366447), (X'617070726F61636861626C655F726F677565', X'676C65616D696E675F6D7574696E79', -5351540099.744092, -3614025150.9013805, -2327775310.276925, 2223379997.077526), (X'676C696D6D6572696E675F63617263686961', X'696D6167696E61746976655F61737379616E6E', 4104832554.8371887, -5531434716.627781, 1652773397.4099865, 3884980522.1830273);",
"DELETE FROM super_becky WHERE (plucky_chai != X'7761726D686561727465645F6877616E67' AND mirthful_asbo != 9537234687.183533 AND bountiful_jon = -3538089391.748543);",
"INSERT INTO super_becky VALUES (X'706C75636B795F6D617263616E74656C', X'696D6167696E61746976655F73696D73', 9535651632.375484, 92270815.0720501, 1299048084.6248207, 6460855331.572151), (X'726F6D616E7469635F706F746C61746368', X'68756D6F726F75735F63686165686F', 9345375719.265533, 7825332230.247925, -7133157299.39028, -6939677879.6597), (X'656666696369656E745F6261676E696E69', X'63726561746976655F67726168616D', -2615470560.1954746, 6790849074.977201, -8081732985.448849, -8133707792.312794), (X'677265676172696F75735F73637564', X'7368696E696E675F67726F7570', -7996394978.2610035, -9734939565.228964, 1108439333.8481388, -5420483517.169478), (X'6C696B61626C655F6B616E6176616C6368796B', X'636F75726167656F75735F7761726669656C64', -1959869609.656724, 4176668769.239971, -8423220404.063669, 9987687878.685959), (X'657863656C6C656E745F68696C6473646F74746572', X'676C6974746572696E675F7472616D7564616E61', -5220160777.908238, 3892402687.8826714, 9803857762.617172, -1065043714.0265541), (X'6D61676E69666963656E745F717565657273', X'73757065725F717565657273', -700932053.2006226, -4706306995.253335, -5286045811.046467, 1954345265.5250092), (X'676976696E675F6275636B65726D616E6E', X'667269656E646C795F70697A7A6F6C61746F', -2186859620.9089565, -6098492099.446075, -7456845586.405931, 8796967674.444252);",
"DELETE FROM super_becky WHERE TRUE;",
"INSERT INTO super_becky VALUES (X'6F7074696D69737469635F6368616E69616C', X'656E657267657469635F6E65677261', 1683345860.4208698, 4163199322.9289455, -4192968616.7868404, -7253371206.571701), (X'616C6C7572696E675F686176656C', X'7477696E6B6C696E675F626965627579636B', -9947019174.287437, 5975899640.893995, 3844707723.8570194, -9699970750.513876), (X'6F7074696D69737469635F7A686F75', X'616D626974696F75735F636F6E6772657373', 4143738484.1081524, -2138255286.170598, 9960750454.03466, 5840575852.80299), (X'73706563746163756C61725F6A6F6E67', X'73656E7369626C655F616269646F72', -1767611042.9716015, -7684260477.580351, 4570634429.188147, -9222640121.140202), (X'706F6C6974655F6B657272', X'696E736967687466756C5F63686F646F726B6F6666', -635016769.5123329, -4359901288.494518, -7531565119.905825, -1180410948.6572971), (X'666C657869626C655F636F6D756E69656C6C6F', X'6E6963655F6172636F73', 8708423014.802425, -6276712625.559328, -771680766.2485523, 8639486874.113342);",
"DELETE FROM super_becky WHERE (mirthful_asbo < 9730384310.536528 AND plucky_chai < X'6E6963655F61726370B2');",
"DELETE FROM super_becky WHERE (mirthful_asbo > 6248699554.426553 AND bountiful_jon > 4124481472.333034);",
"INSERT INTO super_becky VALUES (X'676C696D6D6572696E675F77656C7368', X'64696C6967656E745F636F7262696E', 8217054003.369003, 8745594518.77864, 1928172803.2261295, -8375115534.050233), (X'616772656561626C655F6463', X'6C6F76696E675F666F72656D616E', -5483889804.871533, -8264576639.127487, 4770567289.404846, -3409172927.2573576), (X'6D617276656C6F75735F6173696D616B6F706F756C6F73', X'746563686E6F6C6F676963616C5F6A61637175696572', 2694858779.206814, -1703227425.3442516, -4504989231.263319, -3097265869.5230227), (X'73747570656E646F75735F64757075697364657269', X'68696C6172696F75735F6D75697268656164', 568174708.66469, -4878260547.265669, -9579691520.956625, 73507727.8100338), (X'626C69746865736F6D655F626C6F6B', X'61646570745F6C65696572', 7772117077.916897, 4590608571.321514, -881713470.657032, -9158405774.647465);",
"INSERT INTO super_becky VALUES (X'6772697070696E675F6573736578', X'67656E65726F75735F636875726368696C6C', -4180431825.598956, 7277443000.677654, 2499796052.7878246, -2858339306.235305), (X'756E697175655F6D6172656368616C', X'62726561746874616B696E675F636875726368696C6C', 1401354536.7625294, -611427440.2796707, -4621650430.463729, 1531473111.7482872), (X'657863656C6C656E745F66696E6C6579', X'666169746866756C5F62726F636B', -4020697828.0073624, -2833530733.19637, -7766170050.654022, 8661820959.434689);",
"INSERT INTO super_becky VALUES (X'756E697175655F6C617061797265', X'6C6F76696E675F7374617465', 7063237787.258968, -5425712581.365798, -7750509440.0141945, -7570954710.892544), (X'62726561746874616B696E675F6E65616C', X'636F75726167656F75735F61727269676F6E69', 289862394.2028198, 9690362375.014446, -4712463267.033899, 2474917855.0973473), (X'7477696E6B6C696E675F7368616B7572', X'636F75726167656F75735F636F6D6D6974746565', 5449035403.229155, -2159678989.597906, 3625606019.1150894, -3752010405.4475393);",
"INSERT INTO super_becky VALUES (X'70617373696F6E6174655F73686970776179', X'686F6E6573745F7363687765697A6572', 4193384746.165228, -2232151704.896323, 8615245520.962444, -9789090953.995636);",
"INSERT INTO super_becky VALUES (X'6C696B61626C655F69', X'6661766F7261626C655F6D626168', 6581403690.769894, 3260059398.9544716, -407118859.046051, -3155853965.2700634), (X'73696E636572655F6F72', X'616772656561626C655F617070656C6261756D', 9402938544.308651, -7595112171.758331, -7005316716.211025, -8368210960.419411);",
"INSERT INTO super_becky VALUES (X'6D617276656C6F75735F6B61736864616E', X'6E6963655F636F7272', -5976459640.85817, -3177550476.2092276, 2073318650.736992, -1363247319.9978447);",
"INSERT INTO super_becky VALUES (X'73706C656E6469645F6C616D656E646F6C61', X'677265676172696F75735F766F6E6E65677574', 6898259773.050102, 8973519699.707073, -25070632.280548096, -1845922497.9676847), (X'617765736F6D655F7365766572', X'656E657267657469635F706F746C61746368', -8750678407.717808, 5130907533.668898, -6778425327.111566, 3718982135.202587);",
"INSERT INTO super_becky VALUES (X'70726F706974696F75735F6D616C617465737461', X'657863656C6C656E745F65766572657474', -8846855772.62094, -6168969732.697067, -8796372709.125793, 9983557891.544613), (X'73696E636572655F6C6177', X'696E7175697369746976655F73616E647374726F6D', -6366985697.975358, 3838628702.6652164, 3680621713.3371124, -786796486.8049564), (X'706F6C6974655F676C6561736F6E', X'706C75636B795F677579616E61', -3987946379.104308, -2119148244.413993, -1448660343.6888638, -1264195510.1611118), (X'676C6974746572696E675F6C6975', X'70657273697374656E745F6F6C6976696572', 6741779968.943846, -3239809989.227495, -1026074003.5506897, 4654600514.871752);",
"DELETE FROM super_becky WHERE (engrossing_berger < X'6566651A3C70278D4E200657551D8071A1' AND competitive_petit > 1236742147.9451914);",
"INSERT INTO super_becky VALUES (X'6661766F7261626C655F726569746D616E', X'64657465726D696E65645F726974746572', -7412553243.829927, -7572665195.290464, 7879603411.222157, 3706943306.5691853), (X'70657273697374656E745F6E6F6C616E', X'676C6974746572696E675F73686570617264', 7028261282.277422, -2064164782.3494844, -5244048504.507779, -2399526243.005843), (X'6B6E6F776C6564676561626C655F70617474656E', X'70726F66696369656E745F726F7365627261756768', 3713056763.583538, 3919834206.566164, -6306779387.430006, -9939464323.995546), (X'616461707461626C655F7172757A', X'696E7175697369746976655F68617261776179', 6519349690.299835, -9977624623.820414, 7500579325.440605, -8118341251.362242);",
"INSERT INTO super_becky VALUES (X'636F6E73696465726174655F756E696F6E', X'6E6963655F6573736578', -1497385534.8720198, 9957688503.242973, 9191804202.566128, -179015615.7117195), (X'666169746866756C5F626F776C656773', X'6361707469766174696E675F6D6367697272', 893707300.1576138, 3381656294.246702, 6884723724.381908, 6248331214.701559), (X'6B6E6F776C6564676561626C655F70656E6E61', X'6B696E645F616A697468', -3335162603.6574974, 1812878172.8505402, 5115606679.658335, -5690100280.808182), (X'617765736F6D655F77696E7374616E6C6579', X'70726F706974696F75735F6361726173736F', -7395576292.503981, 4956546102.029215, -1468521769.7486448, -2968223925.60355), (X'636F75726167656F75735F77617266617265', X'74686F7567687466756C5F7361707068697265', 7052982930.566017, -9806098174.104418, -6910398936.377775, -4041963031.766964), (X'657863656C6C656E745F6B62', X'626C69746865736F6D655F666F75747A6F706F756C6F73', 6142173202.994768, 5193126957.544125, -7522202722.983735, -1659088056.594862), (X'7374756E6E696E675F6E6576616461', X'626F756E746966756C5F627572746F6E', -3822097036.7628613, -3458840259.240303, 2544472236.86788, 6928890176.466003);",
"INSERT INTO super_becky VALUES (X'706572736F6E61626C655F646D69747269', X'776F6E64726F75735F6133796F', 2651932559.0077076, 811299402.3174248, -8271909238.671928, 6761098864.189909);",
"INSERT INTO super_becky VALUES (X'726F7573696E675F6B6C6166657461', X'64617A7A6C696E675F6B6E617070', 9370628891.439335, -5923332007.253168, -2763161830.5880013, -9156194881.875952), (X'656666696369656E745F6C6576656C6C6572', X'616C6C7572696E675F706561636F7474', 3102641409.8314342, 2838360181.628153, 2466271662.169607, 1015942181.844162), (X'6469706C6F6D617469635F7065726B696E73', X'726F7573696E675F6172616269', -1551071129.022499, -8079487600.186886, 7832984580.070087, -6785993247.895652), (X'626F756E746966756C5F6D656D62657273', X'706F77657266756C5F70617269737369', 9226031830.72445, 7012021503.536997, -2297349030.108919, -2738320055.4710903), (X'676F7267656F75735F616E6172636F7469636F', X'68656C7066756C5F7765696C616E64', -8394163480.676959, -2978605095.699134, -6439355448.021704, 9137308022.281273), (X'616666656374696F6E6174655F70726F6C65696E666F', X'706C75636B795F73616E7A', 3546758708.3524914, -1870964264.9353771, 338752565.3643894, -3908023657.299715), (X'66756E6E795F706F70756C61697265', X'6F75747374616E64696E675F626576696E67746F6E', -1533858145.408224, 6164225076.710373, 8419445987.622173, 584555253.6852646), (X'76697669645F6D7474', X'7368696D6D6572696E675F70616F6E65737361', 5512251366.193035, -8680583180.123213, -4445968638.153208, -3274009935.4229546);",
"INSERT INTO super_becky VALUES (X'7068696C6F736F70686963616C5F686F7264', X'657863656C6C656E745F67757373656C7370726F757473', -816909447.0240917, -3614686681.8786583, 7701617524.26067, -4541962047.183721), (X'616D6961626C655F69676E6174696576', X'6D61676E69666963656E745F70726F76696E6369616C69', -1318532883.847702, -4918966075.976474, -7601723171.33518, -3515747704.3847466), (X'70726F66696369656E745F32303137', X'66756E6E795F6E77', -1264540201.518032, 8227396547.578808, 6245093925.183641, -8368355328.110817);",
"INSERT INTO super_becky VALUES (X'77696C6C696E675F6E6F6B6B65', X'726F6D616E7469635F677579616E61', 6618610796.3707695, -3814565359.1524105, 1663106272.4565296, -4175107840.768817), (X'72656C617865645F7061766C6F76', X'64657465726D696E65645F63686F646F726B6F6666', -3350029338.034504, -3520837855.4619064, 3375167499.631817, -8866806483.714607), (X'616D706C655F67696464696E6773', X'667269656E646C795F6A6F686E', 1458864959.9942684, 1344208968.0486107, 9335156635.91314, -6180643697.918882), (X'72656C617865645F6C65726F79', X'636F75726167656F75735F6E6F72646772656E', -5164986537.499656, 8820065797.720875, 6146530425.891005, 6949241471.958189), (X'666F63757365645F656D6D61', X'696D6167696E61746976655F6C6F6E67', -9587619060.80035, 6128068142.184402, 6765196076.956905, 800226302.7983418);",
"INSERT INTO super_becky VALUES (X'616D626974696F75735F736F6E67', X'706572666563745F6761686D616E', 4989979180.706432, -9374266591.537058, 314459621.2820797, -3200029490.9553604), (X'666561726C6573735F626C6174', X'676C697374656E696E675F616374696F6E', -8512203612.903147, -7625581186.013805, -9711122307.234787, -301590929.32751083), (X'617765736F6D655F6669646573', X'666169746866756C5F63756E6E696E6768616D', -1428228887.9205084, 7669883854.400173, 5604446195.905277, -1509311057.9653416), (X'68756D6F726F75735F77697468647261776E', X'62726561746874616B696E675F7472617562656C', -7292778713.676636, -6728132503.529593, 2805341768.7252483, 330416975.2300949);",
"INSERT INTO super_becky VALUES (X'677265676172696F75735F696873616E', X'7374656C6C61725F686172746D616E', 8819210651.1988, 5298459883.813452, 7293544377.958424, 460475869.72971725), (X'696E736967687466756C5F62657765726E69747A', X'676C65616D696E675F64656E736C6F77', -6911957282.193239, 1754196756.2193146, -6316860403.693853, -3094020672.236368), (X'6D6972746866756C5F616D6265727261656B656C6C79', X'68756D6F726F75735F6772617665', 1785574023.0269203, -372056983.82761574, 4133719439.9538956, 9374053482.066044), (X'76697669645F736169747461', X'7761726D686561727465645F696E656469746173', 2787071361.6099434, 9663839418.553448, -5934098589.901047, -9774745509.608858), (X'61646570745F6F6375727279', X'6C696B61626C655F726569746D616E', -3098540915.1310825, 5460848322.672174, -6012867197.519758, 6769770087.661135), (X'696E646570656E64656E745F6F', X'656C6567616E745F726F6F726461', 1462542860.3143978, 3360904654.2464733, 5458876201.665213, -5522844849.529962), (X'72656D61726B61626C655F626F6B616E69', X'6F70656E5F6D696E6465645F686F72726F78', 7589481760.867031, 7970075121.546291, 7513467575.5213585, 9663061478.289227), (X'636F6E666964656E745F6C616479', X'70617373696F6E6174655F736B726F7A6974736B79', 8266917234.53915, -7172933478.625412, 309854059.94031143, -8309837814.497616);",
"DELETE FROM super_becky WHERE (competitive_petit != 8725256604.165474 OR engrossing_rexroth > -3607424615.7839313 OR plucky_chai < X'726F7573696E675F6216E20375');",
"INSERT INTO super_becky VALUES (X'7368696E696E675F736F6C69646169726573', X'666561726C6573735F63617264616E', -170727879.20838165, 2744601113.384678, 5676912434.941502, 6757573601.657997), (X'636F75726167656F75735F706C616E636865', X'696E646570656E64656E745F636172736F6E', -6271723086.761938, -180566679.7470188, -1285774632.134449, 1359665735.7842407), (X'677265676172696F75735F7374616D61746F76', X'7374756E6E696E675F77696C64726F6F7473', -6210238866.953484, 2492683045.8287067, -9688894361.68205, 5420275482.048567), (X'696E646570656E64656E745F6F7267616E697A6572', X'676C6974746572696E675F736F72656C', 9291163783.3073, -6843003475.769236, -1320245894.772686, -5023483808.044955), (X'676C6F77696E675F6E65736963', X'676C65616D696E675F746F726D6579', 829526382.8027191, 9365690945.1316, 4761505764.826195, -4149154965.0024815), (X'616C6C7572696E675F646F637472696E65', X'6E6963655F636C6561766572', 3896644979.981762, -288600448.8016701, 9462856570.130062, -909633752.5993862);",
];
for query in queries {
let mut stmt = conn.query(query).unwrap().unwrap();
loop {
let row = stmt.step().expect("step");
match row {
StepResult::Done => {
break;
}
_ => {
tracing::debug!("row {:?}", row);
}
}
}
}
}
#[test]
pub fn test_free_space() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let header_size = 8;
let usable_space = 4096;
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(0, 0, page, record, &conn);
let free = compute_free_space(page, usable_space);
assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size);
}
#[test]
pub fn test_defragment_1() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let usable_space = 4096;
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(0, 0, page, record, &conn);
assert_eq!(page.cell_count(), 1);
defragment_page(page, usable_space);
assert_eq!(page.cell_count(), 1);
let (start, len) = page.cell_get_raw_region(0, usable_space as usize);
let buf = page.as_ptr();
assert_eq!(&payload, &buf[start..start + len]);
}
#[test]
pub fn test_insert_drop_insert() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let usable_space = 4096;
let regs = &[
Register::Value(Value::Integer(0)),
Register::Value(Value::Text(Text::new("aaaaaaaa"))),
];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, 0, page, record, &conn);
assert_eq!(page.cell_count(), 1);
drop_cell(page, 0, usable_space).unwrap();
assert_eq!(page.cell_count(), 0);
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(0, 0, page, record, &conn);
assert_eq!(page.cell_count(), 1);
let (start, len) = page.cell_get_raw_region(0, usable_space as usize);
let buf = page.as_ptr();
assert_eq!(&payload, &buf[start..start + len]);
}
#[test]
pub fn test_insert_drop_insert_multiple() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let usable_space = 4096;
let regs = &[
Register::Value(Value::Integer(0)),
Register::Value(Value::Text(Text::new("aaaaaaaa"))),
];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, 0, page, record, &conn);
for _ in 0..100 {
assert_eq!(page.cell_count(), 1);
drop_cell(page, 0, usable_space).unwrap();
assert_eq!(page.cell_count(), 0);
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(0, 0, page, record, &conn);
assert_eq!(page.cell_count(), 1);
let (start, len) = page.cell_get_raw_region(0, usable_space as usize);
let buf = page.as_ptr();
assert_eq!(&payload, &buf[start..start + len]);
}
}
#[test]
pub fn test_drop_a_few_insert() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let usable_space = 4096;
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let payload = add_record(0, 0, page, record, &conn);
let regs = &[Register::Value(Value::Integer(1))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(1, 1, page, record, &conn);
let regs = &[Register::Value(Value::Integer(2))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(2, 2, page, record, &conn);
drop_cell(page, 1, usable_space).unwrap();
drop_cell(page, 1, usable_space).unwrap();
ensure_cell(page, 0, &payload);
}
#[test]
pub fn test_fuzz_victim_1() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let page = page.get();
let page = page.get_contents();
let usable_space = 4096;
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, 0, page, record, &conn);
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, 0, page, record, &conn);
drop_cell(page, 0, usable_space).unwrap();
defragment_page(page, usable_space);
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, 1, page, record, &conn);
drop_cell(page, 0, usable_space).unwrap();
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, 1, page, record, &conn);
}
#[test]
pub fn test_fuzz_victim_2() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let usable_space = 4096;
let insert = |pos, page| {
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, pos, page, record, &conn);
};
let drop = |pos, page| {
drop_cell(page, pos, usable_space).unwrap();
};
let defragment = |page| {
defragment_page(page, usable_space);
};
let page = page.get();
defragment(page.get_contents());
defragment(page.get_contents());
insert(0, page.get_contents());
drop(0, page.get_contents());
insert(0, page.get_contents());
drop(0, page.get_contents());
insert(0, page.get_contents());
defragment(page.get_contents());
defragment(page.get_contents());
drop(0, page.get_contents());
defragment(page.get_contents());
insert(0, page.get_contents());
drop(0, page.get_contents());
insert(0, page.get_contents());
insert(1, page.get_contents());
insert(1, page.get_contents());
insert(0, page.get_contents());
drop(3, page.get_contents());
drop(2, page.get_contents());
compute_free_space(page.get_contents(), usable_space);
}
#[test]
pub fn test_fuzz_victim_3() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let usable_space = 4096;
let insert = |pos, page| {
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let _ = add_record(0, pos, page, record, &conn);
};
let drop = |pos, page| {
drop_cell(page, pos, usable_space).unwrap();
};
let defragment = |page| {
defragment_page(page, usable_space);
};
let regs = &[Register::Value(Value::Integer(0))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let mut payload: Vec<u8> = Vec::new();
fill_cell_payload(
page.get().get_contents(),
Some(0),
&mut payload,
0,
&record,
4096,
conn.pager.borrow().clone(),
);
let page = page.get();
insert(0, page.get_contents());
defragment(page.get_contents());
insert(0, page.get_contents());
defragment(page.get_contents());
insert(0, page.get_contents());
drop(2, page.get_contents());
drop(0, page.get_contents());
let free = compute_free_space(page.get_contents(), usable_space);
let total_size = payload.len() + 2;
assert_eq!(
free,
usable_space - page.get_contents().header_size() as u16 - total_size as u16
);
dbg!(free);
}
#[test]
pub fn btree_insert_sequential() {
let (pager, root_page, _, _) = empty_btree();
let mut keys = Vec::new();
let num_columns = 5;
for i in 0..10000 {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
tracing::info!("INSERT INTO t VALUES ({});", i,);
let regs = &[Register::Value(Value::Integer(i))];
let value = ImmutableRecord::from_registers(regs, regs.len());
tracing::trace!("before insert {}", i);
run_until_done(
|| {
let key = SeekKey::TableRowId(i);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(i, Some(&value)), true),
pager.deref(),
)
.unwrap();
keys.push(i);
}
if matches!(validate_btree(pager.clone(), root_page), (_, false)) {
panic!("invalid btree");
}
tracing::trace!(
"=========== btree ===========\n{}\n\n",
format_btree(pager.clone(), root_page, 0)
);
for key in keys.iter() {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let key = Value::Integer(*key);
let exists = run_until_done(|| cursor.exists(&key), pager.deref()).unwrap();
assert!(exists, "key not found {key}");
}
}
#[test]
pub fn test_big_payload_compute_free() {
let db = get_database();
let conn = db.connect().unwrap();
let page = get_page(2);
let usable_space = 4096;
let regs = &[Register::Value(Value::Blob(vec![0; 3600]))];
let record = ImmutableRecord::from_registers(regs, regs.len());
let mut payload: Vec<u8> = Vec::new();
fill_cell_payload(
page.get().get_contents(),
Some(0),
&mut payload,
0,
&record,
4096,
conn.pager.borrow().clone(),
);
insert_into_cell(page.get().get_contents(), &payload, 0, 4096).unwrap();
let free = compute_free_space(page.get().get_contents(), usable_space);
let total_size = payload.len() + 2;
assert_eq!(
free,
usable_space - page.get().get_contents().header_size() as u16 - total_size as u16
);
dbg!(free);
}
#[test]
pub fn test_delete_balancing() {
// What does this test do:
// 1. Insert 10,000 rows of ~15 byte payload each. This creates
// nearly 40 pages (10,000 * 15 / 4096) and 240 rows per page.
// 2. Delete enough rows to create empty/ nearly empty pages to trigger balancing
// (verified this in SQLite).
// 3. Verify validity/integrity of btree after deleting and also verify that these
// values are actually deleted.
let (pager, root_page, _, _) = empty_btree();
let num_columns = 5;
// Insert 10,000 records in to the BTree.
for i in 1..=10000 {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let regs = &[Register::Value(Value::Text(Text::new("hello world")))];
let value = ImmutableRecord::from_registers(regs, regs.len());
run_until_done(
|| {
let key = SeekKey::TableRowId(i);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(i, Some(&value)), true),
pager.deref(),
)
.unwrap();
}
if let (_, false) = validate_btree(pager.clone(), root_page) {
panic!("Invalid B-tree after insertion");
}
let num_columns = 5;
// Delete records with 500 <= key <= 3500
for i in 500..=3500 {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let seek_key = SeekKey::TableRowId(i);
let seek_result = run_until_done(
|| cursor.seek(seek_key.clone(), SeekOp::GE { eq_only: true }),
pager.deref(),
)
.unwrap();
if matches!(seek_result, SeekResult::Found) {
run_until_done(|| cursor.delete(), pager.deref()).unwrap();
}
}
// Verify that records with key < 500 and key > 3500 still exist in the BTree.
for i in 1..=10000 {
if (500..=3500).contains(&i) {
continue;
}
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let key = Value::Integer(i);
let exists = run_until_done(|| cursor.exists(&key), pager.deref()).unwrap();
assert!(exists, "Key {i} should exist but doesn't");
}
// Verify the deleted records don't exist.
for i in 500..=3500 {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let key = Value::Integer(i);
let exists = run_until_done(|| cursor.exists(&key), pager.deref()).unwrap();
assert!(!exists, "Deleted key {i} still exists");
}
}
#[test]
pub fn test_overflow_cells() {
let iterations = 10_usize;
let mut huge_texts = Vec::new();
for i in 0..iterations {
let mut huge_text = String::new();
for _j in 0..8192 {
huge_text.push((b'A' + i as u8) as char);
}
huge_texts.push(huge_text);
}
let (pager, root_page, _, _) = empty_btree();
let num_columns = 5;
for (i, huge_text) in huge_texts.iter().enumerate().take(iterations) {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
tracing::info!("INSERT INTO t VALUES ({});", i,);
let regs = &[Register::Value(Value::Text(Text {
value: huge_text.as_bytes().to_vec(),
subtype: crate::types::TextSubtype::Text,
}))];
let value = ImmutableRecord::from_registers(regs, regs.len());
tracing::trace!("before insert {}", i);
tracing::debug!(
"=========== btree before ===========\n{}\n\n",
format_btree(pager.clone(), root_page, 0)
);
run_until_done(
|| {
let key = SeekKey::TableRowId(i as i64);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(i as i64, Some(&value)), true),
pager.deref(),
)
.unwrap();
tracing::debug!(
"=========== btree after ===========\n{}\n\n",
format_btree(pager.clone(), root_page, 0)
);
}
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
cursor.move_to_root().unwrap();
for i in 0..iterations {
let has_next = run_until_done(|| cursor.next(), pager.deref()).unwrap();
if !has_next {
panic!("expected Some(rowid) but got {:?}", cursor.has_record.get());
};
let rowid = run_until_done(|| cursor.rowid(), pager.deref())
.unwrap()
.unwrap();
assert_eq!(rowid, i as i64, "got!=expected");
}
}
#[test]
pub fn test_read_write_payload_with_offset() {
let (pager, root_page, _, _) = empty_btree();
let num_columns = 5;
let mut cursor = BTreeCursor::new(None, pager.clone(), root_page, num_columns);
let offset = 2; // blobs data starts at offset 2
let initial_text = "hello world";
let initial_blob = initial_text.as_bytes().to_vec();
let regs = &[Register::Value(Value::Blob(initial_blob.clone()))];
let value = ImmutableRecord::from_registers(regs, regs.len());
run_until_done(
|| {
let key = SeekKey::TableRowId(1);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(1, Some(&value)), true),
pager.deref(),
)
.unwrap();
cursor
.stack
.set_cell_index(cursor.stack.current_cell_index() + 1);
let mut read_buffer = Vec::new();
run_until_done(
|| {
cursor.read_write_payload_with_offset(
offset,
&mut read_buffer,
initial_blob.len() as u32,
false,
)
},
pager.deref(),
)
.unwrap();
assert_eq!(
std::str::from_utf8(&read_buffer).unwrap(),
initial_text,
"Read data doesn't match expected data"
);
let mut modified_hello = "olleh".as_bytes().to_vec();
run_until_done(
|| cursor.read_write_payload_with_offset(offset, &mut modified_hello, 5, true),
pager.deref(),
)
.unwrap();
let mut verification_buffer = Vec::new();
run_until_done(
|| {
cursor.read_write_payload_with_offset(
offset,
&mut verification_buffer,
initial_blob.len() as u32,
false,
)
},
pager.deref(),
)
.unwrap();
assert_eq!(
std::str::from_utf8(&verification_buffer).unwrap(),
"olleh world",
"Modified data doesn't match expected result"
);
}
#[test]
pub fn test_read_write_payload_with_overflow_page() {
let (pager, root_page, _, _) = empty_btree();
let num_columns = 5;
let mut cursor = BTreeCursor::new(None, pager.clone(), root_page, num_columns);
let mut large_blob = vec![b'A'; 40960 - 11]; // insert large blob. 40960 = 10 page long.
let hello_world = b"hello world";
large_blob.extend_from_slice(hello_world);
let regs = &[Register::Value(Value::Blob(large_blob.clone()))];
let value = ImmutableRecord::from_registers(regs, regs.len());
run_until_done(
|| {
let key = SeekKey::TableRowId(1);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
run_until_done(
|| cursor.insert(&BTreeKey::new_table_rowid(1, Some(&value)), true),
pager.deref(),
)
.unwrap();
cursor
.stack
.set_cell_index(cursor.stack.current_cell_index() + 1);
let offset_to_hello_world = 4 + (large_blob.len() - 11) as u32; // this offset depends on the records type.
let mut read_buffer = Vec::new();
run_until_done(
|| {
cursor.read_write_payload_with_offset(
offset_to_hello_world,
&mut read_buffer,
11,
false,
)
},
pager.deref(),
)
.unwrap();
assert_eq!(
std::str::from_utf8(&read_buffer).unwrap(),
"hello world",
"Failed to read 'hello world' from overflow page"
);
let mut modified_hello = "olleh".as_bytes().to_vec();
run_until_done(
|| {
cursor.read_write_payload_with_offset(
offset_to_hello_world,
&mut modified_hello,
5,
true,
)
},
pager.deref(),
)
.unwrap();
let mut verification_buffer = Vec::new();
run_until_done(
|| {
cursor.read_write_payload_with_offset(
offset_to_hello_world,
&mut verification_buffer,
hello_world.len() as u32,
false,
)
},
pager.deref(),
)
.unwrap();
assert_eq!(
std::str::from_utf8(&verification_buffer).unwrap(),
"olleh world",
"Modified data doesn't match expected result"
);
}
fn run_until_done<T>(action: impl FnMut() -> Result<IOResult<T>>, pager: &Pager) -> Result<T> {
pager.io.block(action)
}
#[test]
fn test_free_array() {
let (mut rng, seed) = rng_from_time_or_env();
tracing::info!("seed={}", seed);
const ITERATIONS: usize = 10000;
for _ in 0..ITERATIONS {
let mut cell_array = CellArray {
cell_payloads: Vec::new(),
cell_count_per_page_cumulative: [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
};
let mut cells_cloned = Vec::new();
let (pager, _, _, _) = empty_btree();
let page_type = PageType::TableLeaf;
let page = pager.allocate_page().unwrap();
let page = Arc::new(BTreePageInner {
page: RefCell::new(page),
});
btree_init_page(&page, page_type, 0, pager.usable_space() as u16);
let page = page.get();
let mut size = (rng.next_u64() % 100) as u16;
let mut i = 0;
// add a bunch of cells
while compute_free_space(page.get_contents(), pager.usable_space() as u16) >= size + 10
{
insert_cell(i, size, page.get_contents(), pager.clone());
i += 1;
size = (rng.next_u64() % 1024) as u16;
}
// Create cell array with references to cells inserted
let contents = page.get_contents();
for cell_idx in 0..contents.cell_count() {
let buf = contents.as_ptr();
let (start, len) = contents.cell_get_raw_region(cell_idx, pager.usable_space());
cell_array
.cell_payloads
.push(to_static_buf(&mut buf[start..start + len]));
cells_cloned.push(buf[start..start + len].to_vec());
}
debug_validate_cells!(contents, pager.usable_space() as u16);
// now free a prefix or suffix of cells added
let cells_before_free = contents.cell_count();
let size = rng.next_u64() as usize % cells_before_free;
let prefix = rng.next_u64() % 2 == 0;
let start = if prefix {
0
} else {
contents.cell_count() - size
};
let removed = page_free_array(
contents,
start,
size,
&cell_array,
pager.usable_space() as u16,
)
.unwrap();
// shift if needed
if prefix {
shift_cells_left(contents, cells_before_free, removed);
}
assert_eq!(removed, size);
assert_eq!(contents.cell_count(), cells_before_free - size);
#[cfg(debug_assertions)]
debug_validate_cells_core(contents, pager.usable_space() as u16);
// check cells are correct
let mut cell_idx_cloned = if prefix { size } else { 0 };
for cell_idx in 0..contents.cell_count() {
let buf = contents.as_ptr();
let (start, len) = contents.cell_get_raw_region(cell_idx, pager.usable_space());
let cell_in_page = &buf[start..start + len];
let cell_in_array = &cells_cloned[cell_idx_cloned];
assert_eq!(cell_in_page, cell_in_array);
cell_idx_cloned += 1;
}
}
}
fn insert_cell(cell_idx: u64, size: u16, contents: &mut PageContent, pager: Rc<Pager>) {
let mut payload = Vec::new();
let regs = &[Register::Value(Value::Blob(vec![0; size as usize]))];
let record = ImmutableRecord::from_registers(regs, regs.len());
fill_cell_payload(
contents,
Some(cell_idx as i64),
&mut payload,
cell_idx as usize,
&record,
pager.usable_space(),
pager.clone(),
);
insert_into_cell(
contents,
&payload,
cell_idx as usize,
pager.usable_space() as u16,
)
.unwrap();
}
}