Merge 'btree: fix trying to go upwards when we are already at the end of the entire btree' from Jussi Saurio

## What does this fix
This PR fixes an issue with BTree upwards traversal logic where we would
try to go up to a parent node in `next()` even though we are at the very
end of the btree. This behavior can leave the cursor incorrectly
positioned at an interior node when it should be at the right edge of
the rightmost leaf.
## Why doesn't it cause problems on main
This bug is masked on `main` by every table `insert()` (wastefully)
calling `find_cell()`:
- `op_new_rowid` called, let's say the current max rowid is `666`.
Cursor is left pointing at `666`.
- `insert()` is called with rowid `667`, cursor is currently pointing at
`666`, which is incorrect.
- `find_cell()` does a binary search every time, and hence somewhat
accidentally positions the cursor correctly _after_ `666` so that the
insert goes to the correct place
## Why was this issue found
in #1988, I am removing `find_cell()` entirely in favor of always
performing a seek to the correct location - and skipping `seek` when it
is not required, saving us from wasting a binary search on every insert
- but this change means that we need to call `next()` after
`op_new_rowid` to have the cursor positioned correctly at the new
insertion slot. Doing this surfaces this upwards traversal bug in that
PR branch.
## Details of solution
- Store `cell_count` together with `cell_idx` in pagestack, so that
chlidren can know whether their parents have reached their end without
doing IO
- To make this foolproof, pin pages on `PageStack` so the page cache
cannot evict them during tree traversal
- `cell_indices` renamed to `node_states` since it now carries more
information (cell index AND count, instead of just index)

Reviewed-by: Pere Diaz Bou <pere-altea@homail.com>

Closes #2005
This commit is contained in:
Jussi Saurio
2025-07-16 19:44:21 +03:00
3 changed files with 193 additions and 41 deletions

View File

@@ -537,6 +537,28 @@ pub struct BTreeCursor {
pub record_cursor: RefCell<RecordCursor>,
}
/// We store the cell index and cell count for each page in the stack.
/// The reason we store the cell count is because we need to know when we are at the end of the page,
/// without having to perform IO to get the ancestor pages.
#[derive(Debug, Clone, Copy, Default)]
struct BTreeNodeState {
cell_idx: i32,
cell_count: Option<i32>,
}
impl BTreeNodeState {
/// Check if the current cell index is at the end of the page.
/// This information is used to determine whether a child page should move up to its parent.
/// If the child page is the rightmost leaf page and it has reached the end, this means all of its ancestors have
/// already reached the end, so it should not go up because there are no more records to traverse.
fn is_at_end(&self) -> bool {
let cell_count = self.cell_count.expect("cell_count is not set");
// cell_idx == cell_count means: we will traverse to the rightmost pointer next.
// cell_idx == cell_count + 1 means: we have already gone down to the rightmost pointer.
self.cell_idx == cell_count + 1
}
}
impl BTreeCursor {
pub fn new(
mv_cursor: Option<Rc<RefCell<MvCursor>>>,
@@ -556,7 +578,7 @@ impl BTreeCursor {
overflow_state: None,
stack: PageStack {
current_page: Cell::new(-1),
cell_indices: RefCell::new([0; BTCURSOR_MAX_DEPTH + 1]),
node_states: RefCell::new([BTreeNodeState::default(); BTCURSOR_MAX_DEPTH + 1]),
stack: RefCell::new([const { None }; BTCURSOR_MAX_DEPTH + 1]),
},
reusable_immutable_record: RefCell::new(None),
@@ -1151,6 +1173,15 @@ impl BTreeCursor {
.copy_from_slice(&buffer[..num_bytes as usize]);
}
/// Check if any ancestor pages still have cells to iterate.
/// If not, traversing back up to parent is of no use because we are at the end of the tree.
fn ancestor_pages_have_more_children(&self) -> bool {
let node_states = self.stack.node_states.borrow();
(0..self.stack.current())
.rev()
.any(|idx| !node_states[idx].is_at_end())
}
/// Move the cursor to the next record and return it.
/// Used in forwards iteration, which is the default.
#[instrument(skip(self), level = Level::INFO, name = "next")]
@@ -1196,47 +1227,44 @@ impl BTreeCursor {
self.going_upwards = false;
return Ok(IOResult::Done(true));
}
// Important to advance only after loading the page in order to not advance > 1 times
self.stack.advance();
let cell_idx = self.stack.current_cell_index() as usize;
tracing::debug!(id = mem_page_rc.get().get().id, cell = cell_idx, "current");
if cell_idx == cell_count {
// do rightmost
let has_parent = self.stack.has_parent();
match contents.rightmost_pointer() {
Some(right_most_pointer) => {
if cell_idx >= cell_count {
let rightmost_already_traversed = cell_idx > cell_count;
match (contents.rightmost_pointer(), rightmost_already_traversed) {
(Some(right_most_pointer), false) => {
// do rightmost
self.stack.advance();
let mem_page = self.read_page(right_most_pointer as usize)?;
self.stack.push(mem_page);
continue;
}
None => {
if has_parent {
_ => {
if self.ancestor_pages_have_more_children() {
tracing::trace!("moving simple upwards");
self.going_upwards = true;
self.stack.pop();
continue;
} else {
// If none of the ancestor pages have more children to iterate, that means we are at the end of the btree and should stop iterating.
return Ok(IOResult::Done(false));
}
}
}
}
if cell_idx > contents.cell_count() {
// end
let has_parent = self.stack.current() > 0;
if has_parent {
tracing::debug!("moving upwards");
self.going_upwards = true;
self.stack.pop();
continue;
} else {
return Ok(IOResult::Done(false));
}
}
turso_assert!(cell_idx < contents.cell_count(), "cell index out of bounds");
turso_assert!(
cell_idx < contents.cell_count(),
"cell index out of bounds: cell_idx={}, cell_count={}, page_type={:?} page_id={}",
cell_idx,
contents.cell_count(),
contents.page_type(),
mem_page_rc.get().get().id
);
let cell = contents.cell_get(cell_idx, self.usable_space())?;
match &cell {
@@ -5435,15 +5463,15 @@ struct PageStack {
/// Pointer to the current page being consumed
current_page: Cell<i32>,
/// List of pages in the stack. Root page will be in index 0
stack: RefCell<[Option<BTreePage>; BTCURSOR_MAX_DEPTH + 1]>,
pub stack: RefCell<[Option<BTreePage>; BTCURSOR_MAX_DEPTH + 1]>,
/// List of cell indices in the stack.
/// cell_indices[current_page] is the current cell index being consumed. Similarly
/// cell_indices[current_page-1] is the cell index of the parent of the current page
/// node_states[current_page] is the current cell index being consumed. Similarly
/// node_states[current_page-1] is the cell index of the parent of the current page
/// that we save in case of going back up.
/// There are two points that need special attention:
/// If cell_indices[current_page] = -1, it indicates that the current iteration has reached the start of the current_page
/// If cell_indices[current_page] = `cell_count`, it means that the current iteration has reached the end of the current_page
cell_indices: RefCell<[i32; BTCURSOR_MAX_DEPTH + 1]>,
/// If node_states[current_page] = -1, it indicates that the current iteration has reached the start of the current_page
/// If node_states[current_page] = `cell_count`, it means that the current iteration has reached the end of the current_page
node_states: RefCell<[BTreeNodeState; BTCURSOR_MAX_DEPTH + 1]>,
}
impl PageStack {
@@ -5477,6 +5505,7 @@ impl PageStack {
);
}
}
self.populate_parent_cell_count();
self.increment_current();
let current = self.current_page.get();
assert!(
@@ -5484,8 +5513,45 @@ impl PageStack {
"corrupted database, stack is bigger than expected"
);
assert!(current >= 0);
// Pin the page to prevent it from being evicted while on the stack
page.get().pin();
self.stack.borrow_mut()[current as usize] = Some(page);
self.cell_indices.borrow_mut()[current as usize] = starting_cell_idx;
self.node_states.borrow_mut()[current as usize] = BTreeNodeState {
cell_idx: starting_cell_idx,
cell_count: None, // we don't know the cell count yet, so we set it to None. any code pushing a child page onto the stack MUST set the parent page's cell_count.
};
}
/// Populate the parent page's cell count.
/// This is needed so that we can, from a child page, check of ancestor pages' position relative to its cell index
/// without having to perform IO to get the ancestor page contents.
///
/// This rests on the assumption that the parent page is already in memory whenever a child is pushed onto the stack.
/// We currently ensure this by pinning all the pages on [PageStack] to the page cache so that they cannot be evicted.
fn populate_parent_cell_count(&self) {
let stack_empty = self.current_page.get() == -1;
if stack_empty {
return;
}
let current = self.current();
let stack = self.stack.borrow();
let page = stack[current].as_ref().unwrap();
let page = page.get();
turso_assert!(
page.is_pinned(),
"parent page {} is not pinned",
page.get().id
);
turso_assert!(
page.is_loaded(),
"parent page {} is not loaded",
page.get().id
);
let contents = page.get_contents();
let cell_count = contents.cell_count() as i32;
self.node_states.borrow_mut()[current].cell_count = Some(cell_count);
}
fn push(&self, page: BTreePage) {
@@ -5503,7 +5569,13 @@ impl PageStack {
let current = self.current_page.get();
assert!(current >= 0);
tracing::trace!(current);
self.cell_indices.borrow_mut()[current as usize] = 0;
// Unpin the page before removing it from the stack
if let Some(page) = &self.stack.borrow()[current as usize] {
page.get().unpin();
}
self.node_states.borrow_mut()[current as usize] = BTreeNodeState::default();
self.stack.borrow_mut()[current as usize] = None;
self.decrement_current();
}
@@ -5530,7 +5602,7 @@ impl PageStack {
/// Cell index of the current page
fn current_cell_index(&self) -> i32 {
let current = self.current();
self.cell_indices.borrow()[current]
self.node_states.borrow()[current].cell_idx
}
/// Check if the current cell index is less than 0.
@@ -5546,36 +5618,54 @@ impl PageStack {
fn advance(&self) {
let current = self.current();
tracing::trace!(
curr_cell_index = self.cell_indices.borrow()[current],
cell_indices = ?self.cell_indices,
curr_cell_index = self.node_states.borrow()[current].cell_idx,
node_states = ?self.node_states.borrow().iter().map(|state| state.cell_idx).collect::<Vec<_>>(),
);
self.cell_indices.borrow_mut()[current] += 1;
self.node_states.borrow_mut()[current].cell_idx += 1;
}
#[instrument(skip(self), level = Level::INFO, name = "pagestack::retreat")]
fn retreat(&self) {
let current = self.current();
tracing::trace!(
curr_cell_index = self.cell_indices.borrow()[current],
cell_indices = ?self.cell_indices,
curr_cell_index = self.node_states.borrow()[current].cell_idx,
node_states = ?self.node_states.borrow().iter().map(|state| state.cell_idx).collect::<Vec<_>>(),
);
self.cell_indices.borrow_mut()[current] -= 1;
self.node_states.borrow_mut()[current].cell_idx -= 1;
}
fn set_cell_index(&self, idx: i32) {
let current = self.current();
self.cell_indices.borrow_mut()[current] = idx;
self.node_states.borrow_mut()[current].cell_idx = idx;
}
fn has_parent(&self) -> bool {
self.current_page.get() > 0
}
fn unpin_all_if_pinned(&self) {
self.stack
.borrow_mut()
.iter_mut()
.flatten()
.for_each(|page| {
let _ = page.get().try_unpin();
});
}
fn clear(&self) {
self.unpin_all_if_pinned();
self.current_page.set(-1);
}
}
impl Drop for PageStack {
fn drop(&mut self) {
self.unpin_all_if_pinned();
}
}
/// Used for redistributing cells during a balance operation.
struct CellArray {
/// The actual cell data.

View File

@@ -47,6 +47,7 @@ pub enum CacheError {
InternalError(String),
Locked,
Dirty { pgno: usize },
Pinned { pgno: usize },
ActiveRefs,
Full,
KeyExists,
@@ -134,6 +135,7 @@ impl DumbLruPageCache {
}
let ptr = *self.map.borrow().get(&key).unwrap();
// Try to detach from LRU list first, can fail
self.detach(ptr, clean_page)?;
let ptr = self.map.borrow_mut().remove(&key).unwrap();
@@ -177,10 +179,11 @@ impl DumbLruPageCache {
}
}
fn detach(
fn _detach(
&mut self,
mut entry: NonNull<PageCacheEntry>,
clean_page: bool,
allow_detach_pinned: bool,
) -> Result<(), CacheError> {
let entry_mut = unsafe { entry.as_mut() };
if entry_mut.page.is_locked() {
@@ -191,6 +194,11 @@ impl DumbLruPageCache {
pgno: entry_mut.page.get().id,
});
}
if entry_mut.page.is_pinned() && !allow_detach_pinned {
return Err(CacheError::Pinned {
pgno: entry_mut.page.get().id,
});
}
if clean_page {
entry_mut.page.clear_loaded();
@@ -201,6 +209,22 @@ impl DumbLruPageCache {
Ok(())
}
fn detach(
&mut self,
entry: NonNull<PageCacheEntry>,
clean_page: bool,
) -> Result<(), CacheError> {
self._detach(entry, clean_page, false)
}
fn detach_even_if_pinned(
&mut self,
entry: NonNull<PageCacheEntry>,
clean_page: bool,
) -> Result<(), CacheError> {
self._detach(entry, clean_page, true)
}
fn unlink(&mut self, mut entry: NonNull<PageCacheEntry>) {
let (next, prev) = unsafe {
let c = entry.as_mut();
@@ -276,7 +300,8 @@ impl DumbLruPageCache {
while need_to_evict > 0 && current_opt.is_some() {
let current = current_opt.unwrap();
let entry = unsafe { current.as_ref() };
current_opt = entry.prev; // Pick prev before modifying entry
// Pick prev before modifying entry
current_opt = entry.prev;
match self.delete(entry.key.clone()) {
Err(_) => {}
Ok(_) => need_to_evict -= 1,
@@ -296,7 +321,7 @@ impl DumbLruPageCache {
self.map.borrow_mut().remove(&current_entry.as_ref().key);
}
let next = unsafe { current_entry.as_ref().next };
self.detach(current_entry, true)?;
self.detach_even_if_pinned(current_entry, true)?;
unsafe {
assert!(!current_entry.as_ref().page.is_dirty());
}

View File

@@ -7,7 +7,7 @@ use crate::storage::sqlite3_ondisk::{self, DatabaseHeader, PageContent, PageType
use crate::storage::wal::{CheckpointResult, Wal};
use crate::types::IOResult;
use crate::{return_if_io, Completion};
use crate::{Buffer, Connection, LimboError, Result};
use crate::{turso_assert, Buffer, Connection, LimboError, Result};
use parking_lot::RwLock;
use std::cell::{Cell, OnceCell, RefCell, UnsafeCell};
use std::collections::HashSet;
@@ -28,6 +28,7 @@ pub struct PageInner {
pub flags: AtomicUsize,
pub contents: Option<PageContent>,
pub id: usize,
pub pin_count: AtomicUsize,
}
#[derive(Debug)]
@@ -57,6 +58,7 @@ impl Page {
flags: AtomicUsize::new(0),
contents: None,
id,
pin_count: AtomicUsize::new(0),
}),
}
}
@@ -139,6 +141,41 @@ impl Page {
PageType::TableLeaf | PageType::TableInterior => false,
}
}
/// Pin the page to prevent it from being evicted from the page cache.
pub fn pin(&self) {
self.get().pin_count.fetch_add(1, Ordering::Relaxed);
}
/// Unpin the page to allow it to be evicted from the page cache.
pub fn unpin(&self) {
let was_pinned = self.try_unpin();
turso_assert!(
was_pinned,
"Attempted to unpin page {} that was not pinned",
self.get().id
);
}
/// Try to unpin the page if it's pinned, otherwise do nothing.
/// Returns true if the page was originally pinned.
pub fn try_unpin(&self) -> bool {
self.get()
.pin_count
.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |current| {
if current == 0 {
None
} else {
Some(current - 1)
}
})
.is_ok()
}
pub fn is_pinned(&self) -> bool {
self.get().pin_count.load(Ordering::SeqCst) > 0
}
}
#[derive(Clone, Copy, Debug)]
/// The state of the current pager cache flush.