diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 112e3bfaa..f5a3ed98c 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -537,6 +537,28 @@ pub struct BTreeCursor { pub record_cursor: RefCell, } +/// We store the cell index and cell count for each page in the stack. +/// The reason we store the cell count is because we need to know when we are at the end of the page, +/// without having to perform IO to get the ancestor pages. +#[derive(Debug, Clone, Copy, Default)] +struct BTreeNodeState { + cell_idx: i32, + cell_count: Option, +} + +impl BTreeNodeState { + /// Check if the current cell index is at the end of the page. + /// This information is used to determine whether a child page should move up to its parent. + /// If the child page is the rightmost leaf page and it has reached the end, this means all of its ancestors have + /// already reached the end, so it should not go up because there are no more records to traverse. + fn is_at_end(&self) -> bool { + let cell_count = self.cell_count.expect("cell_count is not set"); + // cell_idx == cell_count means: we will traverse to the rightmost pointer next. + // cell_idx == cell_count + 1 means: we have already gone down to the rightmost pointer. + self.cell_idx == cell_count + 1 + } +} + impl BTreeCursor { pub fn new( mv_cursor: Option>>, @@ -556,7 +578,7 @@ impl BTreeCursor { overflow_state: None, stack: PageStack { current_page: Cell::new(-1), - cell_indices: RefCell::new([0; BTCURSOR_MAX_DEPTH + 1]), + node_states: RefCell::new([BTreeNodeState::default(); BTCURSOR_MAX_DEPTH + 1]), stack: RefCell::new([const { None }; BTCURSOR_MAX_DEPTH + 1]), }, reusable_immutable_record: RefCell::new(None), @@ -1151,6 +1173,15 @@ impl BTreeCursor { .copy_from_slice(&buffer[..num_bytes as usize]); } + /// Check if any ancestor pages still have cells to iterate. + /// If not, traversing back up to parent is of no use because we are at the end of the tree. + fn ancestor_pages_have_more_children(&self) -> bool { + let node_states = self.stack.node_states.borrow(); + (0..self.stack.current()) + .rev() + .any(|idx| !node_states[idx].is_at_end()) + } + /// Move the cursor to the next record and return it. /// Used in forwards iteration, which is the default. #[instrument(skip(self), level = Level::INFO, name = "next")] @@ -1196,47 +1227,44 @@ impl BTreeCursor { self.going_upwards = false; return Ok(IOResult::Done(true)); } + // Important to advance only after loading the page in order to not advance > 1 times self.stack.advance(); let cell_idx = self.stack.current_cell_index() as usize; tracing::debug!(id = mem_page_rc.get().get().id, cell = cell_idx, "current"); - if cell_idx == cell_count { - // do rightmost - let has_parent = self.stack.has_parent(); - match contents.rightmost_pointer() { - Some(right_most_pointer) => { + if cell_idx >= cell_count { + let rightmost_already_traversed = cell_idx > cell_count; + match (contents.rightmost_pointer(), rightmost_already_traversed) { + (Some(right_most_pointer), false) => { + // do rightmost self.stack.advance(); let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue; } - None => { - if has_parent { + _ => { + if self.ancestor_pages_have_more_children() { tracing::trace!("moving simple upwards"); self.going_upwards = true; self.stack.pop(); continue; } else { + // If none of the ancestor pages have more children to iterate, that means we are at the end of the btree and should stop iterating. return Ok(IOResult::Done(false)); } } } } - if cell_idx > contents.cell_count() { - // end - let has_parent = self.stack.current() > 0; - if has_parent { - tracing::debug!("moving upwards"); - self.going_upwards = true; - self.stack.pop(); - continue; - } else { - return Ok(IOResult::Done(false)); - } - } - turso_assert!(cell_idx < contents.cell_count(), "cell index out of bounds"); + turso_assert!( + cell_idx < contents.cell_count(), + "cell index out of bounds: cell_idx={}, cell_count={}, page_type={:?} page_id={}", + cell_idx, + contents.cell_count(), + contents.page_type(), + mem_page_rc.get().get().id + ); let cell = contents.cell_get(cell_idx, self.usable_space())?; match &cell { @@ -5435,15 +5463,15 @@ struct PageStack { /// Pointer to the current page being consumed current_page: Cell, /// List of pages in the stack. Root page will be in index 0 - stack: RefCell<[Option; BTCURSOR_MAX_DEPTH + 1]>, + pub stack: RefCell<[Option; BTCURSOR_MAX_DEPTH + 1]>, /// List of cell indices in the stack. - /// cell_indices[current_page] is the current cell index being consumed. Similarly - /// cell_indices[current_page-1] is the cell index of the parent of the current page + /// node_states[current_page] is the current cell index being consumed. Similarly + /// node_states[current_page-1] is the cell index of the parent of the current page /// that we save in case of going back up. /// There are two points that need special attention: - /// If cell_indices[current_page] = -1, it indicates that the current iteration has reached the start of the current_page - /// If cell_indices[current_page] = `cell_count`, it means that the current iteration has reached the end of the current_page - cell_indices: RefCell<[i32; BTCURSOR_MAX_DEPTH + 1]>, + /// If node_states[current_page] = -1, it indicates that the current iteration has reached the start of the current_page + /// If node_states[current_page] = `cell_count`, it means that the current iteration has reached the end of the current_page + node_states: RefCell<[BTreeNodeState; BTCURSOR_MAX_DEPTH + 1]>, } impl PageStack { @@ -5477,6 +5505,7 @@ impl PageStack { ); } } + self.populate_parent_cell_count(); self.increment_current(); let current = self.current_page.get(); assert!( @@ -5484,8 +5513,45 @@ impl PageStack { "corrupted database, stack is bigger than expected" ); assert!(current >= 0); + + // Pin the page to prevent it from being evicted while on the stack + page.get().pin(); + self.stack.borrow_mut()[current as usize] = Some(page); - self.cell_indices.borrow_mut()[current as usize] = starting_cell_idx; + self.node_states.borrow_mut()[current as usize] = BTreeNodeState { + cell_idx: starting_cell_idx, + cell_count: None, // we don't know the cell count yet, so we set it to None. any code pushing a child page onto the stack MUST set the parent page's cell_count. + }; + } + + /// Populate the parent page's cell count. + /// This is needed so that we can, from a child page, check of ancestor pages' position relative to its cell index + /// without having to perform IO to get the ancestor page contents. + /// + /// This rests on the assumption that the parent page is already in memory whenever a child is pushed onto the stack. + /// We currently ensure this by pinning all the pages on [PageStack] to the page cache so that they cannot be evicted. + fn populate_parent_cell_count(&self) { + let stack_empty = self.current_page.get() == -1; + if stack_empty { + return; + } + let current = self.current(); + let stack = self.stack.borrow(); + let page = stack[current].as_ref().unwrap(); + let page = page.get(); + turso_assert!( + page.is_pinned(), + "parent page {} is not pinned", + page.get().id + ); + turso_assert!( + page.is_loaded(), + "parent page {} is not loaded", + page.get().id + ); + let contents = page.get_contents(); + let cell_count = contents.cell_count() as i32; + self.node_states.borrow_mut()[current].cell_count = Some(cell_count); } fn push(&self, page: BTreePage) { @@ -5503,7 +5569,13 @@ impl PageStack { let current = self.current_page.get(); assert!(current >= 0); tracing::trace!(current); - self.cell_indices.borrow_mut()[current as usize] = 0; + + // Unpin the page before removing it from the stack + if let Some(page) = &self.stack.borrow()[current as usize] { + page.get().unpin(); + } + + self.node_states.borrow_mut()[current as usize] = BTreeNodeState::default(); self.stack.borrow_mut()[current as usize] = None; self.decrement_current(); } @@ -5530,7 +5602,7 @@ impl PageStack { /// Cell index of the current page fn current_cell_index(&self) -> i32 { let current = self.current(); - self.cell_indices.borrow()[current] + self.node_states.borrow()[current].cell_idx } /// Check if the current cell index is less than 0. @@ -5546,36 +5618,54 @@ impl PageStack { fn advance(&self) { let current = self.current(); tracing::trace!( - curr_cell_index = self.cell_indices.borrow()[current], - cell_indices = ?self.cell_indices, + curr_cell_index = self.node_states.borrow()[current].cell_idx, + node_states = ?self.node_states.borrow().iter().map(|state| state.cell_idx).collect::>(), ); - self.cell_indices.borrow_mut()[current] += 1; + self.node_states.borrow_mut()[current].cell_idx += 1; } #[instrument(skip(self), level = Level::INFO, name = "pagestack::retreat")] fn retreat(&self) { let current = self.current(); tracing::trace!( - curr_cell_index = self.cell_indices.borrow()[current], - cell_indices = ?self.cell_indices, + curr_cell_index = self.node_states.borrow()[current].cell_idx, + node_states = ?self.node_states.borrow().iter().map(|state| state.cell_idx).collect::>(), ); - self.cell_indices.borrow_mut()[current] -= 1; + self.node_states.borrow_mut()[current].cell_idx -= 1; } fn set_cell_index(&self, idx: i32) { let current = self.current(); - self.cell_indices.borrow_mut()[current] = idx; + self.node_states.borrow_mut()[current].cell_idx = idx; } fn has_parent(&self) -> bool { self.current_page.get() > 0 } + fn unpin_all_if_pinned(&self) { + self.stack + .borrow_mut() + .iter_mut() + .flatten() + .for_each(|page| { + let _ = page.get().try_unpin(); + }); + } + fn clear(&self) { + self.unpin_all_if_pinned(); + self.current_page.set(-1); } } +impl Drop for PageStack { + fn drop(&mut self) { + self.unpin_all_if_pinned(); + } +} + /// Used for redistributing cells during a balance operation. struct CellArray { /// The actual cell data. diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 93016ab35..3e69c1cf9 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -47,6 +47,7 @@ pub enum CacheError { InternalError(String), Locked, Dirty { pgno: usize }, + Pinned { pgno: usize }, ActiveRefs, Full, KeyExists, @@ -134,6 +135,7 @@ impl DumbLruPageCache { } let ptr = *self.map.borrow().get(&key).unwrap(); + // Try to detach from LRU list first, can fail self.detach(ptr, clean_page)?; let ptr = self.map.borrow_mut().remove(&key).unwrap(); @@ -177,10 +179,11 @@ impl DumbLruPageCache { } } - fn detach( + fn _detach( &mut self, mut entry: NonNull, clean_page: bool, + allow_detach_pinned: bool, ) -> Result<(), CacheError> { let entry_mut = unsafe { entry.as_mut() }; if entry_mut.page.is_locked() { @@ -191,6 +194,11 @@ impl DumbLruPageCache { pgno: entry_mut.page.get().id, }); } + if entry_mut.page.is_pinned() && !allow_detach_pinned { + return Err(CacheError::Pinned { + pgno: entry_mut.page.get().id, + }); + } if clean_page { entry_mut.page.clear_loaded(); @@ -201,6 +209,22 @@ impl DumbLruPageCache { Ok(()) } + fn detach( + &mut self, + entry: NonNull, + clean_page: bool, + ) -> Result<(), CacheError> { + self._detach(entry, clean_page, false) + } + + fn detach_even_if_pinned( + &mut self, + entry: NonNull, + clean_page: bool, + ) -> Result<(), CacheError> { + self._detach(entry, clean_page, true) + } + fn unlink(&mut self, mut entry: NonNull) { let (next, prev) = unsafe { let c = entry.as_mut(); @@ -276,7 +300,8 @@ impl DumbLruPageCache { while need_to_evict > 0 && current_opt.is_some() { let current = current_opt.unwrap(); let entry = unsafe { current.as_ref() }; - current_opt = entry.prev; // Pick prev before modifying entry + // Pick prev before modifying entry + current_opt = entry.prev; match self.delete(entry.key.clone()) { Err(_) => {} Ok(_) => need_to_evict -= 1, @@ -296,7 +321,7 @@ impl DumbLruPageCache { self.map.borrow_mut().remove(¤t_entry.as_ref().key); } let next = unsafe { current_entry.as_ref().next }; - self.detach(current_entry, true)?; + self.detach_even_if_pinned(current_entry, true)?; unsafe { assert!(!current_entry.as_ref().page.is_dirty()); } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 9276363e0..065809792 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -7,7 +7,7 @@ use crate::storage::sqlite3_ondisk::{self, DatabaseHeader, PageContent, PageType use crate::storage::wal::{CheckpointResult, Wal}; use crate::types::IOResult; use crate::{return_if_io, Completion}; -use crate::{Buffer, Connection, LimboError, Result}; +use crate::{turso_assert, Buffer, Connection, LimboError, Result}; use parking_lot::RwLock; use std::cell::{Cell, OnceCell, RefCell, UnsafeCell}; use std::collections::HashSet; @@ -28,6 +28,7 @@ pub struct PageInner { pub flags: AtomicUsize, pub contents: Option, pub id: usize, + pub pin_count: AtomicUsize, } #[derive(Debug)] @@ -57,6 +58,7 @@ impl Page { flags: AtomicUsize::new(0), contents: None, id, + pin_count: AtomicUsize::new(0), }), } } @@ -139,6 +141,41 @@ impl Page { PageType::TableLeaf | PageType::TableInterior => false, } } + + /// Pin the page to prevent it from being evicted from the page cache. + pub fn pin(&self) { + self.get().pin_count.fetch_add(1, Ordering::Relaxed); + } + + /// Unpin the page to allow it to be evicted from the page cache. + pub fn unpin(&self) { + let was_pinned = self.try_unpin(); + + turso_assert!( + was_pinned, + "Attempted to unpin page {} that was not pinned", + self.get().id + ); + } + + /// Try to unpin the page if it's pinned, otherwise do nothing. + /// Returns true if the page was originally pinned. + pub fn try_unpin(&self) -> bool { + self.get() + .pin_count + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |current| { + if current == 0 { + None + } else { + Some(current - 1) + } + }) + .is_ok() + } + + pub fn is_pinned(&self) -> bool { + self.get().pin_count.load(Ordering::SeqCst) > 0 + } } #[derive(Clone, Copy, Debug)] /// The state of the current pager cache flush.