Merge 'Minor refactoring of btree' from meteorgan

Reviewed-by: Jussi Saurio <jussi.saurio@gmail.com>

Closes #1917
This commit is contained in:
Jussi Saurio
2025-07-09 19:13:39 +03:00
2 changed files with 70 additions and 94 deletions

View File

@@ -68,7 +68,14 @@ pub mod offset {
/// The number of cells in the page (u16).
pub const BTREE_CELL_COUNT: usize = 3;
/// A pointer to first byte of cell allocated content from top (u16).
/// A pointer to the first byte of cell allocated content from top (u16).
///
/// A zero value for this integer is interpreted as 65,536.
/// If a page contains no cells (which is only possible for a root page of a table that
/// contains no rows) then the offset to the cell content area will equal the page size minus
/// the bytes of reserved space. If the database uses a 65536-byte page size and the
/// reserved space is zero (the usual value for reserved space) then the cell content offset of
/// an empty page wants to be 6,5536
///
/// SQLite strives to place cells as far toward the end of the b-tree page as it can, in
/// order to leave space for future growth of the cell pointer array. This means that the
@@ -2218,10 +2225,10 @@ impl BTreeCursor {
cell_idx,
self.usable_space() as u16,
)?;
contents.overflow_cells.len()
!contents.overflow_cells.is_empty()
};
self.stack.set_cell_index(cell_idx as i32);
if overflow > 0 {
if overflow {
// A balance will happen so save the key we were inserting
tracing::debug!(page = page.get().get().id, cell_idx, "balance triggered:");
self.save_context(match bkey {
@@ -4280,13 +4287,10 @@ impl BTreeCursor {
page.get().get_contents().page_type(),
PageType::TableLeaf | PageType::TableInterior
) {
let _target_rowid = match return_if_io!(self.rowid()) {
Some(rowid) => rowid,
_ => {
self.state = CursorState::None;
return Ok(CursorResult::Ok(()));
}
};
if return_if_io!(self.rowid()).is_none() {
self.state = CursorState::None;
return Ok(CursorResult::Ok(()));
}
} else if self.reusable_immutable_record.borrow().is_none() {
self.state = CursorState::None;
return Ok(CursorResult::Ok(()));
@@ -4395,8 +4399,6 @@ impl BTreeCursor {
let page = page.get();
let contents = page.get_contents();
let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1);
let delete_info = self.state.mut_delete_info().unwrap();
if !contents.is_leaf() {
delete_info.state = DeleteState::InteriorNodeReplacement {
@@ -4405,7 +4407,7 @@ impl BTreeCursor {
post_balancing_seek_key,
};
} else {
let contents = page.get().contents.as_mut().unwrap();
let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1);
drop_cell(contents, cell_idx, self.usable_space() as u16)?;
let delete_info = self.state.mut_delete_info().unwrap();
@@ -6062,8 +6064,8 @@ fn free_cell_range(
pc
};
if offset <= page.cell_content_area() {
if offset < page.cell_content_area() {
if (offset as u32) <= page.cell_content_area() {
if (offset as u32) < page.cell_content_area() {
return_corrupt!("Free block before content area");
}
if pointer_to_pc != page.offset as u16 + offset::BTREE_FIRST_FREEBLOCK as u16 {
@@ -6238,8 +6240,13 @@ fn insert_into_cell(
Ok(())
}
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte
/// and end of cell pointer area.
/// The amount of free space is the sum of:
/// #1. The size of the unallocated region
/// #2. Fragments (isolated 1-3 byte chunks of free space within the cell content area)
/// #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that
/// are not in use due to e.g. deletions)
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected
/// to be between first cell byte and end of cell pointer area.
#[allow(unused_assignments)]
fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
// TODO(pere): maybe free space is not calculated correctly with offset
@@ -6248,38 +6255,14 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
// space that is not reserved for extensions by sqlite. Usually reserved_space is 0.
let usable_space = usable_space as usize;
let mut cell_content_area_start = page.cell_content_area();
// A zero value for the cell content area pointer is interpreted as 65536.
// See https://www.sqlite.org/fileformat.html
// The max page size for a sqlite database is 64kiB i.e. 65536 bytes.
// 65536 is u16::MAX + 1, and since cell content grows from right to left, this means
// the cell content area pointer is at the end of the page,
// i.e.
// 1. the page size is 64kiB
// 2. there are no cells on the page
// 3. there is no reserved space at the end of the page
if cell_content_area_start == 0 {
cell_content_area_start = u16::MAX;
}
// The amount of free space is the sum of:
// #1. the size of the unallocated region
// #2. fragments (isolated 1-3 byte chunks of free space within the cell content area)
// #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions)
let pointer_size = if matches!(page.page_type(), PageType::TableLeaf | PageType::IndexLeaf) {
0
} else {
4
};
let first_cell = page.offset + 8 + pointer_size + (2 * page.cell_count());
let mut free_space_bytes =
cell_content_area_start as usize + page.num_frag_free_bytes() as usize;
let first_cell = page.offset + page.header_size() + (2 * page.cell_count());
let cell_content_area_start = page.cell_content_area() as usize;
let mut free_space_bytes = cell_content_area_start + page.num_frag_free_bytes() as usize;
// #3 is computed by iterating over the freeblocks linked list
let mut cur_freeblock_ptr = page.first_freeblock() as usize;
if cur_freeblock_ptr > 0 {
if cur_freeblock_ptr < cell_content_area_start as usize {
if cur_freeblock_ptr < cell_content_area_start {
// Freeblocks exist in the cell content area e.g. after deletions
// They should never exist in the unused area of the page.
todo!("corrupted page");
@@ -6293,7 +6276,7 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
size = page.read_u16_no_offset(cur_freeblock_ptr + 2) as usize; // next 2 bytes in freeblock = size of current freeblock
free_space_bytes += size;
// Freeblocks are in order from left to right on the page,
// so next pointer should > current pointer + its size, or 0 if no next block exists.
// so the next pointer should > current pointer + its size, or 0 if no next block exists.
if next <= cur_freeblock_ptr + size + 3 {
break;
}
@@ -6301,8 +6284,8 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
}
// Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list
assert!(
next == 0,
assert_eq!(
next, 0,
"corrupted page: freeblocks list not in ascending order"
);
@@ -6317,10 +6300,6 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
"corrupted page: free space is greater than usable space"
);
// if( nFree>usableSize || nFree<iCellFirst ){
// return SQLITE_CORRUPT_PAGE(pPage);
// }
free_space_bytes as u16 - first_cell as u16
}
@@ -6414,7 +6393,6 @@ fn fill_cell_payload(
cell_payload.resize(prev_size + space_left + 4, 0);
let mut pointer = unsafe { cell_payload.as_mut_ptr().add(prev_size) };
let mut pointer_to_next = unsafe { cell_payload.as_mut_ptr().add(prev_size + space_left) };
let mut overflow_pages = Vec::new();
loop {
let to_copy = space_left.min(to_copy_buffer.len());
@@ -6428,7 +6406,6 @@ fn fill_cell_payload(
// we still have bytes to add, we will need to allocate new overflow page
// FIXME: handle page cache is full
let overflow_page = pager.allocate_overflow_page();
overflow_pages.push(overflow_page.clone());
{
let id = overflow_page.get().id as u32;
let contents = overflow_page.get().contents.as_mut().unwrap();

View File

@@ -45,11 +45,17 @@
use tracing::{instrument, Level};
use super::pager::PageRef;
use super::wal::LimboRwLock;
use crate::error::LimboError;
use crate::fast_lock::SpinLock;
use crate::io::{
Buffer, Complete, Completion, CompletionType, ReadCompletion, SyncCompletion, WriteCompletion,
};
use crate::storage::btree::offset::{
BTREE_CELL_CONTENT_AREA, BTREE_CELL_COUNT, BTREE_FIRST_FREEBLOCK, BTREE_FRAGMENTED_BYTES_COUNT,
BTREE_PAGE_TYPE, BTREE_RIGHTMOST_PTR,
};
use crate::storage::buffer_pool::BufferPool;
use crate::storage::database::DatabaseStorage;
use crate::storage::pager::Pager;
@@ -65,9 +71,6 @@ use std::rc::Rc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
use super::pager::PageRef;
use super::wal::LimboRwLock;
/// The size of the database header in bytes.
pub const DATABASE_HEADER_SIZE: usize = 100;
// DEFAULT_CACHE_SIZE negative values mean that we store the amount of pages a XKiB of memory can hold.
@@ -360,6 +363,8 @@ pub struct OverflowCell {
#[derive(Debug)]
pub struct PageContent {
/// the position where page content starts. it's 100 for page 1(database file header is 100 bytes),
/// 0 for all other pages.
pub offset: usize,
pub buffer: Arc<RefCell<Buffer>>,
pub overflow_cells: Vec<OverflowCell>,
@@ -376,6 +381,7 @@ impl Clone for PageContent {
}
}
const CELL_POINTER_SIZE_BYTES: usize = 2;
impl PageContent {
pub fn new(offset: usize, buffer: Arc<RefCell<Buffer>>) -> Self {
Self {
@@ -386,7 +392,7 @@ impl PageContent {
}
pub fn page_type(&self) -> PageType {
self.read_u8(0).try_into().unwrap()
self.read_u8(BTREE_PAGE_TYPE).try_into().unwrap()
}
pub fn maybe_page_type(&self) -> Option<PageType> {
@@ -455,19 +461,14 @@ impl PageContent {
buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes());
}
/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page.
/// A freeblock is a structure used to identify unallocated space within a b-tree page.
/// Freeblocks are organized as a chain.
///
/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead
/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions.
/// The offset of the first freeblock, or zero if there are no freeblocks on the page.
pub fn first_freeblock(&self) -> u16 {
self.read_u16(1)
self.read_u16(BTREE_FIRST_FREEBLOCK)
}
/// The number of cells on the page.
pub fn cell_count(&self) -> usize {
self.read_u16(3) as usize
self.read_u16(BTREE_CELL_COUNT) as usize
}
/// The size of the cell pointer array in bytes.
@@ -489,11 +490,13 @@ impl PageContent {
}
/// The start of the cell content area.
/// SQLite strives to place cells as far toward the end of the b-tree page as it can,
/// in order to leave space for future growth of the cell pointer array.
/// = the cell content area pointer moves leftward as cells are added to the page
pub fn cell_content_area(&self) -> u16 {
self.read_u16(5)
pub fn cell_content_area(&self) -> u32 {
let offset = self.read_u16(BTREE_CELL_CONTENT_AREA);
if offset == 0 {
MAX_PAGE_SIZE
} else {
offset as u32
}
}
/// The size of the page header in bytes.
@@ -507,16 +510,15 @@ impl PageContent {
}
}
/// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header.
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
/// The total number of bytes in all fragments
pub fn num_frag_free_bytes(&self) -> u8 {
self.read_u8(7)
self.read_u8(BTREE_FRAGMENTED_BYTES_COUNT)
}
pub fn rightmost_pointer(&self) -> Option<u32> {
match self.page_type() {
PageType::IndexInterior => Some(self.read_u32(8)),
PageType::TableInterior => Some(self.read_u32(8)),
PageType::IndexInterior => Some(self.read_u32(BTREE_RIGHTMOST_PTR)),
PageType::TableInterior => Some(self.read_u32(BTREE_RIGHTMOST_PTR)),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
}
@@ -524,9 +526,11 @@ impl PageContent {
pub fn rightmost_pointer_raw(&self) -> Option<*mut u8> {
match self.page_type() {
PageType::IndexInterior | PageType::TableInterior => {
Some(unsafe { self.as_ptr().as_mut_ptr().add(self.offset + 8) })
}
PageType::IndexInterior | PageType::TableInterior => Some(unsafe {
self.as_ptr()
.as_mut_ptr()
.add(self.offset + BTREE_RIGHTMOST_PTR)
}),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
}
@@ -543,16 +547,14 @@ impl PageContent {
let buf = self.as_ptr();
let ncells = self.cell_count();
// the page header is 12 bytes for interior pages, 8 bytes for leaf pages
// this is because the 4 last bytes in the interior page's header are used for the rightmost pointer.
let cell_pointer_array_start = self.header_size();
assert!(
idx < ncells,
"cell_get: idx out of bounds: idx={}, ncells={}",
idx,
ncells
);
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
// SAFETY: this buffer is valid as long as the page is alive. We could store the page in the cell and do some lifetime magic
@@ -573,9 +575,8 @@ impl PageContent {
pub fn cell_table_interior_read_rowid(&self, idx: usize) -> Result<i64> {
debug_assert!(self.page_type() == PageType::TableInterior);
let buf = self.as_ptr();
const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12;
let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
const LEFT_CHILD_PAGE_SIZE_BYTES: usize = 4;
let (rowid, _) = read_varint(&buf[cell_pointer + LEFT_CHILD_PAGE_SIZE_BYTES..])?;
@@ -590,9 +591,8 @@ impl PageContent {
|| self.page_type() == PageType::IndexInterior
);
let buf = self.as_ptr();
const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12;
let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
u32::from_be_bytes([
buf[cell_pointer],
@@ -607,9 +607,8 @@ impl PageContent {
pub fn cell_table_leaf_read_rowid(&self, idx: usize) -> Result<i64> {
debug_assert!(self.page_type() == PageType::TableLeaf);
let buf = self.as_ptr();
const LEAF_PAGE_HEADER_SIZE_BYTES: usize = 8;
let cell_pointer_array_start = LEAF_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
let mut pos = cell_pointer;
let (_, nr) = read_varint(&buf[pos..])?;
@@ -629,7 +628,7 @@ impl PageContent {
(self.offset + header_size, self.cell_pointer_array_size())
}
/// Get region of a cell's payload
/// Get region(start end length) of a cell's payload
pub fn cell_get_raw_region(
&self,
idx: usize,
@@ -641,7 +640,7 @@ impl PageContent {
let ncells = self.cell_count();
let (cell_pointer_array_start, _) = self.cell_pointer_array_offset_and_size();
assert!(idx < ncells, "cell_get: idx out of bounds");
let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16_no_offset(cell_pointer) as usize;
let start = cell_pointer;
let len = match self.page_type() {