diff --git a/core/storage/btree.rs b/core/storage/btree.rs index b7dffedaa..d7532da61 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -7,7 +7,8 @@ use crate::{ pager::{BtreePageAllocMode, Pager}, sqlite3_ondisk::{ read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, - TableLeafCell, + TableLeafCell, CELL_PTR_SIZE_BYTES, INTERIOR_PAGE_HEADER_SIZE_BYTES, + LEAF_PAGE_HEADER_SIZE_BYTES, LEFT_CHILD_PTR_SIZE_BYTES, }, }, translate::{collate::CollationSeq, plan::IterationDirection}, @@ -100,6 +101,12 @@ pub mod offset { /// assumed that the database is corrupt. pub const BTCURSOR_MAX_DEPTH: usize = 20; +/// Maximum number of sibling pages that balancing is performed on. +pub const MAX_SIBLING_PAGES_TO_BALANCE: usize = 3; + +/// We only need maximum 5 pages to balance 3 pages, because we can guarantee that cells from 3 pages will fit in 5 pages. +pub const MAX_NEW_SIBLING_PAGES_AFTER_BALANCE: usize = 5; + /// Evaluate a Result>, if IO return IO. macro_rules! return_if_io { ($expr:expr) => { @@ -309,11 +316,11 @@ impl BTreeKey<'_> { #[derive(Clone)] struct BalanceInfo { /// Old pages being balanced. We can have maximum 3 pages being balanced at the same time. - pages_to_balance: [Option; 3], + pages_to_balance: [Option; MAX_SIBLING_PAGES_TO_BALANCE], /// Bookkeeping of the rightmost pointer so the offset::BTREE_RIGHTMOST_PTR can be updated. rightmost_pointer: *mut u8, /// Divider cells of old pages. We can have maximum 2 divider cells because of 3 pages. - divider_cells: [Option>; 2], + divider_cell_payloads: [Option>; MAX_SIBLING_PAGES_TO_BALANCE - 1], /// Number of siblings being used to balance sibling_count: usize, /// First divider cell to remove that marks the first sibling @@ -2338,7 +2345,8 @@ impl BTreeCursor { "expected index or table interior page" ); // Part 1: Find the sibling pages to balance - let mut pages_to_balance: [Option; 3] = [const { None }; 3]; + let mut pages_to_balance: [Option; MAX_SIBLING_PAGES_TO_BALANCE] = + [const { None }; MAX_SIBLING_PAGES_TO_BALANCE]; let number_of_cells_in_parent = parent_contents.cell_count() + parent_contents.overflow_cells.len(); @@ -2457,7 +2465,7 @@ impl BTreeCursor { .replace(Some(BalanceInfo { pages_to_balance, rightmost_pointer: right_pointer, - divider_cells: [const { None }; 2], + divider_cell_payloads: [const { None }; MAX_SIBLING_PAGES_TO_BALANCE - 1], sibling_count, first_divider_cell: first_cell_divider, })); @@ -2492,8 +2500,9 @@ impl BTreeCursor { // The count includes: all cells and overflow cells from the sibling pages, and divider cells from the parent page, // excluding the rightmost divider, which will not be dropped from the parent; instead it will be updated at the end. let mut total_cells_to_redistribute = 0; - // We only need maximum 5 pages to balance 3 pages, because we can guarantee that cells from 3 pages will fit in 5 pages. - let mut pages_to_balance_new: [Option; 5] = [const { None }; 5]; + let mut pages_to_balance_new: [Option; + MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] = + [const { None }; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE]; for i in (0..balance_info.sibling_count).rev() { let sibling_page = balance_info.pages_to_balance[i].as_ref().unwrap(); let sibling_page = sibling_page.get(); @@ -2504,7 +2513,8 @@ impl BTreeCursor { // Right pointer is not dropped, we simply update it at the end. This could be a divider cell that points // to the last page in the list of pages to balance or this could be the rightmost pointer that points to a page. - if i == balance_info.sibling_count - 1 { + let is_last_sibling = i == balance_info.sibling_count - 1; + if is_last_sibling { continue; } // Since we know we have a left sibling, take the divider that points to left sibling of this page @@ -2524,7 +2534,7 @@ impl BTreeCursor { ); // TODO(pere): make this reference and not copy - balance_info.divider_cells[i].replace(cell_buf.to_vec()); + balance_info.divider_cell_payloads[i].replace(cell_buf.to_vec()); tracing::trace!( "dropping divider cell from parent cell_idx={} count={}", cell_idx, @@ -2536,14 +2546,15 @@ impl BTreeCursor { /* 2. Initialize CellArray with all the cells used for distribution, this includes divider cells if !leaf. */ let mut cell_array = CellArray { cell_payloads: Vec::with_capacity(total_cells_to_redistribute), - cell_count_per_page_cumulative: [0; 5], + cell_count_per_page_cumulative: [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE], }; let cells_capacity_start = cell_array.cell_payloads.capacity(); let mut total_cells_inserted = 0; // This is otherwise identical to CellArray.cell_count_per_page_cumulative, // but we exclusively track what the prefix sums were _before_ we started redistributing cells. - let mut old_cell_count_per_page_cumulative: [u16; 5] = [0; 5]; + let mut old_cell_count_per_page_cumulative: [u16; + MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] = [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE]; let page_type = balance_info.pages_to_balance[0] .as_ref() @@ -2585,10 +2596,11 @@ impl BTreeCursor { let mut cells_inserted = old_page_contents.cell_count() + old_page_contents.overflow_cells.len(); - if i < balance_info.sibling_count - 1 && !is_table_leaf { + let is_last_sibling = i == balance_info.sibling_count - 1; + if !is_last_sibling && !is_table_leaf { // If we are a index page or a interior table page we need to take the divider cell too. // But we don't need the last divider as it will remain the same. - let mut divider_cell = balance_info.divider_cells[i] + let mut divider_cell = balance_info.divider_cell_payloads[i] .as_mut() .unwrap() .as_mut_slice(); @@ -2598,12 +2610,16 @@ impl BTreeCursor { if !is_leaf { // This divider cell needs to be updated with new left pointer, let right_pointer = old_page_contents.rightmost_pointer().unwrap(); - divider_cell[..4].copy_from_slice(&right_pointer.to_be_bytes()); + divider_cell[..LEFT_CHILD_PTR_SIZE_BYTES] + .copy_from_slice(&right_pointer.to_be_bytes()); } else { // index leaf - turso_assert!(divider_cell.len() >= 4, "divider cell is too short"); + turso_assert!( + divider_cell.len() >= LEFT_CHILD_PTR_SIZE_BYTES, + "divider cell is too short" + ); // let's strip the page pointer - divider_cell = &mut divider_cell[4..]; + divider_cell = &mut divider_cell[LEFT_CHILD_PTR_SIZE_BYTES..]; } cell_array.cell_payloads.push(to_static_buf(divider_cell)); } @@ -2632,11 +2648,16 @@ impl BTreeCursor { validate_cells_after_insertion(&cell_array, is_table_leaf); /* 3. Initiliaze current size of every page including overflow cells and divider cells that might be included. */ - let mut new_page_sizes: [i64; 5] = [0; 5]; - let leaf_correction = if is_leaf { 4 } else { 0 }; + let mut new_page_sizes: [i64; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] = + [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE]; + let header_size = if is_leaf { + LEAF_PAGE_HEADER_SIZE_BYTES + } else { + INTERIOR_PAGE_HEADER_SIZE_BYTES + }; // number of bytes beyond header, different from global usableSapce which includes // header - let usable_space = self.usable_space() - 12 + leaf_correction; + let usable_space = self.usable_space() - header_size; for i in 0..balance_info.sibling_count { cell_array.cell_count_per_page_cumulative[i] = old_cell_count_per_page_cumulative[i]; @@ -2650,7 +2671,8 @@ impl BTreeCursor { // 2 to account of pointer new_page_sizes[i] += 2 + overflow.payload.len() as i64; } - if !is_leaf && i < balance_info.sibling_count - 1 { + let is_last_sibling = i == balance_info.sibling_count - 1; + if !is_leaf && !is_last_sibling { // Account for divider cell which is included in this page. new_page_sizes[i] += cell_array.cell_payloads [cell_array.cell_count_up_to_page(i)] @@ -2695,8 +2717,9 @@ impl BTreeCursor { { // This means we move to the right page the divider cell and we // promote left cell to divider - 2 + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)] - .len() as i64 + CELL_PTR_SIZE_BYTES as i64 + + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)] + .len() as i64 } else { 0 } @@ -2711,8 +2734,8 @@ impl BTreeCursor { while cell_array.cell_count_per_page_cumulative[i] < cell_array.cell_payloads.len() as u16 { - let size_of_cell_to_remove_from_right = - 2 + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)].len() + let size_of_cell_to_remove_from_right = CELL_PTR_SIZE_BYTES as i64 + + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)].len() as i64; let can_take = new_page_sizes[i] + size_of_cell_to_remove_from_right > usable_space as i64; @@ -2726,8 +2749,9 @@ impl BTreeCursor { if cell_array.cell_count_per_page_cumulative[i] < cell_array.cell_payloads.len() as u16 { - 2 + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)] - .len() as i64 + CELL_PTR_SIZE_BYTES as i64 + + cell_array.cell_payloads[cell_array.cell_count_up_to_page(i)] + .len() as i64 } else { 0 } @@ -2837,15 +2861,24 @@ impl BTreeCursor { cell_array.cell_size_bytes(cell_right as usize) as i64; // TODO: add assert nMaxCells - let pointer_size = if i == sibling_count_new - 1 { 0 } else { 2 }; - let would_not_improve_balance = size_right_page + cell_right_size + 2 - > size_left_page - (cell_left_size + pointer_size); + let is_last_sibling = i == sibling_count_new - 1; + let pointer_size = if is_last_sibling { + 0 + } else { + CELL_PTR_SIZE_BYTES as i64 + }; + // As mentioned, this step rebalances the siblings so that cells are moved from left to right, since the previous step just + // packed as much as possible to the left. However, if the right-hand-side page would become larger than the left-hand-side page, + // we stop. + let would_not_improve_balance = + size_right_page + cell_right_size + (CELL_PTR_SIZE_BYTES as i64) + > size_left_page - (cell_left_size + pointer_size); if size_right_page != 0 && would_not_improve_balance { break; } - size_left_page -= cell_left_size + 2; - size_right_page += cell_right_size + 2; + size_left_page -= cell_left_size + (CELL_PTR_SIZE_BYTES as i64); + size_right_page += cell_right_size + (CELL_PTR_SIZE_BYTES as i64); cell_array.cell_count_per_page_cumulative[i - 1] = cell_left; if cell_left == 0 { @@ -2886,7 +2919,8 @@ impl BTreeCursor { // Reassign page numbers in increasing order { - let mut page_numbers: [usize; 5] = [0; 5]; + let mut page_numbers: [usize; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] = + [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE]; for (i, page) in pages_to_balance_new .iter() .take(sibling_count_new) @@ -2955,7 +2989,8 @@ impl BTreeCursor { // that was originally on that place. let is_leaf_page = matches!(page_type, PageType::TableLeaf | PageType::IndexLeaf); if !is_leaf_page { - let last_page = balance_info.pages_to_balance[balance_info.sibling_count - 1] + let last_sibling_idx = balance_info.sibling_count - 1; + let last_page = balance_info.pages_to_balance[last_sibling_idx] .as_ref() .unwrap(); let right_pointer = last_page.get().get_contents().rightmost_pointer().unwrap(); @@ -3018,7 +3053,7 @@ impl BTreeCursor { new_divider_cell.extend_from_slice(divider_cell); } - let left_pointer = read_u32(&new_divider_cell[..4], 0); + let left_pointer = read_u32(&new_divider_cell[..LEFT_CHILD_PTR_SIZE_BYTES], 0); turso_assert!( left_pointer != parent_page.get().id as u32, "left pointer is the same as parent page id" @@ -3098,8 +3133,10 @@ impl BTreeCursor { ** upwards pass simply processes pages that were missed on the downward ** pass. */ - let mut done = [false; 5]; - for i in (1 - sibling_count_new as i64)..sibling_count_new as i64 { + let mut done = [false; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE]; + let rightmost_page_negative_idx = 1 - sibling_count_new as i64; + let rightmost_page_positive_idx = sibling_count_new as i64 - 1; + for i in rightmost_page_negative_idx..=rightmost_page_positive_idx { // As mentioned above, we do two passes over the pages: // 1. Downward pass: Process pages in decreasing order // 2. Upward pass: Process pages in increasing order @@ -3298,7 +3335,7 @@ impl BTreeCursor { parent_page: &BTreePage, balance_info: &mut BalanceInfo, parent_contents: &mut PageContent, - pages_to_balance_new: [Option; 5], + pages_to_balance_new: [Option; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE], page_type: PageType, leaf_data: bool, mut cells_debug: Vec>, @@ -3572,7 +3609,8 @@ impl BTreeCursor { if leaf_data { // If we are in a table leaf page, we just need to check that this cell that should be a divider cell is in the parent // This means we already check cell in leaf pages but not on parent so we don't advance current_index_cell - if page_idx >= balance_info.sibling_count - 1 { + let last_sibling_idx = balance_info.sibling_count - 1; + if page_idx >= last_sibling_idx { // This means we are in the last page and we don't need to check anything continue; } @@ -5485,7 +5523,7 @@ struct CellArray { /// Prefix sum of cells in each page. /// For example, if three pages have 1, 2, and 3 cells, respectively, /// then cell_count_per_page_cumulative will be [1, 3, 6]. - cell_count_per_page_cumulative: [u16; 5], + cell_count_per_page_cumulative: [u16; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE], } impl CellArray { @@ -6012,8 +6050,7 @@ fn insert_into_cell( page.cell_count() ); let free = compute_free_space(page, usable_space); - const CELL_POINTER_SIZE_BYTES: usize = 2; - let enough_space = payload.len() + CELL_POINTER_SIZE_BYTES <= free as usize; + let enough_space = payload.len() + CELL_PTR_SIZE_BYTES <= free as usize; if !enough_space { // add to overflow cell page.overflow_cells.push(OverflowCell { @@ -6038,15 +6075,15 @@ fn insert_into_cell( .copy_from_slice(payload); // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); let (cell_pointer_array_start, _) = page.cell_pointer_array_offset_and_size(); - let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); + let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_PTR_SIZE_BYTES * cell_idx); - // move existing pointers forward by CELL_POINTER_SIZE_BYTES... + // move existing pointers forward by CELL_PTR_SIZE_BYTES... let n_cells_forward = page.cell_count() - cell_idx; - let n_bytes_forward = CELL_POINTER_SIZE_BYTES * n_cells_forward; + let n_bytes_forward = CELL_PTR_SIZE_BYTES * n_cells_forward; if n_bytes_forward > 0 { buf.copy_within( cell_pointer_cur_idx..cell_pointer_cur_idx + n_bytes_forward, - cell_pointer_cur_idx + CELL_POINTER_SIZE_BYTES, + cell_pointer_cur_idx + CELL_PTR_SIZE_BYTES, ); } // ...and insert new cell pointer at the current index @@ -8364,7 +8401,7 @@ mod tests { for _ in 0..ITERATIONS { let mut cell_array = CellArray { cell_payloads: Vec::new(), - cell_count_per_page_cumulative: [0; 5], + cell_count_per_page_cumulative: [0; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE], }; let mut cells_cloned = Vec::new(); let (pager, _, _, _) = empty_btree(); diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 4c694c56c..20bc5ff9a 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -95,6 +95,11 @@ pub const DATABASE_HEADER_PAGE_ID: usize = 1; /// The minimum size of a cell in bytes. pub const MINIMUM_CELL_SIZE: usize = 4; +pub const CELL_PTR_SIZE_BYTES: usize = 2; +pub const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12; +pub const LEAF_PAGE_HEADER_SIZE_BYTES: usize = 8; +pub const LEFT_CHILD_PTR_SIZE_BYTES: usize = 4; + /// The database header. /// The first 100 bytes of the database file comprise the database file header. /// The database file header is divided into fields as shown by the table below. @@ -382,7 +387,6 @@ impl Clone for PageContent { } } -const CELL_POINTER_SIZE_BYTES: usize = 2; impl PageContent { pub fn new(offset: usize, buffer: Arc>) -> Self { Self { @@ -475,8 +479,7 @@ impl PageContent { /// The size of the cell pointer array in bytes. /// 2 bytes per cell pointer pub fn cell_pointer_array_size(&self) -> usize { - const CELL_POINTER_SIZE_BYTES: usize = 2; - self.cell_count() * CELL_POINTER_SIZE_BYTES + self.cell_count() * CELL_PTR_SIZE_BYTES } /// The start of the unallocated region. @@ -504,10 +507,8 @@ impl PageContent { /// 8 bytes for leaf pages, 12 bytes for interior pages (due to storing rightmost child pointer) pub fn header_size(&self) -> usize { match self.page_type() { - PageType::IndexInterior => 12, - PageType::TableInterior => 12, - PageType::IndexLeaf => 8, - PageType::TableLeaf => 8, + PageType::IndexInterior | PageType::TableInterior => INTERIOR_PAGE_HEADER_SIZE_BYTES, + PageType::IndexLeaf | PageType::TableLeaf => LEAF_PAGE_HEADER_SIZE_BYTES, } } @@ -549,7 +550,7 @@ impl PageContent { ncells ); let cell_pointer_array_start = self.header_size(); - let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); + let cell_pointer = cell_pointer_array_start + (idx * CELL_PTR_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; // SAFETY: this buffer is valid as long as the page is alive. We could store the page in the cell and do some lifetime magic @@ -564,7 +565,7 @@ impl PageContent { debug_assert!(self.page_type() == PageType::TableInterior); let buf = self.as_ptr(); let cell_pointer_array_start = self.header_size(); - let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); + let cell_pointer = cell_pointer_array_start + (idx * CELL_PTR_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; const LEFT_CHILD_PAGE_SIZE_BYTES: usize = 4; let (rowid, _) = read_varint(&buf[cell_pointer + LEFT_CHILD_PAGE_SIZE_BYTES..])?; @@ -580,7 +581,7 @@ impl PageContent { ); let buf = self.as_ptr(); let cell_pointer_array_start = self.header_size(); - let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); + let cell_pointer = cell_pointer_array_start + (idx * CELL_PTR_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; u32::from_be_bytes([ buf[cell_pointer], @@ -596,7 +597,7 @@ impl PageContent { debug_assert!(self.page_type() == PageType::TableLeaf); let buf = self.as_ptr(); let cell_pointer_array_start = self.header_size(); - let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); + let cell_pointer = cell_pointer_array_start + (idx * CELL_PTR_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; let mut pos = cell_pointer; let (_, nr) = read_varint(&buf[pos..])?; @@ -622,7 +623,7 @@ impl PageContent { let ncells = self.cell_count(); let (cell_pointer_array_start, _) = self.cell_pointer_array_offset_and_size(); assert!(idx < ncells, "cell_get: idx out of bounds"); - let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); + let cell_pointer = cell_pointer_array_start + (idx * CELL_PTR_SIZE_BYTES); let cell_pointer = self.read_u16_no_offset(cell_pointer) as usize; let start = cell_pointer; let payload_overflow_threshold_max =