From 1266994a0a5f38173cb2359baf11f8f30dc43b7c Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 9 Feb 2025 10:55:07 +0100 Subject: [PATCH 01/33] balance sibling pages --- core/storage/btree.rs | 524 +++++++++++++++++++++++++++------ core/storage/pager.rs | 4 + core/storage/sqlite3_ondisk.rs | 10 + 3 files changed, 451 insertions(+), 87 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 40d9c59d8..2cdbe37fd 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -9,7 +9,7 @@ use crate::storage::sqlite3_ondisk::{ use crate::types::{CursorResult, OwnedValue, Record, SeekKey, SeekOp}; use crate::{LimboError, Result}; -use std::cell::{Ref, RefCell}; +use std::cell::{self, Ref, RefCell}; use std::pin::Pin; use std::rc::Rc; @@ -81,6 +81,7 @@ enum WriteState { Start, BalanceStart, BalanceNonRoot, + BalanceNonRootWaitLoadPages, BalanceGetParentPage, BalanceMoveUp, Finish, @@ -89,27 +90,35 @@ enum WriteState { struct WriteInfo { /// State of the write operation state machine. state: WriteState, - /// Pages involved in the split of the page due to balancing (splits_pages[0] is the balancing page, while other - fresh allocated pages) - split_pages: RefCell>, - /// Amount of cells from balancing page for every split page - split_pages_cells_count: RefCell>, + /// Old pages being balanced. + pages_to_balance: RefCell>, + /// Pages allocated during the write operation due to balancing. + new_pages: RefCell>, /// Scratch space used during balancing. - scratch_cells: RefCell>, + scratch_cells: RefCell>>, /// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated. rightmost_pointer: RefCell>, /// Copy of the current page needed for buffer references. page_copy: RefCell>, + /// Divider cells of old pages + divider_cells: RefCell>>, + /// Number of siblings being used to balance + sibling_count: RefCell, + /// First divider cell to remove that marks the first sibling + first_divider_cell: RefCell, } impl WriteInfo { fn new() -> WriteInfo { WriteInfo { state: WriteState::Start, - split_pages: RefCell::new(Vec::with_capacity(4)), - split_pages_cells_count: RefCell::new(Vec::with_capacity(4)), scratch_cells: RefCell::new(Vec::new()), rightmost_pointer: RefCell::new(None), page_copy: RefCell::new(None), + pages_to_balance: RefCell::new(Vec::new()), + divider_cells: RefCell::new(Vec::new()), + sibling_count: RefCell::new(0), + first_divider_cell: RefCell::new(0), } } } @@ -173,6 +182,12 @@ struct PageStack { cell_indices: RefCell<[i32; BTCURSOR_MAX_DEPTH + 1]>, } +struct CellArray { + cells: Vec>, // TODO(pere): make this with references + + number_of_cells_per_page: Vec, // number of cells in each page +} + impl BTreeCursor { pub fn new(pager: Rc, root_page: usize) -> Self { Self { @@ -768,6 +783,7 @@ impl BTreeCursor { } WriteState::BalanceStart | WriteState::BalanceNonRoot + | WriteState::BalanceNonRootWaitLoadPages | WriteState::BalanceMoveUp | WriteState::BalanceGetParentPage => { return_if_io!(self.balance()); @@ -1078,6 +1094,8 @@ impl BTreeCursor { let write_info = self.state.mut_write_info().unwrap(); write_info.state = WriteState::BalanceNonRoot; + self.stack.pop(); + return_if_io!(self.balance_non_root()); } WriteState::BalanceNonRoot | WriteState::BalanceGetParentPage @@ -1090,6 +1108,7 @@ impl BTreeCursor { } } + /// Balance a non root page by trying to balance cells between a maximum of 3 siblings that should be neighboring the page that overflowed/underflowed. fn balance_non_root(&mut self) -> Result> { assert!( matches!(self.state, CursorState::Write(_)), @@ -1105,97 +1124,392 @@ impl BTreeCursor { // Right pointer means cell that points to the last page, as we don't really want to drop this one. This one // can be a "rightmost pointer" or a "cell". // we always asumme there is a parent - let current_page = self.stack.top(); - debug!("balance_non_root(page={})", current_page.get().id); - - let current_page_inner = current_page.get(); - let current_page_contents = &mut current_page_inner.contents; - let current_page_contents = current_page_contents.as_mut().unwrap(); - // Copy of page used to reference cell bytes. - // This needs to be saved somewhere safe so that references still point to here, - // this will be store in write_info below - let page_copy = current_page_contents.clone(); - current_page_contents.overflow_cells.clear(); - - // In memory in order copy of all cells in pages we want to balance. For now let's do a 2 page split. - // Right pointer in interior cells should be converted to regular cells if more than 2 pages are used for balancing. - let write_info = self.state.write_info().unwrap(); - let mut scratch_cells = write_info.scratch_cells.borrow_mut(); - scratch_cells.clear(); - - let usable_space = self.usable_space(); - for cell_idx in 0..page_copy.cell_count() { - let (start, len) = page_copy.cell_get_raw_region( - cell_idx, - self.payload_overflow_threshold_max(page_copy.page_type()), - self.payload_overflow_threshold_min(page_copy.page_type()), - usable_space, - ); - let cell_buffer = to_static_buf(&page_copy.as_ptr()[start..start + len]); - scratch_cells.push(cell_buffer); + let parent_page = self.stack.top(); + if parent_page.is_locked() { + return Ok(CursorResult::IO); } - // overflow_cells are stored in order - so we need to insert them in reverse order - for cell in page_copy.overflow_cells.iter().rev() { - scratch_cells.insert(cell.index, to_static_buf(&cell.payload)); + return_if_locked!(parent_page); + if !parent_page.is_loaded() { + self.pager.load_page(parent_page.clone())?; + return Ok(CursorResult::IO); } - - // amount of cells for pages involved in split - // the algorithm accumulate cells in greedy manner with 2 conditions for split: - // 1. new cell will overflow single cell (accumulated + new > usable_space - header_size) - // 2. accumulated size already reach >50% of content_usable_size - // second condition is necessary, otherwise in case of small cells we will create a lot of almost empty pages - // - // if we have single overflow cell in a table leaf node - we still can have 3 split pages - // - // for example, if current page has 4 entries with size ~1/4 page size, and new cell has size ~page size - // then we will need 3 pages to distribute cells between them - let split_pages_cells_count = &mut write_info.split_pages_cells_count.borrow_mut(); - split_pages_cells_count.clear(); - let mut last_page_cells_count = 0; - let mut last_page_cells_size = 0; - let content_usable_space = usable_space - page_copy.header_size(); - for scratch_cell in scratch_cells.iter() { - let cell_size = scratch_cell.len() + 2; // + cell pointer size (u16) - if last_page_cells_size + cell_size > content_usable_space - || 2 * last_page_cells_size > content_usable_space - { - split_pages_cells_count.push(last_page_cells_count); - last_page_cells_count = 0; - last_page_cells_size = 0; - } - last_page_cells_count += 1; - last_page_cells_size += cell_size; - assert!(last_page_cells_size <= content_usable_space); - } - split_pages_cells_count.push(last_page_cells_count); - let new_pages_count = split_pages_cells_count.len(); + let parent_contents = parent_page.get().contents.as_ref().unwrap(); + let page_to_balance_idx = self.stack.current_cell_index() as usize; debug!( - "splitting left={} new_pages={}, cells_count={:?}", - current_page.get().id, - new_pages_count - 1, - split_pages_cells_count + "balance_non_root(parent_id={} page_to_balance_idx={})", + parent_page.get().id, + page_to_balance_idx ); + // Part 1: Find the sibling pages to balance + self.write_info.new_pages.borrow_mut().clear(); + self.write_info.pages_to_balance.borrow_mut().clear(); + self.write_info.divider_cells.borrow_mut().clear(); + let number_of_cells_in_parent = + parent_contents.cell_count() + parent_contents.overflow_cells.len(); - *write_info.rightmost_pointer.borrow_mut() = page_copy.rightmost_pointer(); - write_info.page_copy.replace(Some(page_copy)); + // As there will be at maximum 3 pages used to balance: + // sibling_pointer is the index represeneting one of those 3 pages, and we initialize it to the last possible page. + // next_divider is the first divider that contains the first page of the 3 pages. + let (sibling_pointer, first_cell_divider) = if number_of_cells_in_parent < 2 { + (number_of_cells_in_parent, 0) + } else if number_of_cells_in_parent == 2 { + // Here we will have at lest 2 cells and one right pointer, therefore we can get 3 siblings. + // In case of 2 we will have all pages to balance. + (2, 0) + } else { + // In case of > 3 we have to check which ones to get + let next_divider = if page_to_balance_idx == 0 { + // first cell, take first 3 + 0 + } else if page_to_balance_idx == number_of_cells_in_parent { + // Page corresponds to right pointer, so take last 3 + number_of_cells_in_parent - 2 + } else { + // Some cell in the middle, so we want to take sibling on left and right. + page_to_balance_idx - 1 + }; + (2, next_divider) + }; + self.write_info.sibling_count.replace(sibling_pointer + 1); + self.write_info + .first_divider_cell + .replace(first_cell_divider); - let page = current_page.get().contents.as_mut().unwrap(); - let page_type = page.page_type(); + let last_sibling_is_right_pointer = sibling_pointer + first_cell_divider + - parent_contents.overflow_cells.len() + == parent_contents.cell_count(); + // Get the right page pointer that we will need to update later + let right_pointer = if last_sibling_is_right_pointer { + parent_contents.rightmost_pointer_raw().unwrap() + } else { + let pointer_area = parent_contents.cell_pointer_array_offset_and_size(); + let buf = parent_contents.as_ptr().as_mut_ptr(); + let last_divider_offset = (first_cell_divider + sibling_pointer) * 2; + unsafe { buf.add(pointer_area.0 + last_divider_offset) } + }; + + // load sibling pages + // start loading right page first + let mut pgno: u32 = unsafe { right_pointer.cast::().read() }; + let mut current_sibling = sibling_pointer; + for i in (0..=current_sibling).rev() { + let page = self.pager.read_page(pgno as usize)?; + page.set_dirty(); + self.pager.add_dirty(page.get().id); + self.write_info.pages_to_balance.borrow_mut().push(page); + assert_eq!( + parent_contents.overflow_cells.len(), + 0, + "overflow in parent is not yet implented while balancing it" + ); + let next_cell_divider = i + first_cell_divider; + pgno = match parent_contents.cell_get( + next_cell_divider, + self.pager.clone(), + self.payload_overflow_threshold_max(parent_contents.page_type()), + self.payload_overflow_threshold_min(parent_contents.page_type()), + self.usable_space(), + )? { + BTreeCell::TableInteriorCell(table_interior_cell) => { + table_interior_cell._left_child_page + } + BTreeCell::IndexInteriorCell(index_interior_cell) => { + index_interior_cell.left_child_page + } + BTreeCell::TableLeafCell(..) | BTreeCell::IndexLeafCell(..) => { + unreachable!() + } + }; + } + // Reverse in order to keep the right order + self.state + .write_info() + .unwrap() + .pages_to_balance + .borrow_mut() + .reverse(); + self.state.write_info().unwrap().state = WriteState::BalanceNonRootWaitLoadPages; + return Ok(CursorResult::IO); + } + WriteState::BalanceNonRootWaitLoadPages => { + let write_info = self.state.write_info().unwrap(); + let all_loaded = write_info + .pages_to_balance + .borrow() + .iter() + .all(|page| page.is_locked()); + if !all_loaded { + return Ok(CursorResult::IO); + } + let parent_page = self.stack.top(); + let parent_contents = parent_page.get_contents(); assert!( - matches!(page_type, PageType::TableLeaf | PageType::TableInterior), - "indexes still not supported" + parent_contents.overflow_cells.len() == 0, + "overflow parent not yet implemented" ); + let sibling_count = *write_info.sibling_count.borrow(); + let first_divider_cell = *write_info.first_divider_cell.borrow(); - write_info.split_pages.borrow_mut().clear(); - write_info.split_pages.borrow_mut().push(current_page); - // allocate new pages - for _ in 1..new_pages_count { - let new_page = self.allocate_page(page_type, 0); - write_info.split_pages.borrow_mut().push(new_page); + // Get divider cells and max_cells + let mut max_cells = 0; + let pages_to_balance = write_info.pages_to_balance.borrow(); + for i in (0..sibling_count).rev() { + let sibling_page = &pages_to_balance[i]; + let sibling_contents = sibling_page.get_contents(); + max_cells += sibling_contents.cell_count(); + if i == 0 { + // we don't have left sibling from this one so we break + break; + } + // Since we know we have a left sibling, take the divider that points to left sibling of this page + let cell_idx = first_divider_cell - i - 1; + let (cell_start, cell_len) = parent_contents.cell_get_raw_region( + cell_idx, + self.payload_overflow_threshold_max(parent_contents.page_type()), + self.payload_overflow_threshold_min(parent_contents.page_type()), + self.usable_space(), + ); + let buf = parent_contents.as_ptr(); + let cell_buf = &buf[cell_start..cell_start + cell_len]; + + // TODO(pere): make this reference and not copy + write_info + .divider_cells + .borrow_mut() + .push(cell_buf.to_vec()); + self.drop_cell(parent_contents, cell_idx); + } + assert_eq!( + write_info.divider_cells.borrow().len(), + sibling_count - 1, + "the number of pages balancing must be divided by one less divider" + ); + // Reverse divider cells to be in order + write_info.divider_cells.borrow_mut().reverse(); + + write_info + .scratch_cells + .replace(Vec::with_capacity(max_cells)); + + let scratch_cells = write_info.scratch_cells.borrow_mut(); + let mut cell_array = CellArray { + cells: Vec::new(), + number_of_cells_per_page: Vec::new(), + }; + + let mut total_cells_inserted = 0; + // count_cells_in_old_pages is the prefix sum of cells of each page + let mut count_cells_in_old_pages = Vec::new(); + let leaf_data = matches!( + pages_to_balance[0].get_contents().page_type(), + PageType::TableLeaf + ); + for (i, old_page) in pages_to_balance.iter().enumerate() { + let old_page_contents = old_page.get_contents(); + let old_page_type = old_page_contents.page_type(); + for cell_idx in 0..old_page_contents.cell_count() { + let (cell_start, cell_len) = old_page_contents.cell_get_raw_region( + cell_idx, + self.payload_overflow_threshold_max(old_page_contents.page_type()), + self.payload_overflow_threshold_min(old_page_contents.page_type()), + self.usable_space(), + ); + let buf = old_page_contents.as_ptr(); + let cell_buf = &buf[cell_start..cell_start + cell_len]; + // TODO(pere): make this reference and not copy + cell_array.cells.push(cell_buf.to_vec()); + } + // Insert overflow cells into correct place + let mut offset = total_cells_inserted; + assert_eq!( + old_page_contents.overflow_cells.len(), + 1, + "todo: check this works for more than one overflow cell" + ); + for overflow_cell in &old_page_contents.overflow_cells { + cell_array + .cells + .insert(offset + overflow_cell.index, overflow_cell.payload.to_vec()); + } + + count_cells_in_old_pages.push(cell_array.cells.len() as u16); + + let mut cells_inserted = + old_page_contents.cell_count() + old_page_contents.overflow_cells.len(); + + if i < pages_to_balance.len() - 1 && !leaf_data { + // If we are a index page or a interior table page we need to take the divider cell too. + // But we don't need the last divider as it will remain the same. + let divider_cell = write_info.divider_cells.borrow()[i].clone(); + // TODO(pere): in case of old pages are leaf pages, so index leaf page, we need to strip page pointers + // from divider cells in index interior pages (parent) because those should not be included. + cells_inserted += 1; + cell_array.cells.push(divider_cell); + } + total_cells_inserted += cells_inserted; } - (WriteState::BalanceGetParentPage, Ok(CursorResult::Ok(()))) + // calculate how many pages to allocate + let mut new_page_sizes = Vec::new(); + let mut k = 0; + // todo: add leaf correction + let usable_space = self.usable_space() - 12; + for i in 0..sibling_count { + cell_array + .number_of_cells_per_page + .push(count_cells_in_old_pages[i]); + let page = pages_to_balance[i]; + let page_contents = page.get_contents(); + let free_space = + self.compute_free_space(&page_contents, self.database_header.borrow()); + + // If we have an empty page of cells, we ignore it + if k > 0 + && cell_array.number_of_cells_per_page[k - 1] + == cell_array.number_of_cells_per_page[k] + { + k -= 1; + } + if !leaf_data { + k += 1; + } + new_page_sizes.push(usable_space as u16 - free_space); + for overflow in &page_contents.overflow_cells { + let size = new_page_sizes.last_mut().unwrap(); + // 2 to account of pointer + *size += 2 + overflow.payload.len() as u16; + } + k += 1; + } + // Try to pack as many cells to the left + let mut sibling_count_new = sibling_count; + let mut i = 0; + while i < sibling_count_new { + // First try to move cells to the right if they do not fit + while new_page_sizes[i] > usable_space as u16 { + let needs_new_page = i + 2 >= sibling_count_new; + if needs_new_page { + sibling_count_new += 1; + assert!( + sibling_count_new <= 5, + "it is corrupt to require more than 5 pages to balance 3 siblings" + ); + } + let size_of_cell_to_remove_from_left = + 2 + cell_array.cells[cell_array.cell_count(i) - 1].len() as u16; + new_page_sizes[i] -= size_of_cell_to_remove_from_left; + let size_of_cell_to_move_right = if !leaf_data { + if cell_array.number_of_cells_per_page[i] + < cell_array.cells.len() as u16 + { + // This means we move to the right page the divider cell and we + // promote left cell to divider + 2 + cell_array.cells[cell_array.cell_count(i)].len() as u16 + } else { + 0 + } + } else { + size_of_cell_to_remove_from_left + }; + new_page_sizes[i + 1] -= size_of_cell_to_move_right; + cell_array.number_of_cells_per_page[i] -= 1; + } + + // Now try to take from the right if we didn't have enough + while cell_array.number_of_cells_per_page[i] < cell_array.cells.len() as u16 { + let size_of_cell_to_remove_from_right = + 2 + cell_array.cells[cell_array.cell_count(i)].len() as u16; + let can_take = new_page_sizes[i] + size_of_cell_to_remove_from_right + > usable_space as u16; + if can_take { + break; + } + new_page_sizes[i] += size_of_cell_to_remove_from_right; + cell_array.number_of_cells_per_page[i] += 1; + + let size_of_cell_to_remove_from_right = if !leaf_data { + if cell_array.number_of_cells_per_page[i] + < cell_array.cells.len() as u16 + { + 2 + cell_array.cells[cell_array.cell_count(i)].len() as u16 + } else { + 0 + } + } else { + size_of_cell_to_remove_from_right + }; + + new_page_sizes[i + 1] += size_of_cell_to_remove_from_right; + } + + let we_still_need_another_page = + cell_array.number_of_cells_per_page[i] >= cell_array.cells.len() as u16; + if we_still_need_another_page { + sibling_count_new += 1; + } + i += 1; + } + + // Comment borrowed from SQLite src/btree.c + // The packing computed by the previous block is biased toward the siblings + // on the left side (siblings with smaller keys). The left siblings are + // always nearly full, while the right-most sibling might be nearly empty. + // The next block of code attempts to adjust the packing of siblings to + // get a better balance. + // + // This adjustment is more than an optimization. The packing above might + // be so out of balance as to be illegal. For example, the right-most + // sibling might be completely empty. This adjustment is not optional. + for i in (1..sibling_count_new).rev() { + let mut size_right_page = new_page_sizes[i]; + let mut size_left_page = new_page_sizes[i - 1]; + let mut cell_left = cell_array.number_of_cells_per_page[i - 1] - 1; + // if leaf_data means we don't have divider, so the one we move from left is + // the same we add to right (we don't add divider to right). + let mut cell_right = cell_left + 1 - leaf_data as u16; + loop { + let cell_left_size = cell_array.cell_size(cell_left as usize); + let cell_right_size = cell_array.cell_size(cell_right as usize); + // TODO: add assert nMaxCells + + let pointer_size = if i == sibling_count_new - 1 { 0 } else { 2 }; + let would_not_improve_balance = size_right_page + cell_right_size + 2 + > size_left_page - (cell_left_size + pointer_size); + if size_right_page != 0 && would_not_improve_balance { + break; + } + + size_left_page -= cell_left_size + 2; + size_right_page += cell_right_size + 2; + cell_array.number_of_cells_per_page[i - 1] = cell_left; + + if cell_left == 0 { + break; + } + cell_left -= 1; + cell_right -= 1; + } + + new_page_sizes[i] = size_right_page; + new_page_sizes[i - 1] = size_left_page; + assert!( + cell_array.number_of_cells_per_page[i - 1] + > if i > 1 { + cell_array.number_of_cells_per_page[i - 2] + } else { + 0 + } + ); + } + + // TODO: allocate pages + // TODO: reassign page numbers + // TODO: insert divider cells in parent + // TODO: update pages + // TODO: balance root + + return Ok(CursorResult::IO); } WriteState::BalanceGetParentPage => { let parent = self.stack.parent(); @@ -2333,6 +2647,42 @@ impl PageStack { } } +impl CellArray { + pub fn cell_size(&self, cell_idx: usize) -> u16 { + self.cells[cell_idx].len() as u16 + } + + pub fn cell_count(&self, page_idx: usize) -> usize { + self.number_of_cells_per_page[page_idx] as usize + } +} + +fn find_free_cell(page_ref: &PageContent, db_header: Ref, amount: usize) -> usize { + // NOTE: freelist is in ascending order of keys and pc + // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc + let mut pc = page_ref.first_freeblock() as usize; + + let buf = page_ref.as_ptr(); + + let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; + let maxpc = usable_space - amount; + let mut found = false; + while pc <= maxpc { + let next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()); + let size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()); + if amount <= size as usize { + found = true; + break; + } + pc = next as usize; + } + if !found { + 0 + } else { + pc + } +} + pub fn btree_init_page( page: &PageRef, page_type: PageType, diff --git a/core/storage/pager.rs b/core/storage/pager.rs index a67e4dd3f..20e7f9e6b 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -56,6 +56,10 @@ impl Page { unsafe { &mut *self.inner.get() } } + pub fn get_contents(&self) -> &mut PageContent { + self.get().contents.as_mut().unwrap() + } + pub fn is_uptodate(&self) -> bool { self.get().flags.load(Ordering::SeqCst) & PAGE_UPTODATE != 0 } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index f27140fe8..fa9413807 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -534,6 +534,16 @@ impl PageContent { } } + pub fn rightmost_pointer_raw(&self) -> Option<*mut u8> { + match self.page_type() { + PageType::IndexInterior | PageType::TableInterior => { + Some(unsafe { self.as_ptr().as_mut_ptr().add(self.offset + 8) }) + } + PageType::IndexLeaf => None, + PageType::TableLeaf => None, + } + } + pub fn cell_get( &self, idx: usize, From 0035b9d1bd921e140d794a8a5514ac3b66d657b7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 9 Feb 2025 17:56:51 +0100 Subject: [PATCH 02/33] up to edit --- core/storage/btree.rs | 659 ++++++++++++++++++--------------- core/storage/sqlite3_ondisk.rs | 11 +- 2 files changed, 361 insertions(+), 309 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 2cdbe37fd..f8ebbef24 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2,14 +2,14 @@ use tracing::debug; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{ - read_btree_cell, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType, + read_btree_cell, read_u32, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, TableLeafCell, }; use crate::types::{CursorResult, OwnedValue, Record, SeekKey, SeekOp}; use crate::{LimboError, Result}; -use std::cell::{self, Ref, RefCell}; +use std::cell::{Ref, RefCell}; use std::pin::Pin; use std::rc::Rc; @@ -119,6 +119,7 @@ impl WriteInfo { divider_cells: RefCell::new(Vec::new()), sibling_count: RefCell::new(0), first_divider_cell: RefCell::new(0), + new_pages: RefCell::new(Vec::new()), } } } @@ -183,7 +184,7 @@ struct PageStack { } struct CellArray { - cells: Vec>, // TODO(pere): make this with references + cells: Vec<&'static mut [u8]>, // TODO(pere): make this with references number_of_cells_per_page: Vec, // number of cells in each page } @@ -847,122 +848,6 @@ impl BTreeCursor { page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells); } - /// Free the range of bytes that a cell occupies. - /// This function also updates the freeblock list in the page. - /// Freeblocks are used to keep track of free space in the page, - /// and are organized as a linked list. - fn free_cell_range(&self, page: &mut PageContent, offset: u16, len: u16) -> Result<()> { - let mut cell_block_end = offset as u32 + len as u32; - let mut fragments_reduced = 0; - let mut cell_length = len; - let mut cell_block_start = offset; - let mut next_free_block_ptr = PAGE_HEADER_OFFSET_FIRST_FREEBLOCK as u16; - - let usable_size = { - let db_header = self.pager.db_header.borrow(); - (db_header.page_size - db_header.reserved_space as u16) as u32 - }; - - assert!( - cell_block_end <= usable_size, - "cell block end is out of bounds" - ); - assert!(cell_block_start >= 4, "minimum cell size is 4"); - assert!( - cell_block_start <= self.usable_space() as u16 - 4, - "offset is out of page bounds" - ); - - // Check for empty freelist fast path - let mut next_free_block = if page.read_u8(next_free_block_ptr as usize + 1) == 0 - && page.read_u8(next_free_block_ptr as usize) == 0 - { - 0 // Fast path for empty freelist - } else { - // Find position in free list - let mut block = page.read_u16(next_free_block_ptr as usize); - while block != 0 && block < cell_block_start { - if block <= next_free_block_ptr { - if block == 0 { - break; // Handle corruption test case - } - return Err(LimboError::Corrupt("Free block list not ascending".into())); - } - next_free_block_ptr = block; - block = page.read_u16(block as usize); - } - block - }; - - if next_free_block as u32 > usable_size - 4 { - return Err(LimboError::Corrupt("Free block beyond usable space".into())); - } - - // Coalesce with next block if adjacent - if next_free_block != 0 && cell_block_end + 3 >= next_free_block as u32 { - fragments_reduced = (next_free_block as u32 - cell_block_end) as u8; - if cell_block_end > next_free_block as u32 { - return Err(LimboError::Corrupt("Invalid block overlap".into())); - } - - let next_block_size = page.read_u16(next_free_block as usize + 2) as u32; - cell_block_end = next_free_block as u32 + next_block_size; - if cell_block_end > usable_size { - return Err(LimboError::Corrupt( - "Coalesced block extends beyond page".into(), - )); - } - - cell_length = cell_block_end as u16 - cell_block_start; - next_free_block = page.read_u16(next_free_block as usize); - } - - // Coalesce with previous block if adjacent - if next_free_block_ptr > PAGE_HEADER_OFFSET_FIRST_FREEBLOCK as u16 { - let prev_block_end = - next_free_block_ptr as u32 + page.read_u16(next_free_block_ptr as usize + 2) as u32; - - if prev_block_end + 3 >= cell_block_start as u32 { - if prev_block_end > cell_block_start as u32 { - return Err(LimboError::Corrupt("Invalid previous block overlap".into())); - } - fragments_reduced += (cell_block_start as u32 - prev_block_end) as u8; - cell_length = (cell_block_end - next_free_block_ptr as u32) as u16; - cell_block_start = next_free_block_ptr; - } - } - - // Update frag count - let current_frags = page.read_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT); - if fragments_reduced > current_frags { - return Err(LimboError::Corrupt("Invalid fragmentation count".into())); - } - page.write_u8( - PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, - current_frags - fragments_reduced, - ); - - let content_area_start = page.cell_content_area(); - if cell_block_start <= content_area_start { - if cell_block_start < content_area_start { - return Err(LimboError::Corrupt("Free block before content area".into())); - } - if next_free_block_ptr != PAGE_HEADER_OFFSET_FIRST_FREEBLOCK as u16 { - return Err(LimboError::Corrupt("Invalid content area merge".into())); - } - // Extend content area - page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, next_free_block); - page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cell_block_end as u16); - } else { - // Insert in free list - page.write_u16(next_free_block_ptr as usize, cell_block_start); - page.write_u16(cell_block_start as usize, next_free_block); - page.write_u16(cell_block_start as usize + 2, cell_length); - } - - Ok(()) - } - /// Drop a cell from a page. /// This is done by freeing the range of bytes that the cell occupies. fn drop_cell(&self, page: &mut PageContent, cell_idx: usize) { @@ -973,90 +858,13 @@ impl BTreeCursor { self.payload_overflow_threshold_min(page.page_type()), self.usable_space(), ); - - self.free_cell_range(page, cell_start as u16, cell_len as u16) - .expect("Failed to free cell range"); - - let new_cell_count = cell_count - 1; - page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_cell_count as u16); - - if new_cell_count == 0 { - page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); - page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); - page.write_u16( - PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, - self.usable_space() as u16, - ); - } else { - let (pointer_array_start, _) = page.cell_pointer_array_offset_and_size(); - let buf = page.as_ptr(); - buf.copy_within( - pointer_array_start + (2 * (cell_idx + 1)) // src - ..pointer_array_start + (2 * cell_count), - pointer_array_start + (2 * cell_idx), // dst - ); - } - } - - fn find_free_cell( - &self, - page_ref: &PageContent, - amount: usize, - db_header: Ref, - ) -> Result { - // NOTE: freelist is in ascending order of keys and pc - // unused_space is reserved bytes at the end of page, therefore we must subtract from maxpc - let mut free_list_pointer_addr = 1; - let mut pc = page_ref.first_freeblock() as usize; - - let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; - let maxpc = usable_space - amount; - - if pc == 0 { - return Ok(0); - } - - while pc <= maxpc { - let size = page_ref.read_u16(pc + 2) as usize; - - if let Some(x) = size.checked_sub(amount) { - if x < 4 { - if page_ref.read_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT) > 57 { - return Ok(0); - } - - let next_ptr = page_ref.read_u16(pc); - page_ref.write_u16(free_list_pointer_addr, next_ptr); - - let frag_count = page_ref.read_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT); - page_ref.write_u8( - PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, - frag_count + x as u8, - ); - return Ok(pc); - } else if x + pc > maxpc { - return Err(LimboError::Corrupt("Free block extends beyond page".into())); - } else { - page_ref.write_u16(pc + 2, x as u16); - return Ok(pc + x); - } - } - - free_list_pointer_addr = pc; - pc = page_ref.read_u16(pc) as usize; - if pc <= free_list_pointer_addr && pc != 0 { - return Err(LimboError::Corrupt( - "Free list not in ascending order".into(), - )); - } - } - - if pc > maxpc + amount - 4 { - return Err(LimboError::Corrupt( - "Free block chain extends beyond page end".into(), - )); - } - Ok(0) + free_cell_range( + page, + cell_start as u16, + cell_len as u16, + self.usable_space() as u16, + ); + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); } /// Balance a leaf page. @@ -1254,6 +1062,7 @@ impl BTreeCursor { // Get divider cells and max_cells let mut max_cells = 0; let pages_to_balance = write_info.pages_to_balance.borrow(); + let pages_to_balance_new = write_info.pages_to_balance.borrow(); for i in (0..sibling_count).rev() { let sibling_page = &pages_to_balance[i]; let sibling_contents = sibling_page.get_contents(); @@ -1301,10 +1110,10 @@ impl BTreeCursor { let mut total_cells_inserted = 0; // count_cells_in_old_pages is the prefix sum of cells of each page let mut count_cells_in_old_pages = Vec::new(); - let leaf_data = matches!( - pages_to_balance[0].get_contents().page_type(), - PageType::TableLeaf - ); + let mut divider_cells = Vec::new(); + + let page_type = pages_to_balance[0].get_contents().page_type(); + let leaf_data = matches!(page_type, PageType::TableLeaf); for (i, old_page) in pages_to_balance.iter().enumerate() { let old_page_contents = old_page.get_contents(); let old_page_type = old_page_contents.page_type(); @@ -1318,7 +1127,7 @@ impl BTreeCursor { let buf = old_page_contents.as_ptr(); let cell_buf = &buf[cell_start..cell_start + cell_len]; // TODO(pere): make this reference and not copy - cell_array.cells.push(cell_buf.to_vec()); + cell_array.cells.push(cell_buf); } // Insert overflow cells into correct place let mut offset = total_cells_inserted; @@ -1330,7 +1139,7 @@ impl BTreeCursor { for overflow_cell in &old_page_contents.overflow_cells { cell_array .cells - .insert(offset + overflow_cell.index, overflow_cell.payload.to_vec()); + .insert(offset + overflow_cell.index, &overflow_cell.payload); } count_cells_in_old_pages.push(cell_array.cells.len() as u16); @@ -1345,7 +1154,8 @@ impl BTreeCursor { // TODO(pere): in case of old pages are leaf pages, so index leaf page, we need to strip page pointers // from divider cells in index interior pages (parent) because those should not be included. cells_inserted += 1; - cell_array.cells.push(divider_cell); + divider_cells.push(divider_cell); + cell_array.cells.push(÷r_cells.last().unwrap()); } total_cells_inserted += cells_inserted; } @@ -1503,10 +1313,109 @@ impl BTreeCursor { ); } - // TODO: allocate pages - // TODO: reassign page numbers + // Allocate pages or set dirty if not needed + for i in 0..sibling_count_new { + if i < sibling_count { + pages_to_balance[i].set_dirty(); + pages_to_balance_new.push(pages_to_balance[i].clone()); + } else { + let page = self.allocate_page(page_type, 0); + pages_to_balance_new.push(page); + } + } + + // Reassign page numbers in increasing order + let mut page_numbers = Vec::new(); + for page in pages_to_balance_new.iter() { + page_numbers.push(page.get().id); + } + page_numbers.sort(); + for (page, new_id) in pages_to_balance_new.iter().zip(page_numbers) { + if new_id != page.get().id { + page.get().id = new_id; + self.pager.put_loaded_page(new_id, page.clone()); + } + } + + // Ensure right-child pointer of the right-most new sibling pge points to the page + // that was originally on that place. + let is_leaf_page = + matches!(page_type, PageType::TableInterior | PageType::IndexInterior); + if !is_leaf_page { + let last_page = pages_to_balance.last().unwrap(); + let right_pointer = last_page.get_contents().rightmost_pointer().unwrap(); + let new_last_page = pages_to_balance_new.last().unwrap(); + new_last_page + .get_contents() + .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, right_pointer); + } + // TODO: pointer map update (vacuum support) // TODO: insert divider cells in parent + for i in 0..sibling_count_new - 1 + /* do not take last page */ + { + let divider_cell_idx = cell_array.cell_count(i); + let divider_cell = &mut cell_array.cells[divider_cell_idx]; + let page = &pages_to_balance_new[i]; + if !is_leaf_page { + // Interior + page.get_contents() + .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, page.get().id as u32); + } else if leaf_data { + // Leaf table + // FIXME: not needed conversion + // FIXME: need to update cell size in order to free correctly? + // insert into cell with correct range should be enough + let rowid = read_u32(divider_cell, 4); + divider_cell[0..4].copy_from_slice(&(page.get().id as u32).to_be_bytes()); + divider_cell[4..8].copy_from_slice(&rowid.to_be_bytes()); + } else { + // Leaf index + divider_cell[0..4].copy_from_slice(&(page.get().id as u32).to_be_bytes()); + } + self.insert_into_cell(parent_contents, ÷r_cell, first_divider_cell + i); + } // TODO: update pages + let mut done = vec![false; sibling_count_new]; + for i in (1 as i64 - sibling_count_new as i64)..sibling_count_new as i64 { + let page_idx = i.abs() as usize; + if done[page_idx] { + continue; + } + if i >= 0 + || count_cells_in_old_pages[page_idx - 1] + >= cell_array.number_of_cells_per_page[page_idx - 1] + { + let (start_old_cells, start_new_cells, number_new_cells) = if page_idx == 0 + { + (0, 0, cell_array.cell_count(0)) + } else { + let this_was_old_page = page_idx < sibling_count; + let start_old_cells = if this_was_old_page { + count_cells_in_old_pages[page_idx - 1] as usize + + (!leaf_data) as usize + } else { + cell_array.cells.len() + }; + ( + start_old_cells, + cell_array.cell_count(page_idx - 1) + (!leaf_data) as usize, + cell_array.cell_count(0), + ) + }; + let page = pages_to_balance_new[page_idx].get_contents(); + edit_page( + page, + start_old_cells, + start_new_cells, + number_new_cells, + &cell_array, + usable_space as u16, + ); + + done[page_idx] = true; + } + } // TODO: balance root return Ok(CursorResult::IO); @@ -1825,7 +1734,7 @@ impl BTreeCursor { if gap + 2 + amount > top { // defragment - self.defragment_page(page_ref, RefCell::borrow(&self.pager.db_header)); + defragment_page(page_ref, self.usable_space() as u16); top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize; assert!(gap + 2 + amount <= top); } @@ -1840,96 +1749,6 @@ impl BTreeCursor { Ok(top as u16) } - /// Defragment a page. This means packing all the cells to the end of the page. - fn defragment_page(&self, page: &PageContent, db_header: Ref) { - debug!("defragment_page"); - let cloned_page = page.clone(); - // TODO(pere): usable space should include offset probably - let usable_space = (db_header.page_size - db_header.reserved_space as u16) as u64; - let mut cbrk = usable_space; - - // TODO: implement fast algorithm - - let last_cell = usable_space - 4; - let first_cell = cloned_page.unallocated_region_start() as u64; - - if cloned_page.cell_count() > 0 { - let page_type = page.page_type(); - let read_buf = cloned_page.as_ptr(); - let write_buf = page.as_ptr(); - - for i in 0..cloned_page.cell_count() { - let cell_offset = page.offset + 8; - let cell_idx = cell_offset + i * 2; - - let pc = u16::from_be_bytes([read_buf[cell_idx], read_buf[cell_idx + 1]]) as u64; - if pc > last_cell { - unimplemented!("corrupted page"); - } - - assert!(pc <= last_cell); - - let size = match page_type { - PageType::TableInterior => { - let (_, nr_key) = match read_varint(&read_buf[pc as usize ..]) { - Ok(v) => v, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), - }; - 4 + nr_key as u64 - } - PageType::TableLeaf => { - let (payload_size, nr_payload) = match read_varint(&read_buf[pc as usize..]) { - Ok(v) => v, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), - }; - let (_, nr_key) = match read_varint(&read_buf[pc as usize + nr_payload..]) { - Ok(v) => v, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), - }; - // TODO: add overflow page calculation - payload_size + nr_payload as u64 + nr_key as u64 - } - PageType::IndexInterior => todo!(), - PageType::IndexLeaf => todo!(), - }; - cbrk -= size; - if cbrk < first_cell || pc + size > usable_space { - todo!("corrupt"); - } - assert!(cbrk + size <= usable_space && cbrk >= first_cell); - // set new pointer - write_buf[cell_idx..cell_idx + 2].copy_from_slice(&(cbrk as u16).to_be_bytes()); - // copy payload - write_buf[cbrk as usize..cbrk as usize + size as usize] - .copy_from_slice(&read_buf[pc as usize..pc as usize + size as usize]); - } - } - - // assert!( nfree >= 0 ); - // if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ - // return SQLITE_CORRUPT_PAGE(pPage); - // } - assert!(cbrk >= first_cell); - let write_buf = page.as_ptr(); - - // set new first byte of cell content - page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16); - // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start - page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); - // set fragmented bytes counter to zero - page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); - // set unused space to 0 - let first_cell = cloned_page.cell_content_area() as u64; - assert!(first_cell <= cbrk); - write_buf[first_cell as usize..cbrk as usize].fill(0); - } - /// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte /// and end of cell pointer area. #[allow(unused_assignments)] @@ -2713,6 +2532,240 @@ fn to_static_buf(buf: &[u8]) -> &'static [u8] { unsafe { std::mem::transmute::<&[u8], &'static [u8]>(buf) } } +pub fn edit_page( + page: &mut PageContent, + start_old_cells: usize, + start_new_cells: usize, + number_new_cells: usize, + cell_array: &CellArray, + usable_space: u16, +) { + let end_old_cells = start_old_cells + page.cell_count() + page.overflow_cells.len(); + let end_new_cells = start_new_cells + number_new_cells; + let mut count_cells = page.cell_count(); + if start_old_cells < start_new_cells { + let number_to_shift = page_free_array( + page, + start_old_cells, + start_new_cells - start_old_cells, + cell_array, + usable_space, + ); + count_cells -= number_to_shift; + // TODO: shift + } + if end_new_cells < end_old_cells { + let number_tail_removed = page_free_array( + page, + end_new_cells, + end_old_cells - end_new_cells, + cell_array, + usable_space, + ); + assert!(page.cell_count() >= number_tail_removed); + count_cells -= number_tail_removed; + } + // TODO: make page_free_array defragment, for now I'm lazy so this will work for now. + defragment_page(page, usable_space); + // TODO: add to start + // TODO: overflow cells + // TODO: append cells to end + // TODO: update ncell, noverflow + // TODO: update ncell +} + +pub fn page_free_array( + page: &mut PageContent, + first: usize, + count: usize, + cell_array: &CellArray, + usable_space: u16, +) -> usize { + let buf = &mut page.as_ptr()[page.offset..usable_space as usize]; + let buf_range = buf.as_ptr_range(); + let mut number_of_cells_removed = 0; + // TODO: implement fancy smart free block coalescing procedure instead of dumb free to + // then defragment + for i in first..first + count { + let cell = &cell_array.cells[i]; + let cell_pointer = cell.as_ptr_range(); + // check if not overflow cell + if cell_pointer.start >= buf_range.start && cell_pointer.start < buf_range.end { + assert!( + cell_pointer.end >= buf_range.start && cell_pointer.end < buf_range.end, + "whole cell should be inside the page" + ); + let offset = (cell_pointer.start as usize - buf_range.start as usize) as u16; + let len = (cell_pointer.end as usize - buf_range.start as usize) as u16; + free_cell_range(page, offset, len, usable_space); + number_of_cells_removed += 1; + } + } + number_of_cells_removed +} +pub fn page_insert_array( + page: &mut PageContent, + first: usize, + count: usize, + cell_array: &CellArray, + usable_space: u16, +) { +} + +/// Free the range of bytes that a cell occupies. +/// This function also updates the freeblock list in the page. +/// Freeblocks are used to keep track of free space in the page, +/// and are organized as a linked list. +fn free_cell_range(page: &mut PageContent, offset: u16, len: u16, usable_space: u16) { + // if the freeblock list is empty, we set this block as the first freeblock in the page header. + if page.first_freeblock() == 0 { + page.write_u16(offset as usize, 0); // next freeblock = null + page.write_u16(offset as usize + 2, len); // size of this freeblock + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block + return; + } + let first_block = page.first_freeblock(); + + // if the freeblock list is not empty, and the offset is less than the first freeblock, + // we insert this block at the head of the list + if offset < first_block { + page.write_u16(offset as usize, first_block); // next freeblock = previous first freeblock + page.write_u16(offset as usize + 2, len); // size of this freeblock + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block + return; + } + + // if we clear space that is at the start of the cell content area, + // we need to update the cell content area pointer forward to account for the removed space + // FIXME: is offset ever < cell_content_area? cell content area grows leftwards and the pointer + // is to the start of the last allocated cell. should we assert!(offset >= page.cell_content_area()) + // and change this to if offset == page.cell_content_area()? + if offset <= page.cell_content_area() { + // FIXME: remove the line directly below this, it does not change anything. + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, page.first_freeblock()); + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); + return; + } + + // if the freeblock list is not empty, and the offset is greater than the first freeblock, + // then we need to do some more calculation to figure out where to insert the freeblock + // in the freeblock linked list. + let maxpc = usable_space; + + let mut pc = first_block; + let mut prev = first_block; + + while pc <= maxpc && pc < offset { + let next = page.read_u16(pc as usize); + prev = pc; + pc = next; + } + + if pc >= maxpc { + // insert into tail + let offset = offset as usize; + let prev = prev as usize; + page.write_u16(prev, offset as u16); + page.write_u16(offset, 0); + page.write_u16(offset + 2, len); + } else { + // insert in between + let next = page.read_u16(pc as usize); + let offset = offset as usize; + let prev = prev as usize; + page.write_u16(prev, offset as u16); + page.write_u16(offset, next); + page.write_u16(offset + 2, len); + } +} + +/// Defragment a page. This means packing all the cells to the end of the page. +fn defragment_page(page: &PageContent, usable_space: u16) { + log::debug!("defragment_page"); + let cloned_page = page.clone(); + // TODO(pere): usable space should include offset probably + let usable_space = usable_space as u64; + let mut cbrk = usable_space; + + // TODO: implement fast algorithm + + let last_cell = usable_space - 4; + let first_cell = cloned_page.unallocated_region_start() as u64; + + if cloned_page.cell_count() > 0 { + let page_type = page.page_type(); + let read_buf = cloned_page.as_ptr(); + let write_buf = page.as_ptr(); + + for i in 0..cloned_page.cell_count() { + let cell_offset = page.offset + 8; + let cell_idx = cell_offset + i * 2; + + let pc = u16::from_be_bytes([read_buf[cell_idx], read_buf[cell_idx + 1]]) as u64; + if pc > last_cell { + unimplemented!("corrupted page"); + } + + assert!(pc <= last_cell); + + let size = match page_type { + PageType::TableInterior => { + let (_, nr_key) = match read_varint(&read_buf[pc as usize ..]) { + Ok(v) => v, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + 4 + nr_key as u64 + } + PageType::TableLeaf => { + let (payload_size, nr_payload) = match read_varint(&read_buf[pc as usize..]) { + Ok(v) => v, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + let (_, nr_key) = match read_varint(&read_buf[pc as usize + nr_payload..]) { + Ok(v) => v, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + // TODO: add overflow page calculation + payload_size + nr_payload as u64 + nr_key as u64 + } + PageType::IndexInterior => todo!(), + PageType::IndexLeaf => todo!(), + }; + cbrk -= size; + if cbrk < first_cell || pc + size > usable_space { + todo!("corrupt"); + } + assert!(cbrk + size <= usable_space && cbrk >= first_cell); + // set new pointer + write_buf[cell_idx..cell_idx + 2].copy_from_slice(&(cbrk as u16).to_be_bytes()); + // copy payload + write_buf[cbrk as usize..cbrk as usize + size as usize] + .copy_from_slice(&read_buf[pc as usize..pc as usize + size as usize]); + } + } + + // assert!( nfree >= 0 ); + // if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ + // return SQLITE_CORRUPT_PAGE(pPage); + // } + assert!(cbrk >= first_cell); + let write_buf = page.as_ptr(); + + // set new first byte of cell content + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16); + // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); + // set unused space to 0 + let first_cell = cloned_page.cell_content_area() as u64; + assert!(first_cell <= cbrk); + write_buf[first_cell as usize..cbrk as usize].fill(0); +} #[cfg(test)] mod tests { use rand_chacha::rand_core::RngCore; diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index fa9413807..c08333aef 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -441,12 +441,7 @@ impl PageContent { pub fn read_u32(&self, pos: usize) -> u32 { let buf = self.as_ptr(); - u32::from_be_bytes([ - buf[self.offset + pos], - buf[self.offset + pos + 1], - buf[self.offset + pos + 2], - buf[self.offset + pos + 3], - ]) + read_u32(buf, self.offset + pos) } pub fn write_u8(&self, pos: usize, value: u8) { @@ -1384,6 +1379,10 @@ impl WalHeader { } } +pub fn read_u32(buf: &[u8], pos: usize) -> u32 { + u32::from_be_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) +} + #[cfg(test)] mod tests { use super::*; From 05ca716f82014e927ce101706d0bf0f4695ff1fd Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 9 Feb 2025 21:59:23 +0100 Subject: [PATCH 03/33] up to finish without fixing stuff --- core/storage/btree.rs | 660 ++++++++++++++++-------------------------- 1 file changed, 257 insertions(+), 403 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index f8ebbef24..362b440c2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2,8 +2,8 @@ use tracing::debug; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{ - read_btree_cell, read_u32, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType, - TableInteriorCell, TableLeafCell, + read_u32, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, + TableLeafCell, }; use crate::types::{CursorResult, OwnedValue, Record, SeekKey, SeekOp}; @@ -82,8 +82,6 @@ enum WriteState { BalanceStart, BalanceNonRoot, BalanceNonRootWaitLoadPages, - BalanceGetParentPage, - BalanceMoveUp, Finish, } @@ -763,14 +761,18 @@ impl BTreeCursor { // insert let overflow = { let contents = page.get().contents.as_mut().unwrap(); - self.insert_into_cell(contents, cell_payload.as_slice(), cell_idx); - let overflow_cells = contents.overflow_cells.len(); debug!( - "insert_into_page(overflow, cell_count={}, overflow_cells={})", - contents.cell_count(), - overflow_cells + "insert_into_page(overflow, cell_count={})", + contents.cell_count() ); - overflow_cells + + insert_into_cell( + contents, + cell_payload.as_slice(), + cell_idx, + self.usable_space() as u16, + ); + contents.overflow_cells.len() }; let write_info = self .state @@ -784,9 +786,7 @@ impl BTreeCursor { } WriteState::BalanceStart | WriteState::BalanceNonRoot - | WriteState::BalanceNonRootWaitLoadPages - | WriteState::BalanceMoveUp - | WriteState::BalanceGetParentPage => { + | WriteState::BalanceNonRootWaitLoadPages => { return_if_io!(self.balance()); } WriteState::Finish => { @@ -798,56 +798,6 @@ impl BTreeCursor { return ret; } - /// Insert a record into a cell. - /// If the cell overflows, an overflow cell is created. - /// insert_into_cell() is called from insert_into_page(), - /// and the overflow cell count is used to determine if the page overflows, - /// i.e. whether we need to balance the btree after the insert. - fn insert_into_cell(&self, page: &mut PageContent, payload: &[u8], cell_idx: usize) { - let free = self.compute_free_space(page, RefCell::borrow(&self.pager.db_header)); - const CELL_POINTER_SIZE_BYTES: usize = 2; - let enough_space = payload.len() + CELL_POINTER_SIZE_BYTES <= free as usize; - if !enough_space { - // add to overflow cells - page.overflow_cells.push(OverflowCell { - index: cell_idx, - payload: Pin::new(Vec::from(payload)), - }); - return; - } - - // TODO: insert into cell payload in internal page - let new_cell_data_pointer = self - .allocate_cell_space(page, payload.len() as u16) - .unwrap(); - let buf = page.as_ptr(); - - // Copy cell data - buf[new_cell_data_pointer as usize..new_cell_data_pointer as usize + payload.len()] - .copy_from_slice(payload); - // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); - let (cell_pointer_array_start, _) = page.cell_pointer_array_offset_and_size(); - let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); - - let cell_count = page.cell_count(); - - // Move existing pointers if needed - let n_bytes_forward = CELL_POINTER_SIZE_BYTES * (cell_count - cell_idx); - if n_bytes_forward > 0 { - buf.copy_within( - cell_pointer_cur_idx..cell_pointer_cur_idx + n_bytes_forward, - cell_pointer_cur_idx + CELL_POINTER_SIZE_BYTES, - ); - } - - // Insert new cell pointer at the current cell index - page.write_u16(cell_pointer_cur_idx - page.offset, new_cell_data_pointer); - - // Update cell count - let new_n_cells = (page.cell_count() + 1) as u16; - page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells); - } - /// Drop a cell from a page. /// This is done by freeing the range of bytes that the cell occupies. fn drop_cell(&self, page: &mut PageContent, cell_idx: usize) { @@ -905,9 +855,7 @@ impl BTreeCursor { self.stack.pop(); return_if_io!(self.balance_non_root()); } - WriteState::BalanceNonRoot - | WriteState::BalanceGetParentPage - | WriteState::BalanceMoveUp => { + WriteState::BalanceNonRoot | WriteState::BalanceNonRootWaitLoadPages => { return_if_io!(self.balance_non_root()); } @@ -927,11 +875,6 @@ impl BTreeCursor { WriteState::Start => todo!(), WriteState::BalanceStart => todo!(), WriteState::BalanceNonRoot => { - // drop divider cells and find right pointer - // NOTE: since we are doing a simple split we only finding the pointer we want to update (right pointer). - // Right pointer means cell that points to the last page, as we don't really want to drop this one. This one - // can be a "rightmost pointer" or a "cell". - // we always asumme there is a parent let parent_page = self.stack.top(); if parent_page.is_locked() { return Ok(CursorResult::IO); @@ -1050,6 +993,7 @@ impl BTreeCursor { if !all_loaded { return Ok(CursorResult::IO); } + // Now do real balancing let parent_page = self.stack.top(); let parent_contents = parent_page.get_contents(); assert!( @@ -1062,7 +1006,7 @@ impl BTreeCursor { // Get divider cells and max_cells let mut max_cells = 0; let pages_to_balance = write_info.pages_to_balance.borrow(); - let pages_to_balance_new = write_info.pages_to_balance.borrow(); + let mut pages_to_balance_new = Vec::new(); for i in (0..sibling_count).rev() { let sibling_page = &pages_to_balance[i]; let sibling_contents = sibling_page.get_contents(); @@ -1101,7 +1045,6 @@ impl BTreeCursor { .scratch_cells .replace(Vec::with_capacity(max_cells)); - let scratch_cells = write_info.scratch_cells.borrow_mut(); let mut cell_array = CellArray { cells: Vec::new(), number_of_cells_per_page: Vec::new(), @@ -1116,7 +1059,6 @@ impl BTreeCursor { let leaf_data = matches!(page_type, PageType::TableLeaf); for (i, old_page) in pages_to_balance.iter().enumerate() { let old_page_contents = old_page.get_contents(); - let old_page_type = old_page_contents.page_type(); for cell_idx in 0..old_page_contents.cell_count() { let (cell_start, cell_len) = old_page_contents.cell_get_raw_region( cell_idx, @@ -1125,9 +1067,9 @@ impl BTreeCursor { self.usable_space(), ); let buf = old_page_contents.as_ptr(); - let cell_buf = &buf[cell_start..cell_start + cell_len]; + let cell_buf = &mut buf[cell_start..cell_start + cell_len]; // TODO(pere): make this reference and not copy - cell_array.cells.push(cell_buf); + cell_array.cells.push(to_static_buf(cell_buf)); } // Insert overflow cells into correct place let mut offset = total_cells_inserted; @@ -1136,10 +1078,11 @@ impl BTreeCursor { 1, "todo: check this works for more than one overflow cell" ); - for overflow_cell in &old_page_contents.overflow_cells { - cell_array - .cells - .insert(offset + overflow_cell.index, &overflow_cell.payload); + for overflow_cell in old_page_contents.overflow_cells.iter_mut() { + cell_array.cells.insert( + offset + overflow_cell.index, + to_static_buf(&mut Pin::as_mut(&mut overflow_cell.payload)), + ); } count_cells_in_old_pages.push(cell_array.cells.len() as u16); @@ -1155,7 +1098,9 @@ impl BTreeCursor { // from divider cells in index interior pages (parent) because those should not be included. cells_inserted += 1; divider_cells.push(divider_cell); - cell_array.cells.push(÷r_cells.last().unwrap()); + cell_array + .cells + .push(to_static_buf(divider_cells.last_mut().unwrap().as_mut())); } total_cells_inserted += cells_inserted; } @@ -1169,10 +1114,9 @@ impl BTreeCursor { cell_array .number_of_cells_per_page .push(count_cells_in_old_pages[i]); - let page = pages_to_balance[i]; + let page = &pages_to_balance[i]; let page_contents = page.get_contents(); - let free_space = - self.compute_free_space(&page_contents, self.database_header.borrow()); + let free_space = compute_free_space(&page_contents, self.usable_space() as u16); // If we have an empty page of cells, we ignore it if k > 0 @@ -1319,7 +1263,7 @@ impl BTreeCursor { pages_to_balance[i].set_dirty(); pages_to_balance_new.push(pages_to_balance[i].clone()); } else { - let page = self.allocate_page(page_type, 0); + let page = self.allocate_page(page_type.clone(), 0); pages_to_balance_new.push(page); } } @@ -1373,7 +1317,12 @@ impl BTreeCursor { // Leaf index divider_cell[0..4].copy_from_slice(&(page.get().id as u32).to_be_bytes()); } - self.insert_into_cell(parent_contents, ÷r_cell, first_divider_cell + i); + insert_into_cell( + parent_contents, + ÷r_cell, + first_divider_cell + i, + self.usable_space() as u16, + ); } // TODO: update pages let mut done = vec![false; sibling_count_new]; @@ -1412,196 +1361,15 @@ impl BTreeCursor { &cell_array, usable_space as u16, ); + page.overflow_cells.clear(); done[page_idx] = true; } } // TODO: balance root - - return Ok(CursorResult::IO); - } - WriteState::BalanceGetParentPage => { - let parent = self.stack.parent(); - let loaded = parent.is_loaded(); - return_if_locked!(parent); - - if !loaded { - debug!("balance_leaf(loading page)"); - self.pager.load_page(parent.clone())?; - return Ok(CursorResult::IO); - } - parent.set_dirty(); - (WriteState::BalanceMoveUp, Ok(CursorResult::Ok(()))) - } - WriteState::BalanceMoveUp => { - let parent = self.stack.parent(); - - let (page_type, current_idx) = { - let current_page = self.stack.top(); - let contents = current_page.get().contents.as_ref().unwrap(); - (contents.page_type().clone(), current_page.get().id) - }; - - parent.set_dirty(); - self.pager.add_dirty(parent.get().id); - let parent_contents = parent.get().contents.as_mut().unwrap(); - // if this isn't empty next loop won't work - assert_eq!(parent_contents.overflow_cells.len(), 0); - - // Right page pointer is u32 in right most pointer, and in cell is u32 too, so we can use a *u32 to hold where we want to change this value - let mut right_pointer = PAGE_HEADER_OFFSET_RIGHTMOST_PTR; - for cell_idx in 0..parent_contents.cell_count() { - let cell = parent_contents.cell_get( - cell_idx, - self.pager.clone(), - self.payload_overflow_threshold_max(page_type.clone()), - self.payload_overflow_threshold_min(page_type.clone()), - self.usable_space(), - )?; - let found = match cell { - BTreeCell::TableInteriorCell(interior) => { - interior._left_child_page as usize == current_idx - } - _ => unreachable!("Parent should always be an interior page"), - }; - if found { - let (start, _len) = parent_contents.cell_get_raw_region( - cell_idx, - self.payload_overflow_threshold_max(page_type.clone()), - self.payload_overflow_threshold_min(page_type.clone()), - self.usable_space(), - ); - right_pointer = start; - break; - } - } - - let write_info = self.state.write_info().unwrap(); - let mut split_pages = write_info.split_pages.borrow_mut(); - let split_pages_len = split_pages.len(); - let scratch_cells = write_info.scratch_cells.borrow(); - - // reset pages - for page in split_pages.iter() { - assert!(page.is_dirty()); - let contents = page.get().contents.as_mut().unwrap(); - - contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); - contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); - - contents.write_u16( - PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, - self.usable_space() as u16, - ); - - contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); - if !contents.is_leaf() { - contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0); - } - } - - let mut current_cell_index = 0_usize; - /* index to scratch cells that will be used as dividers in order */ - let mut divider_cells_index = Vec::with_capacity(split_pages.len()); - - debug!("balance_leaf::distribute(cells={})", scratch_cells.len()); - - for (i, page) in split_pages.iter_mut().enumerate() { - let page_id = page.get().id; - let contents = page.get().contents.as_mut().unwrap(); - - let cells_to_copy = write_info.split_pages_cells_count.borrow()[i]; - debug!( - "balance_leaf::distribute(page={}, cells_to_copy={})", - page_id, cells_to_copy - ); - - let cell_index_range = current_cell_index..current_cell_index + cells_to_copy; - for (j, cell_idx) in cell_index_range.enumerate() { - debug!("balance_leaf::distribute_in_page(page={}, cells_to_copy={}, j={}, cell_idx={})", page_id, cells_to_copy, j, cell_idx); - - let cell = scratch_cells[cell_idx]; - self.insert_into_cell(contents, cell, j); - } - divider_cells_index.push(current_cell_index + cells_to_copy - 1); - current_cell_index += cells_to_copy; - } - - let is_leaf = { - let page = self.stack.top(); - let page = page.get().contents.as_ref().unwrap(); - page.is_leaf() - }; - - // update rightmost pointer for each page if we are in interior page - if !is_leaf { - for page in split_pages.iter_mut().take(split_pages_len - 1) { - let contents = page.get().contents.as_mut().unwrap(); - - assert!(contents.cell_count() >= 1); - let last_cell = contents.cell_get( - contents.cell_count() - 1, - self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), - self.usable_space(), - )?; - let last_cell_pointer = match last_cell { - BTreeCell::TableInteriorCell(interior) => interior._left_child_page, - _ => unreachable!(), - }; - self.drop_cell(contents, contents.cell_count() - 1); - contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, last_cell_pointer); - } - // last page right most pointer points to previous right most pointer before splitting - let last_page = split_pages.last().unwrap(); - let last_page_contents = last_page.get().contents.as_mut().unwrap(); - last_page_contents.write_u32( - PAGE_HEADER_OFFSET_RIGHTMOST_PTR, - write_info.rightmost_pointer.borrow().unwrap(), - ); - } - - // insert dividers in parent - // we can consider dividers the first cell of each page starting from the second page - for (page_id_index, page) in - split_pages.iter_mut().take(split_pages_len - 1).enumerate() - { - let contents = page.get().contents.as_mut().unwrap(); - let divider_cell_index = divider_cells_index[page_id_index]; - let cell_payload = scratch_cells[divider_cell_index]; - let cell = read_btree_cell( - cell_payload, - &contents.page_type(), - 0, - self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), - self.usable_space(), - )?; - - let key = match cell { - BTreeCell::TableLeafCell(TableLeafCell { _rowid, .. }) - | BTreeCell::TableInteriorCell(TableInteriorCell { _rowid, .. }) => _rowid, - _ => unreachable!(), - }; - - let mut divider_cell = Vec::with_capacity(4 + 9); // 4 - page id, 9 - max length of varint - divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); - write_varint_to_vec(key, &mut divider_cell); - - let parent_cell_idx = self.find_cell(parent_contents, key); - self.insert_into_cell(parent_contents, ÷r_cell, parent_cell_idx); - } - - { - // copy last page id to right pointer - let last_pointer = split_pages.last().unwrap().get().id as u32; - parent_contents.write_u32(right_pointer, last_pointer); - } self.stack.pop(); - let _ = write_info.page_copy.take(); - (WriteState::BalanceStart, Ok(CursorResult::Ok(()))) + // TODO: free pages + return Ok(CursorResult::IO); } WriteState::Finish => todo!(), }; @@ -1711,133 +1479,6 @@ impl BTreeCursor { page } - /// Allocate space for a cell on a page. - fn allocate_cell_space(&self, page_ref: &mut PageContent, amount: u16) -> Result { - let amount = amount as usize; - let (cell_offset, _) = page_ref.cell_pointer_array_offset_and_size(); - let gap = cell_offset + 2 * page_ref.cell_count(); - let mut top = page_ref.cell_content_area() as usize; - - if page_ref.first_freeblock() != 0 && gap + 2 <= top { - let db_header = RefCell::borrow(&self.pager.db_header); - let pc = self.find_free_cell(page_ref, amount, db_header)?; - if pc != 0 { - // Corruption check - if pc <= gap { - return Err(LimboError::Corrupt( - "Corrupted page: free block overlaps cell pointer array".into(), - )); - } - return Ok(pc as u16); - } - } - - if gap + 2 + amount > top { - // defragment - defragment_page(page_ref, self.usable_space() as u16); - top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize; - assert!(gap + 2 + amount <= top); - } - - top -= amount; - page_ref.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, top as u16); - - let db_header = RefCell::borrow(&self.pager.db_header); - let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; - assert!(top + amount <= usable_space); - - Ok(top as u16) - } - - /// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte - /// and end of cell pointer area. - #[allow(unused_assignments)] - fn compute_free_space(&self, page: &PageContent, db_header: Ref) -> u16 { - // TODO(pere): maybe free space is not calculated correctly with offset - - // Usable space, not the same as free space, simply means: - // space that is not reserved for extensions by sqlite. Usually reserved_space is 0. - let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; - - let mut cell_content_area_start = page.cell_content_area(); - // A zero value for the cell content area pointer is interpreted as 65536. - // See https://www.sqlite.org/fileformat.html - // The max page size for a sqlite database is 64kiB i.e. 65536 bytes. - // 65536 is u16::MAX + 1, and since cell content grows from right to left, this means - // the cell content area pointer is at the end of the page, - // i.e. - // 1. the page size is 64kiB - // 2. there are no cells on the page - // 3. there is no reserved space at the end of the page - if cell_content_area_start == 0 { - cell_content_area_start = u16::MAX; - } - - // The amount of free space is the sum of: - // #1. the size of the unallocated region - // #2. fragments (isolated 1-3 byte chunks of free space within the cell content area) - // #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions) - - let mut free_space_bytes = - page.unallocated_region_size() + page.num_frag_free_bytes() as usize; - - // #3 is computed by iterating over the freeblocks linked list - let mut cur_freeblock_ptr = page.first_freeblock() as usize; - let page_buf = page.as_ptr(); - if cur_freeblock_ptr > 0 { - if cur_freeblock_ptr < cell_content_area_start as usize { - // Freeblocks exist in the cell content area e.g. after deletions - // They should never exist in the unused area of the page. - todo!("corrupted page"); - } - - let mut next = 0; - let mut size = 0; - loop { - // TODO: check corruption icellast - next = u16::from_be_bytes( - page_buf[cur_freeblock_ptr..cur_freeblock_ptr + 2] - .try_into() - .unwrap(), - ) as usize; // first 2 bytes in freeblock = next freeblock pointer - size = u16::from_be_bytes( - page_buf[cur_freeblock_ptr + 2..cur_freeblock_ptr + 4] - .try_into() - .unwrap(), - ) as usize; // next 2 bytes in freeblock = size of current freeblock - free_space_bytes += size; - // Freeblocks are in order from left to right on the page, - // so next pointer should > current pointer + its size, or 0 if no next block exists. - if next <= cur_freeblock_ptr + size + 3 { - break; - } - cur_freeblock_ptr = next; - } - - // Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list - assert_eq!( - next, 0, - "corrupted page: freeblocks list not in ascending order" - ); - - assert!( - cur_freeblock_ptr + size <= usable_space, - "corrupted page: last freeblock extends last page end" - ); - } - - assert!( - free_space_bytes <= usable_space, - "corrupted page: free space is greater than usable space" - ); - - // if( nFree>usableSize || nFree, amount: usize) -> usize { +fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> usize { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc let mut pc = page_ref.first_freeblock() as usize; let buf = page_ref.as_ptr(); - let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; + let usable_space = usable_space as usize; let maxpc = usable_space - amount; let mut found = false; while pc <= maxpc { @@ -2528,8 +2169,8 @@ pub fn btree_init_page( contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0); } -fn to_static_buf(buf: &[u8]) -> &'static [u8] { - unsafe { std::mem::transmute::<&[u8], &'static [u8]>(buf) } +fn to_static_buf(buf: &mut [u8]) -> &'static mut [u8] { + unsafe { std::mem::transmute::<&mut [u8], &'static mut [u8]>(buf) } } pub fn edit_page( @@ -2551,6 +2192,13 @@ pub fn edit_page( cell_array, usable_space, ); + // shift pointers left + let buf = page.as_ptr(); + let (start, _) = page.cell_pointer_array_offset_and_size(); + buf.copy_within( + start + (number_to_shift * 2)..start + (count_cells * 2), + start, + ); count_cells -= number_to_shift; // TODO: shift } @@ -2568,10 +2216,39 @@ pub fn edit_page( // TODO: make page_free_array defragment, for now I'm lazy so this will work for now. defragment_page(page, usable_space); // TODO: add to start + if start_new_cells < start_old_cells { + let count = number_new_cells.min(start_old_cells - start_new_cells); + page_insert_array(page, start_new_cells, count, cell_array, 0, usable_space); + count_cells += count; + } // TODO: overflow cells + for i in 0..page.overflow_cells.len() { + let overflow_cell = &page.overflow_cells[i]; + // cell index in context of new list of cells that should be in the page + let cell_idx = start_old_cells + overflow_cell.index - start_new_cells; + if cell_idx >= 0 && cell_idx < start_new_cells { + count_cells += 1; + page_insert_array( + page, + cell_idx + start_new_cells, + 1, + cell_array, + cell_idx, + usable_space, + ); + } + } // TODO: append cells to end - // TODO: update ncell, noverflow - // TODO: update ncell + page_insert_array( + page, + start_new_cells + count_cells, + number_new_cells - count_cells, + cell_array, + count_cells, + usable_space, + ); + // TODO: noverflow + page.write_u32(PAGE_HEADER_OFFSET_CELL_COUNT, count_cells as u32); } pub fn page_free_array( @@ -2608,8 +2285,15 @@ pub fn page_insert_array( first: usize, count: usize, cell_array: &CellArray, + mut start_insert: usize, usable_space: u16, ) { + // TODO: implement faster algorithm, this is doing extra work that's not needed. + // See pageInsertArray to understand faster way. + for i in first..first + count { + insert_into_cell(page, cell_array.cells[i], start_insert, usable_space); + start_insert += 1; + } } /// Free the range of bytes that a cell occupies. @@ -2766,6 +2450,176 @@ fn defragment_page(page: &PageContent, usable_space: u16) { assert!(first_cell <= cbrk); write_buf[first_cell as usize..cbrk as usize].fill(0); } + +/// Insert a record into a cell. +/// If the cell overflows, an overflow cell is created. +/// insert_into_cell() is called from insert_into_page(), +/// and the overflow cell count is used to determine if the page overflows, +/// i.e. whether we need to balance the btree after the insert. +fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usable_space: u16) { + let free = compute_free_space(page, usable_space); + const CELL_POINTER_SIZE_BYTES: usize = 2; + let enough_space = payload.len() + CELL_POINTER_SIZE_BYTES <= free as usize; + if !enough_space { + // add to overflow cell + page.overflow_cells.push(OverflowCell { + index: cell_idx, + payload: Pin::new(Vec::from(payload)), + }); + return; + } + + // TODO: insert into cell payload in internal page + let new_cell_data_pointer = allocate_cell_space(page, payload.len() as u16, usable_space); + let buf = page.as_ptr(); + + // copy data + buf[new_cell_data_pointer as usize..new_cell_data_pointer as usize + payload.len()] + .copy_from_slice(payload); + // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); + let (cell_pointer_array_start, _) = page.cell_pointer_array_offset_and_size(); + let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); + + // move existing pointers forward by CELL_POINTER_SIZE_BYTES... + let n_cells_forward = page.cell_count() - cell_idx; + let n_bytes_forward = CELL_POINTER_SIZE_BYTES * n_cells_forward; + if n_bytes_forward > 0 { + buf.copy_within( + cell_pointer_cur_idx..cell_pointer_cur_idx + n_bytes_forward, + cell_pointer_cur_idx + CELL_POINTER_SIZE_BYTES, + ); + } + // ...and insert new cell pointer at the current index + page.write_u16(cell_pointer_cur_idx - page.offset, new_cell_data_pointer); + + // update first byte of content area (cell data always appended to the left, so cell content area pointer moves to point to the new cell data) + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, new_cell_data_pointer); + + // update cell count + let new_n_cells = (page.cell_count() + 1) as u16; + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells); +} + +/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte +/// and end of cell pointer area. +#[allow(unused_assignments)] +fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { + // TODO(pere): maybe free space is not calculated correctly with offset + + // Usable space, not the same as free space, simply means: + // space that is not reserved for extensions by sqlite. Usually reserved_space is 0. + let usable_space = usable_space as usize; + + let mut cell_content_area_start = page.cell_content_area(); + // A zero value for the cell content area pointer is interpreted as 65536. + // See https://www.sqlite.org/fileformat.html + // The max page size for a sqlite database is 64kiB i.e. 65536 bytes. + // 65536 is u16::MAX + 1, and since cell content grows from right to left, this means + // the cell content area pointer is at the end of the page, + // i.e. + // 1. the page size is 64kiB + // 2. there are no cells on the page + // 3. there is no reserved space at the end of the page + if cell_content_area_start == 0 { + cell_content_area_start = u16::MAX; + } + + // The amount of free space is the sum of: + // #1. the size of the unallocated region + // #2. fragments (isolated 1-3 byte chunks of free space within the cell content area) + // #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions) + + let mut free_space_bytes = page.unallocated_region_size() + page.num_frag_free_bytes() as usize; + + // #3 is computed by iterating over the freeblocks linked list + let mut cur_freeblock_ptr = page.first_freeblock() as usize; + let page_buf = page.as_ptr(); + if cur_freeblock_ptr > 0 { + if cur_freeblock_ptr < cell_content_area_start as usize { + // Freeblocks exist in the cell content area e.g. after deletions + // They should never exist in the unused area of the page. + todo!("corrupted page"); + } + + let mut next = 0; + let mut size = 0; + loop { + // TODO: check corruption icellast + next = u16::from_be_bytes( + page_buf[cur_freeblock_ptr..cur_freeblock_ptr + 2] + .try_into() + .unwrap(), + ) as usize; // first 2 bytes in freeblock = next freeblock pointer + size = u16::from_be_bytes( + page_buf[cur_freeblock_ptr + 2..cur_freeblock_ptr + 4] + .try_into() + .unwrap(), + ) as usize; // next 2 bytes in freeblock = size of current freeblock + free_space_bytes += size; + // Freeblocks are in order from left to right on the page, + // so next pointer should > current pointer + its size, or 0 if no next block exists. + if next <= cur_freeblock_ptr + size + 3 { + break; + } + cur_freeblock_ptr = next; + } + + // Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list + assert!( + next == 0, + "corrupted page: freeblocks list not in ascending order" + ); + + assert!( + cur_freeblock_ptr + size <= usable_space, + "corrupted page: last freeblock extends last page end" + ); + } + + assert!( + free_space_bytes <= usable_space, + "corrupted page: free space is greater than usable space" + ); + + // if( nFree>usableSize || nFree u16 { + let amount = amount as usize; + + let (cell_offset, _) = page_ref.cell_pointer_array_offset_and_size(); + let gap = cell_offset + 2 * page_ref.cell_count(); + let mut top = page_ref.cell_content_area() as usize; + + // there are free blocks and enough space + if page_ref.first_freeblock() != 0 && gap + 2 <= top { + // find slot + let pc = find_free_cell(page_ref, usable_space, amount); + if pc != 0 { + return pc as u16; + } + /* fall through, we might need to defragment */ + } + + if gap + 2 + amount > top { + // defragment + defragment_page(page_ref, usable_space as u16); + top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize; + } + + top -= amount; + + page_ref.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, top as u16); + + assert!(top + amount <= usable_space as usize); + top as u16 +} + #[cfg(test)] mod tests { use rand_chacha::rand_core::RngCore; From 0c015e43a2ddbc38484fdc68b50ffd4f8572727f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 10 Feb 2025 12:00:23 +0100 Subject: [PATCH 04/33] some fixes --- core/storage/btree.rs | 172 +++++++++++++++++++++++++++++------------- 1 file changed, 119 insertions(+), 53 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 362b440c2..e55ba8490 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -304,7 +304,7 @@ impl BTreeCursor { let mem_page_rc = self.stack.top(); let cell_idx = self.stack.current_cell_index() as usize; - debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx); + // debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx); return_if_locked!(mem_page_rc); if !mem_page_rc.is_loaded() { self.pager.load_page(mem_page_rc.clone())?; @@ -870,11 +870,18 @@ impl BTreeCursor { matches!(self.state, CursorState::Write(_)), "Cursor must be in balancing state" ); - let state = self.state.write_info().expect("must be balancing").state; + let state = self + .state + .write_info() + .expect("must be balancing") + .state + .clone(); + tracing::trace!("balance_non_root(state={:?})", state); let (next_write_state, result) = match state { WriteState::Start => todo!(), WriteState::BalanceStart => todo!(), WriteState::BalanceNonRoot => { + let write_info = self.state.write_info().unwrap(); let parent_page = self.stack.top(); if parent_page.is_locked() { return Ok(CursorResult::IO); @@ -884,6 +891,8 @@ impl BTreeCursor { self.pager.load_page(parent_page.clone())?; return Ok(CursorResult::IO); } + parent_page.set_dirty(); + self.pager.add_dirty(parent_page.get().id); let parent_contents = parent_page.get().contents.as_ref().unwrap(); let page_to_balance_idx = self.stack.current_cell_index() as usize; @@ -893,9 +902,9 @@ impl BTreeCursor { page_to_balance_idx ); // Part 1: Find the sibling pages to balance - self.write_info.new_pages.borrow_mut().clear(); - self.write_info.pages_to_balance.borrow_mut().clear(); - self.write_info.divider_cells.borrow_mut().clear(); + write_info.new_pages.borrow_mut().clear(); + write_info.pages_to_balance.borrow_mut().clear(); + write_info.divider_cells.borrow_mut().clear(); let number_of_cells_in_parent = parent_contents.cell_count() + parent_contents.overflow_cells.len(); @@ -922,10 +931,8 @@ impl BTreeCursor { }; (2, next_divider) }; - self.write_info.sibling_count.replace(sibling_pointer + 1); - self.write_info - .first_divider_cell - .replace(first_cell_divider); + write_info.sibling_count.replace(sibling_pointer + 1); + write_info.first_divider_cell.replace(first_cell_divider); let last_sibling_is_right_pointer = sibling_pointer + first_cell_divider - parent_contents.overflow_cells.len() @@ -942,19 +949,21 @@ impl BTreeCursor { // load sibling pages // start loading right page first - let mut pgno: u32 = unsafe { right_pointer.cast::().read() }; + let mut pgno: u32 = unsafe { right_pointer.cast::().read().swap_bytes() }; + dbg!(pgno); let mut current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { let page = self.pager.read_page(pgno as usize)?; - page.set_dirty(); - self.pager.add_dirty(page.get().id); - self.write_info.pages_to_balance.borrow_mut().push(page); + write_info.pages_to_balance.borrow_mut().push(page); assert_eq!( parent_contents.overflow_cells.len(), 0, "overflow in parent is not yet implented while balancing it" ); - let next_cell_divider = i + first_cell_divider; + if i == 0 { + break; + } + let next_cell_divider = i + first_cell_divider - 1; pgno = match parent_contents.cell_get( next_cell_divider, self.pager.clone(), @@ -989,7 +998,7 @@ impl BTreeCursor { .pages_to_balance .borrow() .iter() - .all(|page| page.is_locked()); + .all(|page| !page.is_locked()); if !all_loaded { return Ok(CursorResult::IO); } @@ -1010,13 +1019,15 @@ impl BTreeCursor { for i in (0..sibling_count).rev() { let sibling_page = &pages_to_balance[i]; let sibling_contents = sibling_page.get_contents(); + sibling_page.set_dirty(); + self.pager.add_dirty(sibling_page.get().id); max_cells += sibling_contents.cell_count(); if i == 0 { // we don't have left sibling from this one so we break break; } // Since we know we have a left sibling, take the divider that points to left sibling of this page - let cell_idx = first_divider_cell - i - 1; + let cell_idx = first_divider_cell + i - 1; let (cell_start, cell_len) = parent_contents.cell_get_raw_region( cell_idx, self.payload_overflow_threshold_max(parent_contents.page_type()), @@ -1073,9 +1084,8 @@ impl BTreeCursor { } // Insert overflow cells into correct place let mut offset = total_cells_inserted; - assert_eq!( - old_page_contents.overflow_cells.len(), - 1, + assert!( + old_page_contents.overflow_cells.len() <= 1, "todo: check this works for more than one overflow cell" ); for overflow_cell in old_page_contents.overflow_cells.iter_mut() { @@ -1109,6 +1119,8 @@ impl BTreeCursor { let mut new_page_sizes = Vec::new(); let mut k = 0; // todo: add leaf correction + // number of bytes beyond header, differnet from global usableSapce which inccludes + // header let usable_space = self.usable_space() - 12; for i in 0..sibling_count { cell_array @@ -1136,15 +1148,28 @@ impl BTreeCursor { } k += 1; } + + for n in cell_array.number_of_cells_per_page.iter().enumerate() { + println!("init count page={}, n={}", n.0, n.1); + } // Try to pack as many cells to the left let mut sibling_count_new = sibling_count; let mut i = 0; while i < sibling_count_new { + for n in cell_array.number_of_cells_per_page.iter().enumerate() { + println!("start count i={} page={}, n={}", i, n.0, n.1); + } // First try to move cells to the right if they do not fit while new_page_sizes[i] > usable_space as u16 { - let needs_new_page = i + 2 >= sibling_count_new; + println!("moving right {}", i); + let needs_new_page = i + 1 >= sibling_count_new; if needs_new_page { sibling_count_new += 1; + println!("adding new page"); + new_page_sizes.push(0); + cell_array + .number_of_cells_per_page + .push(cell_array.cells.len() as u16); assert!( sibling_count_new <= 5, "it is corrupt to require more than 5 pages to balance 3 siblings" @@ -1166,12 +1191,13 @@ impl BTreeCursor { } else { size_of_cell_to_remove_from_left }; - new_page_sizes[i + 1] -= size_of_cell_to_move_right; + new_page_sizes[i + 1] += size_of_cell_to_move_right; cell_array.number_of_cells_per_page[i] -= 1; } // Now try to take from the right if we didn't have enough while cell_array.number_of_cells_per_page[i] < cell_array.cells.len() as u16 { + println!("moving left {}", i); let size_of_cell_to_remove_from_right = 2 + cell_array.cells[cell_array.cell_count(i)].len() as u16; let can_take = new_page_sizes[i] + size_of_cell_to_remove_from_right @@ -1194,15 +1220,22 @@ impl BTreeCursor { size_of_cell_to_remove_from_right }; - new_page_sizes[i + 1] += size_of_cell_to_remove_from_right; + new_page_sizes[i + 1] -= size_of_cell_to_remove_from_right; } let we_still_need_another_page = cell_array.number_of_cells_per_page[i] >= cell_array.cells.len() as u16; if we_still_need_another_page { - sibling_count_new += 1; + dbg!("we need"); + sibling_count_new = i + 1; } i += 1; + if i >= sibling_count_new { + break; + } + } + for n in cell_array.number_of_cells_per_page.iter().enumerate() { + println!("start count page={}, n={}", n.0, n.1); } // Comment borrowed from SQLite src/btree.c @@ -1222,6 +1255,7 @@ impl BTreeCursor { // if leaf_data means we don't have divider, so the one we move from left is // the same we add to right (we don't add divider to right). let mut cell_right = cell_left + 1 - leaf_data as u16; + log::trace!("start cell_left={}", cell_left); loop { let cell_left_size = cell_array.cell_size(cell_left as usize); let cell_right_size = cell_array.cell_size(cell_right as usize); @@ -1244,9 +1278,13 @@ impl BTreeCursor { cell_left -= 1; cell_right -= 1; } + tracing::trace!("end cell_left={}", cell_left); new_page_sizes[i] = size_right_page; new_page_sizes[i - 1] = size_left_page; + for n in cell_array.number_of_cells_per_page.iter().enumerate() { + println!("new count page={}, n={}", n.0, n.1); + } assert!( cell_array.number_of_cells_per_page[i - 1] > if i > 1 { @@ -1281,10 +1319,15 @@ impl BTreeCursor { } } + // Write right pointer in parent page to point to new rightmost page + parent_contents.write_u32( + PAGE_HEADER_OFFSET_RIGHTMOST_PTR, + pages_to_balance_new.last().unwrap().get().id as u32, + ); + // Ensure right-child pointer of the right-most new sibling pge points to the page // that was originally on that place. - let is_leaf_page = - matches!(page_type, PageType::TableInterior | PageType::IndexInterior); + let is_leaf_page = matches!(page_type, PageType::TableLeaf | PageType::IndexLeaf); if !is_leaf_page { let last_page = pages_to_balance.last().unwrap(); let right_pointer = last_page.get_contents().rightmost_pointer().unwrap(); @@ -1301,25 +1344,29 @@ impl BTreeCursor { let divider_cell_idx = cell_array.cell_count(i); let divider_cell = &mut cell_array.cells[divider_cell_idx]; let page = &pages_to_balance_new[i]; + // FIXME: dont use auxiliary space, could be done without allocations + let mut new_divider_cell = Vec::new(); if !is_leaf_page { // Interior page.get_contents() .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, page.get().id as u32); + new_divider_cell.extend_from_slice(divider_cell); } else if leaf_data { // Leaf table // FIXME: not needed conversion // FIXME: need to update cell size in order to free correctly? // insert into cell with correct range should be enough - let rowid = read_u32(divider_cell, 4); - divider_cell[0..4].copy_from_slice(&(page.get().id as u32).to_be_bytes()); - divider_cell[4..8].copy_from_slice(&rowid.to_be_bytes()); + let (_, n_bytes_payload) = read_varint(divider_cell)?; + let (rowid, _) = read_varint(÷r_cell[n_bytes_payload..])?; + new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + write_varint_to_vec(rowid, &mut new_divider_cell); } else { // Leaf index - divider_cell[0..4].copy_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell.extend_from_slice(divider_cell); } insert_into_cell( parent_contents, - ÷r_cell, + &new_divider_cell, first_divider_cell + i, self.usable_space() as u16, ); @@ -1346,10 +1393,12 @@ impl BTreeCursor { } else { cell_array.cells.len() }; + let start_new_cells = + cell_array.cell_count(page_idx - 1) + (!leaf_data) as usize; ( start_old_cells, - cell_array.cell_count(page_idx - 1) + (!leaf_data) as usize, - cell_array.cell_count(0), + start_new_cells, + cell_array.cell_count(page_idx) - start_new_cells, ) }; let page = pages_to_balance_new[page_idx].get_contents(); @@ -1359,7 +1408,12 @@ impl BTreeCursor { start_new_cells, number_new_cells, &cell_array, - usable_space as u16, + self.usable_space() as u16, + ); + tracing::trace!( + "edit_page page={} cells={}", + pages_to_balance_new[page_idx].get().id, + page.cell_count() ); page.overflow_cells.clear(); @@ -1369,7 +1423,8 @@ impl BTreeCursor { // TODO: balance root self.stack.pop(); // TODO: free pages - return Ok(CursorResult::IO); + write_info.state = WriteState::Finish; + return Ok(CursorResult::Ok(())); } WriteState::Finish => todo!(), }; @@ -2014,11 +2069,11 @@ impl PageStack { /// Push a new page onto the stack. /// This effectively means traversing to a child page. fn push(&self, page: PageRef) { - debug!( - "pagestack::push(current={}, new_page_id={})", - self.current_page.borrow(), - page.get().id - ); + // debug!( + // "pagestack::push(current={}, new_page_id={})", + // self.current_page.borrow(), + // page.get().id + // ); *self.current_page.borrow_mut() += 1; let current = *self.current_page.borrow(); assert!( @@ -2033,7 +2088,7 @@ impl PageStack { /// This effectively means traversing back up to a parent page. fn pop(&self) { let current = *self.current_page.borrow(); - debug!("pagestack::pop(current={})", current); + // debug!("pagestack::pop(current={})", current); self.cell_indices.borrow_mut()[current as usize] = 0; self.stack.borrow_mut()[current as usize] = None; *self.current_page.borrow_mut() -= 1; @@ -2047,11 +2102,11 @@ impl PageStack { .as_ref() .unwrap() .clone(); - debug!( - "pagestack::top(current={}, page_id={})", - current, - page.get().id - ); + // debug!( + // "pagestack::top(current={}, page_id={})", + // current, + // page.get().id + // ); page } @@ -2160,6 +2215,7 @@ pub fn btree_init_page( contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); let cell_content_area_start = db_header.page_size - db_header.reserved_space as u16; + contents.write_u16( PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cell_content_area_start, @@ -2200,7 +2256,6 @@ pub fn edit_page( start, ); count_cells -= number_to_shift; - // TODO: shift } if end_new_cells < end_old_cells { let number_tail_removed = page_free_array( @@ -2248,7 +2303,7 @@ pub fn edit_page( usable_space, ); // TODO: noverflow - page.write_u32(PAGE_HEADER_OFFSET_CELL_COUNT, count_cells as u32); + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, number_new_cells as u16); } pub fn page_free_array( @@ -2269,12 +2324,15 @@ pub fn page_free_array( // check if not overflow cell if cell_pointer.start >= buf_range.start && cell_pointer.start < buf_range.end { assert!( - cell_pointer.end >= buf_range.start && cell_pointer.end < buf_range.end, + cell_pointer.end >= buf_range.start && cell_pointer.end <= buf_range.end, "whole cell should be inside the page" ); + // TODO: remove pointer too let offset = (cell_pointer.start as usize - buf_range.start as usize) as u16; - let len = (cell_pointer.end as usize - buf_range.start as usize) as u16; + let len = (cell_pointer.end as usize - cell_pointer.start as usize) as u16; + println!("removing {}~{}", offset, len); free_cell_range(page, offset, len, usable_space); + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); number_of_cells_removed += 1; } } @@ -2288,6 +2346,7 @@ pub fn page_insert_array( mut start_insert: usize, usable_space: u16, ) { + dbg!(count); // TODO: implement faster algorithm, this is doing extra work that's not needed. // See pageInsertArray to understand faster way. for i in first..first + count { @@ -2439,6 +2498,7 @@ fn defragment_page(page: &PageContent, usable_space: u16) { // return SQLITE_CORRUPT_PAGE(pPage); // } assert!(cbrk >= first_cell); + dbg!(cbrk, first_cell); let write_buf = page.as_ptr(); // set new first byte of cell content @@ -2446,8 +2506,7 @@ fn defragment_page(page: &PageContent, usable_space: u16) { // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); // set unused space to 0 - let first_cell = cloned_page.cell_content_area() as u64; - assert!(first_cell <= cbrk); + dbg!(cbrk, first_cell); write_buf[first_cell as usize..cbrk as usize].fill(0); } @@ -2529,7 +2588,14 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { // #2. fragments (isolated 1-3 byte chunks of free space within the cell content area) // #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions) - let mut free_space_bytes = page.unallocated_region_size() + page.num_frag_free_bytes() as usize; + let pointer_size = if matches!(page.page_type(), PageType::TableLeaf | PageType::IndexLeaf) { + 0 + } else { + 4 + }; + let first_cell = page.offset + 8 + pointer_size + (2 * page.cell_count()); + let mut free_space_bytes = + cell_content_area_start as usize + page.num_frag_free_bytes() as usize; // #3 is computed by iterating over the freeblocks linked list let mut cur_freeblock_ptr = page.first_freeblock() as usize; @@ -2585,7 +2651,7 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { // return SQLITE_CORRUPT_PAGE(pPage); // } - free_space_bytes as u16 + free_space_bytes as u16 - first_cell as u16 } /// Allocate space for a cell on a page. From 915bb4241242fd6c69dd7b4c304098807691df9f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 10 Feb 2025 12:14:19 +0100 Subject: [PATCH 05/33] fix rightmost pointer update --- core/storage/btree.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index e55ba8490..e8aacab1c 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -95,7 +95,7 @@ struct WriteInfo { /// Scratch space used during balancing. scratch_cells: RefCell>>, /// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated. - rightmost_pointer: RefCell>, + rightmost_pointer: RefCell>, /// Copy of the current page needed for buffer references. page_copy: RefCell>, /// Divider cells of old pages @@ -950,7 +950,9 @@ impl BTreeCursor { // load sibling pages // start loading right page first let mut pgno: u32 = unsafe { right_pointer.cast::().read().swap_bytes() }; - dbg!(pgno); + self.write_info + .rightmost_pointer + .replace(Some(right_pointer)); let mut current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { let page = self.pager.read_page(pgno as usize)?; @@ -1320,10 +1322,11 @@ impl BTreeCursor { } // Write right pointer in parent page to point to new rightmost page - parent_contents.write_u32( - PAGE_HEADER_OFFSET_RIGHTMOST_PTR, - pages_to_balance_new.last().unwrap().get().id as u32, - ); + let right_page_id = pages_to_balance_new.last().unwrap().get().id as u32; + let rightmost_pointer = self.write_info.rightmost_pointer.borrow_mut().unwrap(); + let rightmost_pointer = + unsafe { std::slice::from_raw_parts_mut(rightmost_pointer, 4) }; + rightmost_pointer[0..4].copy_from_slice(&right_page_id.to_be_bytes()); // Ensure right-child pointer of the right-most new sibling pge points to the page // that was originally on that place. @@ -1364,6 +1367,8 @@ impl BTreeCursor { // Leaf index new_divider_cell.extend_from_slice(divider_cell); } + // FIXME: defragment shouldn't be needed + defragment_page(parent_contents, self.usable_space() as u16); insert_into_cell( parent_contents, &new_divider_cell, @@ -2330,7 +2335,6 @@ pub fn page_free_array( // TODO: remove pointer too let offset = (cell_pointer.start as usize - buf_range.start as usize) as u16; let len = (cell_pointer.end as usize - cell_pointer.start as usize) as u16; - println!("removing {}~{}", offset, len); free_cell_range(page, offset, len, usable_space); page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); number_of_cells_removed += 1; From 7e3470fb8eba5e99063c0ef67716e76f84c8ea89 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 10 Feb 2025 12:30:34 +0100 Subject: [PATCH 06/33] fix rebase --- core/storage/btree.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index e8aacab1c..25256d57b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -950,9 +950,7 @@ impl BTreeCursor { // load sibling pages // start loading right page first let mut pgno: u32 = unsafe { right_pointer.cast::().read().swap_bytes() }; - self.write_info - .rightmost_pointer - .replace(Some(right_pointer)); + write_info.rightmost_pointer.replace(Some(right_pointer)); let mut current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { let page = self.pager.read_page(pgno as usize)?; @@ -991,8 +989,10 @@ impl BTreeCursor { .pages_to_balance .borrow_mut() .reverse(); - self.state.write_info().unwrap().state = WriteState::BalanceNonRootWaitLoadPages; - return Ok(CursorResult::IO); + ( + WriteState::BalanceNonRootWaitLoadPages, + Ok(CursorResult::IO), + ) } WriteState::BalanceNonRootWaitLoadPages => { let write_info = self.state.write_info().unwrap(); @@ -1323,7 +1323,7 @@ impl BTreeCursor { // Write right pointer in parent page to point to new rightmost page let right_page_id = pages_to_balance_new.last().unwrap().get().id as u32; - let rightmost_pointer = self.write_info.rightmost_pointer.borrow_mut().unwrap(); + let rightmost_pointer = write_info.rightmost_pointer.borrow_mut().unwrap(); let rightmost_pointer = unsafe { std::slice::from_raw_parts_mut(rightmost_pointer, 4) }; rightmost_pointer[0..4].copy_from_slice(&right_page_id.to_be_bytes()); @@ -1428,8 +1428,7 @@ impl BTreeCursor { // TODO: balance root self.stack.pop(); // TODO: free pages - write_info.state = WriteState::Finish; - return Ok(CursorResult::Ok(())); + (WriteState::Finish, Ok(CursorResult::Ok(()))) } WriteState::Finish => todo!(), }; From c4c5a74e16218351713549a8b6dfcfc5e68cadd9 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 11 Feb 2025 09:20:18 +0100 Subject: [PATCH 07/33] add a bit of testing --- core/storage/btree.rs | 549 ++++++++++++++++++++++++++++-------------- 1 file changed, 367 insertions(+), 182 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 25256d57b..957153a8c 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -265,8 +265,8 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), self.usable_space(), )?; @@ -304,7 +304,7 @@ impl BTreeCursor { let mem_page_rc = self.stack.top(); let cell_idx = self.stack.current_cell_index() as usize; - // debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx); + debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx); return_if_locked!(mem_page_rc); if !mem_page_rc.is_loaded() { self.pager.load_page(mem_page_rc.clone())?; @@ -354,8 +354,8 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), self.usable_space(), )?; match &cell { @@ -475,8 +475,14 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max( + contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + contents.page_type(), + self.usable_space() as u16, + ), self.usable_space(), )?; match &cell { @@ -635,8 +641,14 @@ impl BTreeCursor { match &contents.cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max( + contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + contents.page_type(), + self.usable_space() as u16, + ), self.usable_space(), )? { BTreeCell::TableInteriorCell(TableInteriorCell { @@ -756,7 +768,14 @@ impl BTreeCursor { // insert cell let mut cell_payload: Vec = Vec::new(); - self.fill_cell_payload(page_type, Some(int_key), &mut cell_payload, record); + fill_cell_payload( + page_type, + Some(int_key), + &mut cell_payload, + record, + self.usable_space() as u16, + self.pager.clone(), + ); // insert let overflow = { @@ -804,8 +823,8 @@ impl BTreeCursor { let cell_count = page.cell_count(); let (cell_start, cell_len) = page.cell_get_raw_region( cell_idx, - self.payload_overflow_threshold_max(page.page_type()), - self.payload_overflow_threshold_min(page.page_type()), + payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16), self.usable_space(), ); free_cell_range( @@ -967,8 +986,14 @@ impl BTreeCursor { pgno = match parent_contents.cell_get( next_cell_divider, self.pager.clone(), - self.payload_overflow_threshold_max(parent_contents.page_type()), - self.payload_overflow_threshold_min(parent_contents.page_type()), + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), self.usable_space(), )? { BTreeCell::TableInteriorCell(table_interior_cell) => { @@ -1032,8 +1057,14 @@ impl BTreeCursor { let cell_idx = first_divider_cell + i - 1; let (cell_start, cell_len) = parent_contents.cell_get_raw_region( cell_idx, - self.payload_overflow_threshold_max(parent_contents.page_type()), - self.payload_overflow_threshold_min(parent_contents.page_type()), + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), self.usable_space(), ); let buf = parent_contents.as_ptr(); @@ -1075,8 +1106,14 @@ impl BTreeCursor { for cell_idx in 0..old_page_contents.cell_count() { let (cell_start, cell_len) = old_page_contents.cell_get_raw_region( cell_idx, - self.payload_overflow_threshold_max(old_page_contents.page_type()), - self.payload_overflow_threshold_min(old_page_contents.page_type()), + payload_overflow_threshold_max( + old_page_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + old_page_contents.page_type(), + self.usable_space() as u16, + ), self.usable_space(), ); let buf = old_page_contents.as_ptr(); @@ -1319,6 +1356,7 @@ impl BTreeCursor { page.get().id = new_id; self.pager.put_loaded_page(new_id, page.clone()); } + dbg!(page.get().id); } // Write right pointer in parent page to point to new rightmost page @@ -1326,7 +1364,9 @@ impl BTreeCursor { let rightmost_pointer = write_info.rightmost_pointer.borrow_mut().unwrap(); let rightmost_pointer = unsafe { std::slice::from_raw_parts_mut(rightmost_pointer, 4) }; + dbg!(&rightmost_pointer); rightmost_pointer[0..4].copy_from_slice(&right_page_id.to_be_bytes()); + dbg!(&rightmost_pointer); // Ensure right-child pointer of the right-most new sibling pge points to the page // that was originally on that place. @@ -1367,8 +1407,45 @@ impl BTreeCursor { // Leaf index new_divider_cell.extend_from_slice(divider_cell); } + dbg!(&new_divider_cell); // FIXME: defragment shouldn't be needed + println!("cells before fragment"); + for cell_idx in 0..parent_contents.cell_count() { + let cell = parent_contents.cell_get( + cell_idx, + self.pager.clone(), + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; + dbg!(cell); + } + println!("cells end"); defragment_page(parent_contents, self.usable_space() as u16); + println!("cells"); + for cell_idx in 0..parent_contents.cell_count() { + let cell = parent_contents.cell_get( + cell_idx, + self.pager.clone(), + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; + dbg!(cell); + } + println!("cells end"); insert_into_cell( parent_contents, &new_divider_cell, @@ -1376,6 +1453,24 @@ impl BTreeCursor { self.usable_space() as u16, ); } + println!("cells"); + for cell_idx in 0..parent_contents.cell_count() { + let cell = parent_contents.cell_get( + cell_idx, + self.pager.clone(), + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; + dbg!(cell); + } + println!("cells end"); // TODO: update pages let mut done = vec![false; sibling_count_new]; for i in (1 as i64 - sibling_count_new as i64)..sibling_count_new as i64 { @@ -1521,151 +1616,10 @@ impl BTreeCursor { /// This marks the page as dirty and writes the page header. fn allocate_page(&self, page_type: PageType, offset: usize) -> PageRef { let page = self.pager.allocate_page().unwrap(); - btree_init_page(&page, page_type, &self.pager.db_header.borrow(), offset); + btree_init_page(&page, page_type, offset, self.usable_space() as u16); page } - /// Allocate a new overflow page. - /// This is done when a cell overflows and new space is needed. - fn allocate_overflow_page(&self) -> PageRef { - let page = self.pager.allocate_page().unwrap(); - - // setup overflow page - let contents = page.get().contents.as_mut().unwrap(); - let buf = contents.as_ptr(); - buf.fill(0); - - page - } - - /// Fill in the cell payload with the record. - /// If the record is too large to fit in the cell, it will spill onto overflow pages. - fn fill_cell_payload( - &self, - page_type: PageType, - int_key: Option, - cell_payload: &mut Vec, - record: &Record, - ) { - assert!(matches!( - page_type, - PageType::TableLeaf | PageType::IndexLeaf - )); - // TODO: make record raw from start, having to serialize is not good - let mut record_buf = Vec::new(); - record.serialize(&mut record_buf); - - // fill in header - if matches!(page_type, PageType::TableLeaf) { - let int_key = int_key.unwrap(); - write_varint_to_vec(record_buf.len() as u64, cell_payload); - write_varint_to_vec(int_key, cell_payload); - } else { - write_varint_to_vec(record_buf.len() as u64, cell_payload); - } - - let payload_overflow_threshold_max = self.payload_overflow_threshold_max(page_type.clone()); - debug!( - "fill_cell_payload(record_size={}, payload_overflow_threshold_max={})", - record_buf.len(), - payload_overflow_threshold_max - ); - if record_buf.len() <= payload_overflow_threshold_max { - // enough allowed space to fit inside a btree page - cell_payload.extend_from_slice(record_buf.as_slice()); - return; - } - debug!("fill_cell_payload(overflow)"); - - let payload_overflow_threshold_min = self.payload_overflow_threshold_min(page_type); - // see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371 - let mut space_left = payload_overflow_threshold_min - + (record_buf.len() - payload_overflow_threshold_min) % (self.usable_space() - 4); - - if space_left > payload_overflow_threshold_max { - space_left = payload_overflow_threshold_min; - } - - // cell_size must be equal to first value of space_left as this will be the bytes copied to non-overflow page. - let cell_size = space_left + cell_payload.len() + 4; // 4 is the number of bytes of pointer to first overflow page - let mut to_copy_buffer = record_buf.as_slice(); - - let prev_size = cell_payload.len(); - cell_payload.resize(prev_size + space_left + 4, 0); - let mut pointer = unsafe { cell_payload.as_mut_ptr().add(prev_size) }; - let mut pointer_to_next = unsafe { cell_payload.as_mut_ptr().add(prev_size + space_left) }; - let mut overflow_pages = Vec::new(); - - loop { - let to_copy = space_left.min(to_copy_buffer.len()); - unsafe { std::ptr::copy(to_copy_buffer.as_ptr(), pointer, to_copy) }; - - let left = to_copy_buffer.len() - to_copy; - if left == 0 { - break; - } - - // we still have bytes to add, we will need to allocate new overflow page - let overflow_page = self.allocate_overflow_page(); - overflow_pages.push(overflow_page.clone()); - { - let id = overflow_page.get().id as u32; - let contents = overflow_page.get().contents.as_mut().unwrap(); - - // TODO: take into account offset here? - let buf = contents.as_ptr(); - let as_bytes = id.to_be_bytes(); - // update pointer to new overflow page - unsafe { std::ptr::copy(as_bytes.as_ptr(), pointer_to_next, 4) }; - - pointer = unsafe { buf.as_mut_ptr().add(4) }; - pointer_to_next = buf.as_mut_ptr(); - space_left = self.usable_space() - 4; - } - - to_copy_buffer = &to_copy_buffer[to_copy..]; - } - - assert_eq!(cell_size, cell_payload.len()); - } - - /// Returns the maximum payload size (X) that can be stored directly on a b-tree page without spilling to overflow pages. - /// - /// For table leaf pages: X = usable_size - 35 - /// For index pages: X = ((usable_size - 12) * 64/255) - 23 - /// - /// The usable size is the total page size less the reserved space at the end of each page. - /// These thresholds are designed to: - /// - Give a minimum fanout of 4 for index b-trees - /// - Ensure enough payload is on the b-tree page that the record header can usually be accessed - /// without consulting an overflow page - fn payload_overflow_threshold_max(&self, page_type: PageType) -> usize { - let usable_size = self.usable_space(); - match page_type { - PageType::IndexInterior | PageType::IndexLeaf => { - ((usable_size - 12) * 64 / 255) - 23 // Index page formula - } - PageType::TableInterior | PageType::TableLeaf => { - usable_size - 35 // Table leaf page formula - } - } - } - - /// Returns the minimum payload size (M) that must be stored on the b-tree page before spilling to overflow pages is allowed. - /// - /// For all page types: M = ((usable_size - 12) * 32/255) - 23 - /// - /// When payload size P exceeds max_local(): - /// - If K = M + ((P-M) % (usable_size-4)) <= max_local(): store K bytes on page - /// - Otherwise: store M bytes on page - /// - /// The remaining bytes are stored on overflow pages in both cases. - fn payload_overflow_threshold_min(&self, _page_type: PageType) -> usize { - let usable_size = self.usable_space(); - // Same formula for all page types - ((usable_size - 12) * 32 / 255) - 23 - } - /// The "usable size" of a database page is the page size specified by the 2-byte integer at offset 16 /// in the header, minus the "reserved" space size recorded in the 1-byte integer at offset 20 in the header. /// The usable size of a page might be an odd number. However, the usable size is not allowed to be less than 480. @@ -1685,8 +1639,8 @@ impl BTreeCursor { .cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(page.page_type()), - self.payload_overflow_threshold_min(page.page_type()), + payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16), self.usable_space(), ) .unwrap() @@ -1957,8 +1911,8 @@ impl BTreeCursor { let equals = match &contents.cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), self.usable_space(), )? { BTreeCell::TableLeafCell(l) => l._rowid == int_key, @@ -2073,11 +2027,11 @@ impl PageStack { /// Push a new page onto the stack. /// This effectively means traversing to a child page. fn push(&self, page: PageRef) { - // debug!( - // "pagestack::push(current={}, new_page_id={})", - // self.current_page.borrow(), - // page.get().id - // ); + debug!( + "pagestack::push(current={}, new_page_id={})", + self.current_page.borrow(), + page.get().id + ); *self.current_page.borrow_mut() += 1; let current = *self.current_page.borrow(); assert!( @@ -2092,7 +2046,7 @@ impl PageStack { /// This effectively means traversing back up to a parent page. fn pop(&self) { let current = *self.current_page.borrow(); - // debug!("pagestack::pop(current={})", current); + debug!("pagestack::pop(current={})", current); self.cell_indices.borrow_mut()[current as usize] = 0; self.stack.borrow_mut()[current as usize] = None; *self.current_page.borrow_mut() -= 1; @@ -2202,12 +2156,7 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> u } } -pub fn btree_init_page( - page: &PageRef, - page_type: PageType, - db_header: &DatabaseHeader, - offset: usize, -) { +pub fn btree_init_page(page: &PageRef, page_type: PageType, offset: usize, usable_space: u16) { // setup btree page let contents = page.get(); debug!("btree_init_page(id={}, offset={})", contents.id, offset); @@ -2218,12 +2167,7 @@ pub fn btree_init_page( contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); - let cell_content_area_start = db_header.page_size - db_header.reserved_space as u16; - - contents.write_u16( - PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, - cell_content_area_start, - ); + contents.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, usable_space); contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0); @@ -2427,6 +2371,7 @@ fn free_cell_range(page: &mut PageContent, offset: u16, len: u16, usable_space: /// Defragment a page. This means packing all the cells to the end of the page. fn defragment_page(page: &PageContent, usable_space: u16) { + // TODO: test this log::debug!("defragment_page"); let cloned_page = page.clone(); // TODO(pere): usable space should include offset probably @@ -2689,6 +2634,147 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - top as u16 } +/// Fill in the cell payload with the record. +/// If the record is too large to fit in the cell, it will spill onto overflow pages. +fn fill_cell_payload( + page_type: PageType, + int_key: Option, + cell_payload: &mut Vec, + record: &OwnedRecord, + usable_space: u16, + pager: Rc, +) { + assert!(matches!( + page_type, + PageType::TableLeaf | PageType::IndexLeaf + )); + // TODO: make record raw from start, having to serialize is not good + let mut record_buf = Vec::new(); + record.serialize(&mut record_buf); + + // fill in header + if matches!(page_type, PageType::TableLeaf) { + let int_key = int_key.unwrap(); + write_varint_to_vec(record_buf.len() as u64, cell_payload); + write_varint_to_vec(int_key, cell_payload); + } else { + write_varint_to_vec(record_buf.len() as u64, cell_payload); + } + + let payload_overflow_threshold_max = + payload_overflow_threshold_max(page_type.clone(), usable_space); + debug!( + "fill_cell_payload(record_size={}, payload_overflow_threshold_max={})", + record_buf.len(), + payload_overflow_threshold_max + ); + if record_buf.len() <= payload_overflow_threshold_max { + // enough allowed space to fit inside a btree page + cell_payload.extend_from_slice(record_buf.as_slice()); + return; + } + debug!("fill_cell_payload(overflow)"); + + let payload_overflow_threshold_min = payload_overflow_threshold_min(page_type, usable_space); + // see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371 + let mut space_left = payload_overflow_threshold_min + + (record_buf.len() - payload_overflow_threshold_min) % (usable_space as usize - 4); + + if space_left > payload_overflow_threshold_max { + space_left = payload_overflow_threshold_min; + } + + // cell_size must be equal to first value of space_left as this will be the bytes copied to non-overflow page. + let cell_size = space_left + cell_payload.len() + 4; // 4 is the number of bytes of pointer to first overflow page + let mut to_copy_buffer = record_buf.as_slice(); + + let prev_size = cell_payload.len(); + cell_payload.resize(prev_size + space_left + 4, 0); + let mut pointer = unsafe { cell_payload.as_mut_ptr().add(prev_size) }; + let mut pointer_to_next = unsafe { cell_payload.as_mut_ptr().add(prev_size + space_left) }; + let mut overflow_pages = Vec::new(); + + loop { + let to_copy = space_left.min(to_copy_buffer.len()); + unsafe { std::ptr::copy(to_copy_buffer.as_ptr(), pointer, to_copy) }; + + let left = to_copy_buffer.len() - to_copy; + if left == 0 { + break; + } + + // we still have bytes to add, we will need to allocate new overflow page + let overflow_page = allocate_overflow_page(pager.clone()); + overflow_pages.push(overflow_page.clone()); + { + let id = overflow_page.get().id as u32; + let contents = overflow_page.get().contents.as_mut().unwrap(); + + // TODO: take into account offset here? + let buf = contents.as_ptr(); + let as_bytes = id.to_be_bytes(); + // update pointer to new overflow page + unsafe { std::ptr::copy(as_bytes.as_ptr(), pointer_to_next, 4) }; + + pointer = unsafe { buf.as_mut_ptr().add(4) }; + pointer_to_next = buf.as_mut_ptr(); + space_left = usable_space as usize - 4; + } + + to_copy_buffer = &to_copy_buffer[to_copy..]; + } + + assert_eq!(cell_size, cell_payload.len()); +} + +/// Allocate a new overflow page. +/// This is done when a cell overflows and new space is needed. +fn allocate_overflow_page(pager: Rc) -> PageRef { + let page = pager.allocate_page().unwrap(); + + // setup overflow page + let contents = page.get().contents.as_mut().unwrap(); + let buf = contents.as_ptr(); + buf.fill(0); + + page +} + +/// Returns the maximum payload size (X) that can be stored directly on a b-tree page without spilling to overflow pages. +/// +/// For table leaf pages: X = usable_size - 35 +/// For index pages: X = ((usable_size - 12) * 64/255) - 23 +/// +/// The usable size is the total page size less the reserved space at the end of each page. +/// These thresholds are designed to: +/// - Give a minimum fanout of 4 for index b-trees +/// - Ensure enough payload is on the b-tree page that the record header can usually be accessed +/// without consulting an overflow page +fn payload_overflow_threshold_max(page_type: PageType, usable_space: u16) -> usize { + match page_type { + PageType::IndexInterior | PageType::IndexLeaf => { + ((usable_space as usize - 12) * 64 / 255) - 23 // Index page formula + } + PageType::TableInterior | PageType::TableLeaf => { + usable_space as usize - 35 // Table leaf page formula + } + } +} + +/// Returns the minimum payload size (M) that must be stored on the b-tree page before spilling to overflow pages is allowed. +/// +/// For all page types: M = ((usable_size - 12) * 32/255) - 23 +/// +/// When payload size P exceeds max_local(): +/// - If K = M + ((P-M) % (usable_size-4)) <= max_local(): store K bytes on page +/// - Otherwise: store M bytes on page +/// +/// The remaining bytes are stored on overflow pages in both cases. +fn payload_overflow_threshold_min(_page_type: PageType, usable_space: u16) -> usize { + // Same formula for all page types + ((usable_space as usize - 12) * 32 / 255) - 23 +} + #[cfg(test)] mod tests { use rand_chacha::rand_core::RngCore; @@ -2702,8 +2788,107 @@ mod tests { use crate::storage::sqlite3_ondisk; use crate::{BufferPool, DatabaseStorage, WalFile, WalFileShared, WriteCompletion}; use std::cell::RefCell; + use std::rc::Rc; use std::sync::Arc; + use tempfile::TempDir; + + use crate::{ + io::BufferData, + storage::{ + btree::{ + compute_free_space, fill_cell_payload, payload_overflow_threshold_max, + payload_overflow_threshold_min, + }, + pager::PageRef, + sqlite3_ondisk::{BTreeCell, PageContent, PageType}, + }, + types::{OwnedValue, Record}, + Database, Page, Pager, PlatformIO, + }; + + use super::{btree_init_page, insert_into_cell}; + + fn get_page(id: usize) -> PageRef { + let page = Arc::new(Page::new(2)); + + let drop_fn = Rc::new(|_| {}); + let inner = PageContent { + offset: 0, + buffer: Rc::new(RefCell::new(Buffer::new( + BufferData::new(vec![0; 4096]), + drop_fn, + ))), + overflow_cells: Vec::new(), + }; + page.get().contents.replace(inner); + + btree_init_page(&page, PageType::TableLeaf, 0, 4096); + page + } + + fn get_database() -> Arc { + let mut path = TempDir::new().unwrap().into_path(); + path.push("test.db"); + { + let connection = rusqlite::Connection::open(&path).unwrap(); + connection + .pragma_update(None, "journal_mode", "wal") + .unwrap(); + } + let io: Arc = Arc::new(PlatformIO::new().unwrap()); + let db = Database::open_file(io.clone(), path.to_str().unwrap()).unwrap(); + + db + } + + #[test] + fn test_insert_cell() { + let db = get_database(); + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); + let mut payload: Vec = Vec::new(); + fill_cell_payload( + page.page_type(), + Some(1), + &mut payload, + &record, + 4096, + db.pager.clone(), + ); + insert_into_cell(page, &payload, 0, 4096); + assert_eq!(page.cell_count(), 1); + let free = compute_free_space(page, 4096); + assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size); + + let cell = page.cell_get_raw_region( + 0, + payload_overflow_threshold_max(page.page_type(), 4096), + payload_overflow_threshold_min(page.page_type(), 4096), + 4096, + ); + let buf = &page.as_ptr()[cell.0..cell.0 + cell.1]; + assert_eq!(buf.len(), payload.len()); + assert_eq!(buf, &payload); + } + + #[test] + fn test_multiple_insert_cell() { + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + let mut payload = Vec::new(); + for i in 0..16 { + payload.push(i); + } + insert_into_cell(page, &payload, 0, 4096); + assert_eq!(page.cell_count(), 1); + let free = compute_free_space(page, 4096); + assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size); + } + fn validate_btree(pager: Rc, page_idx: usize) -> (usize, bool) { let cursor = BTreeCursor::new(pager.clone(), page_idx); let page = pager.read_page(page_idx).unwrap(); From 48f0fe0904634e08569ff879781cf9091040a512 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 11 Feb 2025 09:26:32 +0100 Subject: [PATCH 08/33] multiple cell insert test --- core/storage/btree.rs | 65 ++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 957153a8c..ae17942ab 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2842,13 +2842,19 @@ mod tests { db } - #[test] - fn test_insert_cell() { - let db = get_database(); - let page = get_page(2); - let page = page.get_contents(); - let header_size = 8; - let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); + fn ensure_cell(page: &mut PageContent, cell_idx: usize, payload: &Vec) { + let cell = page.cell_get_raw_region( + cell_idx, + payload_overflow_threshold_max(page.page_type(), 4096), + payload_overflow_threshold_min(page.page_type(), 4096), + 4096, + ); + let buf = &page.as_ptr()[cell.0..cell.0 + cell.1]; + assert_eq!(buf.len(), payload.len()); + assert_eq!(buf, payload); + } + + fn add_record(page: &mut PageContent, record: OwnedRecord, db: &Arc) -> Vec { let mut payload: Vec = Vec::new(); fill_cell_payload( page.page_type(), @@ -2859,34 +2865,47 @@ mod tests { db.pager.clone(), ); insert_into_cell(page, &payload, 0, 4096); + payload + } + + #[test] + fn test_insert_cell() { + let db = get_database(); + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); + let payload = add_record(page, record, &db); assert_eq!(page.cell_count(), 1); let free = compute_free_space(page, 4096); assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size); - let cell = page.cell_get_raw_region( - 0, - payload_overflow_threshold_max(page.page_type(), 4096), - payload_overflow_threshold_min(page.page_type(), 4096), - 4096, - ); - let buf = &page.as_ptr()[cell.0..cell.0 + cell.1]; - assert_eq!(buf.len(), payload.len()); - assert_eq!(buf, &payload); + let cell_idx = 0; + ensure_cell(page, cell_idx, &payload); } #[test] fn test_multiple_insert_cell() { + let db = get_database(); let page = get_page(2); let page = page.get_contents(); let header_size = 8; - let mut payload = Vec::new(); - for i in 0..16 { - payload.push(i); + + let mut total_size = 0; + let mut payloads = Vec::new(); + for i in 0..10 { + let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); + let payload = add_record(page, record, &db); + assert_eq!(page.cell_count(), i + 1); + let free = compute_free_space(page, 4096); + total_size += payload.len() as u16 + 2; + assert_eq!(free, 4096 - total_size - header_size); + payloads.push(payload); + } + + for (i, payload) in payloads.iter().enumerate() { + ensure_cell(page, i, payload); } - insert_into_cell(page, &payload, 0, 4096); - assert_eq!(page.cell_count(), 1); - let free = compute_free_space(page, 4096); - assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size); } fn validate_btree(pager: Rc, page_idx: usize) -> (usize, bool) { From 0aa70929f2fb401346c0d00e8f68fc4cbcc84555 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 11 Feb 2025 09:26:46 +0100 Subject: [PATCH 09/33] fix usable size btree init --- core/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/lib.rs b/core/lib.rs index 00889db98..3062f1137 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -229,8 +229,8 @@ pub fn maybe_init_database_file(file: &Rc, io: &Arc) -> Result btree_init_page( &page1, storage::sqlite3_ondisk::PageType::TableLeaf, - &db_header, DATABASE_HEADER_SIZE, + db_header.page_size - db_header.reserved_space as u16, ); let contents = page1.get().contents.as_mut().unwrap(); From 44857cdd17335a649d43540028df2dda98c7fe98 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 11 Feb 2025 09:30:55 +0100 Subject: [PATCH 10/33] fix insert test --- core/storage/btree.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index ae17942ab..79f0e5166 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2854,17 +2854,23 @@ mod tests { assert_eq!(buf, payload); } - fn add_record(page: &mut PageContent, record: OwnedRecord, db: &Arc) -> Vec { + fn add_record( + id: usize, + pos: usize, + page: &mut PageContent, + record: OwnedRecord, + db: &Arc, + ) -> Vec { let mut payload: Vec = Vec::new(); fill_cell_payload( page.page_type(), - Some(1), + Some(id as u64), &mut payload, &record, 4096, db.pager.clone(), ); - insert_into_cell(page, &payload, 0, 4096); + insert_into_cell(page, &payload, pos, 4096); payload } @@ -2875,7 +2881,7 @@ mod tests { let page = page.get_contents(); let header_size = 8; let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); - let payload = add_record(page, record, &db); + let payload = add_record(1, 0, page, record, &db); assert_eq!(page.cell_count(), 1); let free = compute_free_space(page, 4096); assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size); @@ -2894,8 +2900,8 @@ mod tests { let mut total_size = 0; let mut payloads = Vec::new(); for i in 0..10 { - let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); - let payload = add_record(page, record, &db); + let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let payload = add_record(i, i, page, record, &db); assert_eq!(page.cell_count(), i + 1); let free = compute_free_space(page, 4096); total_size += payload.len() as u16 + 2; From 0274f74f323a63b37b94605dac9d6c6efe780e81 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 12 Feb 2025 21:25:55 +0100 Subject: [PATCH 11/33] fix drop cell and add some tests --- core/storage/btree.rs | 172 +++++++++++++++++++++++++++------ core/storage/sqlite3_ondisk.rs | 3 +- 2 files changed, 146 insertions(+), 29 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 79f0e5166..1ee2a326a 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -817,25 +817,6 @@ impl BTreeCursor { return ret; } - /// Drop a cell from a page. - /// This is done by freeing the range of bytes that the cell occupies. - fn drop_cell(&self, page: &mut PageContent, cell_idx: usize) { - let cell_count = page.cell_count(); - let (cell_start, cell_len) = page.cell_get_raw_region( - cell_idx, - payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16), - payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16), - self.usable_space(), - ); - free_cell_range( - page, - cell_start as u16, - cell_len as u16, - self.usable_space() as u16, - ); - page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); - } - /// Balance a leaf page. /// Balancing is done when a page overflows. /// see e.g. https://en.wikipedia.org/wiki/B-tree @@ -1075,7 +1056,7 @@ impl BTreeCursor { .divider_cells .borrow_mut() .push(cell_buf.to_vec()); - self.drop_cell(parent_contents, cell_idx); + drop_cell(parent_contents, cell_idx, self.usable_space() as u16); } assert_eq!( write_info.divider_cells.borrow().len(), @@ -2775,6 +2756,35 @@ fn payload_overflow_threshold_min(_page_type: PageType, usable_space: u16) -> us ((usable_space as usize - 12) * 32 / 255) - 23 } +/// Drop a cell from a page. +/// This is done by freeing the range of bytes that the cell occupies. +fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) { + let (cell_start, cell_len) = page.cell_get_raw_region( + cell_idx, + payload_overflow_threshold_max(page.page_type(), usable_space), + payload_overflow_threshold_min(page.page_type(), usable_space), + usable_space as usize, + ); + free_cell_range(page, cell_start as u16, cell_len as u16, usable_space); + if page.cell_count() > 0 { + shift_pointers_left(page, cell_idx); + } + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); +} + +/// Shift pointers to the left once starting from a cell position +/// This is useful when we remove a cell and we want to move left the cells from the right to fill +/// the empty space that's not needed +fn shift_pointers_left(page: &mut PageContent, cell_idx: usize) { + assert!(page.cell_count() > 0); + let buf = page.as_ptr(); + let (start, _) = page.cell_pointer_array_offset_and_size(); + let start = start + (cell_idx * 2) + 2; + let right_cells = page.cell_count() - cell_idx - 1; + let amount_to_shift = right_cells * 2; + buf.copy_within(start..start + amount_to_shift, start - 2); +} + #[cfg(test)] mod tests { use rand_chacha::rand_core::RngCore; @@ -2790,6 +2800,7 @@ mod tests { use std::cell::RefCell; use std::rc::Rc; use std::sync::Arc; + use std::{cell::RefCell, panic, rc::Rc, sync::Arc}; use tempfile::TempDir; @@ -2807,7 +2818,7 @@ mod tests { Database, Page, Pager, PlatformIO, }; - use super::{btree_init_page, insert_into_cell}; + use super::{btree_init_page, defragment_page, drop_cell, insert_into_cell}; fn get_page(id: usize) -> PageRef { let page = Arc::new(Page::new(2)); @@ -2849,6 +2860,7 @@ mod tests { payload_overflow_threshold_min(page.page_type(), 4096), 4096, ); + log::trace!("cell idx={} start={} len={}", cell_idx, cell.0, cell.1); let buf = &page.as_ptr()[cell.0..cell.0 + cell.1]; assert_eq!(buf.len(), payload.len()); assert_eq!(buf, payload); @@ -2890,27 +2902,41 @@ mod tests { ensure_cell(page, cell_idx, &payload); } + struct Cell { + pos: usize, + payload: Vec, + } + #[test] - fn test_multiple_insert_cell() { + fn test_drop_1() { + set_breakpoint_panic(); let db = get_database(); + let page = get_page(2); let page = page.get_contents(); let header_size = 8; let mut total_size = 0; - let mut payloads = Vec::new(); - for i in 0..10 { + let mut cells = Vec::new(); + let usable_space = 4096; + for i in 0..3 { let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, i, page, record, &db); assert_eq!(page.cell_count(), i + 1); - let free = compute_free_space(page, 4096); + let free = compute_free_space(page, usable_space); total_size += payload.len() as u16 + 2; assert_eq!(free, 4096 - total_size - header_size); - payloads.push(payload); + cells.push(Cell { pos: i, payload }); } - for (i, payload) in payloads.iter().enumerate() { - ensure_cell(page, i, payload); + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + cells.remove(1); + drop_cell(page, 1, usable_space); + + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); } } @@ -3189,6 +3215,46 @@ mod tests { key ); } + #[test] + fn test_drop_odd() { + set_breakpoint_panic(); + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + + let mut total_size = 0; + let mut cells = Vec::new(); + let usable_space = 4096; + let total_cells = 10; + for i in 0..total_cells { + let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let payload = add_record(i, i, page, record, &db); + assert_eq!(page.cell_count(), i + 1); + let free = compute_free_space(page, usable_space); + total_size += payload.len() as u16 + 2; + assert_eq!(free, 4096 - total_size - header_size); + cells.push(Cell { pos: i, payload }); + } + + let mut removed = 0; + let mut new_cells = Vec::new(); + for cell in cells { + if cell.pos % 2 == 1 { + drop_cell(page, cell.pos - removed, usable_space); + removed += 1; + } else { + new_cells.push(cell); + } + } + let cells = new_cells; + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); } } @@ -3416,4 +3482,54 @@ mod tests { Ok(()) } + #[test] + fn test_defragment() { + set_breakpoint_panic(); + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + + let mut total_size = 0; + let mut cells = Vec::new(); + let usable_space = 4096; + for i in 0..3 { + let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let payload = add_record(i, i, page, record, &db); + assert_eq!(page.cell_count(), i + 1); + let free = compute_free_space(page, usable_space); + total_size += payload.len() as u16 + 2; + assert_eq!(free, 4096 - total_size - header_size); + cells.push(Cell { pos: i, payload }); + } + + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + cells.remove(1); + drop_cell(page, 1, usable_space); + + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + + defragment_page(page, usable_space); + + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + } + + fn set_breakpoint_panic() { + // Set custom panic hook at start of program + panic::set_hook(Box::new(|panic_info| { + unsafe { + std::arch::asm!("brk #0"); + } + + // Optionally print the panic info + eprintln!("Panic occurred: {:?}", panic_info); + })); + } } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index c08333aef..ddd36decf 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -579,7 +579,7 @@ impl PageContent { (self.offset + header_size, self.cell_pointer_array_size()) } - /* Get region of a cell's payload */ + /// Get region of a cell's payload pub fn cell_get_raw_region( &self, idx: usize, @@ -893,6 +893,7 @@ fn read_payload(unread: &[u8], payload_size: usize, pager: Rc) -> (Vec 0); let page; loop { + // FIXME(pere): this looks terrible, what did i do lmao let page_ref = pager.read_page(next_overflow as usize); if let Ok(p) = page_ref { page = p; From ea77902e24d8182a58d7cff96aa9cf0dae8d932f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 12 Feb 2025 21:29:10 +0100 Subject: [PATCH 12/33] add defragment test after deletion of odd cells Signed-off-by: Pere Diaz Bou --- core/storage/btree.rs | 46 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 1ee2a326a..4d4f57c00 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2427,7 +2427,6 @@ fn defragment_page(page: &PageContent, usable_space: u16) { // return SQLITE_CORRUPT_PAGE(pPage); // } assert!(cbrk >= first_cell); - dbg!(cbrk, first_cell); let write_buf = page.as_ptr(); // set new first byte of cell content @@ -3521,6 +3520,51 @@ mod tests { } } + #[test] + fn test_drop_odd_with_defragment() { + set_breakpoint_panic(); + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + + let mut total_size = 0; + let mut cells = Vec::new(); + let usable_space = 4096; + let total_cells = 10; + for i in 0..total_cells { + let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let payload = add_record(i, i, page, record, &db); + assert_eq!(page.cell_count(), i + 1); + let free = compute_free_space(page, usable_space); + total_size += payload.len() as u16 + 2; + assert_eq!(free, 4096 - total_size - header_size); + cells.push(Cell { pos: i, payload }); + } + + let mut removed = 0; + let mut new_cells = Vec::new(); + for cell in cells { + if cell.pos % 2 == 1 { + drop_cell(page, cell.pos - removed, usable_space); + removed += 1; + } else { + new_cells.push(cell); + } + } + let cells = new_cells; + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + + defragment_page(page, usable_space); + + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + } + fn set_breakpoint_panic() { // Set custom panic hook at start of program panic::set_hook(Box::new(|panic_info| { From 78e7364f452b314d08424294cc9135ac31d0baf3 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 12 Feb 2025 22:56:41 +0100 Subject: [PATCH 13/33] fuzz test --- core/storage/btree.rs | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 4d4f57c00..1f8ca12be 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2801,6 +2801,8 @@ mod tests { use std::sync::Arc; use std::{cell::RefCell, panic, rc::Rc, sync::Arc}; + use rand::{thread_rng, Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; use tempfile::TempDir; use crate::{ @@ -3565,6 +3567,66 @@ mod tests { } } + #[test] + fn test_fuzz_drop_defragment_insert() { + set_breakpoint_panic(); + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + + let mut total_size = 0; + let mut cells = Vec::new(); + let usable_space = 4096; + let total_cells = 10; + let mut ticks = 0; + let mut rng = ChaCha8Rng::seed_from_u64(0); + while ticks > 0 { + ticks -= 1; + } + let mut total_size = 0; + for i in 0..total_cells { + match rng.gen_range(0..3) { + 0 => { + // allow appends with extra place to insert + let cell_idx = rng.gen_range(0..=page.cell_count()); + let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let payload = add_record(i, cell_idx, page, record, &db); + let free = compute_free_space(page, usable_space); + if (free as usize) < payload.len() - 2 { + // do not try to insert overflow pages because they require balancing + continue; + } + assert!(page.overflow_cells.is_empty()); + total_size += payload.len() as u16 + 2; + cells.push(Cell { pos: i, payload }); + } + 1 => { + if page.cell_count() == 0 { + continue; + } + let cell_idx = rng.gen_range(0..page.cell_count()); + let (_, len) = page.cell_get_raw_region( + cell_idx, + payload_overflow_threshold_max(page.page_type(), 4096), + payload_overflow_threshold_min(page.page_type(), 4096), + usable_space as usize, + ); + drop_cell(page, cell_idx, usable_space); + total_size -= len as u16 + 2; + cells.remove(cell_idx); + } + 2 => { + defragment_page(page, usable_space); + } + _ => unreachable!(), + } + let free = compute_free_space(page, usable_space); + assert_eq!(free, 4096 - total_size - header_size); + } + } + fn set_breakpoint_panic() { // Set custom panic hook at start of program panic::set_hook(Box::new(|panic_info| { From 5ff66b8c621e3b05328b6ad3870568ce16b7c047 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 10:33:21 +0100 Subject: [PATCH 14/33] fix reset content area in drop_cell --- core/storage/btree.rs | 181 +++++++++++++++++++++++++++++++++--------- 1 file changed, 145 insertions(+), 36 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 1f8ca12be..59ecb6540 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1056,6 +1056,11 @@ impl BTreeCursor { .divider_cells .borrow_mut() .push(cell_buf.to_vec()); + log::trace!( + "dropping divider cell from parent cell_idx={} count={}", + cell_idx, + parent_contents.cell_count() + ); drop_cell(parent_contents, cell_idx, self.usable_space() as u16); } assert_eq!( @@ -1217,7 +1222,6 @@ impl BTreeCursor { // Now try to take from the right if we didn't have enough while cell_array.number_of_cells_per_page[i] < cell_array.cells.len() as u16 { - println!("moving left {}", i); let size_of_cell_to_remove_from_right = 2 + cell_array.cells[cell_array.cell_count(i)].len() as u16; let can_take = new_page_sizes[i] + size_of_cell_to_remove_from_right @@ -1314,6 +1318,9 @@ impl BTreeCursor { } ); } + for n in cell_array.number_of_cells_per_page.iter().enumerate() { + println!("end count page={}, n={}", n.0, n.1); + } // Allocate pages or set dirty if not needed for i in 0..sibling_count_new { @@ -1408,25 +1415,6 @@ impl BTreeCursor { dbg!(cell); } println!("cells end"); - defragment_page(parent_contents, self.usable_space() as u16); - println!("cells"); - for cell_idx in 0..parent_contents.cell_count() { - let cell = parent_contents.cell_get( - cell_idx, - self.pager.clone(), - payload_overflow_threshold_max( - parent_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - parent_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - dbg!(cell); - } - println!("cells end"); insert_into_cell( parent_contents, &new_divider_cell, @@ -2111,30 +2099,45 @@ impl CellArray { } } +/// Try to find a free block available and allocate it if found fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> usize { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc let mut pc = page_ref.first_freeblock() as usize; + let mut prev_pc = page_ref.offset + PAGE_HEADER_OFFSET_FIRST_FREEBLOCK; let buf = page_ref.as_ptr(); let usable_space = usable_space as usize; let maxpc = usable_space - amount; - let mut found = false; while pc <= maxpc { let next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()); let size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()); if amount <= size as usize { - found = true; - break; + if amount == size as usize { + // delete whole thing + page_ref.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, next); + } else { + // take only the part we are interested in by reducing the size + let new_size = size - amount as u16; + // size includes 4 bytes of freeblock + // we need to leave the free block at least + if new_size >= 4 { + buf[pc + 2..pc + 4].copy_from_slice(&new_size.to_be_bytes()); + } else { + // increase fragment size and delete entry from free list + buf[prev_pc..prev_pc + 2].copy_from_slice(&next.to_be_bytes()); + let frag = page_ref.num_frag_free_bytes() + new_size as u8; + page_ref.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, frag); + } + pc += new_size as usize; + return pc; + } } + prev_pc = pc; pc = next as usize; } - if !found { - 0 - } else { - pc - } + 0 } pub fn btree_init_page(page: &PageRef, page_type: PageType, offset: usize, usable_space: u16) { @@ -2166,6 +2169,13 @@ pub fn edit_page( cell_array: &CellArray, usable_space: u16, ) { + log::trace!( + "edit_page start_old_cells={} start_new_cells={} number_new_cells={} cell_array={}", + start_old_cells, + start_new_cells, + number_new_cells, + cell_array.cells.len() + ); let end_old_cells = start_old_cells + page.cell_count() + page.overflow_cells.len(); let end_new_cells = start_new_cells + number_new_cells; let mut count_cells = page.cell_count(); @@ -2326,13 +2336,15 @@ fn free_cell_range(page: &mut PageContent, offset: u16, len: u16, usable_space: let mut pc = first_block; let mut prev = first_block; - while pc <= maxpc && pc < offset { + dbg!(pc, prev, offset); + while pc <= maxpc && pc < offset && pc != 0 { let next = page.read_u16(pc as usize); prev = pc; pc = next; + dbg!(pc, prev); } - if pc >= maxpc { + if pc == 0 || pc >= maxpc { // insert into tail let offset = offset as usize; let prev = prev as usize; @@ -2479,9 +2491,6 @@ fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usa // ...and insert new cell pointer at the current index page.write_u16(cell_pointer_cur_idx - page.offset, new_cell_data_pointer); - // update first byte of content area (cell data always appended to the left, so cell content area pointer moves to point to the new cell data) - page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, new_cell_data_pointer); - // update cell count let new_n_cells = (page.cell_count() + 1) as u16; page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells); @@ -2550,6 +2559,7 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { .unwrap(), ) as usize; // next 2 bytes in freeblock = size of current freeblock free_space_bytes += size; + dbg!(cur_freeblock_ptr, next, size); // Freeblocks are in order from left to right on the page, // so next pointer should > current pointer + its size, or 0 if no next block exists. if next <= cur_freeblock_ptr + size + 3 { @@ -2765,8 +2775,11 @@ fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) { usable_space as usize, ); free_cell_range(page, cell_start as u16, cell_len as u16, usable_space); - if page.cell_count() > 0 { + if page.cell_count() > 1 { shift_pointers_left(page, cell_idx); + } else { + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, usable_space); + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); } page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); } @@ -2815,8 +2828,8 @@ mod tests { pager::PageRef, sqlite3_ondisk::{BTreeCell, PageContent, PageType}, }, - types::{OwnedValue, Record}, - Database, Page, Pager, PlatformIO, + types::{LimboText, OwnedRecord, OwnedValue, Record}, + Buffer, Database, Page, Pager, PlatformIO, Value, DATABASE_VERSION, IO, }; use super::{btree_init_page, defragment_page, drop_cell, insert_into_cell}; @@ -3627,6 +3640,102 @@ mod tests { } } + #[test] + fn test_defragment_1() { + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let usable_space = 4096; + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let payload = add_record(0, 0, page, record, &db); + + assert_eq!(page.cell_count(), 1); + defragment_page(page, usable_space); + assert_eq!(page.cell_count(), 1); + let (start, len) = page.cell_get_raw_region( + 0, + payload_overflow_threshold_max(page.page_type(), 4096), + payload_overflow_threshold_min(page.page_type(), 4096), + usable_space as usize, + ); + let buf = page.as_ptr(); + assert_eq!(&payload, &buf[start..start + len]); + } + + #[test] + fn test_insert_drop_insert() { + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let usable_space = 4096; + + let record = OwnedRecord::new( + [ + OwnedValue::Integer(0 as i64), + OwnedValue::Text(LimboText::new(Rc::new("aaaaaaaa".to_string()))), + ] + .to_vec(), + ); + let payload = add_record(0, 0, page, record, &db); + + assert_eq!(page.cell_count(), 1); + drop_cell(page, 0, usable_space); + assert_eq!(page.cell_count(), 0); + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let payload = add_record(0, 0, page, record, &db); + assert_eq!(page.cell_count(), 1); + + let (start, len) = page.cell_get_raw_region( + 0, + payload_overflow_threshold_max(page.page_type(), 4096), + payload_overflow_threshold_min(page.page_type(), 4096), + usable_space as usize, + ); + let buf = page.as_ptr(); + assert_eq!(&payload, &buf[start..start + len]); + } + + #[test] + fn test_insert_drop_insert_multiple() { + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let usable_space = 4096; + + let record = OwnedRecord::new( + [ + OwnedValue::Integer(0 as i64), + OwnedValue::Text(LimboText::new(Rc::new("aaaaaaaa".to_string()))), + ] + .to_vec(), + ); + let payload = add_record(0, 0, page, record, &db); + + for i in 0..100 { + assert_eq!(page.cell_count(), 1); + drop_cell(page, 0, usable_space); + assert_eq!(page.cell_count(), 0); + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let payload = add_record(0, 0, page, record, &db); + assert_eq!(page.cell_count(), 1); + + let (start, len) = page.cell_get_raw_region( + 0, + payload_overflow_threshold_max(page.page_type(), 4096), + payload_overflow_threshold_min(page.page_type(), 4096), + usable_space as usize, + ); + let buf = page.as_ptr(); + assert_eq!(&payload, &buf[start..start + len]); + } + } + fn set_breakpoint_panic() { // Set custom panic hook at start of program panic::set_hook(Box::new(|panic_info| { From 4907de6e21f92bcd2cba88f22dfeea6894cd7425 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 15:22:19 +0100 Subject: [PATCH 15/33] fix rightpoitner load + some stuff --- core/storage/btree.rs | 69 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 59ecb6540..94fe30750 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -853,13 +853,13 @@ impl BTreeCursor { let write_info = self.state.mut_write_info().unwrap(); write_info.state = WriteState::BalanceNonRoot; self.stack.pop(); + self.stack.retreat(); return_if_io!(self.balance_non_root()); } WriteState::BalanceNonRoot | WriteState::BalanceNonRootWaitLoadPages => { return_if_io!(self.balance_non_root()); } - _ => unreachable!("invalid balance leaf state {:?}", state), } } } @@ -908,6 +908,11 @@ impl BTreeCursor { let number_of_cells_in_parent = parent_contents.cell_count() + parent_contents.overflow_cells.len(); + assert!( + parent_contents.overflow_cells.is_empty(), + "balancing child page with overflowed parent not yet implemented" + ); + assert!(page_to_balance_idx <= parent_contents.cell_count()); // As there will be at maximum 3 pages used to balance: // sibling_pointer is the index represeneting one of those 3 pages, and we initialize it to the last possible page. // next_divider is the first divider that contains the first page of the 3 pages. @@ -934,17 +939,50 @@ impl BTreeCursor { write_info.sibling_count.replace(sibling_pointer + 1); write_info.first_divider_cell.replace(first_cell_divider); + dbg!(page_to_balance_idx); + for p in 0..parent_contents.cell_count() { + let cell = parent_contents + .cell_get( + p, + self.pager.clone(), + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ) + .unwrap(); + dbg!(cell); + } let last_sibling_is_right_pointer = sibling_pointer + first_cell_divider - parent_contents.overflow_cells.len() == parent_contents.cell_count(); // Get the right page pointer that we will need to update later + dbg!(last_sibling_is_right_pointer); + dbg!(sibling_pointer, first_cell_divider); + dbg!(parent_contents.rightmost_pointer()); + dbg!(number_of_cells_in_parent, page_to_balance_idx); let right_pointer = if last_sibling_is_right_pointer { parent_contents.rightmost_pointer_raw().unwrap() } else { - let pointer_area = parent_contents.cell_pointer_array_offset_and_size(); + let (start_of_cell, _) = parent_contents.cell_get_raw_region( + first_cell_divider + sibling_pointer, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ); let buf = parent_contents.as_ptr().as_mut_ptr(); - let last_divider_offset = (first_cell_divider + sibling_pointer) * 2; - unsafe { buf.add(pointer_area.0 + last_divider_offset) } + unsafe { buf.add(start_of_cell) } }; // load sibling pages @@ -953,6 +991,7 @@ impl BTreeCursor { write_info.rightmost_pointer.replace(Some(right_pointer)); let mut current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { + dbg!(pgno); let page = self.pager.read_page(pgno as usize)?; write_info.pages_to_balance.borrow_mut().push(page); assert_eq!( @@ -1574,6 +1613,8 @@ impl BTreeCursor { self.root_page = root_id; self.stack.clear(); self.stack.push(root.clone()); + // advance in order to maintain semantics + self.stack.advance(); self.stack.push(child.clone()); self.pager.put_loaded_page(root_id, root); @@ -2065,6 +2106,7 @@ impl PageStack { } /// Advance the current cell index of the current page to the next cell. + /// We usually advance after going traversing a new page fn advance(&self) { let current = self.current(); self.cell_indices.borrow_mut()[current] += 1; @@ -2220,11 +2262,18 @@ pub fn edit_page( let overflow_cell = &page.overflow_cells[i]; // cell index in context of new list of cells that should be in the page let cell_idx = start_old_cells + overflow_cell.index - start_new_cells; - if cell_idx >= 0 && cell_idx < start_new_cells { + dbg!( + cell_idx, + start_old_cells, + start_new_cells, + overflow_cell.index, + number_new_cells + ); + if cell_idx >= 0 && cell_idx < number_new_cells { count_cells += 1; page_insert_array( page, - cell_idx + start_new_cells, + start_new_cells + cell_idx, 1, cell_array, cell_idx, @@ -2232,6 +2281,7 @@ pub fn edit_page( ); } } + dbg!(number_new_cells, count_cells); // TODO: append cells to end page_insert_array( page, @@ -2456,6 +2506,12 @@ fn defragment_page(page: &PageContent, usable_space: u16) { /// and the overflow cell count is used to determine if the page overflows, /// i.e. whether we need to balance the btree after the insert. fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usable_space: u16) { + assert!( + cell_idx <= page.cell_count(), + "attempting to add cell to an incorrect place cell_idx={} cell_count={}", + cell_idx, + page.cell_count() + ); let free = compute_free_space(page, usable_space); const CELL_POINTER_SIZE_BYTES: usize = 2; let enough_space = payload.len() + CELL_POINTER_SIZE_BYTES <= free as usize; @@ -2480,6 +2536,7 @@ fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usa let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); // move existing pointers forward by CELL_POINTER_SIZE_BYTES... + dbg!(page.cell_count(), cell_idx); let n_cells_forward = page.cell_count() - cell_idx; let n_bytes_forward = CELL_POINTER_SIZE_BYTES * n_cells_forward; if n_bytes_forward > 0 { From 9d2aaea439c42aaf67e120c098ce706bbe92bcd6 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 20:19:10 +0100 Subject: [PATCH 16/33] add no offset write and read u16 from page --- core/storage/sqlite3_ondisk.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index ddd36decf..ee84ccee4 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -439,6 +439,11 @@ impl PageContent { u16::from_be_bytes([buf[self.offset + pos], buf[self.offset + pos + 1]]) } + pub fn read_u16_no_offset(&self, pos: usize) -> u16 { + let buf = self.as_ptr(); + u16::from_be_bytes([buf[pos], buf[pos + 1]]) + } + pub fn read_u32(&self, pos: usize) -> u32 { let buf = self.as_ptr(); read_u32(buf, self.offset + pos) @@ -456,6 +461,12 @@ impl PageContent { buf[self.offset + pos..self.offset + pos + 2].copy_from_slice(&value.to_be_bytes()); } + pub fn write_u16_no_offset(&self, pos: usize, value: u16) { + log::debug!("write_u16(pos={}, value={})", pos, value); + let buf = self.as_ptr(); + buf[pos..pos + 2].copy_from_slice(&value.to_be_bytes()); + } + pub fn write_u32(&self, pos: usize, value: u32) { tracing::debug!("write_u32(pos={}, value={})", pos, value); let buf = self.as_ptr(); From c65dce6cfde0965cf6accc2901ba0af81b65cbb2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 20:20:05 +0100 Subject: [PATCH 17/33] re implemenet free cell range --- core/storage/btree.rs | 192 +++++++++++++++++++++++++----------------- 1 file changed, 113 insertions(+), 79 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 94fe30750..e66f70510 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2155,6 +2155,7 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> u while pc <= maxpc { let next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()); let size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()); + dbg!(next, size); if amount <= size as usize { if amount == size as usize { // delete whole thing @@ -2347,68 +2348,57 @@ pub fn page_insert_array( /// This function also updates the freeblock list in the page. /// Freeblocks are used to keep track of free space in the page, /// and are organized as a linked list. -fn free_cell_range(page: &mut PageContent, offset: u16, len: u16, usable_space: u16) { +fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_space: u16) { + let mut size = len; + let mut end = offset + len; + let mut pointer_to_pc = page.offset as u16 + 1; // if the freeblock list is empty, we set this block as the first freeblock in the page header. - if page.first_freeblock() == 0 { - page.write_u16(offset as usize, 0); // next freeblock = null - page.write_u16(offset as usize + 2, len); // size of this freeblock - page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block - return; - } - let first_block = page.first_freeblock(); - - // if the freeblock list is not empty, and the offset is less than the first freeblock, - // we insert this block at the head of the list - if offset < first_block { - page.write_u16(offset as usize, first_block); // next freeblock = previous first freeblock - page.write_u16(offset as usize + 2, len); // size of this freeblock - page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block - return; - } - - // if we clear space that is at the start of the cell content area, - // we need to update the cell content area pointer forward to account for the removed space - // FIXME: is offset ever < cell_content_area? cell content area grows leftwards and the pointer - // is to the start of the last allocated cell. should we assert!(offset >= page.cell_content_area()) - // and change this to if offset == page.cell_content_area()? - if offset <= page.cell_content_area() { - // FIXME: remove the line directly below this, it does not change anything. - page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, page.first_freeblock()); - page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); - return; - } - - // if the freeblock list is not empty, and the offset is greater than the first freeblock, - // then we need to do some more calculation to figure out where to insert the freeblock - // in the freeblock linked list. - let maxpc = usable_space; - - let mut pc = first_block; - let mut prev = first_block; - - dbg!(pc, prev, offset); - while pc <= maxpc && pc < offset && pc != 0 { - let next = page.read_u16(pc as usize); - prev = pc; - pc = next; - dbg!(pc, prev); - } - - if pc == 0 || pc >= maxpc { - // insert into tail - let offset = offset as usize; - let prev = prev as usize; - page.write_u16(prev, offset as u16); - page.write_u16(offset, 0); - page.write_u16(offset + 2, len); + let pc = if page.first_freeblock() == 0 { + 0 } else { - // insert in between - let next = page.read_u16(pc as usize); - let offset = offset as usize; - let prev = prev as usize; - page.write_u16(prev, offset as u16); - page.write_u16(offset, next); - page.write_u16(offset + 2, len); + // if the freeblock list is not empty, and the offset is greater than the first freeblock, + // then we need to do some more calculation to figure out where to insert the freeblock + // in the freeblock linked list. + let first_block = page.first_freeblock(); + let maxpc = usable_space; + + let mut pc = first_block; + + dbg!(pc, pointer_to_pc, offset); + while pc <= maxpc && pc < offset && pc != 0 { + let next = page.read_u16_no_offset(pc as usize); + pointer_to_pc = pc; + pc = next; + dbg!(pc, pointer_to_pc); + } + let mut removed_fragmentation = 0; + if pc > 0 && offset + len + 3 >= pc { + removed_fragmentation = (pc - end) as u8; + end = pc + page.read_u16_no_offset(pc as usize); + size = end - offset; + } + + if pointer_to_pc > page.offset as u16 + 1 { + let prev_end = pointer_to_pc + page.read_u16_no_offset(pointer_to_pc as usize + 2); + if prev_end + 3 >= offset { + removed_fragmentation += (offset - prev_end) as u8; + size = end - pointer_to_pc; + offset = pointer_to_pc; + } + } + let frag = page.num_frag_free_bytes() - removed_fragmentation; + page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, frag); + + pc + }; + + if offset < page.cell_content_area() { + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, pc); + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); + } else { + page.write_u16_no_offset(pointer_to_pc as usize, offset); + page.write_u16_no_offset(offset as usize, pc); + page.write_u16_no_offset(offset as usize + 2, size); } } @@ -2495,6 +2485,7 @@ fn defragment_page(page: &PageContent, usable_space: u16) { page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16); // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); + page.write_u16(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); // set unused space to 0 dbg!(cbrk, first_cell); write_buf[first_cell as usize..cbrk as usize].fill(0); @@ -2546,7 +2537,7 @@ fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usa ); } // ...and insert new cell pointer at the current index - page.write_u16(cell_pointer_cur_idx - page.offset, new_cell_data_pointer); + page.write_u16_no_offset(cell_pointer_cur_idx, new_cell_data_pointer); // update cell count let new_n_cells = (page.cell_count() + 1) as u16; @@ -2593,7 +2584,6 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { // #3 is computed by iterating over the freeblocks linked list let mut cur_freeblock_ptr = page.first_freeblock() as usize; - let page_buf = page.as_ptr(); if cur_freeblock_ptr > 0 { if cur_freeblock_ptr < cell_content_area_start as usize { // Freeblocks exist in the cell content area e.g. after deletions @@ -2605,16 +2595,8 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { let mut size = 0; loop { // TODO: check corruption icellast - next = u16::from_be_bytes( - page_buf[cur_freeblock_ptr..cur_freeblock_ptr + 2] - .try_into() - .unwrap(), - ) as usize; // first 2 bytes in freeblock = next freeblock pointer - size = u16::from_be_bytes( - page_buf[cur_freeblock_ptr + 2..cur_freeblock_ptr + 4] - .try_into() - .unwrap(), - ) as usize; // next 2 bytes in freeblock = size of current freeblock + next = page.read_u16_no_offset(cur_freeblock_ptr) as usize; // first 2 bytes in freeblock = next freeblock pointer + size = page.read_u16_no_offset(cur_freeblock_ptr + 2) as usize; // next 2 bytes in freeblock = size of current freeblock free_space_bytes += size; dbg!(cur_freeblock_ptr, next, size); // Freeblocks are in order from left to right on the page, @@ -2657,6 +2639,7 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - let gap = cell_offset + 2 * page_ref.cell_count(); let mut top = page_ref.cell_content_area() as usize; + dbg!("allocate_cell_space"); // there are free blocks and enough space if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot @@ -3650,17 +3633,16 @@ mod tests { let mut cells = Vec::new(); let usable_space = 4096; let total_cells = 10; - let mut ticks = 0; - let mut rng = ChaCha8Rng::seed_from_u64(0); - while ticks > 0 { - ticks -= 1; - } - let mut total_size = 0; - for i in 0..total_cells { + let mut i = 1000; + let seed = thread_rng().gen(); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + while i > 0 { + i -= 1; match rng.gen_range(0..3) { 0 => { // allow appends with extra place to insert let cell_idx = rng.gen_range(0..=page.cell_count()); + println!("insert {}", cell_idx); let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, cell_idx, page, record, &db); let free = compute_free_space(page, usable_space); @@ -3673,10 +3655,12 @@ mod tests { cells.push(Cell { pos: i, payload }); } 1 => { + dbg!("drop"); if page.cell_count() == 0 { continue; } let cell_idx = rng.gen_range(0..page.cell_count()); + println!("drop {}", cell_idx); let (_, len) = page.cell_get_raw_region( cell_idx, payload_overflow_threshold_max(page.page_type(), 4096), @@ -3688,10 +3672,12 @@ mod tests { cells.remove(cell_idx); } 2 => { + println!("defragment_page"); defragment_page(page, usable_space); } _ => unreachable!(), } + dbg!("compute free"); let free = compute_free_space(page, usable_space); assert_eq!(free, 4096 - total_size - header_size); } @@ -3793,6 +3779,54 @@ mod tests { } } + #[test] + fn test_drop_a_few_insert() { + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let usable_space = 4096; + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let payload = add_record(0, 0, page, record, &db); + let record = OwnedRecord::new([OwnedValue::Integer(1 as i64)].to_vec()); + let _ = add_record(1, 1, page, record, &db); + let record = OwnedRecord::new([OwnedValue::Integer(2 as i64)].to_vec()); + let _ = add_record(2, 2, page, record, &db); + + drop_cell(page, 1, usable_space); + drop_cell(page, 1, usable_space); + + ensure_cell(page, 0, &payload); + } + + #[test] + fn test_fuzz_victim_1() { + set_breakpoint_panic(); + let db = get_database(); + + let page = get_page(2); + let page = page.get_contents(); + let usable_space = 4096; + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let _ = add_record(0, 0, page, record, &db); + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let _ = add_record(0, 0, page, record, &db); + drop_cell(page, 0, usable_space); + + defragment_page(page, usable_space); + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let _ = add_record(0, 1, page, record, &db); + + drop_cell(page, 0, usable_space); + + let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let _ = add_record(0, 1, page, record, &db); + } + fn set_breakpoint_panic() { // Set custom panic hook at start of program panic::set_hook(Box::new(|panic_info| { From 7e55f46b26fd439b26bac4bc616743d83e00a161 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 22:21:40 +0100 Subject: [PATCH 18/33] fix defragment --- core/storage/btree.rs | 14 +++++--------- core/storage/sqlite3_ondisk.rs | 4 ++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index e66f70510..c27510890 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2422,10 +2422,10 @@ fn defragment_page(page: &PageContent, usable_space: u16) { let write_buf = page.as_ptr(); for i in 0..cloned_page.cell_count() { - let cell_offset = page.offset + 8; - let cell_idx = cell_offset + i * 2; + let (cell_offset, _) = page.cell_pointer_array_offset_and_size(); + let cell_idx = cell_offset + (i * 2); - let pc = u16::from_be_bytes([read_buf[cell_idx], read_buf[cell_idx + 1]]) as u64; + let pc = cloned_page.read_u16_no_offset(cell_idx) as u64; if pc > last_cell { unimplemented!("corrupted page"); } @@ -2467,7 +2467,7 @@ fn defragment_page(page: &PageContent, usable_space: u16) { } assert!(cbrk + size <= usable_space && cbrk >= first_cell); // set new pointer - write_buf[cell_idx..cell_idx + 2].copy_from_slice(&(cbrk as u16).to_be_bytes()); + page.write_u16_no_offset(cell_idx, cbrk as u16); // copy payload write_buf[cbrk as usize..cbrk as usize + size as usize] .copy_from_slice(&read_buf[pc as usize..pc as usize + size as usize]); @@ -2479,16 +2479,12 @@ fn defragment_page(page: &PageContent, usable_space: u16) { // return SQLITE_CORRUPT_PAGE(pPage); // } assert!(cbrk >= first_cell); - let write_buf = page.as_ptr(); // set new first byte of cell content page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16); // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); - page.write_u16(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); - // set unused space to 0 - dbg!(cbrk, first_cell); - write_buf[first_cell as usize..cbrk as usize].fill(0); + page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); } /// Insert a record into a cell. diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index ee84ccee4..21c6da4a3 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -600,10 +600,10 @@ impl PageContent { ) -> (usize, usize) { let buf = self.as_ptr(); let ncells = self.cell_count(); - let cell_pointer_array_start = self.header_size(); + let (cell_pointer_array_start, _) = self.cell_pointer_array_offset_and_size(); assert!(idx < ncells, "cell_get: idx out of bounds"); let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each - let cell_pointer = self.read_u16(cell_pointer) as usize; + let cell_pointer = self.read_u16_no_offset(cell_pointer) as usize; let start = cell_pointer; let len = match self.page_type() { PageType::IndexInterior => { From 1b96bd8a30796ba4d788557641680d563dbf1c3e Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 22:35:52 +0100 Subject: [PATCH 19/33] fix free_cell_area offset check --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c27510890..c5100367e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2392,7 +2392,7 @@ fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_spa pc }; - if offset < page.cell_content_area() { + if offset <= page.cell_content_area() { page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, pc); page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); } else { From 8ba7022e0bc3ee97cd78bc4f674ac24bd89fd4f2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 22:36:02 +0100 Subject: [PATCH 20/33] remove dbg prints --- core/storage/btree.rs | 107 +----------------------------------------- 1 file changed, 2 insertions(+), 105 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c5100367e..9de6220fe 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -859,7 +859,6 @@ impl BTreeCursor { WriteState::BalanceNonRoot | WriteState::BalanceNonRootWaitLoadPages => { return_if_io!(self.balance_non_root()); } - } } } @@ -939,33 +938,10 @@ impl BTreeCursor { write_info.sibling_count.replace(sibling_pointer + 1); write_info.first_divider_cell.replace(first_cell_divider); - dbg!(page_to_balance_idx); - for p in 0..parent_contents.cell_count() { - let cell = parent_contents - .cell_get( - p, - self.pager.clone(), - payload_overflow_threshold_max( - parent_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - parent_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - ) - .unwrap(); - dbg!(cell); - } let last_sibling_is_right_pointer = sibling_pointer + first_cell_divider - parent_contents.overflow_cells.len() == parent_contents.cell_count(); // Get the right page pointer that we will need to update later - dbg!(last_sibling_is_right_pointer); - dbg!(sibling_pointer, first_cell_divider); - dbg!(parent_contents.rightmost_pointer()); - dbg!(number_of_cells_in_parent, page_to_balance_idx); let right_pointer = if last_sibling_is_right_pointer { parent_contents.rightmost_pointer_raw().unwrap() } else { @@ -991,7 +967,6 @@ impl BTreeCursor { write_info.rightmost_pointer.replace(Some(right_pointer)); let mut current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { - dbg!(pgno); let page = self.pager.read_page(pgno as usize)?; write_info.pages_to_balance.borrow_mut().push(page); assert_eq!( @@ -1213,23 +1188,15 @@ impl BTreeCursor { k += 1; } - for n in cell_array.number_of_cells_per_page.iter().enumerate() { - println!("init count page={}, n={}", n.0, n.1); - } // Try to pack as many cells to the left let mut sibling_count_new = sibling_count; let mut i = 0; while i < sibling_count_new { - for n in cell_array.number_of_cells_per_page.iter().enumerate() { - println!("start count i={} page={}, n={}", i, n.0, n.1); - } // First try to move cells to the right if they do not fit while new_page_sizes[i] > usable_space as u16 { - println!("moving right {}", i); let needs_new_page = i + 1 >= sibling_count_new; if needs_new_page { sibling_count_new += 1; - println!("adding new page"); new_page_sizes.push(0); cell_array .number_of_cells_per_page @@ -1289,7 +1256,6 @@ impl BTreeCursor { let we_still_need_another_page = cell_array.number_of_cells_per_page[i] >= cell_array.cells.len() as u16; if we_still_need_another_page { - dbg!("we need"); sibling_count_new = i + 1; } i += 1; @@ -1297,9 +1263,6 @@ impl BTreeCursor { break; } } - for n in cell_array.number_of_cells_per_page.iter().enumerate() { - println!("start count page={}, n={}", n.0, n.1); - } // Comment borrowed from SQLite src/btree.c // The packing computed by the previous block is biased toward the siblings @@ -1318,7 +1281,6 @@ impl BTreeCursor { // if leaf_data means we don't have divider, so the one we move from left is // the same we add to right (we don't add divider to right). let mut cell_right = cell_left + 1 - leaf_data as u16; - log::trace!("start cell_left={}", cell_left); loop { let cell_left_size = cell_array.cell_size(cell_left as usize); let cell_right_size = cell_array.cell_size(cell_right as usize); @@ -1341,13 +1303,9 @@ impl BTreeCursor { cell_left -= 1; cell_right -= 1; } - tracing::trace!("end cell_left={}", cell_left); new_page_sizes[i] = size_right_page; new_page_sizes[i - 1] = size_left_page; - for n in cell_array.number_of_cells_per_page.iter().enumerate() { - println!("new count page={}, n={}", n.0, n.1); - } assert!( cell_array.number_of_cells_per_page[i - 1] > if i > 1 { @@ -1357,9 +1315,6 @@ impl BTreeCursor { } ); } - for n in cell_array.number_of_cells_per_page.iter().enumerate() { - println!("end count page={}, n={}", n.0, n.1); - } // Allocate pages or set dirty if not needed for i in 0..sibling_count_new { @@ -1383,7 +1338,6 @@ impl BTreeCursor { page.get().id = new_id; self.pager.put_loaded_page(new_id, page.clone()); } - dbg!(page.get().id); } // Write right pointer in parent page to point to new rightmost page @@ -1391,9 +1345,7 @@ impl BTreeCursor { let rightmost_pointer = write_info.rightmost_pointer.borrow_mut().unwrap(); let rightmost_pointer = unsafe { std::slice::from_raw_parts_mut(rightmost_pointer, 4) }; - dbg!(&rightmost_pointer); rightmost_pointer[0..4].copy_from_slice(&right_page_id.to_be_bytes()); - dbg!(&rightmost_pointer); // Ensure right-child pointer of the right-most new sibling pge points to the page // that was originally on that place. @@ -1434,26 +1386,7 @@ impl BTreeCursor { // Leaf index new_divider_cell.extend_from_slice(divider_cell); } - dbg!(&new_divider_cell); // FIXME: defragment shouldn't be needed - println!("cells before fragment"); - for cell_idx in 0..parent_contents.cell_count() { - let cell = parent_contents.cell_get( - cell_idx, - self.pager.clone(), - payload_overflow_threshold_max( - parent_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - parent_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - dbg!(cell); - } - println!("cells end"); insert_into_cell( parent_contents, &new_divider_cell, @@ -1461,24 +1394,6 @@ impl BTreeCursor { self.usable_space() as u16, ); } - println!("cells"); - for cell_idx in 0..parent_contents.cell_count() { - let cell = parent_contents.cell_get( - cell_idx, - self.pager.clone(), - payload_overflow_threshold_max( - parent_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - parent_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - dbg!(cell); - } - println!("cells end"); // TODO: update pages let mut done = vec![false; sibling_count_new]; for i in (1 as i64 - sibling_count_new as i64)..sibling_count_new as i64 { @@ -2155,7 +2070,6 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> u while pc <= maxpc { let next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()); let size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()); - dbg!(next, size); if amount <= size as usize { if amount == size as usize { // delete whole thing @@ -2263,13 +2177,6 @@ pub fn edit_page( let overflow_cell = &page.overflow_cells[i]; // cell index in context of new list of cells that should be in the page let cell_idx = start_old_cells + overflow_cell.index - start_new_cells; - dbg!( - cell_idx, - start_old_cells, - start_new_cells, - overflow_cell.index, - number_new_cells - ); if cell_idx >= 0 && cell_idx < number_new_cells { count_cells += 1; page_insert_array( @@ -2282,7 +2189,6 @@ pub fn edit_page( ); } } - dbg!(number_new_cells, count_cells); // TODO: append cells to end page_insert_array( page, @@ -2335,7 +2241,6 @@ pub fn page_insert_array( mut start_insert: usize, usable_space: u16, ) { - dbg!(count); // TODO: implement faster algorithm, this is doing extra work that's not needed. // See pageInsertArray to understand faster way. for i in first..first + count { @@ -2364,12 +2269,10 @@ fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_spa let mut pc = first_block; - dbg!(pc, pointer_to_pc, offset); while pc <= maxpc && pc < offset && pc != 0 { let next = page.read_u16_no_offset(pc as usize); pointer_to_pc = pc; pc = next; - dbg!(pc, pointer_to_pc); } let mut removed_fragmentation = 0; if pc > 0 && offset + len + 3 >= pc { @@ -2523,7 +2426,6 @@ fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usa let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); // move existing pointers forward by CELL_POINTER_SIZE_BYTES... - dbg!(page.cell_count(), cell_idx); let n_cells_forward = page.cell_count() - cell_idx; let n_bytes_forward = CELL_POINTER_SIZE_BYTES * n_cells_forward; if n_bytes_forward > 0 { @@ -2594,7 +2496,6 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { next = page.read_u16_no_offset(cur_freeblock_ptr) as usize; // first 2 bytes in freeblock = next freeblock pointer size = page.read_u16_no_offset(cur_freeblock_ptr + 2) as usize; // next 2 bytes in freeblock = size of current freeblock free_space_bytes += size; - dbg!(cur_freeblock_ptr, next, size); // Freeblocks are in order from left to right on the page, // so next pointer should > current pointer + its size, or 0 if no next block exists. if next <= cur_freeblock_ptr + size + 3 { @@ -2635,7 +2536,6 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - let gap = cell_offset + 2 * page_ref.cell_count(); let mut top = page_ref.cell_content_area() as usize; - dbg!("allocate_cell_space"); // there are free blocks and enough space if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot @@ -3265,6 +3165,8 @@ mod tests { key ); } + } + } #[test] fn test_drop_odd() { set_breakpoint_panic(); @@ -3638,7 +3540,6 @@ mod tests { 0 => { // allow appends with extra place to insert let cell_idx = rng.gen_range(0..=page.cell_count()); - println!("insert {}", cell_idx); let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, cell_idx, page, record, &db); let free = compute_free_space(page, usable_space); @@ -3651,12 +3552,10 @@ mod tests { cells.push(Cell { pos: i, payload }); } 1 => { - dbg!("drop"); if page.cell_count() == 0 { continue; } let cell_idx = rng.gen_range(0..page.cell_count()); - println!("drop {}", cell_idx); let (_, len) = page.cell_get_raw_region( cell_idx, payload_overflow_threshold_max(page.page_type(), 4096), @@ -3668,12 +3567,10 @@ mod tests { cells.remove(cell_idx); } 2 => { - println!("defragment_page"); defragment_page(page, usable_space); } _ => unreachable!(), } - dbg!("compute free"); let free = compute_free_space(page, usable_space); assert_eq!(free, 4096 - total_size - header_size); } From d3574f1e55a345d0ad1d64b4d028d76491eb03e2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 22:41:17 +0100 Subject: [PATCH 21/33] fix root offfset handling --- core/storage/btree.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 9de6220fe..db984d1da 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1507,9 +1507,9 @@ impl BTreeCursor { let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size(); // change cell pointers for cell_idx in 0..contents.cell_count() { - let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset; - let pc = contents.read_u16(cell_pointer_offset); - contents.write_u16(cell_pointer_offset, pc - offset as u16); + let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx); + let pc = contents.read_u16_no_offset(cell_pointer_offset); + contents.write_u16_no_offset(cell_pointer_offset, pc as u16); } contents.offset = 0; From f9d979575b2cc7cdd4f32c48132d396c14404cae Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 13 Feb 2025 22:48:30 +0100 Subject: [PATCH 22/33] fix defragment_page cell payload size calculation --- core/storage/btree.rs | 44 ++++++++++--------------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index db984d1da..12ec3b597 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2311,16 +2311,14 @@ fn defragment_page(page: &PageContent, usable_space: u16) { log::debug!("defragment_page"); let cloned_page = page.clone(); // TODO(pere): usable space should include offset probably - let usable_space = usable_space as u64; let mut cbrk = usable_space; // TODO: implement fast algorithm let last_cell = usable_space - 4; - let first_cell = cloned_page.unallocated_region_start() as u64; + let first_cell = cloned_page.unallocated_region_start() as u16; if cloned_page.cell_count() > 0 { - let page_type = page.page_type(); let read_buf = cloned_page.as_ptr(); let write_buf = page.as_ptr(); @@ -2328,49 +2326,27 @@ fn defragment_page(page: &PageContent, usable_space: u16) { let (cell_offset, _) = page.cell_pointer_array_offset_and_size(); let cell_idx = cell_offset + (i * 2); - let pc = cloned_page.read_u16_no_offset(cell_idx) as u64; + let pc = cloned_page.read_u16_no_offset(cell_idx); if pc > last_cell { unimplemented!("corrupted page"); } assert!(pc <= last_cell); - let size = match page_type { - PageType::TableInterior => { - let (_, nr_key) = match read_varint(&read_buf[pc as usize ..]) { - Ok(v) => v, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), - }; - 4 + nr_key as u64 - } - PageType::TableLeaf => { - let (payload_size, nr_payload) = match read_varint(&read_buf[pc as usize..]) { - Ok(v) => v, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), - }; - let (_, nr_key) = match read_varint(&read_buf[pc as usize + nr_payload..]) { - Ok(v) => v, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), - }; - // TODO: add overflow page calculation - payload_size + nr_payload as u64 + nr_key as u64 - } - PageType::IndexInterior => todo!(), - PageType::IndexLeaf => todo!(), - }; + let (_, size) = cloned_page.cell_get_raw_region( + i, + payload_overflow_threshold_max(page.page_type(), usable_space), + payload_overflow_threshold_min(page.page_type(), usable_space), + usable_space as usize, + ); + let size = size as u16; cbrk -= size; if cbrk < first_cell || pc + size > usable_space { todo!("corrupt"); } assert!(cbrk + size <= usable_space && cbrk >= first_cell); // set new pointer - page.write_u16_no_offset(cell_idx, cbrk as u16); + page.write_u16_no_offset(cell_idx, cbrk); // copy payload write_buf[cbrk as usize..cbrk as usize + size as usize] .copy_from_slice(&read_buf[pc as usize..pc as usize + size as usize]); From 177dbcd403420f6fe675a4e5bddd58e38756b9ee Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 18:01:33 +0100 Subject: [PATCH 23/33] simplify balance_root --- core/storage/btree.rs | 110 ++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 59 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 12ec3b597..a7cad9f41 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1071,6 +1071,7 @@ impl BTreeCursor { .borrow_mut() .push(cell_buf.to_vec()); log::trace!( + tracing::trace!( "dropping divider cell from parent cell_idx={} count={}", cell_idx, parent_contents.cell_count() @@ -1468,73 +1469,64 @@ impl BTreeCursor { }; let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 }; - let new_root_page = self.allocate_page(PageType::TableInterior, offset); - { - let current_root = self.stack.top(); - let current_root_contents = current_root.get().contents.as_ref().unwrap(); - let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap(); - if is_page_1 { - // Copy header - let current_root_buf = current_root_contents.as_ptr(); - let new_root_buf = new_root_page_contents.as_ptr(); - new_root_buf[0..DATABASE_HEADER_SIZE] - .copy_from_slice(¤t_root_buf[0..DATABASE_HEADER_SIZE]); - } - // point new root right child to previous root - new_root_page_contents.write_u32( - PAGE_HEADER_OFFSET_RIGHTMOST_PTR, - current_root.get().id as u32, - ); - new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); - } + let root = self.stack.top(); + let root_contents = root.get_contents(); + let child = self.allocate_page(root_contents.page_type(), 0); - /* swap split page buffer with new root buffer so we don't have to update page idx */ - { - let (root_id, child_id, child) = { - let page_ref = self.stack.top(); - let child = page_ref.clone(); + tracing::debug!( + "Balancing root. root={}, rightmost={}", + root.get().id, + child.get().id + ); - // Swap the entire Page structs - std::mem::swap(&mut child.get().id, &mut new_root_page.get().id); - // TODO:: shift bytes by offset to left on child because now child has offset 100 - // and header bytes - // Also change the offset of page - // - if is_page_1 { - // Remove header from child and set offset to 0 - let contents = child.get().contents.as_mut().unwrap(); - let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size(); - // change cell pointers - for cell_idx in 0..contents.cell_count() { - let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx); - let pc = contents.read_u16_no_offset(cell_pointer_offset); - contents.write_u16_no_offset(cell_pointer_offset, pc as u16); - } + self.pager.add_dirty(root.get().id); + self.pager.add_dirty(child.get().id); - contents.offset = 0; - let buf = contents.as_ptr(); - buf.copy_within(DATABASE_HEADER_SIZE.., 0); - } + let root_buf = root_contents.as_ptr(); + let child_contents = child.get_contents(); + let child_buf = child_contents.as_ptr(); + let (root_pointer_start, root_pointer_len) = + root_contents.cell_pointer_array_offset_and_size(); + let (child_pointer_start, _) = child.get_contents().cell_pointer_array_offset_and_size(); - self.pager.add_dirty(new_root_page.get().id); - self.pager.add_dirty(child.get().id); - (new_root_page.get().id, child.get().id, child) - }; + let top = root_contents.cell_content_area() as usize; - debug!("Balancing root. root={}, rightmost={}", root_id, child_id); - let root = new_root_page.clone(); + // 1. Modify child + // Copy pointers + child_buf[child_pointer_start..child_pointer_start + root_pointer_len] + .copy_from_slice(&root_buf[root_pointer_start..root_pointer_start + root_pointer_len]); + // Copy cell contents + child_buf[top..].copy_from_slice(&root_buf[top..]); + // Copy header + child_buf[0..root_contents.header_size()] + .copy_from_slice(&root_buf[offset..offset + root_contents.header_size()]); + // Copy overflow cells + child_contents.overflow_cells = root_contents.overflow_cells.clone(); - self.root_page = root_id; - self.stack.clear(); - self.stack.push(root.clone()); - // advance in order to maintain semantics - self.stack.advance(); - self.stack.push(child.clone()); + // 2. Modify root + let new_root_page_type = match root_contents.page_type() { + PageType::IndexLeaf => PageType::IndexInterior, + PageType::TableLeaf => PageType::TableInterior, + _ => unreachable!("invalid root non leaf page type"), + } as u8; + // set new page type + root_contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, new_root_page_type); + root_contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, child.get().id as u32); + root_contents.write_u16( + PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, + self.usable_space() as u16, + ); + root_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); + root_contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); - self.pager.put_loaded_page(root_id, root); - self.pager.put_loaded_page(child_id, child); - } + root_contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); + root_contents.overflow_cells.clear(); + self.root_page = root.get().id; + self.stack.clear(); + self.stack.push(root.clone()); + self.stack.advance(); + self.stack.push(child.clone()); } /// Allocate a new page to the btree via the pager. From b5ec5186eae374b682624c70dca148c201d63c8d Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 18:30:42 +0100 Subject: [PATCH 24/33] fix divider cell on leaf data --- core/storage/btree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a7cad9f41..9d436d3f3 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1360,12 +1360,11 @@ impl BTreeCursor { .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, right_pointer); } // TODO: pointer map update (vacuum support) - // TODO: insert divider cells in parent for i in 0..sibling_count_new - 1 /* do not take last page */ { let divider_cell_idx = cell_array.cell_count(i); - let divider_cell = &mut cell_array.cells[divider_cell_idx]; + let mut divider_cell = &mut cell_array.cells[divider_cell_idx]; let page = &pages_to_balance_new[i]; // FIXME: dont use auxiliary space, could be done without allocations let mut new_divider_cell = Vec::new(); @@ -1379,6 +1378,7 @@ impl BTreeCursor { // FIXME: not needed conversion // FIXME: need to update cell size in order to free correctly? // insert into cell with correct range should be enough + divider_cell = &mut cell_array.cells[divider_cell_idx - 1]; let (_, n_bytes_payload) = read_varint(divider_cell)?; let (rowid, _) = read_varint(÷r_cell[n_bytes_payload..])?; new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); From b64cc769b6e137b930fbb34dfefca5098b70e1ac Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 18:31:03 +0100 Subject: [PATCH 25/33] fix rebase --- core/storage/btree.rs | 208 ++++++++++++++++++++++----------- core/storage/sqlite3_ondisk.rs | 2 +- core/vdbe/mod.rs | 3 + 3 files changed, 142 insertions(+), 71 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 9d436d3f3..3ee65673a 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2,8 +2,7 @@ use tracing::debug; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{ - read_u32, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, - TableLeafCell, + read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, TableLeafCell, }; use crate::types::{CursorResult, OwnedValue, Record, SeekKey, SeekOp}; @@ -96,8 +95,6 @@ struct WriteInfo { scratch_cells: RefCell>>, /// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated. rightmost_pointer: RefCell>, - /// Copy of the current page needed for buffer references. - page_copy: RefCell>, /// Divider cells of old pages divider_cells: RefCell>>, /// Number of siblings being used to balance @@ -112,7 +109,6 @@ impl WriteInfo { state: WriteState::Start, scratch_cells: RefCell::new(Vec::new()), rightmost_pointer: RefCell::new(None), - page_copy: RefCell::new(None), pages_to_balance: RefCell::new(Vec::new()), divider_cells: RefCell::new(Vec::new()), sibling_count: RefCell::new(0), @@ -847,7 +843,6 @@ impl BTreeCursor { if !self.stack.has_parent() { self.balance_root(); - return Ok(CursorResult::Ok(())); } let write_info = self.state.mut_write_info().unwrap(); @@ -859,6 +854,8 @@ impl BTreeCursor { WriteState::BalanceNonRoot | WriteState::BalanceNonRootWaitLoadPages => { return_if_io!(self.balance_non_root()); } + WriteState::Finish => return Ok(CursorResult::Ok(())), + _ => panic!("unexpected state on balance {:?}", state), } } } @@ -875,7 +872,7 @@ impl BTreeCursor { .expect("must be balancing") .state .clone(); - tracing::trace!("balance_non_root(state={:?})", state); + tracing::debug!("balance_non_root(state={:?})", state); let (next_write_state, result) = match state { WriteState::Start => todo!(), WriteState::BalanceStart => todo!(), @@ -900,6 +897,10 @@ impl BTreeCursor { parent_page.get().id, page_to_balance_idx ); + assert!(matches!( + parent_contents.page_type(), + PageType::IndexInterior | PageType::TableInterior + )); // Part 1: Find the sibling pages to balance write_info.new_pages.borrow_mut().clear(); write_info.pages_to_balance.borrow_mut().clear(); @@ -965,7 +966,7 @@ impl BTreeCursor { // start loading right page first let mut pgno: u32 = unsafe { right_pointer.cast::().read().swap_bytes() }; write_info.rightmost_pointer.replace(Some(right_pointer)); - let mut current_sibling = sibling_pointer; + let current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { let page = self.pager.read_page(pgno as usize)?; write_info.pages_to_balance.borrow_mut().push(page); @@ -1070,7 +1071,6 @@ impl BTreeCursor { .divider_cells .borrow_mut() .push(cell_buf.to_vec()); - log::trace!( tracing::trace!( "dropping divider cell from parent cell_idx={} count={}", cell_idx, @@ -1697,8 +1697,8 @@ impl BTreeCursor { let cell = contents.cell_get( idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), self.usable_space(), )?; @@ -1719,8 +1719,8 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(contents.page_type()), - self.payload_overflow_threshold_min(contents.page_type()), + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), self.usable_space(), )?; @@ -1768,8 +1768,14 @@ impl BTreeCursor { let predecessor_cell = leaf_contents.cell_get( leaf_cell_idx, self.pager.clone(), - self.payload_overflow_threshold_max(leaf_contents.page_type()), - self.payload_overflow_threshold_min(leaf_contents.page_type()), + payload_overflow_threshold_max( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + leaf_contents.page_type(), + self.usable_space() as u16, + ), self.usable_space(), )?; @@ -1785,11 +1791,16 @@ impl BTreeCursor { } _ => unreachable!("Expected table leaf cell"), } - self.insert_into_cell(contents, &cell_payload, cell_idx); - self.drop_cell(contents, cell_idx); + insert_into_cell( + contents, + &cell_payload, + cell_idx, + self.usable_space() as u16, + ); + drop_cell(contents, cell_idx, self.usable_space() as u16); } else { // For leaf nodes, simply remove the cell - self.drop_cell(contents, cell_idx); + drop_cell(contents, cell_idx, self.usable_space() as u16); } // TODO(Krishna): Implement balance after delete. I will implement after balance_nonroot is extended. @@ -1811,6 +1822,7 @@ impl BTreeCursor { }; return_if_io!(self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ)); let page = self.stack.top(); + dbg!(page.get().id); // TODO(pere): request load return_if_locked!(page); @@ -1917,8 +1929,10 @@ impl BTreeCursor { payload_len: usize, page_type: PageType, ) -> Result> { - let max_local = self.payload_overflow_threshold_max(page_type.clone()); - let min_local = self.payload_overflow_threshold_min(page_type.clone()); + let max_local = + payload_overflow_threshold_max(page_type.clone(), self.usable_space() as u16); + let min_local = + payload_overflow_threshold_min(page_type.clone(), self.usable_space() as u16); let usable_size = self.usable_space(); let (_, local_size) = payload_overflows(payload_len, max_local, min_local, usable_size); @@ -1944,7 +1958,7 @@ impl PageStack { /// Push a new page onto the stack. /// This effectively means traversing to a child page. fn push(&self, page: PageRef) { - debug!( + tracing::trace!( "pagestack::push(current={}, new_page_id={})", self.current_page.borrow(), page.get().id @@ -1963,7 +1977,7 @@ impl PageStack { /// This effectively means traversing back up to a parent page. fn pop(&self) { let current = *self.current_page.borrow(); - debug!("pagestack::pop(current={})", current); + tracing::trace!("pagestack::pop(current={})", current); self.cell_indices.borrow_mut()[current as usize] = 0; self.stack.borrow_mut()[current as usize] = None; *self.current_page.borrow_mut() -= 1; @@ -1977,11 +1991,11 @@ impl PageStack { .as_ref() .unwrap() .clone(); - // debug!( - // "pagestack::top(current={}, page_id={})", - // current, - // page.get().id - // ); + tracing::trace!( + "pagestack::top(current={}, page_id={})", + current, + page.get().id + ); page } @@ -2016,11 +2030,13 @@ impl PageStack { /// We usually advance after going traversing a new page fn advance(&self) { let current = self.current(); + tracing::trace!("advance {}", self.cell_indices.borrow()[current]); self.cell_indices.borrow_mut()[current] += 1; } fn retreat(&self) { let current = self.current(); + tracing::trace!("retreat {}", self.cell_indices.borrow()[current]); self.cell_indices.borrow_mut()[current] -= 1; } @@ -2110,7 +2126,7 @@ fn to_static_buf(buf: &mut [u8]) -> &'static mut [u8] { unsafe { std::mem::transmute::<&mut [u8], &'static mut [u8]>(buf) } } -pub fn edit_page( +fn edit_page( page: &mut PageContent, start_old_cells: usize, start_new_cells: usize, @@ -2118,7 +2134,7 @@ pub fn edit_page( cell_array: &CellArray, usable_space: u16, ) { - log::trace!( + tracing::trace!( "edit_page start_old_cells={} start_new_cells={} number_new_cells={} cell_array={}", start_old_cells, start_new_cells, @@ -2169,7 +2185,7 @@ pub fn edit_page( let overflow_cell = &page.overflow_cells[i]; // cell index in context of new list of cells that should be in the page let cell_idx = start_old_cells + overflow_cell.index - start_new_cells; - if cell_idx >= 0 && cell_idx < number_new_cells { + if cell_idx < number_new_cells { count_cells += 1; page_insert_array( page, @@ -2194,7 +2210,7 @@ pub fn edit_page( page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, number_new_cells as u16); } -pub fn page_free_array( +fn page_free_array( page: &mut PageContent, first: usize, count: usize, @@ -2300,7 +2316,7 @@ fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_spa /// Defragment a page. This means packing all the cells to the end of the page. fn defragment_page(page: &PageContent, usable_space: u16) { // TODO: test this - log::debug!("defragment_page"); + tracing::debug!("defragment_page"); let cloned_page = page.clone(); // TODO(pere): usable space should include offset probably let mut cbrk = usable_space; @@ -2534,7 +2550,7 @@ fn fill_cell_payload( page_type: PageType, int_key: Option, cell_payload: &mut Vec, - record: &OwnedRecord, + record: &Record, usable_space: u16, pager: Rc, ) { @@ -2712,14 +2728,15 @@ mod tests { use crate::storage::database::FileStorage; use crate::storage::page_cache::DumbLruPageCache; use crate::storage::sqlite3_ondisk; + use crate::storage::sqlite3_ondisk::DatabaseHeader; + use crate::types::Text; use crate::{BufferPool, DatabaseStorage, WalFile, WalFileShared, WriteCompletion}; use std::cell::RefCell; + use std::panic; use std::rc::Rc; use std::sync::Arc; - use std::{cell::RefCell, panic, rc::Rc, sync::Arc}; - use rand::{thread_rng, Rng, SeedableRng}; - use rand_chacha::ChaCha8Rng; + use rand::{thread_rng, Rng}; use tempfile::TempDir; use crate::{ @@ -2732,14 +2749,14 @@ mod tests { pager::PageRef, sqlite3_ondisk::{BTreeCell, PageContent, PageType}, }, - types::{LimboText, OwnedRecord, OwnedValue, Record}, - Buffer, Database, Page, Pager, PlatformIO, Value, DATABASE_VERSION, IO, + types::{OwnedValue, Record}, + Database, Page, Pager, PlatformIO, }; use super::{btree_init_page, defragment_page, drop_cell, insert_into_cell}; fn get_page(id: usize) -> PageRef { - let page = Arc::new(Page::new(2)); + let page = Arc::new(Page::new(id)); let drop_fn = Rc::new(|_| {}); let inner = PageContent { @@ -2778,7 +2795,7 @@ mod tests { payload_overflow_threshold_min(page.page_type(), 4096), 4096, ); - log::trace!("cell idx={} start={} len={}", cell_idx, cell.0, cell.1); + tracing::trace!("cell idx={} start={} len={}", cell_idx, cell.0, cell.1); let buf = &page.as_ptr()[cell.0..cell.0 + cell.1]; assert_eq!(buf.len(), payload.len()); assert_eq!(buf, payload); @@ -2788,7 +2805,7 @@ mod tests { id: usize, pos: usize, page: &mut PageContent, - record: OwnedRecord, + record: Record, db: &Arc, ) -> Vec { let mut payload: Vec = Vec::new(); @@ -2810,7 +2827,7 @@ mod tests { let page = get_page(2); let page = page.get_contents(); let header_size = 8; - let record = OwnedRecord::new([OwnedValue::Integer(1)].to_vec()); + let record = Record::new([OwnedValue::Integer(1)].to_vec()); let payload = add_record(1, 0, page, record, &db); assert_eq!(page.cell_count(), 1); let free = compute_free_space(page, 4096); @@ -2838,7 +2855,7 @@ mod tests { let mut cells = Vec::new(); let usable_space = 4096; for i in 0..3 { - let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, i, page, record, &db); assert_eq!(page.cell_count(), i + 1); let free = compute_free_space(page, usable_space); @@ -2872,8 +2889,8 @@ mod tests { .cell_get( cell_idx, pager.clone(), - cursor.payload_overflow_threshold_max(page_type), - cursor.payload_overflow_threshold_min(page_type), + payload_overflow_threshold_max(page_type, 4096), + payload_overflow_threshold_min(page_type, 4096), cursor.usable_space(), ) .unwrap(); @@ -2935,8 +2952,8 @@ mod tests { .cell_get( cell_idx, pager.clone(), - cursor.payload_overflow_threshold_max(page_type), - cursor.payload_overflow_threshold_min(page_type), + payload_overflow_threshold_max(page_type, 4096), + payload_overflow_threshold_min(page_type, 4096), cursor.usable_space(), ) .unwrap(); @@ -3002,7 +3019,7 @@ mod tests { }; let pager = Rc::new(pager); let page1 = pager.allocate_page().unwrap(); - btree_init_page(&page1, PageType::TableLeaf, &db_header, 0); + btree_init_page(&page1, PageType::TableLeaf, 0, 4096); (pager, page1.get().id) } @@ -3149,7 +3166,7 @@ mod tests { let usable_space = 4096; let total_cells = 10; for i in 0..total_cells { - let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, i, page, record, &db); assert_eq!(page.cell_count(), i + 1); let free = compute_free_space(page, usable_space); @@ -3269,7 +3286,7 @@ mod tests { let (pager, db_header) = setup_test_env(5); let cursor = BTreeCursor::new(pager.clone(), 1); - let max_local = cursor.payload_overflow_threshold_max(PageType::TableLeaf); + let max_local = payload_overflow_threshold_max(PageType::TableLeaf, 4096); let usable_size = cursor.usable_space(); // Create a large payload that will definitely trigger overflow @@ -3415,7 +3432,7 @@ mod tests { let mut cells = Vec::new(); let usable_space = 4096; for i in 0..3 { - let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, i, page, record, &db); assert_eq!(page.cell_count(), i + 1); let free = compute_free_space(page, usable_space); @@ -3455,7 +3472,7 @@ mod tests { let usable_space = 4096; let total_cells = 10; for i in 0..total_cells { - let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, i, page, record, &db); assert_eq!(page.cell_count(), i + 1); let free = compute_free_space(page, usable_space); @@ -3504,11 +3521,11 @@ mod tests { let mut rng = ChaCha8Rng::seed_from_u64(seed); while i > 0 { i -= 1; - match rng.gen_range(0..3) { + match rng.next_u64() % 3 { 0 => { // allow appends with extra place to insert - let cell_idx = rng.gen_range(0..=page.cell_count()); - let record = OwnedRecord::new([OwnedValue::Integer(i as i64)].to_vec()); + let cell_idx = rng.next_u64() as usize % (page.cell_count() + 1); + let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); let payload = add_record(i, cell_idx, page, record, &db); let free = compute_free_space(page, usable_space); if (free as usize) < payload.len() - 2 { @@ -3523,7 +3540,7 @@ mod tests { if page.cell_count() == 0 { continue; } - let cell_idx = rng.gen_range(0..page.cell_count()); + let cell_idx = rng.next_u64() as usize % page.cell_count(); let (_, len) = page.cell_get_raw_region( cell_idx, payload_overflow_threshold_max(page.page_type(), 4096), @@ -3552,7 +3569,7 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let payload = add_record(0, 0, page, record, &db); assert_eq!(page.cell_count(), 1); @@ -3576,10 +3593,10 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = OwnedRecord::new( + let record = Record::new( [ OwnedValue::Integer(0 as i64), - OwnedValue::Text(LimboText::new(Rc::new("aaaaaaaa".to_string()))), + OwnedValue::Text(Text::new("aaaaaaaa")), ] .to_vec(), ); @@ -3589,7 +3606,7 @@ mod tests { drop_cell(page, 0, usable_space); assert_eq!(page.cell_count(), 0); - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let payload = add_record(0, 0, page, record, &db); assert_eq!(page.cell_count(), 1); @@ -3611,10 +3628,10 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = OwnedRecord::new( + let record = Record::new( [ OwnedValue::Integer(0 as i64), - OwnedValue::Text(LimboText::new(Rc::new("aaaaaaaa".to_string()))), + OwnedValue::Text(Text::new("aaaaaaaa")), ] .to_vec(), ); @@ -3625,7 +3642,7 @@ mod tests { drop_cell(page, 0, usable_space); assert_eq!(page.cell_count(), 0); - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let payload = add_record(0, 0, page, record, &db); assert_eq!(page.cell_count(), 1); @@ -3648,11 +3665,11 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let payload = add_record(0, 0, page, record, &db); - let record = OwnedRecord::new([OwnedValue::Integer(1 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(1 as i64)].to_vec()); let _ = add_record(1, 1, page, record, &db); - let record = OwnedRecord::new([OwnedValue::Integer(2 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(2 as i64)].to_vec()); let _ = add_record(2, 2, page, record, &db); drop_cell(page, 1, usable_space); @@ -3670,24 +3687,75 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 0, page, record, &db); - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 0, page, record, &db); drop_cell(page, 0, usable_space); defragment_page(page, usable_space); - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 1, page, record, &db); drop_cell(page, 0, usable_space); - let record = OwnedRecord::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 1, page, record, &db); } + #[test] + fn btree_insert_sequential() { + let (pager, root_page) = empty_btree(); + let mut keys = Vec::new(); + for i in 0..10000 { + let mut cursor = BTreeCursor::new(pager.clone(), root_page); + tracing::info!("INSERT INTO t VALUES ({});", i,); + let key = OwnedValue::Integer(i); + let value = Record::new(vec![OwnedValue::Integer(i)]); + tracing::trace!("before insert {}", i); + loop { + let key = SeekKey::TableRowId(i as u64); + match cursor.move_to(key, SeekOp::EQ).unwrap() { + CursorResult::Ok(_) => break, + CursorResult::IO => { + pager.io.run_once().unwrap(); + } + } + } + loop { + match cursor.insert(&key, &value, true).unwrap() { + CursorResult::Ok(_) => break, + CursorResult::IO => { + pager.io.run_once().unwrap(); + } + } + } + keys.push(i); + } + if matches!(validate_btree(pager.clone(), root_page), (_, false)) { + panic!("invalid btree"); + } + tracing::trace!( + "=========== btree ===========\n{}\n\n", + format_btree(pager.clone(), root_page, 0) + ); + for key in keys.iter() { + let mut cursor = BTreeCursor::new(pager.clone(), root_page); + let key = OwnedValue::Integer(*key); + loop { + match cursor.exists(&key).unwrap() { + CursorResult::Ok(exists) => { + assert!(exists, "key {} is not found", key); + break; + } + CursorResult::IO => pager.io.run_once().unwrap(), + } + } + } + } + fn set_breakpoint_panic() { // Set custom panic hook at start of program panic::set_hook(Box::new(|panic_info| { diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 21c6da4a3..b5b8d3f2b 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -462,7 +462,7 @@ impl PageContent { } pub fn write_u16_no_offset(&self, pos: usize, value: u16) { - log::debug!("write_u16(pos={}, value={})", pos, value); + tracing::debug!("write_u16(pos={}, value={})", pos, value); let buf = self.as_ptr(); buf[pos..pos + 2].copy_from_slice(&value.to_be_bytes()); } diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 5fa53dd07..1f1614595 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -2467,6 +2467,9 @@ impl Program { _ => unreachable!("Not a record! Cannot insert a non record value."), }; let key = &state.registers[*key_reg]; + // NOTE(pere): Sending moved_before == true is okay because we moved before but + // if we were to set to false after starting a balance procedure, it might + // leave undefined state. return_if_io!(cursor.insert(key, record, true)); state.pc += 1; } From 1687072d7780f86a059ec29cfb186dc4ca32e7d6 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 18:31:19 +0100 Subject: [PATCH 26/33] remove dbg --- core/storage/btree.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 3ee65673a..aa70651d8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1822,7 +1822,6 @@ impl BTreeCursor { }; return_if_io!(self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ)); let page = self.stack.top(); - dbg!(page.get().id); // TODO(pere): request load return_if_locked!(page); From aea4560422207264608f838828cffe9a6fc5322b Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 18:51:09 +0100 Subject: [PATCH 27/33] bring back corrupt errors --- core/storage/btree.rs | 137 +++++++++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 42 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index aa70651d8..c018f75ed 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -786,7 +786,8 @@ impl BTreeCursor { cell_payload.as_slice(), cell_idx, self.usable_space() as u16, - ); + ) + .unwrap(); contents.overflow_cells.len() }; let write_info = self @@ -1076,7 +1077,7 @@ impl BTreeCursor { cell_idx, parent_contents.cell_count() ); - drop_cell(parent_contents, cell_idx, self.usable_space() as u16); + drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; } assert_eq!( write_info.divider_cells.borrow().len(), @@ -1393,7 +1394,8 @@ impl BTreeCursor { &new_divider_cell, first_divider_cell + i, self.usable_space() as u16, - ); + ) + .unwrap(); } // TODO: update pages let mut done = vec![false; sibling_count_new]; @@ -1796,11 +1798,12 @@ impl BTreeCursor { &cell_payload, cell_idx, self.usable_space() as u16, - ); - drop_cell(contents, cell_idx, self.usable_space() as u16); + ) + .unwrap(); + drop_cell(contents, cell_idx, self.usable_space() as u16)?; } else { // For leaf nodes, simply remove the cell - drop_cell(contents, cell_idx, self.usable_space() as u16); + drop_cell(contents, cell_idx, self.usable_space() as u16)?; } // TODO(Krishna): Implement balance after delete. I will implement after balance_nonroot is extended. @@ -2064,7 +2067,7 @@ impl CellArray { } /// Try to find a free block available and allocate it if found -fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> usize { +fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> Result { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc let mut pc = page_ref.first_freeblock() as usize; @@ -2095,13 +2098,23 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> u page_ref.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, frag); } pc += new_size as usize; - return pc; + return Ok(pc); } } prev_pc = pc; + if pc <= prev_pc && pc != 0 { + return Err(LimboError::Corrupt( + "Free list not in ascending order".into(), + )); + } pc = next as usize; } - 0 + if pc > maxpc + amount - 4 { + return Err(LimboError::Corrupt( + "Free block chain extends beyond page end".into(), + )); + } + Ok(0) } pub fn btree_init_page(page: &PageRef, page_type: PageType, offset: usize, usable_space: u16) { @@ -2132,7 +2145,7 @@ fn edit_page( number_new_cells: usize, cell_array: &CellArray, usable_space: u16, -) { +) -> Result<()> { tracing::trace!( "edit_page start_old_cells={} start_new_cells={} number_new_cells={} cell_array={}", start_old_cells, @@ -2150,7 +2163,7 @@ fn edit_page( start_new_cells - start_old_cells, cell_array, usable_space, - ); + )?; // shift pointers left let buf = page.as_ptr(); let (start, _) = page.cell_pointer_array_offset_and_size(); @@ -2167,7 +2180,7 @@ fn edit_page( end_old_cells - end_new_cells, cell_array, usable_space, - ); + )?; assert!(page.cell_count() >= number_tail_removed); count_cells -= number_tail_removed; } @@ -2176,7 +2189,7 @@ fn edit_page( // TODO: add to start if start_new_cells < start_old_cells { let count = number_new_cells.min(start_old_cells - start_new_cells); - page_insert_array(page, start_new_cells, count, cell_array, 0, usable_space); + page_insert_array(page, start_new_cells, count, cell_array, 0, usable_space)?; count_cells += count; } // TODO: overflow cells @@ -2193,7 +2206,7 @@ fn edit_page( cell_array, cell_idx, usable_space, - ); + )?; } } // TODO: append cells to end @@ -2204,9 +2217,10 @@ fn edit_page( cell_array, count_cells, usable_space, - ); + )?; // TODO: noverflow page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, number_new_cells as u16); + Ok(()) } fn page_free_array( @@ -2215,7 +2229,7 @@ fn page_free_array( count: usize, cell_array: &CellArray, usable_space: u16, -) -> usize { +) -> Result { let buf = &mut page.as_ptr()[page.offset..usable_space as usize]; let buf_range = buf.as_ptr_range(); let mut number_of_cells_removed = 0; @@ -2233,12 +2247,12 @@ fn page_free_array( // TODO: remove pointer too let offset = (cell_pointer.start as usize - buf_range.start as usize) as u16; let len = (cell_pointer.end as usize - cell_pointer.start as usize) as u16; - free_cell_range(page, offset, len, usable_space); + free_cell_range(page, offset, len, usable_space)?; page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); number_of_cells_removed += 1; } } - number_of_cells_removed + Ok(number_of_cells_removed) } pub fn page_insert_array( page: &mut PageContent, @@ -2247,20 +2261,26 @@ pub fn page_insert_array( cell_array: &CellArray, mut start_insert: usize, usable_space: u16, -) { +) -> Result<()> { // TODO: implement faster algorithm, this is doing extra work that's not needed. // See pageInsertArray to understand faster way. for i in first..first + count { - insert_into_cell(page, cell_array.cells[i], start_insert, usable_space); + insert_into_cell(page, cell_array.cells[i], start_insert, usable_space)?; start_insert += 1; } + Ok(()) } /// Free the range of bytes that a cell occupies. /// This function also updates the freeblock list in the page. /// Freeblocks are used to keep track of free space in the page, /// and are organized as a linked list. -fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_space: u16) { +fn free_cell_range( + page: &mut PageContent, + mut offset: u16, + len: u16, + usable_space: u16, +) -> Result<()> { let mut size = len; let mut end = offset + len; let mut pointer_to_pc = page.offset as u16 + 1; @@ -2281,21 +2301,40 @@ fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_spa pointer_to_pc = pc; pc = next; } + + if pc > usable_space - 4 { + return Err(LimboError::Corrupt("Free block beyond usable space".into())); + } let mut removed_fragmentation = 0; if pc > 0 && offset + len + 3 >= pc { removed_fragmentation = (pc - end) as u8; + + if end > pc { + return Err(LimboError::Corrupt("Invalid block overlap".into())); + } end = pc + page.read_u16_no_offset(pc as usize); + if end > usable_space { + return Err(LimboError::Corrupt( + "Coalesced block extends beyond page".into(), + )); + } size = end - offset; } if pointer_to_pc > page.offset as u16 + 1 { let prev_end = pointer_to_pc + page.read_u16_no_offset(pointer_to_pc as usize + 2); if prev_end + 3 >= offset { + if prev_end > offset { + return Err(LimboError::Corrupt("Invalid previous block overlap".into())); + } removed_fragmentation += (offset - prev_end) as u8; size = end - pointer_to_pc; offset = pointer_to_pc; } } + if removed_fragmentation > page.num_frag_free_bytes() { + return Err(LimboError::Corrupt("Invalid fragmentation count".into())); + } let frag = page.num_frag_free_bytes() - removed_fragmentation; page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, frag); @@ -2303,6 +2342,12 @@ fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_spa }; if offset <= page.cell_content_area() { + if offset < page.cell_content_area() { + return Err(LimboError::Corrupt("Free block before content area".into())); + } + if offset != PAGE_HEADER_OFFSET_FIRST_FREEBLOCK as u16 { + return Err(LimboError::Corrupt("Invalid content area merge".into())); + } page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, pc); page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); } else { @@ -2310,6 +2355,7 @@ fn free_cell_range(page: &mut PageContent, mut offset: u16, len: u16, usable_spa page.write_u16_no_offset(offset as usize, pc); page.write_u16_no_offset(offset as usize + 2, size); } + Ok(()) } /// Defragment a page. This means packing all the cells to the end of the page. @@ -2378,7 +2424,12 @@ fn defragment_page(page: &PageContent, usable_space: u16) { /// insert_into_cell() is called from insert_into_page(), /// and the overflow cell count is used to determine if the page overflows, /// i.e. whether we need to balance the btree after the insert. -fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usable_space: u16) { +fn insert_into_cell( + page: &mut PageContent, + payload: &[u8], + cell_idx: usize, + usable_space: u16, +) -> Result<()> { assert!( cell_idx <= page.cell_count(), "attempting to add cell to an incorrect place cell_idx={} cell_count={}", @@ -2394,11 +2445,11 @@ fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usa index: cell_idx, payload: Pin::new(Vec::from(payload)), }); - return; + return Ok(()); } // TODO: insert into cell payload in internal page - let new_cell_data_pointer = allocate_cell_space(page, payload.len() as u16, usable_space); + let new_cell_data_pointer = allocate_cell_space(page, payload.len() as u16, usable_space)?; let buf = page.as_ptr(); // copy data @@ -2423,6 +2474,7 @@ fn insert_into_cell(page: &mut PageContent, payload: &[u8], cell_idx: usize, usa // update cell count let new_n_cells = (page.cell_count() + 1) as u16; page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells); + Ok(()) } /// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte @@ -2512,7 +2564,7 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { } /// Allocate space for a cell on a page. -fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) -> u16 { +fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) -> Result { let amount = amount as usize; let (cell_offset, _) = page_ref.cell_pointer_array_offset_and_size(); @@ -2522,9 +2574,9 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - // there are free blocks and enough space if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot - let pc = find_free_cell(page_ref, usable_space, amount); + let pc = find_free_cell(page_ref, usable_space, amount)?; if pc != 0 { - return pc as u16; + return Ok(pc as u16); } /* fall through, we might need to defragment */ } @@ -2540,7 +2592,7 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - page_ref.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, top as u16); assert!(top + amount <= usable_space as usize); - top as u16 + Ok(top as u16) } /// Fill in the cell payload with the record. @@ -2686,14 +2738,14 @@ fn payload_overflow_threshold_min(_page_type: PageType, usable_space: u16) -> us /// Drop a cell from a page. /// This is done by freeing the range of bytes that the cell occupies. -fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) { +fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) -> Result<()> { let (cell_start, cell_len) = page.cell_get_raw_region( cell_idx, payload_overflow_threshold_max(page.page_type(), usable_space), payload_overflow_threshold_min(page.page_type(), usable_space), usable_space as usize, ); - free_cell_range(page, cell_start as u16, cell_len as u16, usable_space); + free_cell_range(page, cell_start as u16, cell_len as u16, usable_space)?; if page.cell_count() > 1 { shift_pointers_left(page, cell_idx); } else { @@ -2701,6 +2753,7 @@ fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) { page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); } page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); + Ok(()) } /// Shift pointers to the left once starting from a cell position @@ -2816,7 +2869,7 @@ mod tests { 4096, db.pager.clone(), ); - insert_into_cell(page, &payload, pos, 4096); + insert_into_cell(page, &payload, pos, 4096).unwrap(); payload } @@ -2867,7 +2920,7 @@ mod tests { ensure_cell(page, i, &cell.payload); } cells.remove(1); - drop_cell(page, 1, usable_space); + drop_cell(page, 1, usable_space).unwrap(); for (i, cell) in cells.iter().enumerate() { ensure_cell(page, i, &cell.payload); @@ -3178,7 +3231,7 @@ mod tests { let mut new_cells = Vec::new(); for cell in cells { if cell.pos % 2 == 1 { - drop_cell(page, cell.pos - removed, usable_space); + drop_cell(page, cell.pos - removed, usable_space).unwrap(); removed += 1; } else { new_cells.push(cell); @@ -3444,7 +3497,7 @@ mod tests { ensure_cell(page, i, &cell.payload); } cells.remove(1); - drop_cell(page, 1, usable_space); + drop_cell(page, 1, usable_space).unwrap(); for (i, cell) in cells.iter().enumerate() { ensure_cell(page, i, &cell.payload); @@ -3484,7 +3537,7 @@ mod tests { let mut new_cells = Vec::new(); for cell in cells { if cell.pos % 2 == 1 { - drop_cell(page, cell.pos - removed, usable_space); + drop_cell(page, cell.pos - removed, usable_space).unwrap(); removed += 1; } else { new_cells.push(cell); @@ -3546,7 +3599,7 @@ mod tests { payload_overflow_threshold_min(page.page_type(), 4096), usable_space as usize, ); - drop_cell(page, cell_idx, usable_space); + drop_cell(page, cell_idx, usable_space).unwrap(); total_size -= len as u16 + 2; cells.remove(cell_idx); } @@ -3602,7 +3655,7 @@ mod tests { let payload = add_record(0, 0, page, record, &db); assert_eq!(page.cell_count(), 1); - drop_cell(page, 0, usable_space); + drop_cell(page, 0, usable_space).unwrap(); assert_eq!(page.cell_count(), 0); let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); @@ -3638,7 +3691,7 @@ mod tests { for i in 0..100 { assert_eq!(page.cell_count(), 1); - drop_cell(page, 0, usable_space); + drop_cell(page, 0, usable_space).unwrap(); assert_eq!(page.cell_count(), 0); let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); @@ -3671,8 +3724,8 @@ mod tests { let record = Record::new([OwnedValue::Integer(2 as i64)].to_vec()); let _ = add_record(2, 2, page, record, &db); - drop_cell(page, 1, usable_space); - drop_cell(page, 1, usable_space); + drop_cell(page, 1, usable_space).unwrap(); + drop_cell(page, 1, usable_space).unwrap(); ensure_cell(page, 0, &payload); } @@ -3691,14 +3744,14 @@ mod tests { let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 0, page, record, &db); - drop_cell(page, 0, usable_space); + drop_cell(page, 0, usable_space).unwrap(); defragment_page(page, usable_space); let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 1, page, record, &db); - drop_cell(page, 0, usable_space); + drop_cell(page, 0, usable_space).unwrap(); let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); let _ = add_record(0, 1, page, record, &db); From 286cb8c5bc9aff6404103b7cf005e03c7f5348bd Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 19:10:15 +0100 Subject: [PATCH 28/33] fix some free_cell_range pointer reads --- core/storage/btree.rs | 48 ++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c018f75ed..e2b286d4c 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1435,7 +1435,7 @@ impl BTreeCursor { number_new_cells, &cell_array, self.usable_space() as u16, - ); + )?; tracing::trace!( "edit_page page={} cells={}", pages_to_balance_new[page_idx].get().id, @@ -2312,7 +2312,7 @@ fn free_cell_range( if end > pc { return Err(LimboError::Corrupt("Invalid block overlap".into())); } - end = pc + page.read_u16_no_offset(pc as usize); + end = pc + page.read_u16_no_offset(pc as usize + 2); if end > usable_space { return Err(LimboError::Corrupt( "Coalesced block extends beyond page".into(), @@ -2345,7 +2345,7 @@ fn free_cell_range( if offset < page.cell_content_area() { return Err(LimboError::Corrupt("Free block before content area".into())); } - if offset != PAGE_HEADER_OFFSET_FIRST_FREEBLOCK as u16 { + if pointer_to_pc != page.offset as u16 + PAGE_HEADER_OFFSET_FIRST_FREEBLOCK as u16 { return Err(LimboError::Corrupt("Invalid content area merge".into())); } page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, pc); @@ -2896,7 +2896,6 @@ mod tests { #[test] fn test_drop_1() { - set_breakpoint_panic(); let db = get_database(); let page = get_page(2); @@ -3205,8 +3204,7 @@ mod tests { } } #[test] - fn test_drop_odd() { - set_breakpoint_panic(); + pub fn test_drop_odd() { let db = get_database(); let page = get_page(2); @@ -3334,7 +3332,7 @@ mod tests { } #[test] - fn test_clear_overflow_pages() -> Result<()> { + pub fn test_clear_overflow_pages() -> Result<()> { let (pager, db_header) = setup_test_env(5); let cursor = BTreeCursor::new(pager.clone(), 1); @@ -3431,7 +3429,7 @@ mod tests { } #[test] - fn test_clear_overflow_pages_no_overflow() -> Result<()> { + pub fn test_clear_overflow_pages_no_overflow() -> Result<()> { let (pager, db_header) = setup_test_env(5); let cursor = BTreeCursor::new(pager.clone(), 1); @@ -3472,8 +3470,7 @@ mod tests { Ok(()) } #[test] - fn test_defragment() { - set_breakpoint_panic(); + pub fn test_defragment() { let db = get_database(); let page = get_page(2); @@ -3511,8 +3508,7 @@ mod tests { } #[test] - fn test_drop_odd_with_defragment() { - set_breakpoint_panic(); + pub fn test_drop_odd_with_defragment() { let db = get_database(); let page = get_page(2); @@ -3556,8 +3552,7 @@ mod tests { } #[test] - fn test_fuzz_drop_defragment_insert() { - set_breakpoint_panic(); + pub fn test_fuzz_drop_defragment_insert() { let db = get_database(); let page = get_page(2); @@ -3614,7 +3609,7 @@ mod tests { } #[test] - fn test_defragment_1() { + pub fn test_defragment_1() { let db = get_database(); let page = get_page(2); @@ -3638,7 +3633,7 @@ mod tests { } #[test] - fn test_insert_drop_insert() { + pub fn test_insert_drop_insert() { let db = get_database(); let page = get_page(2); @@ -3673,7 +3668,7 @@ mod tests { } #[test] - fn test_insert_drop_insert_multiple() { + pub fn test_insert_drop_insert_multiple() { let db = get_database(); let page = get_page(2); @@ -3710,7 +3705,7 @@ mod tests { } #[test] - fn test_drop_a_few_insert() { + pub fn test_drop_a_few_insert() { let db = get_database(); let page = get_page(2); @@ -3731,8 +3726,7 @@ mod tests { } #[test] - fn test_fuzz_victim_1() { - set_breakpoint_panic(); + pub fn test_fuzz_victim_1() { let db = get_database(); let page = get_page(2); @@ -3758,7 +3752,7 @@ mod tests { } #[test] - fn btree_insert_sequential() { + pub fn btree_insert_sequential() { let (pager, root_page) = empty_btree(); let mut keys = Vec::new(); for i in 0..10000 { @@ -3807,16 +3801,4 @@ mod tests { } } } - - fn set_breakpoint_panic() { - // Set custom panic hook at start of program - panic::set_hook(Box::new(|panic_info| { - unsafe { - std::arch::asm!("brk #0"); - } - - // Optionally print the panic info - eprintln!("Panic occurred: {:?}", panic_info); - })); - } } From a6f5bcbaf4ac53e6f3c2790c1c1cf5d1f5fc9c35 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 19:21:37 +0100 Subject: [PATCH 29/33] fix return find_free_cell --- core/storage/btree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index e2b286d4c..5d9a488c0 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2098,16 +2098,16 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> R page_ref.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, frag); } pc += new_size as usize; - return Ok(pc); } + return Ok(pc); } prev_pc = pc; + pc = next as usize; if pc <= prev_pc && pc != 0 { return Err(LimboError::Corrupt( "Free list not in ascending order".into(), )); } - pc = next as usize; } if pc > maxpc + amount - 4 { return Err(LimboError::Corrupt( From d276c22a0c34df365b364b21b8df41ab32ea71c9 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 19:25:41 +0100 Subject: [PATCH 30/33] clippy --- core/storage/btree.rs | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5d9a488c0..f64800973 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1124,7 +1124,7 @@ impl BTreeCursor { cell_array.cells.push(to_static_buf(cell_buf)); } // Insert overflow cells into correct place - let mut offset = total_cells_inserted; + let offset = total_cells_inserted; assert!( old_page_contents.overflow_cells.len() <= 1, "todo: check this works for more than one overflow cell" @@ -1399,8 +1399,8 @@ impl BTreeCursor { } // TODO: update pages let mut done = vec![false; sibling_count_new]; - for i in (1 as i64 - sibling_count_new as i64)..sibling_count_new as i64 { - let page_idx = i.abs() as usize; + for i in (1 - sibling_count_new as i64)..sibling_count_new as i64 { + let page_idx = i.unsigned_abs() as usize; if done[page_idx] { continue; } @@ -2001,15 +2001,6 @@ impl PageStack { page } - /// Get the parent page of the current page. - fn parent(&self) -> PageRef { - let current = *self.current_page.borrow(); - self.stack.borrow()[current as usize - 1] - .as_ref() - .unwrap() - .clone() - } - /// Current page pointer being used fn current(&self) -> usize { *self.current_page.borrow() as usize @@ -2254,7 +2245,7 @@ fn page_free_array( } Ok(number_of_cells_removed) } -pub fn page_insert_array( +fn page_insert_array( page: &mut PageContent, first: usize, count: usize, From 4d6843d1cc9a3f95ab09be7467766b6501674bba Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 16 Feb 2025 19:27:34 +0100 Subject: [PATCH 31/33] clippy --- core/storage/btree.rs | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index f64800973..f2db4784b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -3553,7 +3553,6 @@ mod tests { let mut total_size = 0; let mut cells = Vec::new(); let usable_space = 4096; - let total_cells = 10; let mut i = 1000; let seed = thread_rng().gen(); let mut rng = ChaCha8Rng::seed_from_u64(seed); @@ -3633,12 +3632,12 @@ mod tests { let record = Record::new( [ - OwnedValue::Integer(0 as i64), + OwnedValue::Integer(0), OwnedValue::Text(Text::new("aaaaaaaa")), ] .to_vec(), ); - let payload = add_record(0, 0, page, record, &db); + let _ = add_record(0, 0, page, record, &db); assert_eq!(page.cell_count(), 1); drop_cell(page, 0, usable_space).unwrap(); @@ -3668,19 +3667,19 @@ mod tests { let record = Record::new( [ - OwnedValue::Integer(0 as i64), + OwnedValue::Integer(0), OwnedValue::Text(Text::new("aaaaaaaa")), ] .to_vec(), ); - let payload = add_record(0, 0, page, record, &db); + let _ = add_record(0, 0, page, record, &db); - for i in 0..100 { + for _ in 0..100 { assert_eq!(page.cell_count(), 1); drop_cell(page, 0, usable_space).unwrap(); assert_eq!(page.cell_count(), 0); - let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0)].to_vec()); let payload = add_record(0, 0, page, record, &db); assert_eq!(page.cell_count(), 1); @@ -3703,11 +3702,11 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0)].to_vec()); let payload = add_record(0, 0, page, record, &db); - let record = Record::new([OwnedValue::Integer(1 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(1)].to_vec()); let _ = add_record(1, 1, page, record, &db); - let record = Record::new([OwnedValue::Integer(2 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(2)].to_vec()); let _ = add_record(2, 2, page, record, &db); drop_cell(page, 1, usable_space).unwrap(); @@ -3724,21 +3723,21 @@ mod tests { let page = page.get_contents(); let usable_space = 4096; - let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0)].to_vec()); let _ = add_record(0, 0, page, record, &db); - let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0)].to_vec()); let _ = add_record(0, 0, page, record, &db); drop_cell(page, 0, usable_space).unwrap(); defragment_page(page, usable_space); - let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0)].to_vec()); let _ = add_record(0, 1, page, record, &db); drop_cell(page, 0, usable_space).unwrap(); - let record = Record::new([OwnedValue::Integer(0 as i64)].to_vec()); + let record = Record::new([OwnedValue::Integer(0)].to_vec()); let _ = add_record(0, 1, page, record, &db); } From ddbfada8bdaa4323b493a555d29bab2ee995e459 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 17 Feb 2025 08:47:55 +0100 Subject: [PATCH 32/33] fix wrong usage of insert in fuzz tests --- core/storage/btree.rs | 59 +++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index f2db4784b..b33109f5b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2775,6 +2775,7 @@ mod tests { use crate::types::Text; use crate::{BufferPool, DatabaseStorage, WalFile, WalFileShared, WriteCompletion}; use std::cell::RefCell; + use std::ops::Deref; use std::panic; use std::rc::Rc; use std::sync::Arc; @@ -3170,9 +3171,18 @@ mod tests { size, insert_id ); + run_until_done( + || { + let key = SeekKey::TableRowId(key as u64); + cursor.move_to(key, SeekOp::EQ) + }, + pager.deref(), + ) + .unwrap(); + let key = OwnedValue::Integer(key); let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); - cursor.insert(&key, &value, false).unwrap(); + run_until_done(|| cursor.insert(&key, &value, true), pager.deref()).unwrap(); } tracing::info!( "=========== btree ===========\n{}\n\n", @@ -3751,23 +3761,15 @@ mod tests { let key = OwnedValue::Integer(i); let value = Record::new(vec![OwnedValue::Integer(i)]); tracing::trace!("before insert {}", i); - loop { - let key = SeekKey::TableRowId(i as u64); - match cursor.move_to(key, SeekOp::EQ).unwrap() { - CursorResult::Ok(_) => break, - CursorResult::IO => { - pager.io.run_once().unwrap(); - } - } - } - loop { - match cursor.insert(&key, &value, true).unwrap() { - CursorResult::Ok(_) => break, - CursorResult::IO => { - pager.io.run_once().unwrap(); - } - } - } + run_until_done( + || { + let key = SeekKey::TableRowId(i as u64); + cursor.move_to(key, SeekOp::EQ) + }, + pager.deref(), + ) + .unwrap(); + run_until_done(|| cursor.insert(&key, &value, true), pager.deref()).unwrap(); keys.push(i); } if matches!(validate_btree(pager.clone(), root_page), (_, false)) { @@ -3780,14 +3782,21 @@ mod tests { for key in keys.iter() { let mut cursor = BTreeCursor::new(pager.clone(), root_page); let key = OwnedValue::Integer(*key); - loop { - match cursor.exists(&key).unwrap() { - CursorResult::Ok(exists) => { - assert!(exists, "key {} is not found", key); - break; - } - CursorResult::IO => pager.io.run_once().unwrap(), + let exists = run_until_done(|| cursor.exists(&key), pager.deref()).unwrap(); + assert!(exists, "key not found {}", key); + } + } + + fn run_until_done( + mut action: impl FnMut() -> Result>, + pager: &Pager, + ) -> Result { + loop { + match action()? { + CursorResult::Ok(res) => { + return Ok(res); } + CursorResult::IO => pager.io.run_once().unwrap(), } } } From e25272adc00cdf103cd6a29ff2c515ce45e64abc Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 17 Feb 2025 11:50:15 +0100 Subject: [PATCH 33/33] fix free_cell_space extend content area --- core/storage/btree.rs | 151 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 132 insertions(+), 19 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index b33109f5b..ebe83174e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2172,7 +2172,7 @@ fn edit_page( cell_array, usable_space, )?; - assert!(page.cell_count() >= number_tail_removed); + assert!(count_cells >= number_tail_removed); count_cells -= number_tail_removed; } // TODO: make page_free_array defragment, for now I'm lazy so this will work for now. @@ -2187,17 +2187,19 @@ fn edit_page( for i in 0..page.overflow_cells.len() { let overflow_cell = &page.overflow_cells[i]; // cell index in context of new list of cells that should be in the page - let cell_idx = start_old_cells + overflow_cell.index - start_new_cells; - if cell_idx < number_new_cells { - count_cells += 1; - page_insert_array( - page, - start_new_cells + cell_idx, - 1, - cell_array, - cell_idx, - usable_space, - )?; + if start_old_cells + overflow_cell.index >= start_new_cells { + let cell_idx = start_old_cells + overflow_cell.index - start_new_cells; + if cell_idx < number_new_cells { + count_cells += 1; + page_insert_array( + page, + start_new_cells + cell_idx, + 1, + cell_array, + cell_idx, + usable_space, + )?; + } } } // TODO: append cells to end @@ -2283,11 +2285,19 @@ fn free_cell_range( // then we need to do some more calculation to figure out where to insert the freeblock // in the freeblock linked list. let first_block = page.first_freeblock(); - let maxpc = usable_space; let mut pc = first_block; - while pc <= maxpc && pc < offset && pc != 0 { + while pc < offset { + if pc <= pointer_to_pc { + if pc == 0 { + break; + } + return Err(LimboError::Corrupt( + "free cell range free block not in ascending order".into(), + )); + } + let next = page.read_u16_no_offset(pc as usize); pointer_to_pc = pc; pc = next; @@ -2310,6 +2320,7 @@ fn free_cell_range( )); } size = end - offset; + pc = page.read_u16_no_offset(pc as usize); } if pointer_to_pc > page.offset as u16 + 1 { @@ -2340,7 +2351,7 @@ fn free_cell_range( return Err(LimboError::Corrupt("Invalid content area merge".into())); } page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, pc); - page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, end); } else { page.write_u16_no_offset(pointer_to_pc as usize, offset); page.write_u16_no_offset(offset as usize, pc); @@ -2351,7 +2362,6 @@ fn free_cell_range( /// Defragment a page. This means packing all the cells to the end of the page. fn defragment_page(page: &PageContent, usable_space: u16) { - // TODO: test this tracing::debug!("defragment_page"); let cloned_page = page.clone(); // TODO(pere): usable space should include offset probably @@ -2574,7 +2584,7 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - if gap + 2 + amount > top { // defragment - defragment_page(page_ref, usable_space as u16); + defragment_page(page_ref, usable_space); top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize; } @@ -2742,6 +2752,7 @@ fn drop_cell(page: &mut PageContent, cell_idx: usize, usable_space: u16) -> Resu } else { page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, usable_space); page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); + page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); } page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); Ok(()) @@ -3572,13 +3583,22 @@ mod tests { 0 => { // allow appends with extra place to insert let cell_idx = rng.next_u64() as usize % (page.cell_count() + 1); - let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); - let payload = add_record(i, cell_idx, page, record, &db); let free = compute_free_space(page, usable_space); + let record = Record::new([OwnedValue::Integer(i as i64)].to_vec()); + let mut payload: Vec = Vec::new(); + fill_cell_payload( + page.page_type(), + Some(i as u64), + &mut payload, + &record, + 4096, + db.pager.clone(), + ); if (free as usize) < payload.len() - 2 { // do not try to insert overflow pages because they require balancing continue; } + insert_into_cell(page, &payload, cell_idx, 4096).unwrap(); assert!(page.overflow_cells.is_empty()); total_size += payload.len() as u16 + 2; cells.push(Cell { pos: i, payload }); @@ -3608,6 +3628,20 @@ mod tests { } } + #[test] + pub fn test_free_space() { + let db = get_database(); + let page = get_page(2); + let page = page.get_contents(); + let header_size = 8; + let usable_space = 4096; + + let record = Record::new([OwnedValue::Integer(0)].to_vec()); + let payload = add_record(0, 0, page, record, &db); + let free = compute_free_space(page, usable_space); + assert_eq!(free, 4096 - payload.len() as u16 - 2 - header_size); + } + #[test] pub fn test_defragment_1() { let db = get_database(); @@ -3751,6 +3785,85 @@ mod tests { let _ = add_record(0, 1, page, record, &db); } + #[test] + pub fn test_fuzz_victim_2() { + let db = get_database(); + + let page = get_page(2); + let usable_space = 4096; + let insert = |pos, page| { + let record = Record::new([OwnedValue::Integer(0)].to_vec()); + let _ = add_record(0, pos, page, record, &db); + }; + let drop = |pos, page| { + drop_cell(page, pos, usable_space).unwrap(); + }; + let defragment = |page| { + defragment_page(page, usable_space); + }; + defragment(page.get_contents()); + defragment(page.get_contents()); + insert(0, page.get_contents()); + drop(0, page.get_contents()); + insert(0, page.get_contents()); + drop(0, page.get_contents()); + insert(0, page.get_contents()); + defragment(page.get_contents()); + defragment(page.get_contents()); + drop(0, page.get_contents()); + defragment(page.get_contents()); + insert(0, page.get_contents()); + drop(0, page.get_contents()); + insert(0, page.get_contents()); + insert(1, page.get_contents()); + insert(1, page.get_contents()); + insert(0, page.get_contents()); + drop(3, page.get_contents()); + drop(2, page.get_contents()); + compute_free_space(page.get_contents(), usable_space); + } + + #[test] + pub fn test_fuzz_victim_3() { + let db = get_database(); + + let page = get_page(2); + let usable_space = 4096; + let insert = |pos, page| { + let record = Record::new([OwnedValue::Integer(0)].to_vec()); + let _ = add_record(0, pos, page, record, &db); + }; + let drop = |pos, page| { + drop_cell(page, pos, usable_space).unwrap(); + }; + let defragment = |page| { + defragment_page(page, usable_space); + }; + let record = Record::new([OwnedValue::Integer(0)].to_vec()); + let mut payload: Vec = Vec::new(); + fill_cell_payload( + page.get_contents().page_type(), + Some(0), + &mut payload, + &record, + 4096, + db.pager.clone(), + ); + insert(0, page.get_contents()); + defragment(page.get_contents()); + insert(0, page.get_contents()); + defragment(page.get_contents()); + insert(0, page.get_contents()); + drop(2, page.get_contents()); + drop(0, page.get_contents()); + let free = compute_free_space(page.get_contents(), usable_space); + let total_size = payload.len() + 2; + assert_eq!( + free, + usable_space - page.get_contents().header_size() as u16 - total_size as u16 + ); + dbg!(free); + } #[test] pub fn btree_insert_sequential() { let (pager, root_page) = empty_btree();