mirror of
https://github.com/aljazceru/turso.git
synced 2026-01-05 09:14:24 +01:00
Merge 'Fix various bugs in B-Tree handling' from Nikita Sivukhin
This PR introduce simple fuzz test for BTree insertion algorithm and fixes few bugs found by fuzzer - BTree algorithm returned early although there were overflow pages on stack and more rebalances were needed - BTree balancing algorithm worked under assumption that single page will be enough for rebalance - although this is not always true (if page were tightly packed with relatively big cells, insertion of new very big cell can require 3 split pages to distribute the content between them) - `overflow_cells` wasn't cleared properly during rebalancing - insertions of dividers to the parent node were implemented incorrectly - `defragment_page` didn't reset `PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT` field which can lead to suboptimal usage of pages Closes #951
This commit is contained in:
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -737,6 +737,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
|
||||
dependencies = [
|
||||
"log",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -768,7 +769,10 @@ version = "0.11.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"env_filter",
|
||||
"humantime",
|
||||
"log",
|
||||
]
|
||||
|
||||
@@ -1606,6 +1610,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"criterion",
|
||||
"crossbeam-skiplist",
|
||||
"env_logger 0.11.6",
|
||||
"fallible-iterator 0.3.0",
|
||||
"getrandom 0.2.15",
|
||||
"hex",
|
||||
@@ -1634,6 +1639,7 @@ dependencies = [
|
||||
"quickcheck",
|
||||
"quickcheck_macros",
|
||||
"rand 0.8.5",
|
||||
"rand_chacha 0.9.0",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"rstest",
|
||||
|
||||
@@ -91,7 +91,9 @@ rusqlite = "0.29.0"
|
||||
tempfile = "3.8.0"
|
||||
quickcheck = { version = "1.0", default-features = false }
|
||||
quickcheck_macros = { version = "1.0", default-features = false }
|
||||
rand = "0.8" # Required for quickcheck
|
||||
rand = "0.8.5" # Required for quickcheck
|
||||
rand_chacha = "0.9.0"
|
||||
env_logger = "0.11.6"
|
||||
|
||||
[[bench]]
|
||||
name = "benchmark"
|
||||
|
||||
@@ -2,7 +2,7 @@ use log::debug;
|
||||
|
||||
use crate::storage::pager::Pager;
|
||||
use crate::storage::sqlite3_ondisk::{
|
||||
read_btree_cell, read_varint, write_varint, BTreeCell, DatabaseHeader, PageContent, PageType,
|
||||
read_btree_cell, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType,
|
||||
TableInteriorCell, TableLeafCell,
|
||||
};
|
||||
|
||||
@@ -76,7 +76,7 @@ macro_rules! return_if_locked {
|
||||
|
||||
/// State machine of a write operation.
|
||||
/// May involve balancing due to overflow.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum WriteState {
|
||||
Start,
|
||||
BalanceStart,
|
||||
@@ -89,8 +89,10 @@ enum WriteState {
|
||||
struct WriteInfo {
|
||||
/// State of the write operation state machine.
|
||||
state: WriteState,
|
||||
/// Pages allocated during the write operation due to balancing.
|
||||
new_pages: RefCell<Vec<PageRef>>,
|
||||
/// Pages involved in the split of the page due to balancing (splits_pages[0] is the balancing page, while other - fresh allocated pages)
|
||||
split_pages: RefCell<Vec<PageRef>>,
|
||||
/// Amount of cells from balancing page for every split page
|
||||
split_pages_cells_count: RefCell<Vec<usize>>,
|
||||
/// Scratch space used during balancing.
|
||||
scratch_cells: RefCell<Vec<&'static [u8]>>,
|
||||
/// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated.
|
||||
@@ -103,7 +105,8 @@ impl WriteInfo {
|
||||
fn new() -> WriteInfo {
|
||||
WriteInfo {
|
||||
state: WriteState::Start,
|
||||
new_pages: RefCell::new(Vec::with_capacity(4)),
|
||||
split_pages: RefCell::new(Vec::with_capacity(4)),
|
||||
split_pages_cells_count: RefCell::new(Vec::with_capacity(4)),
|
||||
scratch_cells: RefCell::new(Vec::new()),
|
||||
rightmost_pointer: RefCell::new(None),
|
||||
page_copy: RefCell::new(None),
|
||||
@@ -742,13 +745,14 @@ impl BTreeCursor {
|
||||
// insert
|
||||
let overflow = {
|
||||
let contents = page.get().contents.as_mut().unwrap();
|
||||
debug!(
|
||||
"insert_into_page(overflow, cell_count={})",
|
||||
contents.cell_count()
|
||||
);
|
||||
|
||||
self.insert_into_cell(contents, cell_payload.as_slice(), cell_idx);
|
||||
contents.overflow_cells.len()
|
||||
let overflow_cells = contents.overflow_cells.len();
|
||||
debug!(
|
||||
"insert_into_page(overflow, cell_count={}, overflow_cells={})",
|
||||
contents.cell_count(),
|
||||
overflow_cells
|
||||
);
|
||||
overflow_cells
|
||||
};
|
||||
let write_info = self
|
||||
.state
|
||||
@@ -1049,45 +1053,38 @@ impl BTreeCursor {
|
||||
matches!(self.state, CursorState::Write(_)),
|
||||
"Cursor must be in balancing state"
|
||||
);
|
||||
let state = self
|
||||
.state
|
||||
.write_info()
|
||||
.expect("must be balancing")
|
||||
.state
|
||||
.clone();
|
||||
match state {
|
||||
WriteState::BalanceStart => {
|
||||
// drop divider cells and find right pointer
|
||||
// NOTE: since we are doing a simple split we only finding the pointer we want to update (right pointer).
|
||||
// Right pointer means cell that points to the last page, as we don't really want to drop this one. This one
|
||||
// can be a "rightmost pointer" or a "cell".
|
||||
// we always asumme there is a parent
|
||||
let current_page = self.stack.top();
|
||||
{
|
||||
// check if we don't need to balance
|
||||
// don't continue if there are no overflow cells
|
||||
let page = current_page.get().contents.as_mut().unwrap();
|
||||
if page.overflow_cells.is_empty() {
|
||||
let write_info = self.state.mut_write_info().unwrap();
|
||||
write_info.state = WriteState::Finish;
|
||||
loop {
|
||||
let state = self.state.write_info().expect("must be balancing").state;
|
||||
match state {
|
||||
WriteState::BalanceStart => {
|
||||
let current_page = self.stack.top();
|
||||
{
|
||||
// check if we don't need to balance
|
||||
// don't continue if there are no overflow cells
|
||||
let page = current_page.get().contents.as_mut().unwrap();
|
||||
if page.overflow_cells.is_empty() {
|
||||
let write_info = self.state.mut_write_info().unwrap();
|
||||
write_info.state = WriteState::Finish;
|
||||
return Ok(CursorResult::Ok(()));
|
||||
}
|
||||
}
|
||||
|
||||
if !self.stack.has_parent() {
|
||||
self.balance_root();
|
||||
return Ok(CursorResult::Ok(()));
|
||||
}
|
||||
|
||||
let write_info = self.state.mut_write_info().unwrap();
|
||||
write_info.state = WriteState::BalanceNonRoot;
|
||||
}
|
||||
WriteState::BalanceNonRoot
|
||||
| WriteState::BalanceGetParentPage
|
||||
| WriteState::BalanceMoveUp => {
|
||||
return_if_io!(self.balance_non_root());
|
||||
}
|
||||
|
||||
if !self.stack.has_parent() {
|
||||
self.balance_root();
|
||||
return Ok(CursorResult::Ok(()));
|
||||
}
|
||||
|
||||
let write_info = self.state.mut_write_info().unwrap();
|
||||
write_info.state = WriteState::BalanceNonRoot;
|
||||
self.balance_non_root()
|
||||
_ => unreachable!("invalid balance leaf state {:?}", state),
|
||||
}
|
||||
WriteState::BalanceNonRoot
|
||||
| WriteState::BalanceGetParentPage
|
||||
| WriteState::BalanceMoveUp => self.balance_non_root(),
|
||||
|
||||
_ => unreachable!("invalid balance leaf state {:?}", state),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1096,12 +1093,7 @@ impl BTreeCursor {
|
||||
matches!(self.state, CursorState::Write(_)),
|
||||
"Cursor must be in balancing state"
|
||||
);
|
||||
let state = self
|
||||
.state
|
||||
.write_info()
|
||||
.expect("must be balancing")
|
||||
.state
|
||||
.clone();
|
||||
let state = self.state.write_info().expect("must be balancing").state;
|
||||
let (next_write_state, result) = match state {
|
||||
WriteState::Start => todo!(),
|
||||
WriteState::BalanceStart => todo!(),
|
||||
@@ -1114,10 +1106,14 @@ impl BTreeCursor {
|
||||
let current_page = self.stack.top();
|
||||
debug!("balance_non_root(page={})", current_page.get().id);
|
||||
|
||||
let current_page_inner = current_page.get();
|
||||
let current_page_contents = &mut current_page_inner.contents;
|
||||
let current_page_contents = current_page_contents.as_mut().unwrap();
|
||||
// Copy of page used to reference cell bytes.
|
||||
// This needs to be saved somewhere safe so taht references still point to here,
|
||||
// This needs to be saved somewhere safe so that references still point to here,
|
||||
// this will be store in write_info below
|
||||
let page_copy = current_page.get().contents.as_ref().unwrap().clone();
|
||||
let page_copy = current_page_contents.clone();
|
||||
current_page_contents.overflow_cells.clear();
|
||||
|
||||
// In memory in order copy of all cells in pages we want to balance. For now let's do a 2 page split.
|
||||
// Right pointer in interior cells should be converted to regular cells if more than 2 pages are used for balancing.
|
||||
@@ -1125,47 +1121,77 @@ impl BTreeCursor {
|
||||
let mut scratch_cells = write_info.scratch_cells.borrow_mut();
|
||||
scratch_cells.clear();
|
||||
|
||||
let usable_space = self.usable_space();
|
||||
for cell_idx in 0..page_copy.cell_count() {
|
||||
let (start, len) = page_copy.cell_get_raw_region(
|
||||
cell_idx,
|
||||
self.payload_overflow_threshold_max(page_copy.page_type()),
|
||||
self.payload_overflow_threshold_min(page_copy.page_type()),
|
||||
self.usable_space(),
|
||||
usable_space,
|
||||
);
|
||||
let buf = page_copy.as_ptr();
|
||||
scratch_cells.push(to_static_buf(&buf[start..start + len]));
|
||||
let cell_buffer = to_static_buf(&page_copy.as_ptr()[start..start + len]);
|
||||
scratch_cells.push(cell_buffer);
|
||||
}
|
||||
for overflow_cell in &page_copy.overflow_cells {
|
||||
scratch_cells
|
||||
.insert(overflow_cell.index, to_static_buf(&overflow_cell.payload));
|
||||
// overflow_cells are stored in order - so we need to insert them in reverse order
|
||||
for cell in page_copy.overflow_cells.iter().rev() {
|
||||
scratch_cells.insert(cell.index, to_static_buf(&cell.payload));
|
||||
}
|
||||
|
||||
// amount of cells for pages involved in split
|
||||
// the algorithm accumulate cells in greedy manner with 2 conditions for split:
|
||||
// 1. new cell will overflow single cell (accumulated + new > usable_space - header_size)
|
||||
// 2. accumulated size already reach >50% of content_usable_size
|
||||
// second condition is necessary, otherwise in case of small cells we will create a lot of almost empty pages
|
||||
//
|
||||
// if we have single overflow cell in a table leaf node - we still can have 3 split pages
|
||||
//
|
||||
// for example, if current page has 4 entries with size ~1/4 page size, and new cell has size ~page size
|
||||
// then we will need 3 pages to distribute cells between them
|
||||
let split_pages_cells_count = &mut write_info.split_pages_cells_count.borrow_mut();
|
||||
split_pages_cells_count.clear();
|
||||
let mut last_page_cells_count = 0;
|
||||
let mut last_page_cells_size = 0;
|
||||
let content_usable_space = usable_space - page_copy.header_size();
|
||||
for scratch_cell in scratch_cells.iter() {
|
||||
let cell_size = scratch_cell.len() + 2; // + cell pointer size (u16)
|
||||
if last_page_cells_size + cell_size > content_usable_space
|
||||
|| 2 * last_page_cells_size > content_usable_space
|
||||
{
|
||||
split_pages_cells_count.push(last_page_cells_count);
|
||||
last_page_cells_count = 0;
|
||||
last_page_cells_size = 0;
|
||||
}
|
||||
last_page_cells_count += 1;
|
||||
last_page_cells_size += cell_size;
|
||||
assert!(last_page_cells_size <= content_usable_space);
|
||||
}
|
||||
split_pages_cells_count.push(last_page_cells_count);
|
||||
let new_pages_count = split_pages_cells_count.len();
|
||||
|
||||
debug!(
|
||||
"splitting left={} new_pages={}, cells_count={:?}",
|
||||
current_page.get().id,
|
||||
new_pages_count - 1,
|
||||
split_pages_cells_count
|
||||
);
|
||||
|
||||
*write_info.rightmost_pointer.borrow_mut() = page_copy.rightmost_pointer();
|
||||
write_info.page_copy.replace(Some(page_copy));
|
||||
|
||||
// allocate new pages and move cells to those new pages
|
||||
// split procedure
|
||||
let page = current_page.get().contents.as_mut().unwrap();
|
||||
let page_type = page.page_type();
|
||||
assert!(
|
||||
matches!(
|
||||
page.page_type(),
|
||||
PageType::TableLeaf | PageType::TableInterior
|
||||
),
|
||||
"indexes still not supported "
|
||||
matches!(page_type, PageType::TableLeaf | PageType::TableInterior),
|
||||
"indexes still not supported"
|
||||
);
|
||||
|
||||
let right_page = self.allocate_page(page.page_type(), 0);
|
||||
let right_page_id = right_page.get().id;
|
||||
|
||||
write_info.new_pages.borrow_mut().clear();
|
||||
write_info.new_pages.borrow_mut().push(current_page.clone());
|
||||
write_info.new_pages.borrow_mut().push(right_page.clone());
|
||||
|
||||
debug!(
|
||||
"splitting left={} right={}",
|
||||
current_page.get().id,
|
||||
right_page_id
|
||||
);
|
||||
write_info.split_pages.borrow_mut().clear();
|
||||
write_info.split_pages.borrow_mut().push(current_page);
|
||||
// allocate new pages
|
||||
for _ in 1..new_pages_count {
|
||||
let new_page = self.allocate_page(page_type, 0);
|
||||
write_info.split_pages.borrow_mut().push(new_page);
|
||||
}
|
||||
|
||||
(WriteState::BalanceGetParentPage, Ok(CursorResult::Ok(())))
|
||||
}
|
||||
@@ -1211,7 +1237,7 @@ impl BTreeCursor {
|
||||
BTreeCell::TableInteriorCell(interior) => {
|
||||
interior._left_child_page as usize == current_idx
|
||||
}
|
||||
_ => unreachable!("Parent should always be a "),
|
||||
_ => unreachable!("Parent should always be an interior page"),
|
||||
};
|
||||
if found {
|
||||
let (start, _len) = parent_contents.cell_get_raw_region(
|
||||
@@ -1226,23 +1252,21 @@ impl BTreeCursor {
|
||||
}
|
||||
|
||||
let write_info = self.state.write_info().unwrap();
|
||||
let mut new_pages = write_info.new_pages.borrow_mut();
|
||||
let mut split_pages = write_info.split_pages.borrow_mut();
|
||||
let split_pages_len = split_pages.len();
|
||||
let scratch_cells = write_info.scratch_cells.borrow();
|
||||
|
||||
// reset pages
|
||||
for page in new_pages.iter() {
|
||||
for page in split_pages.iter() {
|
||||
assert!(page.is_dirty());
|
||||
let contents = page.get().contents.as_mut().unwrap();
|
||||
|
||||
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
|
||||
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
|
||||
let db_header = RefCell::borrow(&self.pager.db_header);
|
||||
let cell_content_area_start =
|
||||
db_header.page_size - db_header.reserved_space as u16;
|
||||
contents.write_u16(
|
||||
PAGE_HEADER_OFFSET_CELL_CONTENT_AREA,
|
||||
cell_content_area_start,
|
||||
self.usable_space() as u16,
|
||||
);
|
||||
|
||||
contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
|
||||
@@ -1251,29 +1275,17 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
// distribute cells
|
||||
let new_pages_len = new_pages.len();
|
||||
let cells_per_page = scratch_cells.len() / new_pages.len();
|
||||
let mut current_cell_index = 0_usize;
|
||||
let mut divider_cells_index = Vec::new(); /* index to scratch cells that will be used as dividers in order */
|
||||
/* index to scratch cells that will be used as dividers in order */
|
||||
let mut divider_cells_index = Vec::with_capacity(split_pages.len());
|
||||
|
||||
debug!(
|
||||
"balance_leaf::distribute(cells={}, cells_per_page={})",
|
||||
scratch_cells.len(),
|
||||
cells_per_page
|
||||
);
|
||||
debug!("balance_leaf::distribute(cells={})", scratch_cells.len());
|
||||
|
||||
for (i, page) in new_pages.iter_mut().enumerate() {
|
||||
for (i, page) in split_pages.iter_mut().enumerate() {
|
||||
let page_id = page.get().id;
|
||||
let contents = page.get().contents.as_mut().unwrap();
|
||||
|
||||
let last_page = i == new_pages_len - 1;
|
||||
let cells_to_copy = if last_page {
|
||||
// last cells is remaining pages if division was odd
|
||||
scratch_cells.len() - current_cell_index
|
||||
} else {
|
||||
cells_per_page
|
||||
};
|
||||
let cells_to_copy = write_info.split_pages_cells_count.borrow()[i];
|
||||
debug!(
|
||||
"balance_leaf::distribute(page={}, cells_to_copy={})",
|
||||
page_id, cells_to_copy
|
||||
@@ -1289,6 +1301,7 @@ impl BTreeCursor {
|
||||
divider_cells_index.push(current_cell_index + cells_to_copy - 1);
|
||||
current_cell_index += cells_to_copy;
|
||||
}
|
||||
|
||||
let is_leaf = {
|
||||
let page = self.stack.top();
|
||||
let page = page.get().contents.as_ref().unwrap();
|
||||
@@ -1297,10 +1310,10 @@ impl BTreeCursor {
|
||||
|
||||
// update rightmost pointer for each page if we are in interior page
|
||||
if !is_leaf {
|
||||
for page in new_pages.iter_mut().take(new_pages_len - 1) {
|
||||
for page in split_pages.iter_mut().take(split_pages_len - 1) {
|
||||
let contents = page.get().contents.as_mut().unwrap();
|
||||
|
||||
assert_eq!(contents.cell_count(), 1);
|
||||
assert!(contents.cell_count() >= 1);
|
||||
let last_cell = contents.cell_get(
|
||||
contents.cell_count() - 1,
|
||||
self.pager.clone(),
|
||||
@@ -1316,7 +1329,7 @@ impl BTreeCursor {
|
||||
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, last_cell_pointer);
|
||||
}
|
||||
// last page right most pointer points to previous right most pointer before splitting
|
||||
let last_page = new_pages.last().unwrap();
|
||||
let last_page = split_pages.last().unwrap();
|
||||
let last_page_contents = last_page.get().contents.as_mut().unwrap();
|
||||
last_page_contents.write_u32(
|
||||
PAGE_HEADER_OFFSET_RIGHTMOST_PTR,
|
||||
@@ -1327,7 +1340,7 @@ impl BTreeCursor {
|
||||
// insert dividers in parent
|
||||
// we can consider dividers the first cell of each page starting from the second page
|
||||
for (page_id_index, page) in
|
||||
new_pages.iter_mut().take(new_pages_len - 1).enumerate()
|
||||
split_pages.iter_mut().take(split_pages_len - 1).enumerate()
|
||||
{
|
||||
let contents = page.get().contents.as_mut().unwrap();
|
||||
let divider_cell_index = divider_cells_index[page_id_index];
|
||||
@@ -1342,38 +1355,23 @@ impl BTreeCursor {
|
||||
self.usable_space(),
|
||||
)?;
|
||||
|
||||
if is_leaf {
|
||||
// create a new divider cell and push
|
||||
let key = match cell {
|
||||
BTreeCell::TableLeafCell(leaf) => leaf._rowid,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let mut divider_cell = Vec::new();
|
||||
divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes());
|
||||
divider_cell.extend(std::iter::repeat(0).take(9));
|
||||
let n = write_varint(&mut divider_cell.as_mut_slice()[4..], key);
|
||||
divider_cell.truncate(4 + n);
|
||||
let parent_cell_idx = self.find_cell(parent_contents, key);
|
||||
self.insert_into_cell(
|
||||
parent_contents,
|
||||
divider_cell.as_slice(),
|
||||
parent_cell_idx,
|
||||
);
|
||||
} else {
|
||||
// move cell
|
||||
let key = match cell {
|
||||
BTreeCell::TableInteriorCell(interior) => interior._rowid,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let parent_cell_idx = self.find_cell(contents, key);
|
||||
self.insert_into_cell(parent_contents, cell_payload, parent_cell_idx);
|
||||
// self.drop_cell(*page, 0);
|
||||
}
|
||||
let key = match cell {
|
||||
BTreeCell::TableLeafCell(TableLeafCell { _rowid, .. })
|
||||
| BTreeCell::TableInteriorCell(TableInteriorCell { _rowid, .. }) => _rowid,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let mut divider_cell = Vec::with_capacity(4 + 9); // 4 - page id, 9 - max length of varint
|
||||
divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes());
|
||||
write_varint_to_vec(key, &mut divider_cell);
|
||||
|
||||
let parent_cell_idx = self.find_cell(parent_contents, key);
|
||||
self.insert_into_cell(parent_contents, ÷r_cell, parent_cell_idx);
|
||||
}
|
||||
|
||||
{
|
||||
// copy last page id to right pointer
|
||||
let last_pointer = new_pages.last().unwrap().get().id as u32;
|
||||
let last_pointer = split_pages.last().unwrap().get().id as u32;
|
||||
parent_contents.write_u32(right_pointer, last_pointer);
|
||||
}
|
||||
self.stack.pop();
|
||||
@@ -1405,7 +1403,6 @@ impl BTreeCursor {
|
||||
let current_root = self.stack.top();
|
||||
let current_root_contents = current_root.get().contents.as_ref().unwrap();
|
||||
|
||||
let new_root_page_id = new_root_page.get().id;
|
||||
let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap();
|
||||
if is_page_1 {
|
||||
// Copy header
|
||||
@@ -1415,8 +1412,10 @@ impl BTreeCursor {
|
||||
.copy_from_slice(¤t_root_buf[0..DATABASE_HEADER_SIZE]);
|
||||
}
|
||||
// point new root right child to previous root
|
||||
new_root_page_contents
|
||||
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32);
|
||||
new_root_page_contents.write_u32(
|
||||
PAGE_HEADER_OFFSET_RIGHTMOST_PTR,
|
||||
current_root.get().id as u32,
|
||||
);
|
||||
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
}
|
||||
|
||||
@@ -1607,6 +1606,8 @@ impl BTreeCursor {
|
||||
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16);
|
||||
// set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start
|
||||
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
|
||||
// set fragmented bytes counter to zero
|
||||
page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
|
||||
// set unused space to 0
|
||||
let first_cell = cloned_page.cell_content_area() as u64;
|
||||
assert!(first_cell <= cbrk);
|
||||
@@ -2362,6 +2363,10 @@ fn to_static_buf(buf: &[u8]) -> &'static [u8] {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand_chacha::rand_core::RngCore;
|
||||
use rand_chacha::rand_core::SeedableRng;
|
||||
use rand_chacha::ChaCha8Rng;
|
||||
|
||||
use super::*;
|
||||
use crate::io::{Buffer, Completion, MemoryIO, OpenFlags, IO};
|
||||
use crate::storage::database::FileStorage;
|
||||
@@ -2371,6 +2376,312 @@ mod tests {
|
||||
use std::cell::RefCell;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn validate_btree(pager: Rc<Pager>, page_idx: usize) -> (usize, bool) {
|
||||
let cursor = BTreeCursor::new(pager.clone(), page_idx);
|
||||
let page = pager.read_page(page_idx).unwrap();
|
||||
let page = page.get();
|
||||
let contents = page.contents.as_ref().unwrap();
|
||||
let page_type = contents.page_type();
|
||||
let mut previous_key = None;
|
||||
let mut valid = true;
|
||||
let mut depth = None;
|
||||
for cell_idx in 0..contents.cell_count() {
|
||||
let cell = contents
|
||||
.cell_get(
|
||||
cell_idx,
|
||||
pager.clone(),
|
||||
cursor.payload_overflow_threshold_max(page_type),
|
||||
cursor.payload_overflow_threshold_min(page_type),
|
||||
cursor.usable_space(),
|
||||
)
|
||||
.unwrap();
|
||||
let current_depth = match cell {
|
||||
BTreeCell::TableLeafCell(..) => 1,
|
||||
BTreeCell::TableInteriorCell(TableInteriorCell {
|
||||
_left_child_page, ..
|
||||
}) => {
|
||||
let (child_depth, child_valid) =
|
||||
validate_btree(pager.clone(), _left_child_page as usize);
|
||||
valid &= child_valid;
|
||||
child_depth
|
||||
}
|
||||
_ => panic!("unsupported btree cell: {:?}", cell),
|
||||
};
|
||||
depth = Some(depth.unwrap_or(current_depth + 1));
|
||||
if depth != Some(current_depth + 1) {
|
||||
log::error!("depth is different for child of page {}", page_idx);
|
||||
valid = false;
|
||||
}
|
||||
match cell {
|
||||
BTreeCell::TableInteriorCell(TableInteriorCell { _rowid, .. })
|
||||
| BTreeCell::TableLeafCell(TableLeafCell { _rowid, .. }) => {
|
||||
if previous_key.is_some() && previous_key.unwrap() >= _rowid {
|
||||
log::error!(
|
||||
"keys are in bad order: prev={:?}, current={}",
|
||||
previous_key,
|
||||
_rowid
|
||||
);
|
||||
valid = false;
|
||||
}
|
||||
previous_key = Some(_rowid);
|
||||
}
|
||||
_ => panic!("unsupported btree cell: {:?}", cell),
|
||||
}
|
||||
}
|
||||
if let Some(right) = contents.rightmost_pointer() {
|
||||
let (right_depth, right_valid) = validate_btree(pager.clone(), right as usize);
|
||||
valid &= right_valid;
|
||||
depth = Some(depth.unwrap_or(right_depth + 1));
|
||||
if depth != Some(right_depth + 1) {
|
||||
log::error!("depth is different for child of page {}", page_idx);
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
(depth.unwrap(), valid)
|
||||
}
|
||||
|
||||
fn format_btree(pager: Rc<Pager>, page_idx: usize, depth: usize) -> String {
|
||||
let cursor = BTreeCursor::new(pager.clone(), page_idx);
|
||||
let page = pager.read_page(page_idx).unwrap();
|
||||
let page = page.get();
|
||||
let contents = page.contents.as_ref().unwrap();
|
||||
let page_type = contents.page_type();
|
||||
let mut current = Vec::new();
|
||||
let mut child = Vec::new();
|
||||
for cell_idx in 0..contents.cell_count() {
|
||||
let cell = contents
|
||||
.cell_get(
|
||||
cell_idx,
|
||||
pager.clone(),
|
||||
cursor.payload_overflow_threshold_max(page_type),
|
||||
cursor.payload_overflow_threshold_min(page_type),
|
||||
cursor.usable_space(),
|
||||
)
|
||||
.unwrap();
|
||||
match cell {
|
||||
BTreeCell::TableInteriorCell(cell) => {
|
||||
current.push(format!(
|
||||
"node[rowid:{}, ptr(<=):{}]",
|
||||
cell._rowid, cell._left_child_page
|
||||
));
|
||||
child.push(format_btree(
|
||||
pager.clone(),
|
||||
cell._left_child_page as usize,
|
||||
depth + 2,
|
||||
));
|
||||
}
|
||||
BTreeCell::TableLeafCell(cell) => {
|
||||
current.push(format!(
|
||||
"leaf[rowid:{}, len(payload):{}, overflow:{}]",
|
||||
cell._rowid,
|
||||
cell._payload.len(),
|
||||
cell.first_overflow_page.is_some()
|
||||
));
|
||||
}
|
||||
_ => panic!("unsupported btree cell: {:?}", cell),
|
||||
}
|
||||
}
|
||||
if let Some(rightmost) = contents.rightmost_pointer() {
|
||||
child.push(format_btree(pager.clone(), rightmost as usize, depth + 2));
|
||||
}
|
||||
let current = format!(
|
||||
"{}-page:{}, ptr(right):{}\n{}+cells:{}",
|
||||
" ".repeat(depth),
|
||||
page_idx,
|
||||
contents.rightmost_pointer().unwrap_or(0),
|
||||
" ".repeat(depth),
|
||||
current.join(", ")
|
||||
);
|
||||
if child.is_empty() {
|
||||
current
|
||||
} else {
|
||||
current + "\n" + &child.join("\n")
|
||||
}
|
||||
}
|
||||
|
||||
fn empty_btree() -> (Rc<Pager>, usize) {
|
||||
let db_header = DatabaseHeader::default();
|
||||
let page_size = db_header.page_size as usize;
|
||||
|
||||
#[allow(clippy::arc_with_non_send_sync)]
|
||||
let io: Arc<dyn IO> = Arc::new(MemoryIO::new().unwrap());
|
||||
let io_file = io.open_file("test.db", OpenFlags::Create, false).unwrap();
|
||||
let page_io = Rc::new(FileStorage::new(io_file));
|
||||
|
||||
let buffer_pool = Rc::new(BufferPool::new(db_header.page_size as usize));
|
||||
let wal_shared = WalFileShared::open_shared(&io, "test.wal", db_header.page_size).unwrap();
|
||||
let wal_file = WalFile::new(io.clone(), page_size, wal_shared, buffer_pool.clone());
|
||||
let wal = Rc::new(RefCell::new(wal_file));
|
||||
|
||||
let page_cache = Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(10)));
|
||||
let pager = {
|
||||
let db_header = Rc::new(RefCell::new(db_header.clone()));
|
||||
Pager::finish_open(db_header, page_io, wal, io, page_cache, buffer_pool).unwrap()
|
||||
};
|
||||
let pager = Rc::new(pager);
|
||||
let page1 = pager.allocate_page().unwrap();
|
||||
btree_init_page(&page1, PageType::TableLeaf, &db_header, 0);
|
||||
(pager, page1.get().id)
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn btree_insert_fuzz_ex() {
|
||||
for sequence in [
|
||||
&[
|
||||
(777548915, 3364),
|
||||
(639157228, 3796),
|
||||
(709175417, 1214),
|
||||
(390824637, 210),
|
||||
(906124785, 1481),
|
||||
(197677875, 1305),
|
||||
(457946262, 3734),
|
||||
(956825466, 592),
|
||||
(835875722, 1334),
|
||||
(649214013, 1250),
|
||||
(531143011, 1788),
|
||||
(765057993, 2351),
|
||||
(510007766, 1349),
|
||||
(884516059, 822),
|
||||
(81604840, 2545),
|
||||
]
|
||||
.as_slice(),
|
||||
&[
|
||||
(293471650, 2452),
|
||||
(163608869, 627),
|
||||
(544576229, 464),
|
||||
(705823748, 3441),
|
||||
]
|
||||
.as_slice(),
|
||||
&[
|
||||
(987283511, 2924),
|
||||
(261851260, 1766),
|
||||
(343847101, 1657),
|
||||
(315844794, 572),
|
||||
]
|
||||
.as_slice(),
|
||||
&[
|
||||
(987283511, 2924),
|
||||
(261851260, 1766),
|
||||
(343847101, 1657),
|
||||
(315844794, 572),
|
||||
(649272840, 1632),
|
||||
(723398505, 3140),
|
||||
(334416967, 3874),
|
||||
]
|
||||
.as_slice(),
|
||||
] {
|
||||
let (pager, root_page) = empty_btree();
|
||||
let mut cursor = BTreeCursor::new(pager.clone(), root_page);
|
||||
for (key, size) in sequence.iter() {
|
||||
let key = OwnedValue::Integer(*key);
|
||||
let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; *size]))]);
|
||||
log::info!("insert key:{}", key);
|
||||
cursor.insert(&key, &value, false).unwrap();
|
||||
log::info!(
|
||||
"=========== btree ===========\n{}\n\n",
|
||||
format_btree(pager.clone(), root_page, 0)
|
||||
);
|
||||
}
|
||||
for (key, _) in sequence.iter() {
|
||||
let seek_key = SeekKey::TableRowId(*key as u64);
|
||||
assert!(
|
||||
matches!(
|
||||
cursor.seek(seek_key, SeekOp::EQ).unwrap(),
|
||||
CursorResult::Ok(true)
|
||||
),
|
||||
"key {} is not found",
|
||||
key
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn rng_from_time() -> (ChaCha8Rng, u64) {
|
||||
let seed = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
let rng = ChaCha8Rng::seed_from_u64(seed);
|
||||
(rng, seed)
|
||||
}
|
||||
|
||||
fn btree_insert_fuzz_run(
|
||||
attempts: usize,
|
||||
inserts: usize,
|
||||
size: impl Fn(&mut ChaCha8Rng) -> usize,
|
||||
) {
|
||||
let (mut rng, seed) = rng_from_time();
|
||||
log::info!("super seed: {}", seed);
|
||||
for _ in 0..attempts {
|
||||
let (pager, root_page) = empty_btree();
|
||||
let mut cursor = BTreeCursor::new(pager.clone(), root_page);
|
||||
let mut keys = Vec::new();
|
||||
let seed = rng.next_u64();
|
||||
log::info!("seed: {}", seed);
|
||||
let mut rng = ChaCha8Rng::seed_from_u64(seed);
|
||||
for insert_id in 0..inserts {
|
||||
let size = size(&mut rng);
|
||||
let key = (rng.next_u64() % (1 << 30)) as i64;
|
||||
keys.push(key);
|
||||
log::info!(
|
||||
"INSERT INTO t VALUES ({}, randomblob({})); -- {}",
|
||||
key,
|
||||
size,
|
||||
insert_id
|
||||
);
|
||||
let key = OwnedValue::Integer(key);
|
||||
let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]);
|
||||
cursor.insert(&key, &value, false).unwrap();
|
||||
}
|
||||
log::info!(
|
||||
"=========== btree ===========\n{}\n\n",
|
||||
format_btree(pager.clone(), root_page, 0)
|
||||
);
|
||||
if matches!(validate_btree(pager.clone(), root_page), (_, false)) {
|
||||
panic!("invalid btree");
|
||||
}
|
||||
for key in keys.iter() {
|
||||
let seek_key = SeekKey::TableRowId(*key as u64);
|
||||
assert!(
|
||||
matches!(
|
||||
cursor.seek(seek_key, SeekOp::EQ).unwrap(),
|
||||
CursorResult::Ok(true)
|
||||
),
|
||||
"key {} is not found",
|
||||
key
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn btree_insert_fuzz_run_equal_size() {
|
||||
for size in 1..8 {
|
||||
log::info!("======= size:{} =======", size);
|
||||
btree_insert_fuzz_run(2, 1024, |_| size);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn btree_insert_fuzz_run_random() {
|
||||
btree_insert_fuzz_run(128, 16, |rng| (rng.next_u32() % 4096) as usize);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn btree_insert_fuzz_run_small() {
|
||||
btree_insert_fuzz_run(1, 1024, |rng| (rng.next_u32() % 128) as usize);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn btree_insert_fuzz_run_big() {
|
||||
btree_insert_fuzz_run(64, 32, |rng| 3 * 1024 + (rng.next_u32() % 1024) as usize);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn btree_insert_fuzz_run_overflow() {
|
||||
btree_insert_fuzz_run(64, 32, |rng| (rng.next_u32() % 32 * 1024) as usize);
|
||||
}
|
||||
|
||||
#[allow(clippy::arc_with_non_send_sync)]
|
||||
fn setup_test_env(database_size: u32) -> (Rc<Pager>, Rc<RefCell<DatabaseHeader>>) {
|
||||
let page_size = 512;
|
||||
|
||||
@@ -362,7 +362,7 @@ pub fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
|
||||
}
|
||||
|
||||
#[repr(u8)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
pub enum PageType {
|
||||
IndexInterior = 2,
|
||||
TableInterior = 5,
|
||||
@@ -1129,11 +1129,9 @@ pub fn write_varint(buf: &mut [u8], value: u64) -> usize {
|
||||
}
|
||||
|
||||
pub fn write_varint_to_vec(value: u64, payload: &mut Vec<u8>) {
|
||||
let mut varint: Vec<u8> = vec![0; 9];
|
||||
let n = write_varint(&mut varint.as_mut_slice()[0..9], value);
|
||||
write_varint(&mut varint, value);
|
||||
varint.truncate(n);
|
||||
payload.extend_from_slice(&varint);
|
||||
let mut varint = [0u8; 9];
|
||||
let n = write_varint(&mut varint, value);
|
||||
payload.extend_from_slice(&varint[0..n]);
|
||||
}
|
||||
|
||||
pub fn begin_read_wal_header(io: &Rc<dyn File>) -> Result<Arc<RwLock<WalHeader>>> {
|
||||
|
||||
@@ -727,6 +727,7 @@ impl Cursor {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum CursorResult<T> {
|
||||
Ok(T),
|
||||
IO,
|
||||
|
||||
Reference in New Issue
Block a user