Merge 'remove io.blocks from btree balancing code' from Nikita Sivukhin

This PR removes `io.block` usage from B-Tree balancing code (similarly
as in the #3179)

Reviewed-by: Preston Thorpe <preston@turso.tech>

Closes #3194
This commit is contained in:
Preston Thorpe
2025-09-18 07:24:51 -04:00
committed by GitHub

View File

@@ -206,10 +206,35 @@ pub enum OverwriteCellState {
},
}
#[derive(Debug, PartialEq)]
struct BalanceContext {
pages_to_balance_new: [Option<Arc<Page>>; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
sibling_count_new: usize,
cell_array: CellArray,
old_cell_count_per_page_cumulative: [u16; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
#[cfg(debug_assertions)]
cells_debug: Vec<Vec<u8>>,
}
impl std::fmt::Debug for BalanceContext {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("BalanceContext")
.field("pages_to_balance_new", &self.pages_to_balance_new)
.field("sibling_count_new", &self.sibling_count_new)
.field("cell_array", &self.cell_array)
.field(
"old_cell_count_per_page_cumulative",
&self.old_cell_count_per_page_cumulative,
)
.finish()
}
}
#[derive(Debug)]
/// State machine of a btree rebalancing operation.
enum BalanceSubState {
Start,
BalanceRoot,
Decide,
Quick,
/// Choose which sibling pages to balance (max 3).
/// Generally, the siblings involved will be the page that triggered the balancing and its left and right siblings.
@@ -220,6 +245,13 @@ enum BalanceSubState {
/// Perform the actual balancing. This will result in 1-5 pages depending on the number of total cells to be distributed
/// from the source pages.
NonRootDoBalancing,
NonRootDoBalancingAllocate {
i: usize,
context: Option<BalanceContext>,
},
NonRootDoBalancingFinish {
context: BalanceContext,
},
/// Free pages that are not used anymore after balancing.
FreePages {
curr_page: usize,
@@ -2345,7 +2377,7 @@ impl BTreeCursor {
let overflows = !page.get_contents().overflow_cells.is_empty();
if overflows {
*write_state = WriteState::Balancing;
assert!(self.balance_state.sub_state == BalanceSubState::Start, "There should be no balancing operation in progress when insert state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
assert!(matches!(self.balance_state.sub_state, BalanceSubState::Start), "There should be no balancing operation in progress when insert state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
// If we balance, we must save the cursor position and seek to it later.
self.save_context(CursorContext::seek_eq_only(bkey));
} else {
@@ -2388,7 +2420,7 @@ impl BTreeCursor {
};
if overflows || underflows {
*write_state = WriteState::Balancing;
assert!(self.balance_state.sub_state == BalanceSubState::Start, "There should be no balancing operation in progress when overwrite state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
assert!(matches!(self.balance_state.sub_state, BalanceSubState::Start), "There should be no balancing operation in progress when overwrite state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
// If we balance, we must save the cursor position and seek to it later.
self.save_context(CursorContext::seek_eq_only(bkey));
} else {
@@ -2473,11 +2505,19 @@ impl BTreeCursor {
return Ok(IOResult::Done(()));
}
}
if !self.stack.has_parent() {
let _res = self.balance_root()?;
*sub_state = BalanceSubState::BalanceRoot;
} else {
*sub_state = BalanceSubState::Decide;
}
}
BalanceSubState::BalanceRoot => {
return_if_io!(self.balance_root());
let BalanceState { sub_state, .. } = &mut self.balance_state;
*sub_state = BalanceSubState::Decide;
}
BalanceSubState::Decide => {
let cur_page = self.stack.top_ref();
let cur_page_contents = cur_page.get_contents();
@@ -2522,6 +2562,8 @@ impl BTreeCursor {
}
BalanceSubState::NonRootPickSiblings
| BalanceSubState::NonRootDoBalancing
| BalanceSubState::NonRootDoBalancingAllocate { .. }
| BalanceSubState::NonRootDoBalancingFinish { .. }
| BalanceSubState::FreePages { .. } => {
return_if_io!(self.balance_non_root());
}
@@ -2623,7 +2665,10 @@ impl BTreeCursor {
tracing::debug!(?sub_state);
match sub_state {
BalanceSubState::Start | BalanceSubState::Quick => {
BalanceSubState::Start
| BalanceSubState::BalanceRoot
| BalanceSubState::Decide
| BalanceSubState::Quick => {
panic!("balance_non_root: unexpected state {sub_state:?}")
}
BalanceSubState::NonRootPickSiblings => {
@@ -2880,13 +2925,12 @@ impl BTreeCursor {
// Start balancing.
let parent_page = self.stack.top_ref();
let parent_contents = parent_page.get_contents();
let parent_is_root = !self.stack.has_parent();
// 1. Collect cell data from divider cells, and count the total number of cells to be distributed.
// The count includes: all cells and overflow cells from the sibling pages, and divider cells from the parent page,
// excluding the rightmost divider, which will not be dropped from the parent; instead it will be updated at the end.
let mut total_cells_to_redistribute = 0;
let mut pages_to_balance_new: [Option<Arc<Page>>;
let pages_to_balance_new: [Option<Arc<Page>>;
MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] =
[const { None }; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE];
for i in (0..balance_info.sibling_count).rev() {
@@ -3335,31 +3379,84 @@ impl BTreeCursor {
);
}
*sub_state = BalanceSubState::NonRootDoBalancingAllocate {
i: 0,
context: Some(BalanceContext {
pages_to_balance_new,
sibling_count_new,
cell_array,
old_cell_count_per_page_cumulative,
#[cfg(debug_assertions)]
cells_debug,
}),
};
}
BalanceSubState::NonRootDoBalancingAllocate { i, context } => {
let BalanceContext {
pages_to_balance_new,
old_cell_count_per_page_cumulative,
cell_array,
sibling_count_new,
..
} = context.as_mut().unwrap();
let pager = self.pager.clone();
let mut balance_info = balance_info.borrow_mut();
let balance_info = balance_info.as_mut().unwrap();
let page_type = balance_info.pages_to_balance[0]
.as_ref()
.unwrap()
.get_contents()
.page_type();
// Allocate pages or set dirty if not needed
for i in 0..sibling_count_new {
if i < balance_info.sibling_count {
let page = balance_info.pages_to_balance[i].as_ref().unwrap();
turso_assert!(
page.is_dirty(),
"sibling page must be already marked dirty"
);
pages_to_balance_new[i].replace(page.clone());
} else {
// FIXME: handle page cache is full
// FIXME: add new state machine state instead of this sync IO hack
let page = pager.io.block(|| {
pager.do_allocate_page(page_type, 0, BtreePageAllocMode::Any)
})?;
pages_to_balance_new[i].replace(page);
// Since this page didn't exist before, we can set it to cells length as it
// marks them as empty since it is a prefix sum of cells.
old_cell_count_per_page_cumulative[i] =
cell_array.cell_payloads.len() as u16;
}
if *i < balance_info.sibling_count {
let page = balance_info.pages_to_balance[*i].as_ref().unwrap();
turso_assert!(page.is_dirty(), "sibling page must be already marked dirty");
pages_to_balance_new[*i].replace(page.clone());
} else {
// FIXME: handle page cache is full
let page = return_if_io!(pager.do_allocate_page(
page_type,
0,
BtreePageAllocMode::Any
));
pages_to_balance_new[*i].replace(page);
// Since this page didn't exist before, we can set it to cells length as it
// marks them as empty since it is a prefix sum of cells.
old_cell_count_per_page_cumulative[*i] =
cell_array.cell_payloads.len() as u16;
}
if *i + 1 < *sibling_count_new {
*i += 1;
continue;
} else {
*sub_state = BalanceSubState::NonRootDoBalancingFinish {
context: context.take().unwrap(),
};
}
}
BalanceSubState::NonRootDoBalancingFinish {
context:
BalanceContext {
pages_to_balance_new,
sibling_count_new,
cell_array,
old_cell_count_per_page_cumulative,
#[cfg(debug_assertions)]
cells_debug,
},
} => {
let mut balance_info = balance_info.borrow_mut();
let balance_info = balance_info.as_mut().unwrap();
let page_type = balance_info.pages_to_balance[0]
.as_ref()
.unwrap()
.get_contents()
.page_type();
let parent_is_root = !self.stack.has_parent();
let parent_page = self.stack.top_ref();
let parent_contents = parent_page.get_contents();
let mut sibling_count_new = *sibling_count_new;
let is_table_leaf = matches!(page_type, PageType::TableLeaf);
// Reassign page numbers in increasing order
{
let mut page_numbers: [usize; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE] =
@@ -3652,7 +3749,7 @@ impl BTreeCursor {
start_old_cells,
start_new_cells,
number_new_cells,
&cell_array,
cell_array,
usable_space,
)?;
debug_validate_cells!(page_contents, usable_space);
@@ -3831,10 +3928,10 @@ impl BTreeCursor {
parent_page: &PageRef,
balance_info: &BalanceInfo,
parent_contents: &mut PageContent,
pages_to_balance_new: [Option<PageRef>; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
pages_to_balance_new: &[Option<PageRef>; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
page_type: PageType,
is_table_leaf: bool,
mut cells_debug: Vec<Vec<u8>>,
cells_debug: &mut [Vec<u8>],
sibling_count_new: usize,
right_page_id: u32,
usable_space: usize,
@@ -4228,15 +4325,16 @@ impl BTreeCursor {
let _ = self.move_to_right_state.1.take();
let root = self.stack.top();
let is_page_1 = root.get().id == 1;
let offset = if is_page_1 { DatabaseHeader::SIZE } else { 0 };
let root_contents = root.get_contents();
// FIXME: handle page cache is full
// FIXME: remove sync IO hack
let child = self.pager.io.block(|| {
self.pager
.do_allocate_page(root_contents.page_type(), 0, BtreePageAllocMode::Any)
})?;
let child = return_if_io!(self.pager.do_allocate_page(
root_contents.page_type(),
0,
BtreePageAllocMode::Any
));
let is_page_1 = root.get().id == 1;
let offset = if is_page_1 { DatabaseHeader::SIZE } else { 0 };
tracing::debug!(
"balance_root(root={}, rightmost={}, page_type={:?})",
@@ -4961,7 +5059,7 @@ impl BTreeCursor {
}
}
let balance_both = leaf_underflows && interior_overflows_or_underflows;
assert!(self.balance_state.sub_state == BalanceSubState::Start, "There should be no balancing operation in progress when delete state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
assert!(matches!(self.balance_state.sub_state, BalanceSubState::Start), "There should be no balancing operation in progress when delete state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
let post_balancing_seek_key = post_balancing_seek_key
.take()
.expect("post_balancing_seek_key should be Some");
@@ -6430,6 +6528,12 @@ struct CellArray {
cell_count_per_page_cumulative: [u16; MAX_NEW_SIBLING_PAGES_AFTER_BALANCE],
}
impl std::fmt::Debug for CellArray {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CellArray").finish()
}
}
impl CellArray {
pub fn cell_size_bytes(&self, cell_idx: usize) -> u16 {
self.cell_payloads[cell_idx].len() as u16