Merge 'Refactor: use regular save/restore context mechanism for delete balancing' from Jussi Saurio

This is the first in a series of PRs whose end goal is to close the
#2004 issue, but I don't want to make that PR too bloated.
- Removes special `DeleteSavepoint` and uses the existing cursor
restoration mechanism.
- This required some restructuring of `DeleteState` to avoid cloning it,
i.e. some negotations with the borrow checker.
- CursorContext now takes a SeekOp as well to allow retaining the
behavior that we use LT for seeking after a delete-induced rebalancing.
This behavior will probably be removed as part of fixing #2004, but here
I am trying to preserve the current semantics.

Reviewed-by: Pere Diaz Bou <pere-altea@homail.com>

Closes #2638
This commit is contained in:
Jussi Saurio
2025-08-18 13:38:48 +03:00
committed by GitHub

View File

@@ -150,27 +150,21 @@ struct DestroyInfo {
state: DestroyState,
}
#[derive(Debug, Clone)]
enum DeleteSavepoint {
Rowid(i64),
Payload(ImmutableRecord),
}
#[derive(Debug, Clone)]
#[derive(Debug)]
enum DeleteState {
Start,
DeterminePostBalancingSeekKey,
LoadPage {
post_balancing_seek_key: Option<DeleteSavepoint>,
post_balancing_seek_key: Option<CursorContext>,
},
FindCell {
post_balancing_seek_key: Option<DeleteSavepoint>,
post_balancing_seek_key: Option<CursorContext>,
},
ClearOverflowPages {
cell_idx: usize,
cell: BTreeCell,
original_child_pointer: Option<u32>,
post_balancing_seek_key: Option<DeleteSavepoint>,
post_balancing_seek_key: Option<CursorContext>,
},
InteriorNodeReplacement {
page: PageRef,
@@ -180,26 +174,19 @@ enum DeleteState {
btree_depth: usize,
cell_idx: usize,
original_child_pointer: Option<u32>,
post_balancing_seek_key: Option<DeleteSavepoint>,
post_balancing_seek_key: Option<CursorContext>,
},
CheckNeedsBalancing {
/// same as `InteriorNodeReplacement::btree_depth`
btree_depth: usize,
post_balancing_seek_key: Option<DeleteSavepoint>,
post_balancing_seek_key: Option<CursorContext>,
},
Balancing {
/// If provided, will also balance an ancestor page at depth `balance_ancestor_at_depth`.
/// If not provided, balancing will stop as soon as a level is encountered where no balancing is required.
balance_ancestor_at_depth: Option<usize>,
target_key: DeleteSavepoint,
},
SeekAfterBalancing {
target_key: DeleteSavepoint,
},
/// If the seek performed in [DeleteState::SeekAfterBalancing] returned a [SeekResult::TryAdvance] we need to call next()/prev() to get to the right location.
/// We need to have this separate state for re-entrancy as calling next()/prev() might yield on IO.
/// FIXME: refactor DeleteState not to have SeekAfterBalancing and instead use save_context() and restore_context()
TryAdvance,
RestoreContextAfterBalancing,
}
#[derive(Debug)]
@@ -403,7 +390,8 @@ enum OverflowState {
/// Holds a Record or RowId, so that these can be transformed into a SeekKey to restore
/// cursor position to its previous location.
pub enum CursorContext {
#[derive(Debug)]
pub enum CursorContextKey {
TableRowId(i64),
/// If we are in an index tree we can then reuse this field to save
@@ -411,6 +399,30 @@ pub enum CursorContext {
IndexKeyRowId(ImmutableRecord),
}
#[derive(Debug)]
pub struct CursorContext {
pub key: CursorContextKey,
pub seek_op: SeekOp,
}
impl CursorContext {
fn seek_eq_only(key: &BTreeKey<'_>) -> Self {
Self {
key: key.into(),
seek_op: SeekOp::GE { eq_only: true },
}
}
}
impl From<&BTreeKey<'_>> for CursorContextKey {
fn from(key: &BTreeKey<'_>) -> Self {
match key {
BTreeKey::TableRowId((rowid, _)) => CursorContextKey::TableRowId(*rowid),
BTreeKey::IndexKey(record) => CursorContextKey::IndexKeyRowId((*record).clone()),
}
}
}
/// In the future, we may expand these general validity states
#[derive(Debug, PartialEq, Eq)]
pub enum CursorValidState {
@@ -2241,14 +2253,7 @@ impl BTreeCursor {
*write_state = WriteState::Balancing;
assert!(self.balance_state.sub_state == BalanceSubState::Start, "There should be no balancing operation in progress when insert state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
// If we balance, we must save the cursor position and seek to it later.
// FIXME: we shouldn't have both DeleteState::SeekAfterBalancing and
// save_context()/restore/context(), they are practically the same thing.
self.save_context(match bkey {
BTreeKey::TableRowId(rowid) => CursorContext::TableRowId(rowid.0),
BTreeKey::IndexKey(record) => {
CursorContext::IndexKeyRowId((*record).clone())
}
});
self.save_context(CursorContext::seek_eq_only(bkey));
} else {
*write_state = WriteState::Finish;
}
@@ -2296,14 +2301,7 @@ impl BTreeCursor {
*write_state = WriteState::Balancing;
assert!(self.balance_state.sub_state == BalanceSubState::Start, "There should be no balancing operation in progress when overwrite state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
// If we balance, we must save the cursor position and seek to it later.
// FIXME: we shouldn't have both DeleteState::SeekAfterBalancing and
// save_context()/restore/context(), they are practically the same thing.
self.save_context(match bkey {
BTreeKey::TableRowId(rowid) => CursorContext::TableRowId(rowid.0),
BTreeKey::IndexKey(record) => {
CursorContext::IndexKeyRowId((*record).clone())
}
});
self.save_context(CursorContext::seek_eq_only(bkey));
} else {
*write_state = WriteState::Finish;
}
@@ -4516,8 +4514,9 @@ impl BTreeCursor {
}
loop {
let delete_state = match &self.state {
CursorState::Delete(x) => x.clone(),
let usable_space = self.usable_space();
let delete_state = match &mut self.state {
CursorState::Delete(x) => x,
_ => unreachable!("expected delete state"),
};
tracing::debug!(?delete_state);
@@ -4552,12 +4551,18 @@ impl BTreeCursor {
Some(record) => record.clone(),
None => unreachable!("there should've been a record"),
};
DeleteSavepoint::Payload(record)
CursorContext {
key: CursorContextKey::IndexKeyRowId(record),
seek_op: SeekOp::LT,
}
} else {
let Some(rowid) = return_if_io!(self.rowid()) else {
panic!("cursor should be pointing to a record with a rowid");
};
DeleteSavepoint::Rowid(rowid)
CursorContext {
key: CursorContextKey::TableRowId(rowid),
seek_op: SeekOp::LT,
}
};
self.state = CursorState::Delete(DeleteState::LoadPage {
@@ -4571,7 +4576,7 @@ impl BTreeCursor {
let _page: Arc<BTreePageInner> = self.stack.top();
self.state = CursorState::Delete(DeleteState::FindCell {
post_balancing_seek_key,
post_balancing_seek_key: post_balancing_seek_key.take(),
});
}
@@ -4597,7 +4602,7 @@ impl BTreeCursor {
cell_idx
);
let cell = contents.cell_get(cell_idx, self.usable_space())?;
let cell = contents.cell_get(cell_idx, usable_space)?;
let original_child_pointer = match &cell {
BTreeCell::TableInteriorCell(interior) => Some(interior.left_child_page),
@@ -4609,18 +4614,24 @@ impl BTreeCursor {
cell_idx,
cell,
original_child_pointer,
post_balancing_seek_key,
post_balancing_seek_key: post_balancing_seek_key.take(),
});
}
DeleteState::ClearOverflowPages {
cell_idx,
cell,
original_child_pointer,
post_balancing_seek_key,
} => {
DeleteState::ClearOverflowPages { cell, .. } => {
let cell = cell.clone();
return_if_io!(self.clear_overflow_pages(&cell));
let CursorState::Delete(DeleteState::ClearOverflowPages {
cell_idx,
original_child_pointer,
ref mut post_balancing_seek_key,
..
}) = self.state
else {
unreachable!("expected clear overflow pages state");
};
let page = self.stack.top();
let page = page.get();
let contents = page.get_contents();
@@ -4631,25 +4642,19 @@ impl BTreeCursor {
btree_depth: self.stack.current(),
cell_idx,
original_child_pointer,
post_balancing_seek_key,
post_balancing_seek_key: post_balancing_seek_key.take(),
});
} else {
drop_cell(contents, cell_idx, self.usable_space())?;
drop_cell(contents, cell_idx, usable_space)?;
self.state = CursorState::Delete(DeleteState::CheckNeedsBalancing {
btree_depth: self.stack.current(),
post_balancing_seek_key,
post_balancing_seek_key: post_balancing_seek_key.take(),
});
}
}
DeleteState::InteriorNodeReplacement {
page,
btree_depth,
cell_idx,
original_child_pointer,
post_balancing_seek_key,
} => {
DeleteState::InteriorNodeReplacement { .. } => {
// This is an interior node, we need to handle deletion differently.
// 1. Move cursor to the largest key in the left subtree.
// 2. Replace the cell in the interior (parent) node with that key.
@@ -4662,6 +4667,18 @@ impl BTreeCursor {
// avoid calling prev() because it internally calls restore_context() which may cause unintended behavior.
return_if_io!(self.get_prev_record());
let CursorState::Delete(DeleteState::InteriorNodeReplacement {
ref page,
btree_depth,
cell_idx,
original_child_pointer,
ref mut post_balancing_seek_key,
..
}) = self.state
else {
unreachable!("expected interior node replacement state");
};
// Ensure we keep the parent page at the same position as before the replacement.
self.stack
.node_states
@@ -4677,7 +4694,7 @@ impl BTreeCursor {
assert!(leaf_contents.cell_count() > 0);
let leaf_cell_idx = leaf_contents.cell_count() - 1;
let last_cell_on_child_page =
leaf_contents.cell_get(leaf_cell_idx, self.usable_space())?;
leaf_contents.cell_get(leaf_cell_idx, usable_space)?;
let mut cell_payload: Vec<u8> = Vec::new();
let child_pointer =
@@ -4710,7 +4727,7 @@ impl BTreeCursor {
let leaf_page = self.stack.top();
self.pager.add_dirty(&page);
self.pager.add_dirty(page);
self.pager.add_dirty(&leaf_page.get());
// Step 2: Replace the cell in the parent (interior) page.
@@ -4728,33 +4745,25 @@ impl BTreeCursor {
);
// First, drop the old cell that is being replaced.
drop_cell(parent_contents, cell_idx, self.usable_space())?;
drop_cell(parent_contents, cell_idx, usable_space)?;
// Then, insert the new cell (the predecessor) in its place.
insert_into_cell(
parent_contents,
&cell_payload,
cell_idx,
self.usable_space(),
)?;
insert_into_cell(parent_contents, &cell_payload, cell_idx, usable_space)?;
}
// Step 3: Delete the predecessor cell from the leaf page.
{
let leaf_page_ref = leaf_page.get();
let leaf_contents = leaf_page_ref.get_contents();
drop_cell(leaf_contents, leaf_cell_idx, self.usable_space())?;
drop_cell(leaf_contents, leaf_cell_idx, usable_space)?;
}
self.state = CursorState::Delete(DeleteState::CheckNeedsBalancing {
btree_depth,
post_balancing_seek_key,
post_balancing_seek_key: post_balancing_seek_key.take(),
});
}
DeleteState::CheckNeedsBalancing {
btree_depth,
post_balancing_seek_key,
} => {
DeleteState::CheckNeedsBalancing { btree_depth, .. } => {
let page = self.stack.top();
// Check if either the leaf page we took the replacement cell from underflows, or if the interior page we inserted it into overflows OR underflows.
// If the latter is true, we must always balance that level regardless of whether the leaf page (or any ancestor pages in between) need balancing.
@@ -4762,8 +4771,8 @@ impl BTreeCursor {
let leaf_underflows = {
let leaf_page = page.get();
let leaf_contents = leaf_page.get_contents();
let free_space = compute_free_space(leaf_contents, self.usable_space());
free_space * 3 > self.usable_space() * 2
let free_space = compute_free_space(leaf_contents, usable_space);
free_space * 3 > usable_space * 2
};
let interior_overflows_or_underflows = {
@@ -4772,7 +4781,7 @@ impl BTreeCursor {
// and we already loaded the current page above.
let interior_page = self
.stack
.get_page_at_level(btree_depth)
.get_page_at_level(*btree_depth)
.expect("ancestor page should be on the stack");
let interior_page = interior_page.get();
let interior_contents = interior_page.get_contents();
@@ -4780,14 +4789,22 @@ impl BTreeCursor {
if overflows {
true
} else {
let free_space =
compute_free_space(interior_contents, self.usable_space());
free_space * 3 > self.usable_space() * 2
let free_space = compute_free_space(interior_contents, usable_space);
free_space * 3 > usable_space * 2
}
};
let needs_balancing = leaf_underflows || interior_overflows_or_underflows;
let CursorState::Delete(DeleteState::CheckNeedsBalancing {
btree_depth,
ref mut post_balancing_seek_key,
..
}) = self.state
else {
unreachable!("expected check needs balancing state");
};
if needs_balancing {
let balance_only_ancestor =
!leaf_underflows && interior_overflows_or_underflows;
@@ -4799,13 +4816,16 @@ impl BTreeCursor {
}
let balance_both = leaf_underflows && interior_overflows_or_underflows;
assert!(self.balance_state.sub_state == BalanceSubState::Start, "There should be no balancing operation in progress when delete state is {:?}, got: {:?}", self.state, self.balance_state.sub_state);
let post_balancing_seek_key = post_balancing_seek_key
.take()
.expect("post_balancing_seek_key should be Some");
self.save_context(post_balancing_seek_key);
self.state = CursorState::Delete(DeleteState::Balancing {
balance_ancestor_at_depth: if balance_both {
Some(btree_depth)
} else {
None
},
target_key: post_balancing_seek_key.unwrap(),
});
} else {
// No balancing needed, we're done
@@ -4816,36 +4836,14 @@ impl BTreeCursor {
}
DeleteState::Balancing {
target_key,
balance_ancestor_at_depth,
} => {
let balance_ancestor_at_depth = *balance_ancestor_at_depth;
return_if_io!(self.balance(balance_ancestor_at_depth));
self.state =
CursorState::Delete(DeleteState::SeekAfterBalancing { target_key });
self.state = CursorState::Delete(DeleteState::RestoreContextAfterBalancing);
}
DeleteState::SeekAfterBalancing { target_key } => {
let key = match &target_key {
DeleteSavepoint::Rowid(rowid) => SeekKey::TableRowId(*rowid),
DeleteSavepoint::Payload(immutable_record) => {
SeekKey::IndexKey(immutable_record)
}
};
// We want to end up pointing at the row to the left of the position of the row we deleted, so
// that after we call next() in the loop,the next row we delete will again be the same position as this one.
let seek_result = return_if_io!(self.seek(key, SeekOp::LT));
if let SeekResult::TryAdvance = seek_result {
self.state = CursorState::Delete(DeleteState::TryAdvance);
continue;
}
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
DeleteState::TryAdvance => {
// we use LT always for post-delete seeks, which uses backwards iteration, so we always call prev() here.
return_if_io!(self.prev());
DeleteState::RestoreContextAfterBalancing => {
return_if_io!(self.restore_context());
self.state = CursorState::None;
return Ok(IOResult::Done(()));
}
@@ -5409,16 +5407,16 @@ impl BTreeCursor {
return Ok(IOResult::Done(()));
}
let ctx = self.context.take().unwrap();
let seek_key = match ctx {
CursorContext::TableRowId(rowid) => SeekKey::TableRowId(rowid),
CursorContext::IndexKeyRowId(ref record) => SeekKey::IndexKey(record),
let seek_key = match ctx.key {
CursorContextKey::TableRowId(rowid) => SeekKey::TableRowId(rowid),
CursorContextKey::IndexKeyRowId(ref record) => SeekKey::IndexKey(record),
};
let res = self.seek(seek_key, SeekOp::GE { eq_only: true })?;
let res = self.seek(seek_key, ctx.seek_op)?;
match res {
IOResult::Done(res) => {
if let SeekResult::TryAdvance = res {
self.valid_state =
CursorValidState::RequireAdvance(IterationDirection::Forwards);
CursorValidState::RequireAdvance(ctx.seek_op.iteration_direction());
self.context = Some(ctx);
io_yield_one!(Completion::new_dummy());
}