Merge 'btree/balance: support case where immediate parent page of unbalanced child page also overflows' from Jussi Saurio

Closes #2241
## What
When an index interior cell is deleted, it steals the leaf cell with the
largest key in its left subtree, deletes the old interior cell and then
replaces it with the stolen cell. This ensures the binary-search-tree
aspect of the btree remains correct. However, this can cause a situation
where both are true:
1. The leaf page is now UNDERFULL and must be rebalanced
2. The leaf's IMMEDIATE parent page is now OVERFULL and must be
rebalanced
## Why is this a problem
We simply didn't support the case where:
- Leaf page P is unbalanced and rebalancing starts on it
- Its immediate parent is ALSO unbalanced and _overflows_.
We had an assertion against this happening (see #2241)
## The fix
Allow exactly 1 overflow cell in the parent under very particular
conditions:
1. The parent page must be an index interior page
2. The parent must be positioned exactly at the divider cell whose left
child page underflows
This is the _only_ case where the immediate parent of a page about to
undergo rebalancing can have overflow cells.
## Implementation details
The parent overflow cell is folded into `cell_array` fairly early on and
`parent.overflow_cells` is cleared. However we need to be careful with
`cell_idx` for dividers other than the overflow cell because they get
shifted left on the page in `drop_cell()`. I've added a long comment
about this.
## Testing
Adds fuzz test that does inserts and deletes on an index btree and
asserts that all the expected keys are found at the end in the right
order. This test runs into this case quite frequently so I was able to
verify it.

Reviewed-by: Pere Diaz Bou <pere-altea@homail.com>

Closes #2243
This commit is contained in:
Jussi Saurio
2025-07-24 18:48:36 +03:00

View File

@@ -2376,16 +2376,38 @@ impl BTreeCursor {
let parent_page = self.stack.top();
return_if_locked_maybe_load!(self.pager, parent_page);
let parent_page = parent_page.get();
// If `move_to` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1.
// In any other case, `move_to` will stay in the correct index.
if self.stack.current_cell_index() as usize
== parent_page.get_contents().cell_count() + 1
{
let parent_contents = parent_page.get_contents();
let page_type = parent_contents.page_type();
turso_assert!(
matches!(page_type, PageType::IndexInterior | PageType::TableInterior),
"expected index or table interior page"
);
let number_of_cells_in_parent =
parent_contents.cell_count() + parent_contents.overflow_cells.len();
// If `seek` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1.
// In any other case, `seek` will stay in the correct index.
let past_rightmost_pointer =
self.stack.current_cell_index() as usize == number_of_cells_in_parent + 1;
if past_rightmost_pointer {
self.stack.retreat();
} else if self.stack.current_cell_index() == -1 {
// We might've retreated in CheckRequiresBalancing, so advance to the next cell
// to prevent panic in the asserts below due to -1 index
self.stack.advance();
} else if !parent_contents.overflow_cells.is_empty() {
// The ONLY way we can have an overflow cell in the parent is if we replaced an interior cell from a cell in the child, and that replacement did not fit.
// This can only happen on index btrees.
if matches!(page_type, PageType::IndexInterior) {
turso_assert!(parent_contents.overflow_cells.len() == 1, "index interior page must have no more than 1 overflow cell, as a result of InteriorNodeReplacement");
} else {
turso_assert!(false, "{page_type:?} must have no overflow cells");
}
let overflow_cell = parent_contents.overflow_cells.first().unwrap();
let parent_page_cell_idx = self.stack.current_cell_index() as usize;
// Parent page must be positioned at the divider cell that overflowed due to the replacement.
turso_assert!(
overflow_cell.index == parent_page_cell_idx,
"overflow cell index must be the result of InteriorNodeReplacement that leaves both child and parent (id={}) unbalanced, and hence parent page's position must = overflow_cell.index. Instead got: parent_page_cell_idx={parent_page_cell_idx} overflow_cell.index={}",
parent_page.get().id,
overflow_cell.index
);
}
self.pager.add_dirty(&parent_page);
let parent_contents = parent_page.get().contents.as_ref().unwrap();
@@ -2396,23 +2418,9 @@ impl BTreeCursor {
parent_page.get().id,
page_to_balance_idx
);
turso_assert!(
matches!(
parent_contents.page_type(),
PageType::IndexInterior | PageType::TableInterior
),
"expected index or table interior page"
);
// Part 1: Find the sibling pages to balance
let mut pages_to_balance: [Option<BTreePage>; MAX_SIBLING_PAGES_TO_BALANCE] =
[const { None }; MAX_SIBLING_PAGES_TO_BALANCE];
let number_of_cells_in_parent =
parent_contents.cell_count() + parent_contents.overflow_cells.len();
turso_assert!(
parent_contents.overflow_cells.is_empty(),
"balancing child page with overflowed parent not yet implemented"
);
turso_assert!(
page_to_balance_idx <= parent_contents.cell_count(),
"page_to_balance_idx={page_to_balance_idx} is out of bounds for parent cell count {number_of_cells_in_parent}"
@@ -2449,10 +2457,41 @@ impl BTreeCursor {
let right_pointer = if last_sibling_is_right_pointer {
parent_contents.rightmost_pointer_raw().unwrap()
} else {
let (start_of_cell, _) = parent_contents.cell_get_raw_region(
first_cell_divider + sibling_pointer,
self.usable_space(),
let max_overflow_cells = if matches!(page_type, PageType::IndexInterior) {
1
} else {
0
};
turso_assert!(
parent_contents.overflow_cells.len() <= max_overflow_cells,
"must have at most {max_overflow_cells} overflow cell in the parent"
);
// OVERFLOW CELL ADJUSTMENT:
// Let there be parent with cells [0,1,2,3,4].
// Let's imagine the cell at idx 2 gets replaced with a new payload that causes it to overflow.
// See handling of InteriorNodeReplacement in btree.rs.
//
// In this case the rightmost divider is going to be 3 (2 is the middle one and we pick neighbors 1-3).
// drop_cell(): [0,1,2,3,4] -> [0,1,3,4] <-- cells on right side get shifted left!
// insert_into_cell(): [0,1,3,4] -> [0,1,3,4] + overflow cell (2) <-- crucially, no physical shifting happens, overflow cell is stored separately
//
// This means '3' is actually physically located at index '2'.
// So IF the parent has an overflow cell, we need to subtract 1 to get the actual rightmost divider cell idx to physically read from.
// The formula for the actual cell idx is:
// first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len()
// so in the above case:
// actual_cell_idx = 1 + 2 - 1 = 2
//
// In the case where the last divider cell is the overflow cell, there would be no left-shifting of cells in drop_cell(),
// because they are still positioned correctly (imagine .pop() from a vector).
// However, note that we are always looking for the _rightmost_ child page pointer between the (max 2) dividers, and for any case where the last divider cell is the overflow cell,
// the 'last_sibling_is_right_pointer' condition will also be true (since the overflow cell's left child will be the middle page), so we won't enter this code branch.
//
// Hence: when we enter this branch with overflow_cells.len() == 1, we know that left-shifting has happened and we need to subtract 1.
let actual_cell_idx =
first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len();
let (start_of_cell, _) =
parent_contents.cell_get_raw_region(actual_cell_idx, self.usable_space());
let buf = parent_contents.as_ptr().as_mut_ptr();
unsafe { buf.add(start_of_cell) }
};
@@ -2477,25 +2516,58 @@ impl BTreeCursor {
);
}
pages_to_balance[i].replace(page);
turso_assert!(
parent_contents.overflow_cells.is_empty(),
"overflow in parent is not yet implented while balancing it"
);
if i == 0 {
break;
}
let next_cell_divider = i + first_cell_divider - 1;
pgno = match parent_contents.cell_get(next_cell_divider, self.usable_space())? {
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page, ..
})
| BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page, ..
}) => left_child_page,
other => {
crate::bail_corrupt_error!("expected interior cell, got {:?}", other)
}
};
let divider_is_overflow_cell = parent_contents
.overflow_cells
.first()
.is_some_and(|overflow_cell| overflow_cell.index == next_cell_divider);
if divider_is_overflow_cell {
turso_assert!(
matches!(parent_contents.page_type(), PageType::IndexInterior),
"expected index interior page, got {:?}",
parent_contents.page_type()
);
turso_assert!(
parent_contents.overflow_cells.len() == 1,
"must have a single overflow cell in the parent, as a result of InteriorNodeReplacement"
);
let overflow_cell = parent_contents.overflow_cells.first().unwrap();
pgno = u32::from_be_bytes(overflow_cell.payload[0..4].try_into().unwrap());
} else {
// grep for 'OVERFLOW CELL ADJUSTMENT' for explanation.
// here we only subtract 1 if the divider cell has been shifted left, i.e. the overflow cell was placed to the left
// this cell.
let actual_cell_idx =
if let Some(overflow_cell) = parent_contents.overflow_cells.first() {
if next_cell_divider < overflow_cell.index {
next_cell_divider
} else {
next_cell_divider - 1
}
} else {
next_cell_divider
};
pgno =
match parent_contents.cell_get(actual_cell_idx, self.usable_space())? {
BTreeCell::TableInteriorCell(TableInteriorCell {
left_child_page,
..
})
| BTreeCell::IndexInteriorCell(IndexInteriorCell {
left_child_page,
..
}) => left_child_page,
other => {
crate::bail_corrupt_error!(
"expected interior cell, got {:?}",
other
)
}
};
}
}
#[cfg(debug_assertions)]
@@ -2547,11 +2619,6 @@ impl BTreeCursor {
let parent_contents = parent_page.get_contents();
let parent_is_root = !self.stack.has_parent();
turso_assert!(
parent_contents.overflow_cells.is_empty(),
"overflow parent not yet implemented"
);
// 1. Collect cell data from divider cells, and count the total number of cells to be distributed.
// The count includes: all cells and overflow cells from the sibling pages, and divider cells from the parent page,
// excluding the rightmost divider, which will not be dropped from the parent; instead it will be updated at the end.
@@ -2575,10 +2642,33 @@ impl BTreeCursor {
}
// Since we know we have a left sibling, take the divider that points to left sibling of this page
let cell_idx = balance_info.first_divider_cell + i;
let (cell_start, cell_len) =
parent_contents.cell_get_raw_region(cell_idx, self.usable_space());
let buf = parent_contents.as_ptr();
let cell_buf = &buf[cell_start..cell_start + cell_len];
let divider_is_overflow_cell = parent_contents
.overflow_cells
.first()
.is_some_and(|overflow_cell| overflow_cell.index == cell_idx);
let cell_buf = if divider_is_overflow_cell {
turso_assert!(
matches!(parent_contents.page_type(), PageType::IndexInterior),
"expected index interior page, got {:?}",
parent_contents.page_type()
);
turso_assert!(
parent_contents.overflow_cells.len() == 1,
"must have a single overflow cell in the parent, as a result of InteriorNodeReplacement"
);
let overflow_cell = parent_contents.overflow_cells.first().unwrap();
&overflow_cell.payload
} else {
// grep for 'OVERFLOW CELL ADJUSTMENT' for explanation.
// here we can subtract overflow_cells.len() every time, because we are iterating right-to-left,
// so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0.
let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len();
let (cell_start, cell_len) = parent_contents
.cell_get_raw_region(actual_cell_idx, self.usable_space());
let buf = parent_contents.as_ptr();
&buf[cell_start..cell_start + cell_len]
};
// Count the divider cell itself (which will be dropped from the parent)
total_cells_to_redistribute += 1;
@@ -2591,12 +2681,24 @@ impl BTreeCursor {
// TODO(pere): make this reference and not copy
balance_info.divider_cell_payloads[i].replace(cell_buf.to_vec());
tracing::trace!(
"dropping divider cell from parent cell_idx={} count={}",
cell_idx,
parent_contents.cell_count()
);
drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?;
if divider_is_overflow_cell {
tracing::debug!(
"clearing overflow cells from parent cell_idx={}",
cell_idx
);
parent_contents.overflow_cells.clear();
} else {
// grep for 'OVERFLOW CELL ADJUSTMENT' for explanation.
// here we can subtract overflow_cells.len() every time, because we are iterating right-to-left,
// so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0.
let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len();
tracing::trace!(
"dropping divider cell from parent cell_idx={} count={}",
actual_cell_idx,
parent_contents.cell_count()
);
drop_cell(parent_contents, actual_cell_idx, self.usable_space() as u16)?;
}
}
/* 2. Initialize CellArray with all the cells used for distribution, this includes divider cells if !leaf. */
@@ -4428,7 +4530,17 @@ impl BTreeCursor {
// Step 1: Move cursor to the largest key in the left subtree.
// The largest key is always in a leaf, and so this traversal may involvegoing multiple pages downwards,
// so we store the page we are currently on.
return_if_io!(self.get_prev_record()); // avoid calling prev() because it internally calls restore_context() which may cause unintended behavior.
// avoid calling prev() because it internally calls restore_context() which may cause unintended behavior.
return_if_io!(self.get_prev_record());
// Ensure we keep the parent page at the same position as before the replacement.
self.stack
.node_states
.borrow_mut()
.get_mut(btree_depth)
.expect("parent page should be on the stack")
.cell_idx = cell_idx as i32;
let (cell_payload, leaf_cell_idx) = {
let leaf_page_ref = self.stack.top();
let leaf_page = leaf_page_ref.get();
@@ -7385,6 +7497,253 @@ mod tests {
}
}
fn btree_index_insert_delete_fuzz_run(
attempts: usize,
operations: usize,
size: impl Fn(&mut ChaCha8Rng) -> usize,
insert_chance: f64,
) {
use crate::storage::pager::CreateBTreeFlags;
let (mut rng, seed) = if std::env::var("SEED").is_ok() {
let seed = std::env::var("SEED").unwrap();
let seed = seed.parse::<u64>().unwrap();
let rng = ChaCha8Rng::seed_from_u64(seed);
(rng, seed)
} else {
rng_from_time_or_env()
};
let mut seen = HashSet::new();
tracing::info!("super seed: {}", seed);
for _ in 0..attempts {
let (pager, _, _db, conn) = empty_btree();
let index_root_page_result =
pager.btree_create(&CreateBTreeFlags::new_index()).unwrap();
let index_root_page = match index_root_page_result {
crate::types::IOResult::Done(id) => id as usize,
crate::types::IOResult::IO => {
panic!("btree_create returned IO in test, unexpected")
}
};
let index_def = Index {
name: "testindex".to_string(),
columns: vec![IndexColumn {
name: "testcol".to_string(),
order: SortOrder::Asc,
collation: None,
pos_in_table: 0,
default: None,
}],
table_name: "test".to_string(),
root_page: index_root_page,
unique: false,
ephemeral: false,
has_rowid: false,
};
let mut cursor =
BTreeCursor::new_index(None, pager.clone(), index_root_page, &index_def, 1);
// Track expected keys that should be present in the tree
let mut expected_keys = Vec::new();
tracing::info!("seed: {seed}");
for i in 0..operations {
let print_progress = i % 100 == 0;
pager.begin_read_tx().unwrap();
pager.begin_write_tx().unwrap();
// Decide whether to insert or delete (80% chance of insert)
let is_insert = rng.next_u64() % 100 < (insert_chance * 100.0) as u64;
if is_insert {
// Generate a unique key for insertion
let key = {
let result;
loop {
let sizeof_blob = size(&mut rng);
let blob = (0..sizeof_blob)
.map(|_| (rng.next_u64() % 256) as u8)
.collect::<Vec<_>>();
if seen.contains(&blob) {
continue;
} else {
seen.insert(blob.clone());
}
result = blob;
break;
}
result
};
if print_progress {
tracing::info!("insert {}/{}, seed: {seed}", i + 1, operations);
}
expected_keys.push(key.clone());
let regs = vec![Register::Value(Value::Blob(key))];
let value = ImmutableRecord::from_registers(&regs, regs.len());
let seek_result = run_until_done(
|| {
let record = ImmutableRecord::from_registers(&regs, regs.len());
let key = SeekKey::IndexKey(&record);
cursor.seek(key, SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
if let SeekResult::TryAdvance = seek_result {
run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
run_until_done(
|| {
cursor.insert(
&BTreeKey::new_index_key(&value),
cursor.is_write_in_progress(),
)
},
pager.deref(),
)
.unwrap();
} else {
// Delete a random existing key
if !expected_keys.is_empty() {
let delete_idx = rng.next_u64() as usize % expected_keys.len();
let key_to_delete = expected_keys[delete_idx].clone();
if print_progress {
tracing::info!("delete {}/{}, seed: {seed}", i + 1, operations);
}
let regs = vec![Register::Value(Value::Blob(key_to_delete.clone()))];
let record = ImmutableRecord::from_registers(&regs, regs.len());
// Seek to the key to delete
let seek_result = run_until_done(
|| {
cursor
.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true })
},
pager.deref(),
)
.unwrap();
let mut found = matches!(seek_result, SeekResult::Found);
if matches!(seek_result, SeekResult::TryAdvance) {
found = run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
assert!(found, "expected key {key_to_delete:?} is not found");
// Delete the key
run_until_done(|| cursor.delete(), pager.deref()).unwrap();
// Remove from expected keys
expected_keys.remove(delete_idx);
}
}
cursor.move_to_root().unwrap();
loop {
match pager.end_tx(false, false, &conn, false).unwrap() {
IOResult::Done(_) => break,
IOResult::IO => {
pager.io.run_once().unwrap();
}
}
}
}
// Final validation
let mut sorted_keys = expected_keys.clone();
sorted_keys.sort();
validate_expected_keys(&pager, &mut cursor, &sorted_keys, seed);
pager.end_read_tx().unwrap();
}
}
fn validate_expected_keys(
pager: &Rc<Pager>,
cursor: &mut BTreeCursor,
expected_keys: &[Vec<u8>],
seed: u64,
) {
// Check that all expected keys can be found by seeking
pager.begin_read_tx().unwrap();
cursor.move_to_root().unwrap();
for (i, key) in expected_keys.iter().enumerate() {
tracing::info!(
"validating key {}/{}, seed: {seed}",
i + 1,
expected_keys.len()
);
let exists = run_until_done(
|| {
let regs = vec![Register::Value(Value::Blob(key.clone()))];
cursor.seek(
SeekKey::IndexKey(&ImmutableRecord::from_registers(&regs, regs.len())),
SeekOp::GE { eq_only: true },
)
},
pager.deref(),
)
.unwrap();
let mut found = matches!(exists, SeekResult::Found);
if matches!(exists, SeekResult::TryAdvance) {
found = run_until_done(|| cursor.next(), pager.deref()).unwrap();
}
assert!(found, "expected key {key:?} is not found");
}
// Check key count
cursor.move_to_root().unwrap();
run_until_done(|| cursor.rewind(), pager.deref()).unwrap();
if !cursor.has_record.get() {
panic!("no keys in tree");
}
let mut count = 1;
loop {
run_until_done(|| cursor.next(), pager.deref()).unwrap();
if !cursor.has_record.get() {
break;
}
count += 1;
}
assert_eq!(
count,
expected_keys.len(),
"key count is not right, got {}, expected {}, seed: {seed}",
count,
expected_keys.len()
);
// Check that all keys can be found in-order, by iterating the btree
cursor.move_to_root().unwrap();
for (i, key) in expected_keys.iter().enumerate() {
run_until_done(|| cursor.next(), pager.deref()).unwrap();
tracing::info!(
"iterating key {}/{}, cursor stack cur idx: {:?}, cursor stack depth: {:?}, seed: {seed}",
i + 1,
expected_keys.len(),
cursor.stack.current_cell_index(),
cursor.stack.current()
);
let record = run_until_done(|| cursor.record(), pager).unwrap();
let record = record.as_ref().unwrap();
let cur = record.get_values().clone();
let cur = cur.first().unwrap();
let RefValue::Blob(ref cur) = cur else {
panic!("expected blob, got {cur:?}");
};
assert_eq!(
cur.to_slice(),
key,
"key {key:?} is not found, seed: {seed}"
);
}
pager.end_read_tx().unwrap();
}
#[test]
pub fn test_drop_odd() {
let db = get_database();
@@ -7443,6 +7802,20 @@ mod tests {
btree_index_insert_fuzz_run(2, 1024);
}
#[test]
pub fn btree_index_insert_delete_fuzz_run_test() {
btree_index_insert_delete_fuzz_run(
2,
2000,
|rng| {
let min: u32 = 4;
let size = min + rng.next_u32() % (1024 - min);
size as usize
},
0.65,
);
}
#[test]
pub fn btree_insert_fuzz_run_random() {
btree_insert_fuzz_run(128, 16, |rng| (rng.next_u32() % 4096) as usize);
@@ -7478,6 +7851,21 @@ mod tests {
btree_index_insert_fuzz_run(2, 10_000);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_index_insert_delete_fuzz_run() {
btree_index_insert_delete_fuzz_run(
2,
10000,
|rng| {
let min: u32 = 4;
let size = min + rng.next_u32() % (1024 - min);
size as usize
},
0.65,
);
}
#[test]
#[ignore]
pub fn fuzz_long_btree_insert_fuzz_run_random() {