diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5e33ad610..723b059ec 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2376,16 +2376,38 @@ impl BTreeCursor { let parent_page = self.stack.top(); return_if_locked_maybe_load!(self.pager, parent_page); let parent_page = parent_page.get(); - // If `move_to` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. - // In any other case, `move_to` will stay in the correct index. - if self.stack.current_cell_index() as usize - == parent_page.get_contents().cell_count() + 1 - { + let parent_contents = parent_page.get_contents(); + let page_type = parent_contents.page_type(); + turso_assert!( + matches!(page_type, PageType::IndexInterior | PageType::TableInterior), + "expected index or table interior page" + ); + let number_of_cells_in_parent = + parent_contents.cell_count() + parent_contents.overflow_cells.len(); + + // If `seek` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. + // In any other case, `seek` will stay in the correct index. + let past_rightmost_pointer = + self.stack.current_cell_index() as usize == number_of_cells_in_parent + 1; + if past_rightmost_pointer { self.stack.retreat(); - } else if self.stack.current_cell_index() == -1 { - // We might've retreated in CheckRequiresBalancing, so advance to the next cell - // to prevent panic in the asserts below due to -1 index - self.stack.advance(); + } else if !parent_contents.overflow_cells.is_empty() { + // The ONLY way we can have an overflow cell in the parent is if we replaced an interior cell from a cell in the child, and that replacement did not fit. + // This can only happen on index btrees. + if matches!(page_type, PageType::IndexInterior) { + turso_assert!(parent_contents.overflow_cells.len() == 1, "index interior page must have no more than 1 overflow cell, as a result of InteriorNodeReplacement"); + } else { + turso_assert!(false, "{page_type:?} must have no overflow cells"); + } + let overflow_cell = parent_contents.overflow_cells.first().unwrap(); + let parent_page_cell_idx = self.stack.current_cell_index() as usize; + // Parent page must be positioned at the divider cell that overflowed due to the replacement. + turso_assert!( + overflow_cell.index == parent_page_cell_idx, + "overflow cell index must be the result of InteriorNodeReplacement that leaves both child and parent (id={}) unbalanced, and hence parent page's position must = overflow_cell.index. Instead got: parent_page_cell_idx={parent_page_cell_idx} overflow_cell.index={}", + parent_page.get().id, + overflow_cell.index + ); } self.pager.add_dirty(&parent_page); let parent_contents = parent_page.get().contents.as_ref().unwrap(); @@ -2396,23 +2418,9 @@ impl BTreeCursor { parent_page.get().id, page_to_balance_idx ); - turso_assert!( - matches!( - parent_contents.page_type(), - PageType::IndexInterior | PageType::TableInterior - ), - "expected index or table interior page" - ); // Part 1: Find the sibling pages to balance let mut pages_to_balance: [Option; MAX_SIBLING_PAGES_TO_BALANCE] = [const { None }; MAX_SIBLING_PAGES_TO_BALANCE]; - let number_of_cells_in_parent = - parent_contents.cell_count() + parent_contents.overflow_cells.len(); - - turso_assert!( - parent_contents.overflow_cells.is_empty(), - "balancing child page with overflowed parent not yet implemented" - ); turso_assert!( page_to_balance_idx <= parent_contents.cell_count(), "page_to_balance_idx={page_to_balance_idx} is out of bounds for parent cell count {number_of_cells_in_parent}" @@ -2449,10 +2457,41 @@ impl BTreeCursor { let right_pointer = if last_sibling_is_right_pointer { parent_contents.rightmost_pointer_raw().unwrap() } else { - let (start_of_cell, _) = parent_contents.cell_get_raw_region( - first_cell_divider + sibling_pointer, - self.usable_space(), + let max_overflow_cells = if matches!(page_type, PageType::IndexInterior) { + 1 + } else { + 0 + }; + turso_assert!( + parent_contents.overflow_cells.len() <= max_overflow_cells, + "must have at most {max_overflow_cells} overflow cell in the parent" ); + // OVERFLOW CELL ADJUSTMENT: + // Let there be parent with cells [0,1,2,3,4]. + // Let's imagine the cell at idx 2 gets replaced with a new payload that causes it to overflow. + // See handling of InteriorNodeReplacement in btree.rs. + // + // In this case the rightmost divider is going to be 3 (2 is the middle one and we pick neighbors 1-3). + // drop_cell(): [0,1,2,3,4] -> [0,1,3,4] <-- cells on right side get shifted left! + // insert_into_cell(): [0,1,3,4] -> [0,1,3,4] + overflow cell (2) <-- crucially, no physical shifting happens, overflow cell is stored separately + // + // This means '3' is actually physically located at index '2'. + // So IF the parent has an overflow cell, we need to subtract 1 to get the actual rightmost divider cell idx to physically read from. + // The formula for the actual cell idx is: + // first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len() + // so in the above case: + // actual_cell_idx = 1 + 2 - 1 = 2 + // + // In the case where the last divider cell is the overflow cell, there would be no left-shifting of cells in drop_cell(), + // because they are still positioned correctly (imagine .pop() from a vector). + // However, note that we are always looking for the _rightmost_ child page pointer between the (max 2) dividers, and for any case where the last divider cell is the overflow cell, + // the 'last_sibling_is_right_pointer' condition will also be true (since the overflow cell's left child will be the middle page), so we won't enter this code branch. + // + // Hence: when we enter this branch with overflow_cells.len() == 1, we know that left-shifting has happened and we need to subtract 1. + let actual_cell_idx = + first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len(); + let (start_of_cell, _) = + parent_contents.cell_get_raw_region(actual_cell_idx, self.usable_space()); let buf = parent_contents.as_ptr().as_mut_ptr(); unsafe { buf.add(start_of_cell) } }; @@ -2477,25 +2516,58 @@ impl BTreeCursor { ); } pages_to_balance[i].replace(page); - turso_assert!( - parent_contents.overflow_cells.is_empty(), - "overflow in parent is not yet implented while balancing it" - ); if i == 0 { break; } let next_cell_divider = i + first_cell_divider - 1; - pgno = match parent_contents.cell_get(next_cell_divider, self.usable_space())? { - BTreeCell::TableInteriorCell(TableInteriorCell { - left_child_page, .. - }) - | BTreeCell::IndexInteriorCell(IndexInteriorCell { - left_child_page, .. - }) => left_child_page, - other => { - crate::bail_corrupt_error!("expected interior cell, got {:?}", other) - } - }; + let divider_is_overflow_cell = parent_contents + .overflow_cells + .first() + .is_some_and(|overflow_cell| overflow_cell.index == next_cell_divider); + if divider_is_overflow_cell { + turso_assert!( + matches!(parent_contents.page_type(), PageType::IndexInterior), + "expected index interior page, got {:?}", + parent_contents.page_type() + ); + turso_assert!( + parent_contents.overflow_cells.len() == 1, + "must have a single overflow cell in the parent, as a result of InteriorNodeReplacement" + ); + let overflow_cell = parent_contents.overflow_cells.first().unwrap(); + pgno = u32::from_be_bytes(overflow_cell.payload[0..4].try_into().unwrap()); + } else { + // grep for 'OVERFLOW CELL ADJUSTMENT' for explanation. + // here we only subtract 1 if the divider cell has been shifted left, i.e. the overflow cell was placed to the left + // this cell. + let actual_cell_idx = + if let Some(overflow_cell) = parent_contents.overflow_cells.first() { + if next_cell_divider < overflow_cell.index { + next_cell_divider + } else { + next_cell_divider - 1 + } + } else { + next_cell_divider + }; + pgno = + match parent_contents.cell_get(actual_cell_idx, self.usable_space())? { + BTreeCell::TableInteriorCell(TableInteriorCell { + left_child_page, + .. + }) + | BTreeCell::IndexInteriorCell(IndexInteriorCell { + left_child_page, + .. + }) => left_child_page, + other => { + crate::bail_corrupt_error!( + "expected interior cell, got {:?}", + other + ) + } + }; + } } #[cfg(debug_assertions)] @@ -2547,11 +2619,6 @@ impl BTreeCursor { let parent_contents = parent_page.get_contents(); let parent_is_root = !self.stack.has_parent(); - turso_assert!( - parent_contents.overflow_cells.is_empty(), - "overflow parent not yet implemented" - ); - // 1. Collect cell data from divider cells, and count the total number of cells to be distributed. // The count includes: all cells and overflow cells from the sibling pages, and divider cells from the parent page, // excluding the rightmost divider, which will not be dropped from the parent; instead it will be updated at the end. @@ -2575,10 +2642,33 @@ impl BTreeCursor { } // Since we know we have a left sibling, take the divider that points to left sibling of this page let cell_idx = balance_info.first_divider_cell + i; - let (cell_start, cell_len) = - parent_contents.cell_get_raw_region(cell_idx, self.usable_space()); - let buf = parent_contents.as_ptr(); - let cell_buf = &buf[cell_start..cell_start + cell_len]; + let divider_is_overflow_cell = parent_contents + .overflow_cells + .first() + .is_some_and(|overflow_cell| overflow_cell.index == cell_idx); + let cell_buf = if divider_is_overflow_cell { + turso_assert!( + matches!(parent_contents.page_type(), PageType::IndexInterior), + "expected index interior page, got {:?}", + parent_contents.page_type() + ); + turso_assert!( + parent_contents.overflow_cells.len() == 1, + "must have a single overflow cell in the parent, as a result of InteriorNodeReplacement" + ); + let overflow_cell = parent_contents.overflow_cells.first().unwrap(); + &overflow_cell.payload + } else { + // grep for 'OVERFLOW CELL ADJUSTMENT' for explanation. + // here we can subtract overflow_cells.len() every time, because we are iterating right-to-left, + // so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0. + let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len(); + let (cell_start, cell_len) = parent_contents + .cell_get_raw_region(actual_cell_idx, self.usable_space()); + let buf = parent_contents.as_ptr(); + &buf[cell_start..cell_start + cell_len] + }; + // Count the divider cell itself (which will be dropped from the parent) total_cells_to_redistribute += 1; @@ -2591,12 +2681,24 @@ impl BTreeCursor { // TODO(pere): make this reference and not copy balance_info.divider_cell_payloads[i].replace(cell_buf.to_vec()); - tracing::trace!( - "dropping divider cell from parent cell_idx={} count={}", - cell_idx, - parent_contents.cell_count() - ); - drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + if divider_is_overflow_cell { + tracing::debug!( + "clearing overflow cells from parent cell_idx={}", + cell_idx + ); + parent_contents.overflow_cells.clear(); + } else { + // grep for 'OVERFLOW CELL ADJUSTMENT' for explanation. + // here we can subtract overflow_cells.len() every time, because we are iterating right-to-left, + // so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0. + let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len(); + tracing::trace!( + "dropping divider cell from parent cell_idx={} count={}", + actual_cell_idx, + parent_contents.cell_count() + ); + drop_cell(parent_contents, actual_cell_idx, self.usable_space() as u16)?; + } } /* 2. Initialize CellArray with all the cells used for distribution, this includes divider cells if !leaf. */ @@ -4428,7 +4530,17 @@ impl BTreeCursor { // Step 1: Move cursor to the largest key in the left subtree. // The largest key is always in a leaf, and so this traversal may involvegoing multiple pages downwards, // so we store the page we are currently on. - return_if_io!(self.get_prev_record()); // avoid calling prev() because it internally calls restore_context() which may cause unintended behavior. + + // avoid calling prev() because it internally calls restore_context() which may cause unintended behavior. + return_if_io!(self.get_prev_record()); + + // Ensure we keep the parent page at the same position as before the replacement. + self.stack + .node_states + .borrow_mut() + .get_mut(btree_depth) + .expect("parent page should be on the stack") + .cell_idx = cell_idx as i32; let (cell_payload, leaf_cell_idx) = { let leaf_page_ref = self.stack.top(); let leaf_page = leaf_page_ref.get(); @@ -7385,6 +7497,253 @@ mod tests { } } + fn btree_index_insert_delete_fuzz_run( + attempts: usize, + operations: usize, + size: impl Fn(&mut ChaCha8Rng) -> usize, + insert_chance: f64, + ) { + use crate::storage::pager::CreateBTreeFlags; + + let (mut rng, seed) = if std::env::var("SEED").is_ok() { + let seed = std::env::var("SEED").unwrap(); + let seed = seed.parse::().unwrap(); + let rng = ChaCha8Rng::seed_from_u64(seed); + (rng, seed) + } else { + rng_from_time_or_env() + }; + let mut seen = HashSet::new(); + tracing::info!("super seed: {}", seed); + + for _ in 0..attempts { + let (pager, _, _db, conn) = empty_btree(); + let index_root_page_result = + pager.btree_create(&CreateBTreeFlags::new_index()).unwrap(); + let index_root_page = match index_root_page_result { + crate::types::IOResult::Done(id) => id as usize, + crate::types::IOResult::IO => { + panic!("btree_create returned IO in test, unexpected") + } + }; + let index_def = Index { + name: "testindex".to_string(), + columns: vec![IndexColumn { + name: "testcol".to_string(), + order: SortOrder::Asc, + collation: None, + pos_in_table: 0, + default: None, + }], + table_name: "test".to_string(), + root_page: index_root_page, + unique: false, + ephemeral: false, + has_rowid: false, + }; + let mut cursor = + BTreeCursor::new_index(None, pager.clone(), index_root_page, &index_def, 1); + + // Track expected keys that should be present in the tree + let mut expected_keys = Vec::new(); + + tracing::info!("seed: {seed}"); + for i in 0..operations { + let print_progress = i % 100 == 0; + pager.begin_read_tx().unwrap(); + pager.begin_write_tx().unwrap(); + + // Decide whether to insert or delete (80% chance of insert) + let is_insert = rng.next_u64() % 100 < (insert_chance * 100.0) as u64; + + if is_insert { + // Generate a unique key for insertion + let key = { + let result; + loop { + let sizeof_blob = size(&mut rng); + let blob = (0..sizeof_blob) + .map(|_| (rng.next_u64() % 256) as u8) + .collect::>(); + if seen.contains(&blob) { + continue; + } else { + seen.insert(blob.clone()); + } + result = blob; + break; + } + result + }; + + if print_progress { + tracing::info!("insert {}/{}, seed: {seed}", i + 1, operations); + } + expected_keys.push(key.clone()); + + let regs = vec![Register::Value(Value::Blob(key))]; + let value = ImmutableRecord::from_registers(®s, regs.len()); + + let seek_result = run_until_done( + || { + let record = ImmutableRecord::from_registers(®s, regs.len()); + let key = SeekKey::IndexKey(&record); + cursor.seek(key, SeekOp::GE { eq_only: true }) + }, + pager.deref(), + ) + .unwrap(); + if let SeekResult::TryAdvance = seek_result { + run_until_done(|| cursor.next(), pager.deref()).unwrap(); + } + run_until_done( + || { + cursor.insert( + &BTreeKey::new_index_key(&value), + cursor.is_write_in_progress(), + ) + }, + pager.deref(), + ) + .unwrap(); + } else { + // Delete a random existing key + if !expected_keys.is_empty() { + let delete_idx = rng.next_u64() as usize % expected_keys.len(); + let key_to_delete = expected_keys[delete_idx].clone(); + + if print_progress { + tracing::info!("delete {}/{}, seed: {seed}", i + 1, operations); + } + + let regs = vec![Register::Value(Value::Blob(key_to_delete.clone()))]; + let record = ImmutableRecord::from_registers(®s, regs.len()); + + // Seek to the key to delete + let seek_result = run_until_done( + || { + cursor + .seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true }) + }, + pager.deref(), + ) + .unwrap(); + let mut found = matches!(seek_result, SeekResult::Found); + if matches!(seek_result, SeekResult::TryAdvance) { + found = run_until_done(|| cursor.next(), pager.deref()).unwrap(); + } + assert!(found, "expected key {key_to_delete:?} is not found"); + + // Delete the key + run_until_done(|| cursor.delete(), pager.deref()).unwrap(); + + // Remove from expected keys + expected_keys.remove(delete_idx); + } + } + + cursor.move_to_root().unwrap(); + loop { + match pager.end_tx(false, false, &conn, false).unwrap() { + IOResult::Done(_) => break, + IOResult::IO => { + pager.io.run_once().unwrap(); + } + } + } + } + + // Final validation + let mut sorted_keys = expected_keys.clone(); + sorted_keys.sort(); + validate_expected_keys(&pager, &mut cursor, &sorted_keys, seed); + + pager.end_read_tx().unwrap(); + } + } + + fn validate_expected_keys( + pager: &Rc, + cursor: &mut BTreeCursor, + expected_keys: &[Vec], + seed: u64, + ) { + // Check that all expected keys can be found by seeking + pager.begin_read_tx().unwrap(); + cursor.move_to_root().unwrap(); + for (i, key) in expected_keys.iter().enumerate() { + tracing::info!( + "validating key {}/{}, seed: {seed}", + i + 1, + expected_keys.len() + ); + let exists = run_until_done( + || { + let regs = vec![Register::Value(Value::Blob(key.clone()))]; + cursor.seek( + SeekKey::IndexKey(&ImmutableRecord::from_registers(®s, regs.len())), + SeekOp::GE { eq_only: true }, + ) + }, + pager.deref(), + ) + .unwrap(); + let mut found = matches!(exists, SeekResult::Found); + if matches!(exists, SeekResult::TryAdvance) { + found = run_until_done(|| cursor.next(), pager.deref()).unwrap(); + } + assert!(found, "expected key {key:?} is not found"); + } + + // Check key count + cursor.move_to_root().unwrap(); + run_until_done(|| cursor.rewind(), pager.deref()).unwrap(); + if !cursor.has_record.get() { + panic!("no keys in tree"); + } + let mut count = 1; + loop { + run_until_done(|| cursor.next(), pager.deref()).unwrap(); + if !cursor.has_record.get() { + break; + } + count += 1; + } + assert_eq!( + count, + expected_keys.len(), + "key count is not right, got {}, expected {}, seed: {seed}", + count, + expected_keys.len() + ); + + // Check that all keys can be found in-order, by iterating the btree + cursor.move_to_root().unwrap(); + for (i, key) in expected_keys.iter().enumerate() { + run_until_done(|| cursor.next(), pager.deref()).unwrap(); + tracing::info!( + "iterating key {}/{}, cursor stack cur idx: {:?}, cursor stack depth: {:?}, seed: {seed}", + i + 1, + expected_keys.len(), + cursor.stack.current_cell_index(), + cursor.stack.current() + ); + let record = run_until_done(|| cursor.record(), pager).unwrap(); + let record = record.as_ref().unwrap(); + let cur = record.get_values().clone(); + let cur = cur.first().unwrap(); + let RefValue::Blob(ref cur) = cur else { + panic!("expected blob, got {cur:?}"); + }; + assert_eq!( + cur.to_slice(), + key, + "key {key:?} is not found, seed: {seed}" + ); + } + pager.end_read_tx().unwrap(); + } + #[test] pub fn test_drop_odd() { let db = get_database(); @@ -7443,6 +7802,20 @@ mod tests { btree_index_insert_fuzz_run(2, 1024); } + #[test] + pub fn btree_index_insert_delete_fuzz_run_test() { + btree_index_insert_delete_fuzz_run( + 2, + 2000, + |rng| { + let min: u32 = 4; + let size = min + rng.next_u32() % (1024 - min); + size as usize + }, + 0.65, + ); + } + #[test] pub fn btree_insert_fuzz_run_random() { btree_insert_fuzz_run(128, 16, |rng| (rng.next_u32() % 4096) as usize); @@ -7478,6 +7851,21 @@ mod tests { btree_index_insert_fuzz_run(2, 10_000); } + #[test] + #[ignore] + pub fn fuzz_long_btree_index_insert_delete_fuzz_run() { + btree_index_insert_delete_fuzz_run( + 2, + 10000, + |rng| { + let min: u32 = 4; + let size = min + rng.next_u32() % (1024 - min); + size as usize + }, + 0.65, + ); + } + #[test] #[ignore] pub fn fuzz_long_btree_insert_fuzz_run_random() {