From d773a7924d1dfa1ce047677c50edf1de541c3a4e Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 23 Jul 2025 18:42:31 +0300 Subject: [PATCH] fix/btree/balance: allow exactly 1 parent overflow cell for index balancing --- core/storage/btree.rs | 226 +++++++++++++++++++++++++++++++----------- 1 file changed, 169 insertions(+), 57 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5e33ad610..1a5022093 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2376,16 +2376,38 @@ impl BTreeCursor { let parent_page = self.stack.top(); return_if_locked_maybe_load!(self.pager, parent_page); let parent_page = parent_page.get(); - // If `move_to` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. - // In any other case, `move_to` will stay in the correct index. - if self.stack.current_cell_index() as usize - == parent_page.get_contents().cell_count() + 1 - { + let parent_contents = parent_page.get_contents(); + let page_type = parent_contents.page_type(); + turso_assert!( + matches!(page_type, PageType::IndexInterior | PageType::TableInterior), + "expected index or table interior page" + ); + let number_of_cells_in_parent = + parent_contents.cell_count() + parent_contents.overflow_cells.len(); + + // If `seek` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. + // In any other case, `seek` will stay in the correct index. + let past_rightmost_pointer = + self.stack.current_cell_index() as usize == number_of_cells_in_parent + 1; + if past_rightmost_pointer { self.stack.retreat(); - } else if self.stack.current_cell_index() == -1 { - // We might've retreated in CheckRequiresBalancing, so advance to the next cell - // to prevent panic in the asserts below due to -1 index - self.stack.advance(); + } else if !parent_contents.overflow_cells.is_empty() { + // The ONLY way we can have an overflow cell in the parent is if we replaced an interior cell from a cell in the child, and that replacement did not fit. + // This can only happen on index btrees. + if matches!(page_type, PageType::IndexInterior) { + turso_assert!(parent_contents.overflow_cells.len() == 1, "index interior page must have no more than 1 overflow cell, as a result of InteriorNodeReplacement"); + } else { + turso_assert!(false, "{page_type:?} must have no overflow cells"); + } + let overflow_cell = parent_contents.overflow_cells.first().unwrap(); + let parent_page_cell_idx = self.stack.current_cell_index() as usize; + // Parent page must be positioned at the divider cell that overflowed due to the replacement. + turso_assert!( + overflow_cell.index == parent_page_cell_idx, + "overflow cell index must be the result of InteriorNodeReplacement that leaves both child and parent (id={}) unbalanced, and hence parent page's position must = overflow_cell.index. Instead got: parent_page_cell_idx={parent_page_cell_idx} overflow_cell.index={}", + parent_page.get().id, + overflow_cell.index + ); } self.pager.add_dirty(&parent_page); let parent_contents = parent_page.get().contents.as_ref().unwrap(); @@ -2396,23 +2418,9 @@ impl BTreeCursor { parent_page.get().id, page_to_balance_idx ); - turso_assert!( - matches!( - parent_contents.page_type(), - PageType::IndexInterior | PageType::TableInterior - ), - "expected index or table interior page" - ); // Part 1: Find the sibling pages to balance let mut pages_to_balance: [Option; MAX_SIBLING_PAGES_TO_BALANCE] = [const { None }; MAX_SIBLING_PAGES_TO_BALANCE]; - let number_of_cells_in_parent = - parent_contents.cell_count() + parent_contents.overflow_cells.len(); - - turso_assert!( - parent_contents.overflow_cells.is_empty(), - "balancing child page with overflowed parent not yet implemented" - ); turso_assert!( page_to_balance_idx <= parent_contents.cell_count(), "page_to_balance_idx={page_to_balance_idx} is out of bounds for parent cell count {number_of_cells_in_parent}" @@ -2449,10 +2457,41 @@ impl BTreeCursor { let right_pointer = if last_sibling_is_right_pointer { parent_contents.rightmost_pointer_raw().unwrap() } else { - let (start_of_cell, _) = parent_contents.cell_get_raw_region( - first_cell_divider + sibling_pointer, - self.usable_space(), + let max_overflow_cells = if matches!(page_type, PageType::IndexInterior) { + 1 + } else { + 0 + }; + turso_assert!( + parent_contents.overflow_cells.len() <= max_overflow_cells, + "must have at most {max_overflow_cells} overflow cell in the parent" ); + // OVERFLOW CELL ADJUSTMENT: + // Let there be parent with cells [0,1,2,3,4]. + // Let's imagine the cell at idx 2 gets replaced with a new payload that causes it to overflow. + // See handling of InteriorNodeReplacement in btree.rs. + // + // In this case the rightmost divider is going to be 3 (2 is the middle one and we pick neighbors 1-3). + // drop_cell(): [0,1,2,3,4] -> [0,1,3,4] <-- cells on right side get shifted left! + // insert_into_cell(): [0,1,3,4] -> [0,1,3,4] + overflow cell (2) <-- crucially, no physical shifting happens, overflow cell is stored separately + // + // This means '3' is actually physically located at index '2'. + // So IF the parent has an overflow cell, we need to subtract 1 to get the actual rightmost divider cell idx to physically read from. + // The formula for the actual cell idx is: + // first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len() + // so in the above case: + // actual_cell_idx = 1 + 2 - 1 = 2 + // + // In the case where the last divider cell is the overflow cell, there would be no left-shifting of cells in drop_cell(), + // because they are still positioned correctly (imagine .pop() from a vector). + // However, note that we are always looking for the _rightmost_ child page pointer between the (max 2) dividers, and for any case where the last divider cell is the overflow cell, + // the 'last_sibling_is_right_pointer' condition will also be true (since the overflow cell's left child will be the middle page), so we won't enter this code branch. + // + // Hence: when we enter this branch with overflow_cells.len() == 1, we know that left-shifting has happened and we need to subtract 1. + let actual_cell_idx = + first_cell_divider + sibling_pointer - parent_contents.overflow_cells.len(); + let (start_of_cell, _) = + parent_contents.cell_get_raw_region(actual_cell_idx, self.usable_space()); let buf = parent_contents.as_ptr().as_mut_ptr(); unsafe { buf.add(start_of_cell) } }; @@ -2477,25 +2516,58 @@ impl BTreeCursor { ); } pages_to_balance[i].replace(page); - turso_assert!( - parent_contents.overflow_cells.is_empty(), - "overflow in parent is not yet implented while balancing it" - ); if i == 0 { break; } let next_cell_divider = i + first_cell_divider - 1; - pgno = match parent_contents.cell_get(next_cell_divider, self.usable_space())? { - BTreeCell::TableInteriorCell(TableInteriorCell { - left_child_page, .. - }) - | BTreeCell::IndexInteriorCell(IndexInteriorCell { - left_child_page, .. - }) => left_child_page, - other => { - crate::bail_corrupt_error!("expected interior cell, got {:?}", other) - } - }; + let divider_is_overflow_cell = parent_contents + .overflow_cells + .first() + .is_some_and(|overflow_cell| overflow_cell.index == next_cell_divider); + if divider_is_overflow_cell { + turso_assert!( + matches!(parent_contents.page_type(), PageType::IndexInterior), + "expected index interior page, got {:?}", + parent_contents.page_type() + ); + turso_assert!( + parent_contents.overflow_cells.len() == 1, + "must have a single overflow cell in the parent, as a result of InteriorNodeReplacement" + ); + let overflow_cell = parent_contents.overflow_cells.first().unwrap(); + pgno = u32::from_be_bytes(overflow_cell.payload[0..4].try_into().unwrap()); + } else { + // grep for 'OVERFLOW CELL ADJUSTMENT' for explanation. + // here we only subtract 1 if the divider cell has been shifted left, i.e. the overflow cell was placed to the left + // this cell. + let actual_cell_idx = + if let Some(overflow_cell) = parent_contents.overflow_cells.first() { + if next_cell_divider < overflow_cell.index { + next_cell_divider + } else { + next_cell_divider - 1 + } + } else { + next_cell_divider + }; + pgno = + match parent_contents.cell_get(actual_cell_idx, self.usable_space())? { + BTreeCell::TableInteriorCell(TableInteriorCell { + left_child_page, + .. + }) + | BTreeCell::IndexInteriorCell(IndexInteriorCell { + left_child_page, + .. + }) => left_child_page, + other => { + crate::bail_corrupt_error!( + "expected interior cell, got {:?}", + other + ) + } + }; + } } #[cfg(debug_assertions)] @@ -2547,11 +2619,6 @@ impl BTreeCursor { let parent_contents = parent_page.get_contents(); let parent_is_root = !self.stack.has_parent(); - turso_assert!( - parent_contents.overflow_cells.is_empty(), - "overflow parent not yet implemented" - ); - // 1. Collect cell data from divider cells, and count the total number of cells to be distributed. // The count includes: all cells and overflow cells from the sibling pages, and divider cells from the parent page, // excluding the rightmost divider, which will not be dropped from the parent; instead it will be updated at the end. @@ -2575,10 +2642,33 @@ impl BTreeCursor { } // Since we know we have a left sibling, take the divider that points to left sibling of this page let cell_idx = balance_info.first_divider_cell + i; - let (cell_start, cell_len) = - parent_contents.cell_get_raw_region(cell_idx, self.usable_space()); - let buf = parent_contents.as_ptr(); - let cell_buf = &buf[cell_start..cell_start + cell_len]; + let divider_is_overflow_cell = parent_contents + .overflow_cells + .first() + .is_some_and(|overflow_cell| overflow_cell.index == cell_idx); + let cell_buf = if divider_is_overflow_cell { + turso_assert!( + matches!(parent_contents.page_type(), PageType::IndexInterior), + "expected index interior page, got {:?}", + parent_contents.page_type() + ); + turso_assert!( + parent_contents.overflow_cells.len() == 1, + "must have a single overflow cell in the parent, as a result of InteriorNodeReplacement" + ); + let overflow_cell = parent_contents.overflow_cells.first().unwrap(); + &overflow_cell.payload + } else { + // grep for 'OVERFLOW CELL ADJUSTMENT' for explanation. + // here we can subtract overflow_cells.len() every time, because we are iterating right-to-left, + // so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0. + let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len(); + let (cell_start, cell_len) = parent_contents + .cell_get_raw_region(actual_cell_idx, self.usable_space()); + let buf = parent_contents.as_ptr(); + &buf[cell_start..cell_start + cell_len] + }; + // Count the divider cell itself (which will be dropped from the parent) total_cells_to_redistribute += 1; @@ -2591,12 +2681,24 @@ impl BTreeCursor { // TODO(pere): make this reference and not copy balance_info.divider_cell_payloads[i].replace(cell_buf.to_vec()); - tracing::trace!( - "dropping divider cell from parent cell_idx={} count={}", - cell_idx, - parent_contents.cell_count() - ); - drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + if divider_is_overflow_cell { + tracing::debug!( + "clearing overflow cells from parent cell_idx={}", + cell_idx + ); + parent_contents.overflow_cells.clear(); + } else { + // grep for 'OVERFLOW CELL ADJUSTMENT' for explanation. + // here we can subtract overflow_cells.len() every time, because we are iterating right-to-left, + // so if we are to the left of the overflow cell, it has already been cleared from the parent and overflow_cells.len() is 0. + let actual_cell_idx = cell_idx - parent_contents.overflow_cells.len(); + tracing::trace!( + "dropping divider cell from parent cell_idx={} count={}", + actual_cell_idx, + parent_contents.cell_count() + ); + drop_cell(parent_contents, actual_cell_idx, self.usable_space() as u16)?; + } } /* 2. Initialize CellArray with all the cells used for distribution, this includes divider cells if !leaf. */ @@ -4428,7 +4530,17 @@ impl BTreeCursor { // Step 1: Move cursor to the largest key in the left subtree. // The largest key is always in a leaf, and so this traversal may involvegoing multiple pages downwards, // so we store the page we are currently on. - return_if_io!(self.get_prev_record()); // avoid calling prev() because it internally calls restore_context() which may cause unintended behavior. + + // avoid calling prev() because it internally calls restore_context() which may cause unintended behavior. + return_if_io!(self.get_prev_record()); + + // Ensure we keep the parent page at the same position as before the replacement. + self.stack + .node_states + .borrow_mut() + .get_mut(btree_depth) + .expect("parent page should be on the stack") + .cell_idx = cell_idx as i32; let (cell_payload, leaf_cell_idx) = { let leaf_page_ref = self.stack.top(); let leaf_page = leaf_page_ref.get();