diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 8b37dc64b..8380cc2f4 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -698,18 +698,24 @@ impl BTreeCursor { #[instrument(skip(self), level = Level::DEBUG, name = "prev")] fn get_prev_record(&mut self) -> Result> { loop { - let page = self.stack.top(); - let contents = page.get_contents(); - let page_type = contents.page_type(); - let is_index = page.is_index(); + let (old_top_idx, page_type, is_index, is_leaf, cell_count) = { + let page = self.stack.top_ref(); + let contents = page.get_contents(); + ( + self.stack.current(), + contents.page_type(), + page.is_index(), + contents.is_leaf(), + contents.cell_count(), + ) + }; - let cell_count = contents.cell_count(); let cell_idx = self.stack.current_cell_index(); // If we are at the end of the page and we haven't just come back from the right child, // we now need to move to the rightmost child. - if self.stack.current_cell_index() == i32::MAX && !self.going_upwards { - let rightmost_pointer = contents.rightmost_pointer(); + if cell_idx == i32::MAX && !self.going_upwards { + let rightmost_pointer = self.stack.top_ref().get_contents().rightmost_pointer(); if let Some(rightmost_pointer) = rightmost_pointer { let past_rightmost_pointer = cell_count as i32 + 1; self.stack.set_cell_index(past_rightmost_pointer); @@ -756,7 +762,7 @@ impl BTreeCursor { // continue to next loop to get record from the new page continue; } - if contents.is_leaf() { + if is_leaf { return Ok(IOResult::Done(true)); } @@ -768,7 +774,11 @@ impl BTreeCursor { } let cell_idx = self.stack.current_cell_index() as usize; - let left_child_page = contents.cell_interior_read_left_child_page(cell_idx); + let left_child_page = self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_interior_read_left_child_page(cell_idx); if page_type == PageType::IndexInterior { // In backwards iteration, if we haven't just moved to this interior node from the @@ -932,7 +942,7 @@ impl BTreeCursor { return self.continue_payload_overflow_with_offset(buffer, self.usable_space()); } - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); let cell_idx = self.stack.current_cell_index() as usize - 1; @@ -1339,7 +1349,7 @@ impl BTreeCursor { MoveToRightState::Start => { if let Some(rightmost_page_id) = rightmost_page_id { // If we know the rightmost page and are already on it, we can skip a seek. - let current_page = self.stack.top(); + let current_page = self.stack.top_ref(); if current_page.get().id == *rightmost_page_id { let contents = current_page.get_contents(); let cell_count = contents.cell_count(); @@ -1355,7 +1365,7 @@ impl BTreeCursor { } } MoveToRightState::ProcessPage => { - let mem_page = self.stack.top(); + let mem_page = self.stack.top_ref(); let page_idx = mem_page.get().id; let contents = mem_page.get_contents(); if contents.is_leaf() { @@ -1390,16 +1400,23 @@ impl BTreeCursor { #[instrument(skip(self), level = Level::DEBUG)] fn tablebtree_move_to(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { loop { - let page = self.stack.top(); - let contents = page.get_contents(); - if contents.is_leaf() { + let (old_top_idx, is_leaf, cell_count) = { + let page = self.stack.top_ref(); + let contents = page.get_contents(); + ( + self.stack.current(), + contents.is_leaf(), + contents.cell_count(), + ) + }; + + if is_leaf { self.seek_state = CursorSeekState::FoundLeaf { eq_seen: Cell::new(false), }; return Ok(IOResult::Done(())); } - let cell_count = contents.cell_count(); if matches!( self.seek_state, CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. } @@ -1435,8 +1452,11 @@ impl BTreeCursor { let max = max_cell_idx.get(); if min > max { if let Some(nearest_matching_cell) = nearest_matching_cell.get() { - let left_child_page = - contents.cell_interior_read_left_child_page(nearest_matching_cell); + let left_child_page = self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_interior_read_left_child_page(nearest_matching_cell); self.stack.set_cell_index(nearest_matching_cell as i32); let (mem_page, c) = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); @@ -1449,7 +1469,12 @@ impl BTreeCursor { continue; } self.stack.set_cell_index(cell_count as i32 + 1); - match contents.rightmost_pointer() { + match self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .rightmost_pointer() + { Some(right_most_pointer) => { let (mem_page, c) = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); @@ -1467,7 +1492,11 @@ impl BTreeCursor { } } let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. - let cell_rowid = contents.cell_table_interior_read_rowid(cur_cell_idx as usize)?; + let cell_rowid = self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_table_interior_read_rowid(cur_cell_idx as usize)?; // in sqlite btrees left child pages have <= keys. // table btrees can have a duplicate rowid in the interior cell, so for example if we are looking for rowid=10, // and we find an interior cell with rowid=10, we need to move to the left page since (due to the <= rule of sqlite btrees) @@ -1527,9 +1556,17 @@ impl BTreeCursor { let tie_breaker = get_tie_breaker_from_seek_op(cmp); loop { - let page = self.stack.top(); - let contents = page.get_contents(); - if contents.is_leaf() { + let (old_top_idx, is_leaf, cell_count) = { + let page = self.stack.top_ref(); + let contents = page.get_contents(); + ( + self.stack.current(), + contents.is_leaf(), + contents.cell_count(), + ) + }; + + if is_leaf { let eq_seen = match &self.seek_state { CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), _ => false, @@ -1548,7 +1585,6 @@ impl BTreeCursor { CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), _ => false, }; - let cell_count = contents.cell_count(); let min_cell_idx = Cell::new(0); let max_cell_idx = Cell::new(cell_count as isize - 1); let nearest_matching_cell = Cell::new(None); @@ -1578,8 +1614,13 @@ impl BTreeCursor { let max = max_cell_idx.get(); if min > max { let Some(leftmost_matching_cell) = nearest_matching_cell.get() else { - self.stack.set_cell_index(contents.cell_count() as i32 + 1); - match contents.rightmost_pointer() { + self.stack.set_cell_index(cell_count as i32 + 1); + match self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .rightmost_pointer() + { Some(right_most_pointer) => { let (mem_page, c) = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); @@ -1596,8 +1637,11 @@ impl BTreeCursor { } } }; - let matching_cell = - contents.cell_get(leftmost_matching_cell, self.usable_space())?; + let matching_cell = self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_get(leftmost_matching_cell, self.usable_space())?; self.stack.set_cell_index(leftmost_matching_cell as i32); // we don't advance in case of forward iteration and index tree internal nodes because we will visit this node going up. // in backwards iteration, we must retreat because otherwise we would unnecessarily visit this node again. @@ -1615,12 +1659,15 @@ impl BTreeCursor { unreachable!("unexpected cell type: {:?}", matching_cell); }; - turso_assert!( - page.get().id != *left_child_page as usize, - "corrupt: current page and left child page of cell {} are both {}", - leftmost_matching_cell, - page.get().id - ); + { + let page = self.stack.get_page_at_level(old_top_idx).unwrap(); + turso_assert!( + page.get().id != *left_child_page as usize, + "corrupt: current page and left child page of cell {} are both {}", + leftmost_matching_cell, + page.get().id + ); + } let (mem_page, c) = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); @@ -1635,7 +1682,11 @@ impl BTreeCursor { let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. self.stack.set_cell_index(cur_cell_idx as i32); - let cell = contents.cell_get(cur_cell_idx as usize, self.usable_space())?; + let cell = self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_get(cur_cell_idx as usize, self.usable_space())?; let BTreeCell::IndexInteriorCell(IndexInteriorCell { payload, payload_size, @@ -1747,7 +1798,7 @@ impl BTreeCursor { ) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::TableRowId(rowid), seek_op)); - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); turso_assert!( contents.is_leaf(), @@ -1789,7 +1840,7 @@ impl BTreeCursor { unreachable!("we must be in a leaf binary search state"); }; - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); loop { @@ -1910,7 +1961,7 @@ impl BTreeCursor { ); }; let eq_seen = eq_seen.get(); - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); let cell_count = contents.cell_count(); @@ -1951,8 +2002,7 @@ impl BTreeCursor { ); }; - let page = self.stack.top(); - let contents = page.get_contents(); + let old_top_idx = self.stack.current(); let iter_dir = seek_op.iteration_direction(); @@ -1968,7 +2018,13 @@ impl BTreeCursor { // set cursor to the position where which would hold the op-boundary if it were present let target_cell = target_cell_when_not_found.get(); self.stack.set_cell_index(target_cell); - let has_record = target_cell >= 0 && target_cell < contents.cell_count() as i32; + let has_record = target_cell >= 0 + && target_cell + < self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_count() as i32; self.has_record.set(has_record); // Similar logic as in tablebtree_seek(), but for indexes. @@ -1984,7 +2040,11 @@ impl BTreeCursor { let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. self.stack.set_cell_index(cur_cell_idx as i32); - let cell = contents.cell_get(cur_cell_idx as usize, self.usable_space())?; + let cell = self + .stack + .get_page_contents_at_level(old_top_idx) + .unwrap() + .cell_get(cur_cell_idx as usize, self.usable_space())?; let BTreeCell::IndexLeafCell(IndexLeafCell { payload, first_overflow_page, @@ -2183,7 +2243,7 @@ impl BTreeCursor { tracing::debug!("TableLeafCell: found exact match with cell_idx={cell_idx}, overwriting"); self.has_record.set(true); *write_state = WriteState::Overwrite { - page: page.clone(), + page: page, cell_idx, state: Some(OverwriteCellState::AllocatePayload), }; @@ -2207,7 +2267,7 @@ impl BTreeCursor { panic!("expected write state"); }; *write_state = WriteState::Overwrite { - page: page.clone(), + page: page, cell_idx, state: Some(OverwriteCellState::AllocatePayload), }; @@ -2227,7 +2287,7 @@ impl BTreeCursor { panic!("expected write state"); }; *write_state = WriteState::Insert { - page: page.clone(), + page: page, cell_idx, new_payload: Vec::with_capacity(record_values.len() + 4), fill_cell_payload_state: FillCellPayloadState::Start, @@ -2241,7 +2301,7 @@ impl BTreeCursor { ref mut fill_cell_payload_state, } => { return_if_io!(fill_cell_payload( - page.clone(), + page, bkey.maybe_rowid(), new_payload, *cell_idx, @@ -2287,13 +2347,13 @@ impl BTreeCursor { let mut state = state.take().expect("state should be present"); let cell_idx = *cell_idx; if let IOResult::IO(io) = - self.overwrite_cell(page.clone(), cell_idx, record, &mut state)? + self.overwrite_cell(&page, cell_idx, record, &mut state)? { let CursorState::Write(write_state) = &mut self.state else { panic!("expected write state"); }; *write_state = WriteState::Overwrite { - page, + page: page, cell_idx, state: Some(state), }; @@ -2364,7 +2424,7 @@ impl BTreeCursor { balance_info.borrow().is_none(), "BalanceInfo should be empty on start" ); - let current_page = self.stack.top(); + let current_page = self.stack.top_ref(); let next_balance_depth = balance_ancestor_at_depth.unwrap_or(self.stack.current()); { @@ -2430,15 +2490,23 @@ impl BTreeCursor { BalanceSubState::NonRootPickSiblings => { // Since we are going to change the btree structure, let's forget our cached knowledge of the rightmost page. let _ = self.move_to_right_state.1.take(); - let parent_page = self.stack.top(); - let parent_contents = parent_page.get_contents(); - let page_type = parent_contents.page_type(); + + let (parent_page_idx, page_type, cell_count, over_cell_count) = { + let parent_page = self.stack.top_ref(); + let parent_contents = parent_page.get_contents(); + ( + self.stack.current(), + parent_contents.page_type(), + parent_contents.cell_count(), + parent_contents.overflow_cells.len(), + ) + }; + turso_assert!( matches!(page_type, PageType::IndexInterior | PageType::TableInterior), "expected index or table interior page" ); - let number_of_cells_in_parent = - parent_contents.cell_count() + parent_contents.overflow_cells.len(); + let number_of_cells_in_parent = cell_count + over_cell_count; // If `seek` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. // In any other case, `seek` will stay in the correct index. @@ -2446,7 +2514,11 @@ impl BTreeCursor { self.stack.current_cell_index() as usize == number_of_cells_in_parent + 1; if past_rightmost_pointer { self.stack.retreat(); - } else if !parent_contents.overflow_cells.is_empty() { + } + + let parent_page = self.stack.get_page_at_level(parent_page_idx).unwrap(); + let parent_contents = parent_page.get_contents(); + if !past_rightmost_pointer && over_cell_count > 0 { // The ONLY way we can have an overflow cell in the parent is if we replaced an interior cell from a cell in the child, and that replacement did not fit. // This can only happen on index btrees. if matches!(page_type, PageType::IndexInterior) { @@ -2665,7 +2737,7 @@ impl BTreeCursor { } } // Start balancing. - let parent_page = self.stack.top(); + let parent_page = self.stack.top_ref(); let parent_contents = parent_page.get_contents(); let parent_is_root = !self.stack.has_parent(); @@ -3498,14 +3570,13 @@ impl BTreeCursor { ..first_child_contents.offset + header_and_pointer_size], ); - self.stack.set_cell_index(0); // reset cell index, top is already parent sibling_count_new -= 1; // decrease sibling count for debugging and free at the end assert!(sibling_count_new < balance_info.sibling_count); } #[cfg(debug_assertions)] BTreeCursor::post_balance_non_root_validation( - &parent_page, + parent_page, balance_info, parent_contents, pages_to_balance_new, @@ -3516,6 +3587,12 @@ impl BTreeCursor { right_page_id, usable_space, ); + + // Balance-shallower case + if sibling_count_new == 0 { + self.stack.set_cell_index(0); // reset cell index, top is already parent + } + *sub_state = BalanceSubState::FreePages { curr_page: sibling_count_new, sibling_count_new, @@ -4004,14 +4081,9 @@ impl BTreeCursor { // Since we are going to change the btree structure, let's forget our cached knowledge of the rightmost page. let _ = self.move_to_right_state.1.take(); - let is_page_1 = { - let current_root = self.stack.top(); - current_root.get().id == 1 - }; - - let offset = if is_page_1 { DatabaseHeader::SIZE } else { 0 }; - let root = self.stack.top(); + let is_page_1 = root.get().id == 1; + let offset = if is_page_1 { DatabaseHeader::SIZE } else { 0 }; let root_contents = root.get_contents(); // FIXME: handle page cache is full // FIXME: remove sync IO hack @@ -4075,7 +4147,7 @@ impl BTreeCursor { root_contents.overflow_cells.clear(); self.root_page = root.get().id; self.stack.clear(); - self.stack.push(root.clone()); + self.stack.push(root); self.stack.set_cell_index(0); // leave parent pointing at the rightmost pointer (in this case 0, as there are no cells), since we will be balancing the rightmost child page. self.stack.push(child.clone()); Ok(IOResult::Done(())) @@ -4100,7 +4172,7 @@ impl BTreeCursor { } } SeekEndState::ProcessPage => { - let mem_page = self.stack.top(); + let mem_page = self.stack.top_ref(); let contents = mem_page.get_contents(); if contents.is_leaf() { // set cursor just past the last cell to append @@ -4262,7 +4334,7 @@ impl BTreeCursor { return Ok(IOResult::Done(None)); } if self.has_record.get() { - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); let page_type = contents.page_type(); if page_type.is_table() { @@ -4335,7 +4407,7 @@ impl BTreeCursor { return Ok(IOResult::Done(Some(record_ref))); } - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); let cell_idx = self.stack.current_cell_index(); let cell = contents.cell_get(cell_idx as usize, self.usable_space())?; @@ -4440,8 +4512,8 @@ impl BTreeCursor { match delete_state { DeleteState::Start => { - let page = self.stack.top(); - self.pager.add_dirty(&page); + let page = self.stack.top_ref(); + self.pager.add_dirty(page); if matches!( page.get_contents().page_type(), PageType::TableLeaf | PageType::TableInterior @@ -4462,7 +4534,7 @@ impl BTreeCursor { // FIXME: skip this work if we determine deletion wont result in balancing // Right now we calculate the key every time for simplicity/debugging // since it won't affect correctness which is more important - let page = self.stack.top(); + let page = self.stack.top_ref(); let target_key = if page.is_index() { let record = match return_if_io!(self.record()) { Some(record) => record.clone(), @@ -4498,7 +4570,7 @@ impl BTreeCursor { DeleteState::FindCell { post_balancing_seek_key, } => { - let page = self.stack.top(); + let page = self.stack.top_ref(); let cell_idx = self.stack.current_cell_index() as usize; let contents = page.get_contents(); if cell_idx >= contents.cell_count() { @@ -4545,7 +4617,7 @@ impl BTreeCursor { unreachable!("expected clear overflow pages state"); }; - let page = self.stack.top(); + let page = self.stack.top_ref(); let contents = page.get_contents(); if !contents.is_leaf() { @@ -4598,7 +4670,7 @@ impl BTreeCursor { .expect("parent page should be on the stack") .cell_idx = cell_idx as i32; let (cell_payload, leaf_cell_idx) = { - let leaf_page = self.stack.top(); + let leaf_page = self.stack.top_ref(); let leaf_contents = leaf_page.get_contents(); assert!(leaf_contents.is_leaf()); assert!(leaf_contents.cell_count() > 0); @@ -4635,10 +4707,10 @@ impl BTreeCursor { (cell_payload, leaf_cell_idx) }; - let leaf_page = self.stack.top(); + let leaf_page = self.stack.top_ref(); self.pager.add_dirty(page); - self.pager.add_dirty(&leaf_page); + self.pager.add_dirty(leaf_page); // Step 2: Replace the cell in the parent (interior) page. { @@ -4673,7 +4745,7 @@ impl BTreeCursor { } DeleteState::CheckNeedsBalancing { btree_depth, .. } => { - let page = self.stack.top(); + let page = self.stack.top_ref(); // Check if either the leaf page we took the replacement cell from underflows, or if the interior page we inserted it into overflows OR underflows. // If the latter is true, we must always balance that level regardless of whether the leaf page (or any ancestor pages in between) need balancing. @@ -4910,7 +4982,7 @@ impl BTreeCursor { destroy_info.state = DestroyState::LoadPage; } DestroyState::LoadPage => { - let _page = self.stack.top(); + let _page = self.stack.top_ref(); let destroy_info = self .state @@ -4919,9 +4991,8 @@ impl BTreeCursor { destroy_info.state = DestroyState::ProcessPage; } DestroyState::ProcessPage => { - let page = self.stack.top(); - assert!(page.is_loaded()); // page should be loaded at this time self.stack.advance(); + let page = self.stack.top_ref(); let contents = page.get_contents(); let cell_idx = self.stack.current_cell_index(); @@ -5070,7 +5141,7 @@ impl BTreeCursor { pub fn overwrite_cell( &mut self, - page: PageRef, + page: &PageRef, cell_idx: usize, record: &ImmutableRecord, state: &mut OverwriteCellState, @@ -5096,7 +5167,7 @@ impl BTreeCursor { } => { { return_if_io!(fill_cell_payload( - page.clone(), + page, *rowid, new_payload, cell_idx, @@ -5201,8 +5272,8 @@ impl BTreeCursor { } } CountState::Loop => { - mem_page = self.stack.top(); - turso_assert!(mem_page.is_loaded(), "page should be loaded"); + self.stack.advance(); + mem_page = self.stack.top_ref(); contents = mem_page.get_contents(); /* If this is a leaf page or the tree is not an int-key tree, then @@ -5213,7 +5284,6 @@ impl BTreeCursor { self.count += contents.cell_count(); } - self.stack.advance(); let cell_idx = self.stack.current_cell_index() as usize; // Second condition is necessary in case we return if the page is locked in the loop below @@ -5232,7 +5302,7 @@ impl BTreeCursor { // Move to parent self.stack.pop(); - mem_page = self.stack.top(); + mem_page = self.stack.top_ref(); turso_assert!(mem_page.is_loaded(), "page should be loaded"); contents = mem_page.get_contents(); @@ -6064,14 +6134,19 @@ impl PageStack { } /// Get a page at a specific level in the stack (0 = root, 1 = first child, etc.) - fn get_page_at_level(&self, level: usize) -> Option { + fn get_page_at_level(&self, level: usize) -> Option<&PageRef> { if level < self.stack.len() { - self.stack[level].clone() + self.stack[level].as_ref() } else { None } } + fn get_page_contents_at_level(&self, level: usize) -> Option<&mut PageContent> { + self.get_page_at_level(level) + .map(|page| page.get_contents()) + } + fn unpin_all_if_pinned(&mut self) { self.stack.iter_mut().flatten().for_each(|page| { let _ = page.try_unpin(); @@ -7133,7 +7208,7 @@ pub enum CopyDataState { /// may require I/O. #[allow(clippy::too_many_arguments)] fn fill_cell_payload( - page: PageRef, + page: &PageRef, int_key: Option, cell_payload: &mut Vec, cell_idx: usize, @@ -7445,7 +7520,7 @@ mod tests { run_until_done( || { fill_cell_payload( - page.clone(), + &page, Some(id as i64), &mut payload, pos, @@ -9023,7 +9098,7 @@ mod tests { run_until_done( || { fill_cell_payload( - page.clone(), + &page, Some(i as i64), &mut payload, cell_idx, @@ -9105,7 +9180,7 @@ mod tests { run_until_done( || { fill_cell_payload( - page.clone(), + &page, Some(i), &mut payload, cell_idx, @@ -9478,7 +9553,7 @@ mod tests { run_until_done( || { fill_cell_payload( - page.clone(), + &page, Some(0), &mut payload, 0, @@ -9564,7 +9639,7 @@ mod tests { run_until_done( || { fill_cell_payload( - page.clone(), + &page, Some(0), &mut payload, 0, @@ -9973,7 +10048,7 @@ mod tests { run_until_done( || { fill_cell_payload( - page.clone(), + &page, Some(cell_idx as i64), &mut payload, cell_idx as usize,