From 15ed7642c9ff2629549145ecdd5a35002c9ebc98 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 4 Apr 2025 15:51:52 +0200 Subject: [PATCH 1/9] check all keys are present on every insert with fuzz test Let's make sure every insert does still contain all keys. Previously we did this at the end but it made it hard to debug issues that `validate_btree` might not encounter. --- core/storage/btree.rs | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 789f9cb39..3328421f4 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2,7 +2,8 @@ use tracing::debug; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{ - read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, TableLeafCell, + read_btree_cell, read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, + TableLeafCell, }; use crate::MvCursor; @@ -607,7 +608,7 @@ impl BTreeCursor { } None => { if has_parent { - debug!("moving simple upwards"); + tracing::trace!("moving simple upwards"); self.going_upwards = true; self.stack.pop(); continue; @@ -955,7 +956,7 @@ impl BTreeCursor { pub fn move_to(&mut self, key: SeekKey<'_>, cmp: SeekOp) -> Result> { assert!(self.mv_cursor.is_none()); - tracing::debug!("move_to(key={:?} cmp={:?})", key, cmp); + tracing::trace!("move_to(key={:?} cmp={:?})", key, cmp); // For a table with N rows, we can find any row by row id in O(log(N)) time by starting at the root page and following the B-tree pointers. // B-trees consist of interior pages and leaf pages. Interior pages contain pointers to other pages, while leaf pages contain the actual row data. // @@ -4288,11 +4289,24 @@ mod tests { let value = ImmutableRecord::from_registers(&[Register::OwnedValue( OwnedValue::Blob(vec![0; size]), )]); + let btree_before = format_btree(pager.clone(), root_page, 0); run_until_done( || cursor.insert(&BTreeKey::new_table_rowid(key as u64, Some(&value)), true), pager.deref(), ) .unwrap(); + keys.sort(); + cursor.move_to_root(); + for key in keys.iter() { + tracing::trace!("seeking key: {}", key); + run_until_done(|| cursor.next(), pager.deref()).unwrap(); + let cursor_rowid = cursor.rowid().unwrap().unwrap(); + assert_eq!( + *key as u64, cursor_rowid, + "key {} is not found, got {}", + key, cursor_rowid + ); + } if matches!(validate_btree(pager.clone(), root_page), (_, false)) { panic!("invalid btree"); } @@ -4304,13 +4318,18 @@ mod tests { if matches!(validate_btree(pager.clone(), root_page), (_, false)) { panic!("invalid btree"); } + keys.sort(); + cursor.move_to_root(); for key in keys.iter() { let seek_key = SeekKey::TableRowId(*key as u64); - tracing::debug!("seeking key: {}", key); - let found = - run_until_done(|| cursor.seek(seek_key.clone(), SeekOp::EQ), pager.deref()) - .unwrap(); - assert!(found, "key {} is not found", key); + tracing::trace!("seeking key: {}", key); + run_until_done(|| cursor.next(), pager.deref()).unwrap(); + let cursor_rowid = cursor.rowid().unwrap().unwrap(); + assert_eq!( + *key as u64, cursor_rowid, + "key {} is not found, got {}", + key, cursor_rowid + ); } } } From f4920cb96b844c3b60b271f179efcbc71b604696 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 4 Apr 2025 15:52:17 +0200 Subject: [PATCH 2/9] assert new divider cell points to the correct place --- core/storage/btree.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 3328421f4..c48f26e53 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1833,6 +1833,15 @@ impl BTreeCursor { // Leaf index new_divider_cell.extend_from_slice(divider_cell); } + + let left_pointer = read_u32(&new_divider_cell[..4], 0); + assert_eq!(left_pointer, page.get().id as u32); + // FIXME: remove this lock + assert!( + left_pointer <= self.pager.db_header.lock().database_size, + "invalid page number divider left pointer {} > database number of pages", + left_pointer, + ); // FIXME: defragment shouldn't be needed defragment_page(parent_contents, self.usable_space() as u16); insert_into_cell( From 9eb9e7021e57ca6412c465db197e132c61bf7ec3 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 4 Apr 2025 15:52:35 +0200 Subject: [PATCH 3/9] Fix index table new divider cell pointer --- core/storage/btree.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c48f26e53..80e1c522e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1817,8 +1817,15 @@ impl BTreeCursor { page.get_contents() .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, previous_pointer_divider); // divider cell now points to this page - divider_cell[0..4].copy_from_slice(&(page.get().id as u32).to_be_bytes()); - new_divider_cell.extend_from_slice(divider_cell); + new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + // now copy the rest of the divider cell: + // Table Interior page: + // * varint rowid + // Index Interior page: + // * varint payload size + // * payload + // * first overflow page (u32 optional) + new_divider_cell.extend_from_slice(÷r_cell[4..]); } else if leaf_data { // Leaf table // FIXME: not needed conversion From ff8ec5455cf2191b4bf26446aeeddc4c5b0604db Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sat, 5 Apr 2025 23:45:18 +0200 Subject: [PATCH 4/9] fix divider cell selection --- core/storage/btree.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 80e1c522e..bc6889082 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1479,12 +1479,14 @@ impl BTreeCursor { self.pager.add_dirty(sibling_page.get().id); max_cells += sibling_contents.cell_count(); max_cells += sibling_contents.overflow_cells.len(); - if i == 0 { - // we don't have left sibling from this one so we break - break; + + // Right pointer is not dropped, we simply update it at the end. This could be a divider cell that points + // to the last page in the list of pages to balance or this could be the rightmost pointer that points to a page. + if i == balance_info.sibling_count - 1 { + continue; } // Since we know we have a left sibling, take the divider that points to left sibling of this page - let cell_idx = balance_info.first_divider_cell + i - 1; + let cell_idx = balance_info.first_divider_cell + i; let (cell_start, cell_len) = parent_contents.cell_get_raw_region( cell_idx, payload_overflow_threshold_max( @@ -1781,6 +1783,9 @@ impl BTreeCursor { } // Write right pointer in parent page to point to new rightmost page + // Write right pointer in parent page to point to new rightmost page. keep in mind + // we update rightmost pointer first because inserting cells could defragment parent page, + // therfore invalidating the pointer. let right_page_id = pages_to_balance_new.last().unwrap().get().id as u32; let rightmost_pointer = balance_info.rightmost_pointer; let rightmost_pointer = @@ -1850,7 +1855,7 @@ impl BTreeCursor { left_pointer, ); // FIXME: defragment shouldn't be needed - defragment_page(parent_contents, self.usable_space() as u16); + // defragment_page(parent_contents, self.usable_space() as u16); insert_into_cell( parent_contents, &new_divider_cell, @@ -1875,6 +1880,7 @@ impl BTreeCursor { (0, 0, cell_array.cell_count(0)) } else { let this_was_old_page = page_idx < balance_info.sibling_count; + // We add !leaf_data because we want to skip 1 in case of divider cell which is encountared between pages assigned let start_old_cells = if this_was_old_page { count_cells_in_old_pages[page_idx - 1] as usize + (!leaf_data) as usize From 0541da46df97f7c75832926e8a710bb6c6e4dbeb Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sat, 5 Apr 2025 23:45:35 +0200 Subject: [PATCH 5/9] add strict btree validation after non root balancing in debug mode --- core/storage/btree.rs | 405 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 386 insertions(+), 19 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index bc6889082..2148b66b4 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -14,6 +14,7 @@ use crate::{return_corrupt, LimboError, Result}; use std::cell::{Cell, Ref, RefCell}; use std::cmp::Ordering; +use std::collections::HashSet; use std::pin::Pin; use std::rc::Rc; @@ -1503,6 +1504,13 @@ impl BTreeCursor { let cell_buf = &buf[cell_start..cell_start + cell_len]; max_cells += 1; + tracing::debug!( + "balance_non_root(drop_divider_cell, first_divider_cell={}, divider_cell={}, left_pointer={})", + balance_info.first_divider_cell, + i, + read_u32(cell_buf, 0) + ); + // TODO(pere): make this reference and not copy balance_info.divider_cells.push(cell_buf.to_vec()); tracing::trace!( @@ -1589,6 +1597,17 @@ impl BTreeCursor { cells_capacity_start, "calculation of max cells was wrong" ); + + // Let's copy all cells for later checks + #[cfg(debug_assertions)] + let mut cells_debug = Vec::new(); + #[cfg(debug_assertions)] + { + for cell in &cell_array.cells { + cells_debug.push(cell.to_vec()); + } + } + #[cfg(debug_assertions)] { for cell in &cell_array.cells { @@ -1602,7 +1621,7 @@ impl BTreeCursor { // calculate how many pages to allocate let mut new_page_sizes = Vec::new(); let leaf_correction = if leaf { 4 } else { 0 }; - // number of bytes beyond header, different from global usableSapce which inccludes + // number of bytes beyond header, different from global usableSapce which includes // header let usable_space = self.usable_space() - 12 + leaf_correction; for i in 0..balance_info.sibling_count { @@ -1782,7 +1801,19 @@ impl BTreeCursor { } } - // Write right pointer in parent page to point to new rightmost page + #[cfg(debug_assertions)] + { + tracing::debug!("balance_non_root(parent page_id={})", parent_page.get().id); + for page in &pages_to_balance_new { + tracing::debug!("balance_non_root(new_sibling page_id={})", page.get().id); + } + } + + // pages_pointed_to helps us debug we did in fact create divider cells to all the new pages and the rightmost pointer, + // also points to the last page. + #[cfg(debug_assertions)] + let mut pages_pointed_to = HashSet::new(); + // Write right pointer in parent page to point to new rightmost page. keep in mind // we update rightmost pointer first because inserting cells could defragment parent page, // therfore invalidating the pointer. @@ -1792,6 +1823,13 @@ impl BTreeCursor { unsafe { std::slice::from_raw_parts_mut(rightmost_pointer, 4) }; rightmost_pointer[0..4].copy_from_slice(&right_page_id.to_be_bytes()); + #[cfg(debug_assertions)] + pages_pointed_to.insert(right_page_id); + tracing::debug!( + "balance_non_root(rightmost_pointer_update, rightmost_pointer={})", + right_page_id + ); + // Ensure right-child pointer of the right-most new sibling pge points to the page // that was originally on that place. let is_leaf_page = matches!(page_type, PageType::TableLeaf | PageType::IndexLeaf); @@ -1847,6 +1885,14 @@ impl BTreeCursor { } let left_pointer = read_u32(&new_divider_cell[..4], 0); + #[cfg(debug_assertions)] + pages_pointed_to.insert(left_pointer); + tracing::debug!( + "balance_non_root(insert_divider_cell, first_divider_cell={}, divider_cell={}, left_pointer={})", + balance_info.first_divider_cell, + i, + left_pointer + ); assert_eq!(left_pointer, page.get().id as u32); // FIXME: remove this lock assert!( @@ -1863,6 +1909,57 @@ impl BTreeCursor { self.usable_space() as u16, ) .unwrap(); + #[cfg(debug_assertions)] + { + let left_pointer = if parent_contents.overflow_cells.len() == 0 { + let (cell_start, cell_len) = parent_contents.cell_get_raw_region( + balance_info.first_divider_cell + i, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ); + tracing::debug!( + "balance_non_root(cell_start={}, cell_len={})", + cell_start, + cell_len + ); + + let left_pointer = read_u32( + &parent_contents.as_ptr()[cell_start..cell_start + cell_len], + 0, + ); + left_pointer + } else { + let cell = &parent_contents.overflow_cells[0]; + assert_eq!(cell.index, balance_info.first_divider_cell + i); + read_u32(&cell.payload, 0) + }; + assert_eq!(left_pointer, page.get().id as u32, "the cell we just inserted doesn't point to the correct page. points to {}, should point to {}", + left_pointer, + page.get().id as u32 + ); + } + } + tracing::debug!( + "balance_non_root(parent_overflow={})", + parent_contents.overflow_cells.len() + ); + + #[cfg(debug_assertions)] + { + for page in &pages_to_balance_new { + assert!( + pages_pointed_to.contains(&(page.get().id as u32)), + "page {} not pointed to by divider cell or rightmost pointer", + page.get().id + ); + } } // TODO: update pages let mut done = vec![false; sibling_count_new]; @@ -1917,6 +2014,18 @@ impl BTreeCursor { done[page_idx] = true; } } + + #[cfg(debug_assertions)] + self.post_balance_non_root_validation( + balance_info, + parent_contents, + pages_to_balance_new, + page_type, + leaf_data, + cells_debug, + sibling_count_new, + rightmost_pointer, + ); // TODO: balance root // TODO: free pages (WriteState::BalanceStart, Ok(CursorResult::Ok(()))) @@ -1932,6 +2041,240 @@ impl BTreeCursor { result } + fn post_balance_non_root_validation( + &self, + balance_info: &mut BalanceInfo, + parent_contents: &mut PageContent, + pages_to_balance_new: Vec>, + page_type: PageType, + leaf_data: bool, + mut cells_debug: Vec>, + sibling_count_new: usize, + rightmost_pointer: &mut [u8], + ) { + let mut valid = true; + let mut current_index_cell = 0; + // Let's now make a in depth check that we in fact added all possible cells somewhere and they are not lost + for (page_idx, page) in pages_to_balance_new.iter().enumerate() { + let contents = page.get_contents(); + // Cells are distributed in order + for cell_idx in 0..contents.cell_count() { + let (cell_start, cell_len) = contents.cell_get_raw_region( + cell_idx, + payload_overflow_threshold_max( + contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ); + let buf = contents.as_ptr(); + let cell_buf = &buf[cell_start..cell_start + cell_len]; + let cell_buf_in_array = &cells_debug[current_index_cell]; + if cell_buf != cell_buf_in_array { + tracing::error!("balance_non_root(cell_not_found_debug, page_id={}, cell_in_cell_array_idx={})", + page.get().id, + current_index_cell, + ); + valid = false; + } + current_index_cell += 1; + } + // Now check divider cells and their pointers. + let parent_buf = parent_contents.as_ptr(); + let cell_divider_idx = balance_info.first_divider_cell + page_idx; + if page_idx == sibling_count_new - 1 { + // We will only validate rightmost pointer of parent page, we will not validate rightmost if it's a cell and not the last pointer because, + // insert cell could've defragmented the page and invalidated the pointer. + // right pointer, we just check right pointer points to this page. + if cell_divider_idx == parent_contents.cell_count() { + let rightmost = read_u32(rightmost_pointer, 0); + if rightmost != page.get().id as u32 { + tracing::error!("balance_non_root(cell_divider_right_pointer, should point to {}, but points to {})", + page.get().id, + rightmost + ); + valid = false; + } + } + } else { + // divider cell might be an overflow cell + let mut was_overflow = false; + for overflow_cell in &parent_contents.overflow_cells { + if overflow_cell.index == cell_divider_idx { + let left_pointer = read_u32(&overflow_cell.payload, 0); + if left_pointer != page.get().id as u32 { + tracing::error!("balance_non_root(cell_divider_left_pointer_overflow, should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})", + page.get().id, + left_pointer, + page_idx, + parent_contents.overflow_cells.len() + ); + valid = false; + } + was_overflow = true; + break; + } + } + if was_overflow { + continue; + } + // check if overflow + // check if right pointer, this is the last page. Do we update rightmost pointer and defragment moves it? + let (cell_start, cell_len) = parent_contents.cell_get_raw_region( + cell_divider_idx, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ); + let cell_left_pointer = read_u32(&parent_buf[cell_start..cell_start + cell_len], 0); + if cell_left_pointer != page.get().id as u32 { + tracing::error!("balance_non_root(cell_divider_left_pointer, should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})", + page.get().id, + cell_left_pointer, + page_idx, + parent_contents.overflow_cells.len() + ); + valid = false; + } + if leaf_data { + // If we are in a table leaf page, we just need to check that this cell that should be a divider cell is in the parent + // This means we already check cell in leaf pages but not on parent so we don't advance current_index_cell + if page_idx >= balance_info.sibling_count - 1 { + // This means we are in the last page and we don't need to check anything + continue; + } + let cell_buf: &'static mut [u8] = + to_static_buf(&mut cells_debug[current_index_cell - 1]); + let cell = read_btree_cell( + cell_buf, + &page_type, + 0, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ) + .unwrap(); + let parent_cell = parent_contents + .cell_get( + cell_divider_idx, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ) + .unwrap(); + let rowid = match cell { + BTreeCell::TableLeafCell(table_leaf_cell) => table_leaf_cell._rowid, + _ => unreachable!(), + }; + let rowid_parent = match parent_cell { + BTreeCell::TableInteriorCell(table_interior_cell) => { + table_interior_cell._rowid + } + _ => unreachable!(), + }; + if rowid_parent != rowid { + tracing::error!("balance_non_root(cell_divider_rowid, page_id={}, cell_divider_idx={}, rowid_parent={}, rowid={})", + page.get().id, + cell_divider_idx, + rowid_parent, + rowid + ); + valid = false; + } + } else { + // In any other case, we need to check that this cell was moved to parent as divider cell + let mut was_overflow = false; + for overflow_cell in &parent_contents.overflow_cells { + if overflow_cell.index == cell_divider_idx { + let left_pointer = read_u32(&overflow_cell.payload, 0); + if left_pointer != page.get().id as u32 { + tracing::error!("balance_non_root(cell_divider_divider_cell_overflow should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})", + page.get().id, + left_pointer, + page_idx, + parent_contents.overflow_cells.len() + ); + valid = false; + } + was_overflow = true; + break; + } + } + if was_overflow { + continue; + } + let (parent_cell_start, parent_cell_len) = parent_contents.cell_get_raw_region( + cell_divider_idx, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ); + let cell_buf_in_array = &cells_debug[current_index_cell]; + let left_pointer = read_u32( + &parent_buf[parent_cell_start..parent_cell_start + parent_cell_len], + 0, + ); + if left_pointer != page.get().id as u32 { + tracing::error!("balance_non_root(divider_cell_left_pointer_interior should point to page_id={}, but points to {}, divider_cell={}, overflow_cells_parent={})", + page.get().id, + left_pointer, + page_idx, + parent_contents.overflow_cells.len() + ); + valid = false; + } + match page_type { + PageType::TableInterior | PageType::IndexInterior => { + let parent_cell_buf = + &parent_buf[parent_cell_start..parent_cell_start + parent_cell_len]; + if parent_cell_buf[4..] != cell_buf_in_array[4..] { + tracing::error!("balance_non_root(cell_divider_cell, page_id={}, cell_divider_idx={})", + page.get().id, + cell_divider_idx, + ); + valid = false; + } + } + PageType::IndexLeaf => todo!(), + _ => { + unreachable!() + } + } + current_index_cell += 1; + } + } + } + assert!(valid, "corrupted database, cells were to balanced properly"); + } + /// Balance the root page. /// This is done when the root page overflows, and we need to create a new root page. /// See e.g. https://en.wikipedia.org/wiki/B-tree @@ -3156,7 +3499,7 @@ fn edit_page( cell_array: &CellArray, usable_space: u16, ) -> Result<()> { - tracing::trace!( + tracing::debug!( "edit_page start_old_cells={} start_new_cells={} number_new_cells={} cell_array={}", start_old_cells, start_new_cells, @@ -3466,7 +3809,6 @@ fn defragment_page(page: &PageContent, usable_space: u16) { /// Only enabled in debug mode, where we ensure that all cells are valid. fn debug_validate_cells_core(page: &PageContent, usable_space: u16) { for i in 0..page.cell_count() { - // println!("Debug function: i={}", i); let (offset, size) = page.cell_get_raw_region( i, payload_overflow_threshold_max(page.page_type(), usable_space), @@ -3510,7 +3852,7 @@ fn insert_into_cell( } let new_cell_data_pointer = allocate_cell_space(page, payload.len() as u16, usable_space)?; - tracing::trace!( + tracing::debug!( "insert_into_cell(idx={}, pc={})", cell_idx, new_cell_data_pointer @@ -3838,6 +4180,8 @@ fn shift_pointers_left(page: &mut PageContent, cell_idx: usize) { #[cfg(test)] mod tests { + use rand::thread_rng; + use rand::Rng; use rand_chacha::rand_core::RngCore; use rand_chacha::rand_core::SeedableRng; use rand_chacha::ChaCha8Rng; @@ -4032,6 +4376,14 @@ mod tests { _left_child_page, .. }) => { child_pages.push(pager.read_page(_left_child_page as usize).unwrap()); + if _left_child_page == page.id as u32 { + valid = false; + tracing::error!( + "left child page is the same as parent {}", + _left_child_page + ); + continue; + } let (child_depth, child_valid) = validate_btree(pager.clone(), _left_child_page as usize); valid &= child_valid; @@ -4039,6 +4391,10 @@ mod tests { } _ => panic!("unsupported btree cell: {:?}", cell), }; + if current_depth >= 100 { + tracing::error!("depth is too big"); + return (100, false); + } depth = Some(depth.unwrap_or(current_depth + 1)); if depth != Some(current_depth + 1) { tracing::error!("depth is different for child of page {}", page_idx); @@ -4275,6 +4631,7 @@ mod tests { let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); let mut keys = Vec::new(); let seed = rng.next_u64(); + let seed = 1743883058; tracing::info!("seed: {}", seed); let mut rng = ChaCha8Rng::seed_from_u64(seed); for insert_id in 0..inserts { @@ -4319,17 +4676,22 @@ mod tests { .unwrap(); keys.sort(); cursor.move_to_root(); + let mut valid = true; for key in keys.iter() { tracing::trace!("seeking key: {}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); let cursor_rowid = cursor.rowid().unwrap().unwrap(); - assert_eq!( - *key as u64, cursor_rowid, - "key {} is not found, got {}", - key, cursor_rowid - ); + if *key as u64 != cursor_rowid { + valid = false; + println!("key {} is not found, got {}", key, cursor_rowid); + break; + } } - if matches!(validate_btree(pager.clone(), root_page), (_, false)) { + // let's validate btree too so that we undertsand where the btree failed + if matches!(validate_btree(pager.clone(), root_page), (_, false)) || !valid { + let btree_after = format_btree(pager.clone(), root_page, 0); + println!("btree before:\n{}", btree_before); + println!("btree after:\n{}", btree_after); panic!("invalid btree"); } } @@ -4417,7 +4779,7 @@ mod tests { #[test] #[ignore] pub fn btree_insert_fuzz_run_small() { - btree_insert_fuzz_run(1, 1024, |rng| (rng.next_u32() % 128) as usize); + btree_insert_fuzz_run(1, 100_000, |rng| (rng.next_u32() % 128) as usize); } #[test] @@ -4816,15 +5178,13 @@ mod tests { let mut total_size = 0; let mut cells = Vec::new(); let usable_space = 4096; - let mut i = 1000; - // let seed = thread_rng().gen(); - // let seed = 15292777653676891381; - let seed = 9261043168681395159; + let mut i = 100000; + let seed = thread_rng().gen(); tracing::info!("seed {}", seed); let mut rng = ChaCha8Rng::seed_from_u64(seed); while i > 0 { i -= 1; - match rng.next_u64() % 3 { + match rng.next_u64() % 4 { 0 => { // allow appends with extra place to insert let cell_idx = rng.next_u64() as usize % (page.cell_count() + 1); @@ -4841,14 +5201,14 @@ mod tests { 4096, conn.pager.clone(), ); - if (free as usize) < payload.len() - 2 { + if (free as usize) < payload.len() + 2 { // do not try to insert overflow pages because they require balancing continue; } insert_into_cell(page, &payload, cell_idx, 4096).unwrap(); assert!(page.overflow_cells.is_empty()); total_size += payload.len() as u16 + 2; - cells.push(Cell { pos: i, payload }); + cells.insert(cell_idx, Cell { pos: i, payload }); } 1 => { if page.cell_count() == 0 { @@ -4868,6 +5228,13 @@ mod tests { 2 => { defragment_page(page, usable_space); } + 3 => { + // check cells + for (i, cell) in cells.iter().enumerate() { + ensure_cell(page, i, &cell.payload); + } + assert_eq!(page.cell_count(), cells.len()); + } _ => unreachable!(), } let free = compute_free_space(page, usable_space); From 6ac2368ae279f05bf89db068a941e6c68a65ed43 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 7 Apr 2025 17:53:06 +0200 Subject: [PATCH 6/9] update divider cell that is being balanced --- core/storage/btree.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 2148b66b4..29a4b9a4e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1587,6 +1587,11 @@ impl BTreeCursor { // TODO(pere): in case of old pages are leaf pages, so index leaf page, we need to strip page pointers // from divider cells in index interior pages (parent) because those should not be included. cells_inserted += 1; + if !leaf { + // This divider cell needs to be updated with new left pointer, + let right_pointer = old_page_contents.rightmost_pointer().unwrap(); + divider_cell[..4].copy_from_slice(&right_pointer.to_be_bytes()); + } cell_array.cells.push(to_static_buf(divider_cell.as_mut())); } total_cells_inserted += cells_inserted; @@ -1885,6 +1890,7 @@ impl BTreeCursor { } let left_pointer = read_u32(&new_divider_cell[..4], 0); + assert!(left_pointer != parent_page.get().id as u32); #[cfg(debug_assertions)] pages_pointed_to.insert(left_pointer); tracing::debug!( From f137ddfdf8ec233d2214d07d5fea11c1c3312994 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 7 Apr 2025 17:55:50 +0200 Subject: [PATCH 7/9] add loop left pointer validation --- core/storage/btree.rs | 108 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 5 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 29a4b9a4e..8bde08311 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2,8 +2,7 @@ use tracing::debug; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{ - read_btree_cell, read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, - TableLeafCell, + read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, TableLeafCell, }; use crate::MvCursor; @@ -14,6 +13,7 @@ use crate::{return_corrupt, LimboError, Result}; use std::cell::{Cell, Ref, RefCell}; use std::cmp::Ordering; +#[cfg(debug_assertions)] use std::collections::HashSet; use std::pin::Pin; use std::rc::Rc; @@ -2023,6 +2023,7 @@ impl BTreeCursor { #[cfg(debug_assertions)] self.post_balance_non_root_validation( + &parent_page, balance_info, parent_contents, pages_to_balance_new, @@ -2047,8 +2048,10 @@ impl BTreeCursor { result } + #[cfg(debug_assertions)] fn post_balance_non_root_validation( &self, + parent_page: &PageRef, balance_info: &mut BalanceInfo, parent_contents: &mut PageContent, pages_to_balance_new: Vec>, @@ -2060,6 +2063,45 @@ impl BTreeCursor { ) { let mut valid = true; let mut current_index_cell = 0; + for cell_idx in 0..parent_contents.cell_count() { + let cell = parent_contents + .cell_get( + cell_idx, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ) + .unwrap(); + match cell { + BTreeCell::TableInteriorCell(table_interior_cell) => { + let left_child_page = table_interior_cell._left_child_page; + if left_child_page == parent_page.get().id as u32 { + tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})", + parent_page.get().id, + left_child_page, + ); + valid = false; + } + } + BTreeCell::IndexInteriorCell(index_interior_cell) => { + let left_child_page = index_interior_cell.left_child_page; + if left_child_page == parent_page.get().id as u32 { + tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})", + parent_page.get().id, + left_child_page, + ); + valid = false; + } + } + _ => {} + } + } // Let's now make a in depth check that we in fact added all possible cells somewhere and they are not lost for (page_idx, page) in pages_to_balance_new.iter().enumerate() { let contents = page.get_contents(); @@ -2078,7 +2120,7 @@ impl BTreeCursor { self.usable_space(), ); let buf = contents.as_ptr(); - let cell_buf = &buf[cell_start..cell_start + cell_len]; + let cell_buf = to_static_buf(&mut buf[cell_start..cell_start + cell_len]); let cell_buf_in_array = &cells_debug[current_index_cell]; if cell_buf != cell_buf_in_array { tracing::error!("balance_non_root(cell_not_found_debug, page_id={}, cell_in_cell_array_idx={})", @@ -2087,6 +2129,63 @@ impl BTreeCursor { ); valid = false; } + + let cell = crate::storage::sqlite3_ondisk::read_btree_cell( + cell_buf, + &page_type, + 0, + payload_overflow_threshold_max( + parent_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + parent_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + ) + .unwrap(); + match &cell { + BTreeCell::TableInteriorCell(table_interior_cell) => { + let left_child_page = table_interior_cell._left_child_page; + if left_child_page == page.get().id as u32 { + tracing::error!("balance_non_root(child_page_points_same_page, page_id={}, cell_left_child_page={}, page_idx={})", + page.get().id, + left_child_page, + page_idx + ); + valid = false; + } + if left_child_page == parent_page.get().id as u32 { + tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})", + page.get().id, + left_child_page, + page_idx + ); + valid = false; + } + } + BTreeCell::IndexInteriorCell(index_interior_cell) => { + let left_child_page = index_interior_cell.left_child_page; + if left_child_page == page.get().id as u32 { + tracing::error!("balance_non_root(child_page_points_same_page, page_id={}, cell_left_child_page={}, page_idx={})", + page.get().id, + left_child_page, + page_idx + ); + valid = false; + } + if left_child_page == parent_page.get().id as u32 { + tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})", + page.get().id, + left_child_page, + page_idx + ); + valid = false; + } + } + _ => {} + } current_index_cell += 1; } // Now check divider cells and their pointers. @@ -4637,7 +4736,6 @@ mod tests { let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); let mut keys = Vec::new(); let seed = rng.next_u64(); - let seed = 1743883058; tracing::info!("seed: {}", seed); let mut rng = ChaCha8Rng::seed_from_u64(seed); for insert_id in 0..inserts { @@ -4680,6 +4778,7 @@ mod tests { pager.deref(), ) .unwrap(); + // FIXME: add sorted vector instead, should be okay for small amounts of keys for now :P, too lazy to fix right now keys.sort(); cursor.move_to_root(); let mut valid = true; @@ -4711,7 +4810,6 @@ mod tests { keys.sort(); cursor.move_to_root(); for key in keys.iter() { - let seek_key = SeekKey::TableRowId(*key as u64); tracing::trace!("seeking key: {}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); let cursor_rowid = cursor.rowid().unwrap().unwrap(); From 83f13596a4975e4ed3a89a3cd7bc5680f78a31aa Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 7 Apr 2025 17:59:01 +0200 Subject: [PATCH 8/9] decrease fuzz test steps again --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 8bde08311..d5897061e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4883,7 +4883,7 @@ mod tests { #[test] #[ignore] pub fn btree_insert_fuzz_run_small() { - btree_insert_fuzz_run(1, 100_000, |rng| (rng.next_u32() % 128) as usize); + btree_insert_fuzz_run(1, 100, |rng| (rng.next_u32() % 128) as usize); } #[test] From 608628461393645d738444c9aba48be67d7bad87 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 7 Apr 2025 18:06:52 +0200 Subject: [PATCH 9/9] fix debug imports --- core/storage/btree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d5897061e..c505a3fee 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2054,7 +2054,7 @@ impl BTreeCursor { parent_page: &PageRef, balance_info: &mut BalanceInfo, parent_contents: &mut PageContent, - pages_to_balance_new: Vec>, + pages_to_balance_new: Vec>, page_type: PageType, leaf_data: bool, mut cells_debug: Vec>, @@ -2260,7 +2260,7 @@ impl BTreeCursor { } let cell_buf: &'static mut [u8] = to_static_buf(&mut cells_debug[current_index_cell - 1]); - let cell = read_btree_cell( + let cell = crate::storage::sqlite3_ondisk::read_btree_cell( cell_buf, &page_type, 0,