From 55dd108878c67ee76ceed9522dd0b988889b9114 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 12:38:32 +0400 Subject: [PATCH 01/23] setup simple insertion fuzz test --- Cargo.lock | 1 + core/Cargo.toml | 3 ++- core/storage/btree.rs | 57 +++++++++++++++++++++++++++++++++++++++++++ core/types.rs | 1 + 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 819311184..f25ad611d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1641,6 +1641,7 @@ dependencies = [ "quickcheck", "quickcheck_macros", "rand 0.8.5", + "rand_chacha 0.9.0", "regex", "regex-syntax", "rstest", diff --git a/core/Cargo.toml b/core/Cargo.toml index 687f4ff19..58a29c475 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -94,7 +94,8 @@ rusqlite = "0.29.0" tempfile = "3.8.0" quickcheck = { version = "1.0", default-features = false } quickcheck_macros = { version = "1.0", default-features = false } -rand = "0.8" # Required for quickcheck +rand = "0.8.5" # Required for quickcheck +rand_chacha = "0.9.0" [[bench]] name = "benchmark" diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 4a963581b..280f46eac 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2362,6 +2362,10 @@ fn to_static_buf(buf: &[u8]) -> &'static [u8] { #[cfg(test)] mod tests { + use rand_chacha::rand_core::RngCore; + use rand_chacha::rand_core::SeedableRng; + use rand_chacha::ChaCha8Rng; + use super::*; use crate::io::{Buffer, Completion, MemoryIO, OpenFlags, IO}; use crate::storage::database::FileStorage; @@ -2371,6 +2375,59 @@ mod tests { use std::cell::RefCell; use std::sync::Arc; + fn empty_btree() -> (Rc, usize) { + let db_header = DatabaseHeader::default(); + let page_size = db_header.page_size as usize; + + let io: Arc = Arc::new(MemoryIO::new().unwrap()); + let io_file = io.open_file("test.db", OpenFlags::Create, false).unwrap(); + let page_io = Rc::new(FileStorage::new(io_file)); + + let buffer_pool = Rc::new(BufferPool::new(db_header.page_size as usize)); + let wal_shared = WalFileShared::open_shared(&io, "test.wal", db_header.page_size).unwrap(); + let wal_file = WalFile::new(io.clone(), page_size, wal_shared, buffer_pool.clone()); + let wal = Rc::new(RefCell::new(wal_file)); + + let page_cache = Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(10))); + let pager = { + let db_header = Rc::new(RefCell::new(db_header.clone())); + Pager::finish_open(db_header, page_io, wal, io, page_cache, buffer_pool).unwrap() + }; + let pager = Rc::new(pager); + let page1 = pager.allocate_page().unwrap(); + btree_init_page(&page1, PageType::TableLeaf, &db_header, 0); + (pager, page1.get().id) + } + + #[test] + pub fn btree_insert_fuzz() { + let (pager, root_page) = empty_btree(); + let mut cursor = BTreeCursor::new(pager, root_page); + let mut keys = Vec::new(); + let mut rng = ChaCha8Rng::seed_from_u64(0); + for _ in 0..16 { + let size = (rng.next_u64() % 4096) as usize; + let key = (rng.next_u64() % (1 << 30)) as i64; + keys.push(key); + println!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); + let key = OwnedValue::Integer(key); + let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); + cursor.insert(&key, &value, false).unwrap(); + } + + for key in keys { + let seek_key = SeekKey::TableRowId(key as u64); + assert!( + matches!( + cursor.seek(seek_key, SeekOp::EQ).unwrap(), + CursorResult::Ok(true) + ), + "key {} is not found", + key + ); + } + } + #[allow(clippy::arc_with_non_send_sync)] fn setup_test_env(database_size: u32) -> (Rc, Rc>) { let page_size = 512; diff --git a/core/types.rs b/core/types.rs index ae31314c1..d72ddc284 100644 --- a/core/types.rs +++ b/core/types.rs @@ -727,6 +727,7 @@ impl Cursor { } } +#[derive(Debug)] pub enum CursorResult { Ok(T), IO, From 75e2f01ec4db7d4cee5051315ba57ad2579c9ea1 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 13:03:38 +0400 Subject: [PATCH 02/23] print btree for debugging --- Cargo.lock | 5 ++ core/Cargo.toml | 1 + core/storage/btree.rs | 90 +++++++++++++++++++++++++++++----- core/storage/sqlite3_ondisk.rs | 2 +- 4 files changed, 84 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f25ad611d..e64dc3c1d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -736,6 +736,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", + "regex", ] [[package]] @@ -767,7 +768,10 @@ version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" dependencies = [ + "anstream", + "anstyle", "env_filter", + "humantime", "log", ] @@ -1612,6 +1616,7 @@ dependencies = [ "chrono", "criterion", "crossbeam-skiplist", + "env_logger 0.11.6", "fallible-iterator 0.3.0", "getrandom 0.2.15", "hex", diff --git a/core/Cargo.toml b/core/Cargo.toml index 58a29c475..6799fe29b 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -96,6 +96,7 @@ quickcheck = { version = "1.0", default-features = false } quickcheck_macros = { version = "1.0", default-features = false } rand = "0.8.5" # Required for quickcheck rand_chacha = "0.9.0" +env_logger = "0.11.6" [[bench]] name = "benchmark" diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 280f46eac..efdcb47ae 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2375,6 +2375,65 @@ mod tests { use std::cell::RefCell; use std::sync::Arc; + fn format_btree(pager: Rc, page_idx: usize, depth: usize) -> String { + let cursor = BTreeCursor::new(pager.clone(), page_idx); + let page = pager.read_page(page_idx).unwrap(); + let page = page.get(); + let contents = page.contents.as_ref().unwrap(); + let page_type = contents.page_type(); + let mut current = Vec::new(); + let mut child = Vec::new(); + for cell_idx in 0..contents.cell_count() { + let cell = contents + .cell_get( + cell_idx, + pager.clone(), + cursor.payload_overflow_threshold_max(page_type), + cursor.payload_overflow_threshold_min(page_type), + cursor.usable_space(), + ) + .unwrap(); + match cell { + BTreeCell::TableInteriorCell(cell) => { + current.push(format!( + "node[rowid:{}, ptr(<=):{}]", + cell._rowid, cell._left_child_page + )); + child.push(format_btree( + pager.clone(), + cell._left_child_page as usize, + depth + 2, + )); + } + BTreeCell::TableLeafCell(cell) => { + current.push(format!( + "leaf[rowid:{}, len(payload):{}, overflow:{}]", + cell._rowid, + cell._payload.len(), + cell.first_overflow_page.is_some() + )); + } + _ => panic!("unsupported btree cell: {:?}", cell), + } + } + if let Some(rightmost) = contents.rightmost_pointer() { + child.push(format_btree(pager.clone(), rightmost as usize, depth + 2)); + } + let current = format!( + "{}-page:{}, ptr(right):{}\n{}+cells:{}", + " ".repeat(depth), + page_idx, + contents.rightmost_pointer().unwrap_or(0), + " ".repeat(depth), + current.join(", ") + ); + if child.is_empty() { + current + } else { + current + "\n" + &child.join("\n") + } + } + fn empty_btree() -> (Rc, usize) { let db_header = DatabaseHeader::default(); let page_size = db_header.page_size as usize; @@ -2401,30 +2460,35 @@ mod tests { #[test] pub fn btree_insert_fuzz() { + let _ = env_logger::init(); let (pager, root_page) = empty_btree(); - let mut cursor = BTreeCursor::new(pager, root_page); + let mut cursor = BTreeCursor::new(pager.clone(), root_page); let mut keys = Vec::new(); let mut rng = ChaCha8Rng::seed_from_u64(0); for _ in 0..16 { let size = (rng.next_u64() % 4096) as usize; let key = (rng.next_u64() % (1 << 30)) as i64; keys.push(key); - println!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); + log::info!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); let key = OwnedValue::Integer(key); let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); cursor.insert(&key, &value, false).unwrap(); - } - - for key in keys { - let seek_key = SeekKey::TableRowId(key as u64); - assert!( - matches!( - cursor.seek(seek_key, SeekOp::EQ).unwrap(), - CursorResult::Ok(true) - ), - "key {} is not found", - key + log::info!( + "=========== btree ===========\n{}\n\n", + format_btree(pager.clone(), root_page, 0) ); + + for key in keys.iter() { + let seek_key = SeekKey::TableRowId(*key as u64); + assert!( + matches!( + cursor.seek(seek_key, SeekOp::EQ).unwrap(), + CursorResult::Ok(true) + ), + "key {} is not found", + key + ); + } } } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 4963fa669..fbdac7dca 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -362,7 +362,7 @@ pub fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) { } #[repr(u8)] -#[derive(Debug, PartialEq, Clone)] +#[derive(Debug, PartialEq, Clone, Copy)] pub enum PageType { IndexInterior = 2, TableInterior = 5, From 3557c8aada109a8ac51c15b73f7d4c7ee895d1f8 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 21:25:21 +0400 Subject: [PATCH 03/23] adjust fuzz test --- core/storage/btree.rs | 97 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 19 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index efdcb47ae..ce14c9ecf 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2459,26 +2459,47 @@ mod tests { } #[test] - pub fn btree_insert_fuzz() { + pub fn btree_insert_fuzz_ex() { let _ = env_logger::init(); - let (pager, root_page) = empty_btree(); - let mut cursor = BTreeCursor::new(pager.clone(), root_page); - let mut keys = Vec::new(); - let mut rng = ChaCha8Rng::seed_from_u64(0); - for _ in 0..16 { - let size = (rng.next_u64() % 4096) as usize; - let key = (rng.next_u64() % (1 << 30)) as i64; - keys.push(key); - log::info!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); - let key = OwnedValue::Integer(key); - let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); - cursor.insert(&key, &value, false).unwrap(); - log::info!( - "=========== btree ===========\n{}\n\n", - format_btree(pager.clone(), root_page, 0) - ); - - for key in keys.iter() { + for sequence in [ + &[ + (293471650, 2452), + (163608869, 627), + (544576229, 464), + (705823748, 3441), + ] + .as_slice(), + &[ + (987283511, 2924), + (261851260, 1766), + (343847101, 1657), + (315844794, 572), + ] + .as_slice(), + &[ + (987283511, 2924), + (261851260, 1766), + (343847101, 1657), + (315844794, 572), + (649272840, 1632), + (723398505, 3140), + (334416967, 3874), + ] + .as_slice(), + ] { + let (pager, root_page) = empty_btree(); + let mut cursor = BTreeCursor::new(pager.clone(), root_page); + for (key, size) in sequence.iter() { + let key = OwnedValue::Integer(*key); + let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; *size]))]); + log::info!("insert key:{}", key); + cursor.insert(&key, &value, false).unwrap(); + log::info!( + "=========== btree ===========\n{}\n\n", + format_btree(pager.clone(), root_page, 0) + ); + } + for (key, _) in sequence.iter() { let seek_key = SeekKey::TableRowId(*key as u64); assert!( matches!( @@ -2492,6 +2513,44 @@ mod tests { } } + #[test] + pub fn btree_insert_fuzz_run() { + let _ = env_logger::init(); + let mut rng = ChaCha8Rng::seed_from_u64(0); + for _ in 0..128 { + let (pager, root_page) = empty_btree(); + let mut cursor = BTreeCursor::new(pager.clone(), root_page); + let mut keys = Vec::new(); + let seed = rng.next_u64(); + log::info!("seed: {}", seed); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + for _ in 0..16 { + let size = (rng.next_u64() % 4096) as usize; + let key = (rng.next_u64() % (1 << 30)) as i64; + keys.push(key); + log::info!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); + let key = OwnedValue::Integer(key); + let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); + cursor.insert(&key, &value, false).unwrap(); + log::info!( + "=========== btree ===========\n{}\n\n", + format_btree(pager.clone(), root_page, 0) + ); + for key in keys.iter() { + let seek_key = SeekKey::TableRowId(*key as u64); + assert!( + matches!( + cursor.seek(seek_key, SeekOp::EQ).unwrap(), + CursorResult::Ok(true) + ), + "key {} is not found", + key + ); + } + } + } + } + #[allow(clippy::arc_with_non_send_sync)] fn setup_test_env(database_size: u32) -> (Rc, Rc>) { let page_size = 512; From 62b4787d3d4de9157875ef124a50cbd691d28f79 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 18:35:34 +0400 Subject: [PATCH 04/23] simplify write_varint_to_vec function --- core/storage/sqlite3_ondisk.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index fbdac7dca..3cfdfca6f 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -1129,11 +1129,9 @@ pub fn write_varint(buf: &mut [u8], value: u64) -> usize { } pub fn write_varint_to_vec(value: u64, payload: &mut Vec) { - let mut varint: Vec = vec![0; 9]; - let n = write_varint(&mut varint.as_mut_slice()[0..9], value); - write_varint(&mut varint, value); - varint.truncate(n); - payload.extend_from_slice(&varint); + let mut varint = [0u8; 9]; + let n = write_varint(&mut varint, value); + payload.extend_from_slice(&varint[0..n]); } pub fn begin_read_wal_header(io: &Rc) -> Result>> { From 5ce3d12f7513f2132298a843bc9fd4698c69a450 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 21:25:01 +0400 Subject: [PATCH 05/23] fix typo --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index ce14c9ecf..51b3d32c3 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1115,7 +1115,7 @@ impl BTreeCursor { debug!("balance_non_root(page={})", current_page.get().id); // Copy of page used to reference cell bytes. - // This needs to be saved somewhere safe so taht references still point to here, + // This needs to be saved somewhere safe so that references still point to here, // this will be store in write_info below let page_copy = current_page.get().contents.as_ref().unwrap().clone(); From fc502b86c773f4163e965318a08f64d4691b5341 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 21:25:08 +0400 Subject: [PATCH 06/23] fix defragmentation code a bit --- core/storage/btree.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 51b3d32c3..d3a958d8e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1607,6 +1607,8 @@ impl BTreeCursor { page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16); // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); + // set fragmented bytes counter to zero + page.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); // set unused space to 0 let first_cell = cloned_page.cell_content_area() as u64; assert!(first_cell <= cbrk); From eec0493c60c2393677446eedd2f47cfd1eed2a18 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 21:29:09 +0400 Subject: [PATCH 07/23] remove misleading comment --- core/storage/btree.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d3a958d8e..5169d0687 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1057,11 +1057,6 @@ impl BTreeCursor { .clone(); match state { WriteState::BalanceStart => { - // drop divider cells and find right pointer - // NOTE: since we are doing a simple split we only finding the pointer we want to update (right pointer). - // Right pointer means cell that points to the last page, as we don't really want to drop this one. This one - // can be a "rightmost pointer" or a "cell". - // we always asumme there is a parent let current_page = self.stack.top(); { // check if we don't need to balance From 6aa10701a429cb6e36867b3c892f6bc55127bef4 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 21:33:23 +0400 Subject: [PATCH 08/23] fix comment --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5169d0687..2d2c3291b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1206,7 +1206,7 @@ impl BTreeCursor { BTreeCell::TableInteriorCell(interior) => { interior._left_child_page as usize == current_idx } - _ => unreachable!("Parent should always be a "), + _ => unreachable!("Parent should always be an interior page"), }; if found { let (start, _len) = parent_contents.cell_get_raw_region( From ea61f31843884cf8b204159505ba0544cdb23e1b Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 8 Feb 2025 21:52:04 +0400 Subject: [PATCH 09/23] clear overflow_cells --- core/storage/btree.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 2d2c3291b..d21dedeb2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1109,10 +1109,14 @@ impl BTreeCursor { let current_page = self.stack.top(); debug!("balance_non_root(page={})", current_page.get().id); + let current_page_inner = current_page.get(); + let current_page_contents = &mut current_page_inner.contents; + let current_page_contents = current_page_contents.as_mut().unwrap(); // Copy of page used to reference cell bytes. // This needs to be saved somewhere safe so that references still point to here, // this will be store in write_info below - let page_copy = current_page.get().contents.as_ref().unwrap().clone(); + let page_copy = current_page_contents.clone(); + current_page_contents.overflow_cells.clear(); // In memory in order copy of all cells in pages we want to balance. For now let's do a 2 page split. // Right pointer in interior cells should be converted to regular cells if more than 2 pages are used for balancing. From d4bbad161b25328cca11474ad737b720484cb22b Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 17:08:00 +0400 Subject: [PATCH 10/23] handle case when we can't balance all cells between current page and one new allocated page - if we have page which is tightly packed with relatively big cells, we will be unable to balance its content if we will insert very big (~page size) cell in the middle (because nothing can't be merged with new cell - so we will need to split 1 page into 3) --- core/storage/btree.rs | 131 ++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 62 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d21dedeb2..7af0945fe 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -76,7 +76,7 @@ macro_rules! return_if_locked { /// State machine of a write operation. /// May involve balancing due to overflow. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] enum WriteState { Start, BalanceStart, @@ -89,8 +89,10 @@ enum WriteState { struct WriteInfo { /// State of the write operation state machine. state: WriteState, - /// Pages allocated during the write operation due to balancing. - new_pages: RefCell>, + /// Pages involved in the split of the page due to balancing (splits_pages[0] is the balancing page, while other - fresh allocated pages) + split_pages: RefCell>, + /// Amount of cells from balancing page for every split page + split_pages_cells_count: RefCell>, /// Scratch space used during balancing. scratch_cells: RefCell>, /// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated. @@ -103,7 +105,8 @@ impl WriteInfo { fn new() -> WriteInfo { WriteInfo { state: WriteState::Start, - new_pages: RefCell::new(Vec::with_capacity(4)), + split_pages: RefCell::new(Vec::with_capacity(4)), + split_pages_cells_count: RefCell::new(Vec::with_capacity(4)), scratch_cells: RefCell::new(Vec::new()), rightmost_pointer: RefCell::new(None), page_copy: RefCell::new(None), @@ -1091,12 +1094,7 @@ impl BTreeCursor { matches!(self.state, CursorState::Write(_)), "Cursor must be in balancing state" ); - let state = self - .state - .write_info() - .expect("must be balancing") - .state - .clone(); + let state = self.state.write_info().expect("must be balancing").state; let (next_write_state, result) = match state { WriteState::Start => todo!(), WriteState::BalanceStart => todo!(), @@ -1124,47 +1122,69 @@ impl BTreeCursor { let mut scratch_cells = write_info.scratch_cells.borrow_mut(); scratch_cells.clear(); + let usable_space = self.usable_space(); for cell_idx in 0..page_copy.cell_count() { let (start, len) = page_copy.cell_get_raw_region( cell_idx, self.payload_overflow_threshold_max(page_copy.page_type()), self.payload_overflow_threshold_min(page_copy.page_type()), - self.usable_space(), + usable_space, ); - let buf = page_copy.as_ptr(); - scratch_cells.push(to_static_buf(&buf[start..start + len])); + let cell_buffer = to_static_buf(&page_copy.as_ptr()[start..start + len]); + scratch_cells.push(cell_buffer); } - for overflow_cell in &page_copy.overflow_cells { - scratch_cells - .insert(overflow_cell.index, to_static_buf(&overflow_cell.payload)); + // overflow_cells are stored in order - so we need to insert them in reverse order + for cell in page_copy.overflow_cells.iter().rev() { + scratch_cells.insert(cell.index, to_static_buf(&cell.payload)); } + // amount of cells for pages involved in split (distributed with naive greedy approach) + // if we have single overflow cell in a table leaf node - we still can have 3 split pages + // + // for example, if current page has 4 entries with size ~1/4 page size, and new cell has size ~page size + // then we will need 3 pages to distribute cells between them + let split_pages_cells_count = &mut write_info.split_pages_cells_count.borrow_mut(); + split_pages_cells_count.clear(); + let mut last_page_cells_count = 0; + let mut last_page_cells_size = 0; + for scratch_cell in scratch_cells.iter() { + let cell_size = scratch_cell.len() + 2; // + cell pointer size (u16) + if last_page_cells_size + cell_size > usable_space { + split_pages_cells_count.push(last_page_cells_count); + last_page_cells_count = 0; + last_page_cells_size = 0; + } + last_page_cells_count += 1; + last_page_cells_size += cell_size; + assert!(last_page_cells_size <= usable_space); + } + split_pages_cells_count.push(last_page_cells_count); + let new_pages_count = split_pages_cells_count.len(); + + debug!( + "splitting left={} new_pages={}, cells_count={:?}", + current_page.get().id, + new_pages_count - 1, + split_pages_cells_count + ); + *write_info.rightmost_pointer.borrow_mut() = page_copy.rightmost_pointer(); write_info.page_copy.replace(Some(page_copy)); - // allocate new pages and move cells to those new pages - // split procedure let page = current_page.get().contents.as_mut().unwrap(); + let page_type = page.page_type(); assert!( - matches!( - page.page_type(), - PageType::TableLeaf | PageType::TableInterior - ), - "indexes still not supported " + matches!(page_type, PageType::TableLeaf | PageType::TableInterior), + "indexes still not supported" ); - let right_page = self.allocate_page(page.page_type(), 0); - let right_page_id = right_page.get().id; - - write_info.new_pages.borrow_mut().clear(); - write_info.new_pages.borrow_mut().push(current_page.clone()); - write_info.new_pages.borrow_mut().push(right_page.clone()); - - debug!( - "splitting left={} right={}", - current_page.get().id, - right_page_id - ); + write_info.split_pages.borrow_mut().clear(); + write_info.split_pages.borrow_mut().push(current_page); + // allocate new pages + for _ in 1..new_pages_count { + let new_page = self.allocate_page(page_type, 0); + write_info.split_pages.borrow_mut().push(new_page); + } (WriteState::BalanceGetParentPage, Ok(CursorResult::Ok(()))) } @@ -1225,23 +1245,21 @@ impl BTreeCursor { } let write_info = self.state.write_info().unwrap(); - let mut new_pages = write_info.new_pages.borrow_mut(); + let mut split_pages = write_info.split_pages.borrow_mut(); + let split_pages_len = split_pages.len(); let scratch_cells = write_info.scratch_cells.borrow(); // reset pages - for page in new_pages.iter() { + for page in split_pages.iter() { assert!(page.is_dirty()); let contents = page.get().contents.as_mut().unwrap(); contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); - let db_header = RefCell::borrow(&self.pager.db_header); - let cell_content_area_start = - db_header.page_size - db_header.reserved_space as u16; contents.write_u16( PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, - cell_content_area_start, + self.usable_space() as u16, ); contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); @@ -1250,29 +1268,17 @@ impl BTreeCursor { } } - // distribute cells - let new_pages_len = new_pages.len(); - let cells_per_page = scratch_cells.len() / new_pages.len(); let mut current_cell_index = 0_usize; - let mut divider_cells_index = Vec::new(); /* index to scratch cells that will be used as dividers in order */ + /* index to scratch cells that will be used as dividers in order */ + let mut divider_cells_index = Vec::with_capacity(split_pages.len()); - debug!( - "balance_leaf::distribute(cells={}, cells_per_page={})", - scratch_cells.len(), - cells_per_page - ); + debug!("balance_leaf::distribute(cells={})", scratch_cells.len()); - for (i, page) in new_pages.iter_mut().enumerate() { + for (i, page) in split_pages.iter_mut().enumerate() { let page_id = page.get().id; let contents = page.get().contents.as_mut().unwrap(); - let last_page = i == new_pages_len - 1; - let cells_to_copy = if last_page { - // last cells is remaining pages if division was odd - scratch_cells.len() - current_cell_index - } else { - cells_per_page - }; + let cells_to_copy = write_info.split_pages_cells_count.borrow()[i]; debug!( "balance_leaf::distribute(page={}, cells_to_copy={})", page_id, cells_to_copy @@ -1288,6 +1294,7 @@ impl BTreeCursor { divider_cells_index.push(current_cell_index + cells_to_copy - 1); current_cell_index += cells_to_copy; } + let is_leaf = { let page = self.stack.top(); let page = page.get().contents.as_ref().unwrap(); @@ -1296,7 +1303,7 @@ impl BTreeCursor { // update rightmost pointer for each page if we are in interior page if !is_leaf { - for page in new_pages.iter_mut().take(new_pages_len - 1) { + for page in split_pages.iter_mut().take(split_pages_len - 1) { let contents = page.get().contents.as_mut().unwrap(); assert_eq!(contents.cell_count(), 1); @@ -1315,7 +1322,7 @@ impl BTreeCursor { contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, last_cell_pointer); } // last page right most pointer points to previous right most pointer before splitting - let last_page = new_pages.last().unwrap(); + let last_page = split_pages.last().unwrap(); let last_page_contents = last_page.get().contents.as_mut().unwrap(); last_page_contents.write_u32( PAGE_HEADER_OFFSET_RIGHTMOST_PTR, @@ -1326,7 +1333,7 @@ impl BTreeCursor { // insert dividers in parent // we can consider dividers the first cell of each page starting from the second page for (page_id_index, page) in - new_pages.iter_mut().take(new_pages_len - 1).enumerate() + split_pages.iter_mut().take(split_pages_len - 1).enumerate() { let contents = page.get().contents.as_mut().unwrap(); let divider_cell_index = divider_cells_index[page_id_index]; @@ -1372,7 +1379,7 @@ impl BTreeCursor { { // copy last page id to right pointer - let last_pointer = new_pages.last().unwrap().get().id as u32; + let last_pointer = split_pages.last().unwrap().get().id as u32; parent_contents.write_u32(right_pointer, last_pointer); } self.stack.pop(); From 8659dbba8e74c6bc7ef81de1d3a49b9bdbb6f29b Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 17:22:21 +0400 Subject: [PATCH 11/23] fix pointer structure in case of root split --- core/storage/btree.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 7af0945fe..02bfda01c 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1411,7 +1411,6 @@ impl BTreeCursor { let current_root = self.stack.top(); let current_root_contents = current_root.get().contents.as_ref().unwrap(); - let new_root_page_id = new_root_page.get().id; let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap(); if is_page_1 { // Copy header @@ -1421,8 +1420,10 @@ impl BTreeCursor { .copy_from_slice(¤t_root_buf[0..DATABASE_HEADER_SIZE]); } // point new root right child to previous root - new_root_page_contents - .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32); + new_root_page_contents.write_u32( + PAGE_HEADER_OFFSET_RIGHTMOST_PTR, + current_root.get().id as u32, + ); new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); } From e8a585f87ae9ebfb15b8de0362b4c631c8e39965 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:09:43 +0400 Subject: [PATCH 12/23] adjust logging --- core/storage/btree.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 02bfda01c..d6663540b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -745,13 +745,14 @@ impl BTreeCursor { // insert let overflow = { let contents = page.get().contents.as_mut().unwrap(); - debug!( - "insert_into_page(overflow, cell_count={})", - contents.cell_count() - ); - self.insert_into_cell(contents, cell_payload.as_slice(), cell_idx); - contents.overflow_cells.len() + let overflow_cells = contents.overflow_cells.len(); + debug!( + "insert_into_page(overflow, cell_count={}, overflow_cells={})", + contents.cell_count(), + overflow_cells + ); + overflow_cells }; let write_info = self .state From d2251b1dd1eba56e1b617a93ab82ee5fe9b32fa0 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:09:53 +0400 Subject: [PATCH 13/23] fix --- core/storage/btree.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d6663540b..138a04371 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1148,16 +1148,17 @@ impl BTreeCursor { split_pages_cells_count.clear(); let mut last_page_cells_count = 0; let mut last_page_cells_size = 0; + let content_usable_space = usable_space - page_copy.header_size(); for scratch_cell in scratch_cells.iter() { let cell_size = scratch_cell.len() + 2; // + cell pointer size (u16) - if last_page_cells_size + cell_size > usable_space { + if last_page_cells_size + cell_size > content_usable_space { split_pages_cells_count.push(last_page_cells_count); last_page_cells_count = 0; last_page_cells_size = 0; } last_page_cells_count += 1; last_page_cells_size += cell_size; - assert!(last_page_cells_size <= usable_space); + assert!(last_page_cells_size <= content_usable_space); } split_pages_cells_count.push(last_page_cells_count); let new_pages_count = split_pages_cells_count.len(); From a62265eef4c97f8da30ae39768c25be6dd2ff5bd Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:10:19 +0400 Subject: [PATCH 14/23] hanle balancing cases when more than 1 level is affected --- core/storage/btree.rs | 56 +++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 138a04371..4656ab28e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1053,40 +1053,38 @@ impl BTreeCursor { matches!(self.state, CursorState::Write(_)), "Cursor must be in balancing state" ); - let state = self - .state - .write_info() - .expect("must be balancing") - .state - .clone(); - match state { - WriteState::BalanceStart => { - let current_page = self.stack.top(); - { - // check if we don't need to balance - // don't continue if there are no overflow cells - let page = current_page.get().contents.as_mut().unwrap(); - if page.overflow_cells.is_empty() { - let write_info = self.state.mut_write_info().unwrap(); - write_info.state = WriteState::Finish; + loop { + let state = self.state.write_info().expect("must be balancing").state; + match state { + WriteState::BalanceStart => { + let current_page = self.stack.top(); + { + // check if we don't need to balance + // don't continue if there are no overflow cells + let page = current_page.get().contents.as_mut().unwrap(); + if page.overflow_cells.is_empty() { + let write_info = self.state.mut_write_info().unwrap(); + write_info.state = WriteState::Finish; + return Ok(CursorResult::Ok(())); + } + } + + if !self.stack.has_parent() { + self.balance_root(); return Ok(CursorResult::Ok(())); } + + let write_info = self.state.mut_write_info().unwrap(); + write_info.state = WriteState::BalanceNonRoot; + } + WriteState::BalanceNonRoot + | WriteState::BalanceGetParentPage + | WriteState::BalanceMoveUp => { + return_if_io!(self.balance_non_root()); } - if !self.stack.has_parent() { - self.balance_root(); - return Ok(CursorResult::Ok(())); - } - - let write_info = self.state.mut_write_info().unwrap(); - write_info.state = WriteState::BalanceNonRoot; - self.balance_non_root() + _ => unreachable!("invalid balance leaf state {:?}", state), } - WriteState::BalanceNonRoot - | WriteState::BalanceGetParentPage - | WriteState::BalanceMoveUp => self.balance_non_root(), - - _ => unreachable!("invalid balance leaf state {:?}", state), } } From 8e6569434a5d18bd088babb0cd87ad1e9ca73624 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:10:59 +0400 Subject: [PATCH 15/23] add fuzz --- core/storage/btree.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 4656ab28e..a0202c37a 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2471,6 +2471,24 @@ mod tests { pub fn btree_insert_fuzz_ex() { let _ = env_logger::init(); for sequence in [ + &[ + (777548915, 3364), + (639157228, 3796), + (709175417, 1214), + (390824637, 210), + (906124785, 1481), + (197677875, 1305), + (457946262, 3734), + (956825466, 592), + (835875722, 1334), + (649214013, 1250), + (531143011, 1788), + (765057993, 2351), + (510007766, 1349), + (884516059, 822), + (81604840, 2545), + ] + .as_slice(), &[ (293471650, 2452), (163608869, 627), From 9e4afd1d13af22e75ab712708bc6847a869a7107 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:36:26 +0400 Subject: [PATCH 16/23] relax assertion --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a0202c37a..c3dc054ba 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1306,7 +1306,7 @@ impl BTreeCursor { for page in split_pages.iter_mut().take(split_pages_len - 1) { let contents = page.get().contents.as_mut().unwrap(); - assert_eq!(contents.cell_count(), 1); + assert!(contents.cell_count() >= 1); let last_cell = contents.cell_get( contents.cell_count() - 1, self.pager.clone(), From 9049c91863b346708141cf141f44fc31b4de521f Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:36:37 +0400 Subject: [PATCH 17/23] find cell in parent node --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c3dc054ba..1ea2901db 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1371,7 +1371,7 @@ impl BTreeCursor { BTreeCell::TableInteriorCell(interior) => interior._rowid, _ => unreachable!(), }; - let parent_cell_idx = self.find_cell(contents, key); + let parent_cell_idx = self.find_cell(&parent_contents, key); self.insert_into_cell(parent_contents, cell_payload, parent_cell_idx); // self.drop_cell(*page, 0); } From 6c40f52fc8b6370217418b79033a6cbdb2851cf2 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 18:37:04 +0400 Subject: [PATCH 18/23] separate fuzz tests in categories --- core/storage/btree.rs | 83 +++++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 22 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 1ea2901db..9db523a59 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2469,7 +2469,6 @@ mod tests { #[test] pub fn btree_insert_fuzz_ex() { - let _ = env_logger::init(); for sequence in [ &[ (777548915, 3364), @@ -2540,44 +2539,84 @@ mod tests { } } - #[test] - pub fn btree_insert_fuzz_run() { - let _ = env_logger::init(); - let mut rng = ChaCha8Rng::seed_from_u64(0); - for _ in 0..128 { + fn rng_from_time() -> (ChaCha8Rng, u64) { + let seed = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + let rng = ChaCha8Rng::seed_from_u64(seed); + (rng, seed) + } + + fn btree_insert_fuzz_run( + attempts: usize, + inserts: usize, + size: impl Fn(&mut ChaCha8Rng) -> usize, + ) { + let (mut rng, seed) = rng_from_time(); + log::info!("super seed: {}", seed); + for _ in 0..attempts { let (pager, root_page) = empty_btree(); let mut cursor = BTreeCursor::new(pager.clone(), root_page); let mut keys = Vec::new(); let seed = rng.next_u64(); log::info!("seed: {}", seed); let mut rng = ChaCha8Rng::seed_from_u64(seed); - for _ in 0..16 { - let size = (rng.next_u64() % 4096) as usize; + for _ in 0..inserts { + let size = size(&mut rng); let key = (rng.next_u64() % (1 << 30)) as i64; keys.push(key); log::info!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); let key = OwnedValue::Integer(key); let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); cursor.insert(&key, &value, false).unwrap(); - log::info!( - "=========== btree ===========\n{}\n\n", - format_btree(pager.clone(), root_page, 0) + } + log::info!( + "=========== btree ===========\n{}\n\n", + format_btree(pager.clone(), root_page, 0) + ); + for key in keys.iter() { + let seek_key = SeekKey::TableRowId(*key as u64); + assert!( + matches!( + cursor.seek(seek_key, SeekOp::EQ).unwrap(), + CursorResult::Ok(true) + ), + "key {} is not found", + key ); - for key in keys.iter() { - let seek_key = SeekKey::TableRowId(*key as u64); - assert!( - matches!( - cursor.seek(seek_key, SeekOp::EQ).unwrap(), - CursorResult::Ok(true) - ), - "key {} is not found", - key - ); - } } } } + #[test] + pub fn btree_insert_fuzz_run_equal_size() { + for size in 1..8 { + log::info!("======= size:{} =======", size); + btree_insert_fuzz_run(2, 1024, |_| size); + } + } + + #[test] + pub fn btree_insert_fuzz_run_random() { + btree_insert_fuzz_run(128, 16, |rng| (rng.next_u32() % 4096) as usize); + } + + #[test] + pub fn btree_insert_fuzz_run_small() { + btree_insert_fuzz_run(1, 1024, |rng| (rng.next_u32() % 128) as usize); + } + + #[test] + pub fn btree_insert_fuzz_run_big() { + btree_insert_fuzz_run(64, 32, |rng| 3 * 1024 + (rng.next_u32() % 1024) as usize); + } + + #[test] + pub fn btree_insert_fuzz_run_overflow() { + btree_insert_fuzz_run(64, 32, |rng| (rng.next_u32() % 32 * 1024) as usize); + } + #[allow(clippy::arc_with_non_send_sync)] fn setup_test_env(database_size: u32) -> (Rc, Rc>) { let page_size = 512; From a59589844d0c0e0ea332cbef0c34bf2600a89239 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 19:19:36 +0400 Subject: [PATCH 19/23] fix insertion to the parent --- core/storage/btree.rs | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 9db523a59..5e5862699 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2,7 +2,7 @@ use log::debug; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{ - read_btree_cell, read_varint, write_varint, BTreeCell, DatabaseHeader, PageContent, PageType, + read_btree_cell, read_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, TableLeafCell, }; @@ -1348,33 +1348,18 @@ impl BTreeCursor { self.usable_space(), )?; - if is_leaf { - // create a new divider cell and push - let key = match cell { - BTreeCell::TableLeafCell(leaf) => leaf._rowid, - _ => unreachable!(), - }; - let mut divider_cell = Vec::new(); - divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); - divider_cell.extend(std::iter::repeat(0).take(9)); - let n = write_varint(&mut divider_cell.as_mut_slice()[4..], key); - divider_cell.truncate(4 + n); - let parent_cell_idx = self.find_cell(parent_contents, key); - self.insert_into_cell( - parent_contents, - divider_cell.as_slice(), - parent_cell_idx, - ); - } else { - // move cell - let key = match cell { - BTreeCell::TableInteriorCell(interior) => interior._rowid, - _ => unreachable!(), - }; - let parent_cell_idx = self.find_cell(&parent_contents, key); - self.insert_into_cell(parent_contents, cell_payload, parent_cell_idx); - // self.drop_cell(*page, 0); - } + let key = match cell { + BTreeCell::TableLeafCell(TableLeafCell { _rowid, .. }) + | BTreeCell::TableInteriorCell(TableInteriorCell { _rowid, .. }) => _rowid, + _ => unreachable!(), + }; + + let mut divider_cell = Vec::with_capacity(4 + 9); // 4 - page id, 9 - max length of varint + divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + write_varint_to_vec(key, &mut divider_cell); + + let parent_cell_idx = self.find_cell(parent_contents, key); + self.insert_into_cell(parent_contents, ÷r_cell, parent_cell_idx); } { From e23ea35993667b0843dc6e9887324b8d6a8df9bf Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 19:20:37 +0400 Subject: [PATCH 20/23] add simple B-tree validation func --- core/storage/btree.rs | 67 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5e5862699..8c2244794 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2369,6 +2369,70 @@ mod tests { use std::cell::RefCell; use std::sync::Arc; + fn validate_btree(pager: Rc, page_idx: usize) -> (usize, bool) { + let cursor = BTreeCursor::new(pager.clone(), page_idx); + let page = pager.read_page(page_idx).unwrap(); + let page = page.get(); + let contents = page.contents.as_ref().unwrap(); + let page_type = contents.page_type(); + let mut previous_key = None; + let mut valid = true; + let mut depth = None; + for cell_idx in 0..contents.cell_count() { + let cell = contents + .cell_get( + cell_idx, + pager.clone(), + cursor.payload_overflow_threshold_max(page_type), + cursor.payload_overflow_threshold_min(page_type), + cursor.usable_space(), + ) + .unwrap(); + let current_depth = match cell { + BTreeCell::TableLeafCell(..) => 1, + BTreeCell::TableInteriorCell(TableInteriorCell { + _left_child_page, .. + }) => { + let (child_depth, child_valid) = + validate_btree(pager.clone(), _left_child_page as usize); + valid &= child_valid; + child_depth + } + _ => panic!("unsupported btree cell: {:?}", cell), + }; + depth = Some(depth.unwrap_or(current_depth + 1)); + if depth != Some(current_depth + 1) { + log::error!("depth is different for child of page {}", page_idx); + valid = false; + } + match cell { + BTreeCell::TableInteriorCell(TableInteriorCell { _rowid, .. }) + | BTreeCell::TableLeafCell(TableLeafCell { _rowid, .. }) => { + if previous_key.is_some() && previous_key.unwrap() >= _rowid { + log::error!( + "keys are in bad order: prev={:?}, current={}", + previous_key, + _rowid + ); + valid = false; + } + previous_key = Some(_rowid); + } + _ => panic!("unsupported btree cell: {:?}", cell), + } + } + if let Some(right) = contents.rightmost_pointer() { + let (right_depth, right_valid) = validate_btree(pager.clone(), right as usize); + valid &= right_valid; + depth = Some(depth.unwrap_or(right_depth + 1)); + if depth != Some(right_depth + 1) { + log::error!("depth is different for child of page {}", page_idx); + valid = false; + } + } + (depth.unwrap(), valid) + } + fn format_btree(pager: Rc, page_idx: usize, depth: usize) -> String { let cursor = BTreeCursor::new(pager.clone(), page_idx); let page = pager.read_page(page_idx).unwrap(); @@ -2560,6 +2624,9 @@ mod tests { "=========== btree ===========\n{}\n\n", format_btree(pager.clone(), root_page, 0) ); + if matches!(validate_btree(pager.clone(), root_page), (_, false)) { + panic!("invalid btree"); + } for key in keys.iter() { let seek_key = SeekKey::TableRowId(*key as u64); assert!( From bc289d314a59b0eea6d3c56cca2d64238655b711 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 19:20:48 +0400 Subject: [PATCH 21/23] adjust test a bit --- core/storage/btree.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 8c2244794..5ded4ee76 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2611,11 +2611,16 @@ mod tests { let seed = rng.next_u64(); log::info!("seed: {}", seed); let mut rng = ChaCha8Rng::seed_from_u64(seed); - for _ in 0..inserts { + for insert_id in 0..inserts { let size = size(&mut rng); let key = (rng.next_u64() % (1 << 30)) as i64; keys.push(key); - log::info!("INSERT INTO t VALUES ({}, randomblob({}));", key, size); + log::info!( + "INSERT INTO t VALUES ({}, randomblob({})); -- {}", + key, + size, + insert_id + ); let key = OwnedValue::Integer(key); let value = Record::new(vec![OwnedValue::Blob(Rc::new(vec![0; size]))]); cursor.insert(&key, &value, false).unwrap(); From 32b5b0d0197f1a75e866ccfd634ebf981bde5cc2 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 19:26:03 +0400 Subject: [PATCH 22/23] introduce additional condition for cells distribution in order to avoid almost empty pages --- core/storage/btree.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5ded4ee76..1a537767d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1137,7 +1137,12 @@ impl BTreeCursor { scratch_cells.insert(cell.index, to_static_buf(&cell.payload)); } - // amount of cells for pages involved in split (distributed with naive greedy approach) + // amount of cells for pages involved in split + // the algorithm accumulate cells in greedy manner with 2 conditions for split: + // 1. new cell will overflow single cell (accumulated + new > usable_space - header_size) + // 2. accumulated size already reach >50% of content_usable_size + // second condition is necessary, otherwise in case of small cells we will create a lot of almost empty pages + // // if we have single overflow cell in a table leaf node - we still can have 3 split pages // // for example, if current page has 4 entries with size ~1/4 page size, and new cell has size ~page size @@ -1149,7 +1154,9 @@ impl BTreeCursor { let content_usable_space = usable_space - page_copy.header_size(); for scratch_cell in scratch_cells.iter() { let cell_size = scratch_cell.len() + 2; // + cell pointer size (u16) - if last_page_cells_size + cell_size > content_usable_space { + if last_page_cells_size + cell_size > content_usable_space + || 2 * last_page_cells_size > content_usable_space + { split_pages_cells_count.push(last_page_cells_count); last_page_cells_count = 0; last_page_cells_size = 0; From 1b9772e9ad04f4c3f0b811e73a251deba7c4ae72 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 9 Feb 2025 19:36:14 +0400 Subject: [PATCH 23/23] fix clippy --- core/storage/btree.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 1a537767d..bafc89dd3 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2503,6 +2503,7 @@ mod tests { let db_header = DatabaseHeader::default(); let page_size = db_header.page_size as usize; + #[allow(clippy::arc_with_non_send_sync)] let io: Arc = Arc::new(MemoryIO::new().unwrap()); let io_file = io.open_file("test.db", OpenFlags::Create, false).unwrap(); let page_io = Rc::new(FileStorage::new(io_file));