From bc1fd39a8dc6f8c7c99b8524c1be85b7b92b0071 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Thu, 20 Mar 2025 10:47:41 +0530 Subject: [PATCH 01/12] Add `self.stack.retreat()` when balancing is not needed. Fixes: https://github.com/tursodatabase/limbo/issues/1019 --- core/storage/btree.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index f9c37b14e..c5107fb63 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1885,7 +1885,22 @@ impl BTreeCursor { drop_cell(contents, cell_idx, self.usable_space() as u16)?; } - // TODO(Krishna): Implement balance after delete. I will implement after balance_nonroot is extended. + let page = self.stack.top(); + return_if_locked!(page); + + if !page.is_loaded() { + self.pager.load_page(page.clone())?; + return Ok(CursorResult::IO); + } + + let contents = page.get().contents.as_ref().unwrap(); + let free_space = compute_free_space(contents, self.usable_space() as u16); + if free_space as usize * 3 > self.usable_space() * 2 { + //need balancing + } else { + self.stack.retreat(); + return Ok(CursorResult::Ok(())); + } Ok(CursorResult::Ok(())) } From 7cbd816b2a53cb1a04ad9d335ab2c65b28adb30b Mon Sep 17 00:00:00 2001 From: krishvishal Date: Thu, 20 Mar 2025 12:44:08 +0530 Subject: [PATCH 02/12] Add balancing support for delete operation. --- core/storage/btree.rs | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c5107fb63..0d76ca9ab 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1895,13 +1895,46 @@ impl BTreeCursor { let contents = page.get().contents.as_ref().unwrap(); let free_space = compute_free_space(contents, self.usable_space() as u16); - if free_space as usize * 3 > self.usable_space() * 2 { - //need balancing + let usable_space = self.usable_space(); + let needs_balancing = free_space as usize * 3 > usable_space * 2; + + if needs_balancing { + let write_info = WriteInfo { + state: WriteState::BalanceStart, + balance_info: RefCell::new(Some(BalanceInfo { + pages_to_balance: Vec::new(), + rightmost_pointer: std::ptr::null_mut(), + divider_cells: Vec::new(), + sibling_count: 0, + first_divider_cell: 0, + })), + }; + self.state = CursorState::Write(write_info); + + loop { + let write_state = self.state.write_info().unwrap().state; + match write_state { + WriteState::Start => { + let write_info = self.state.mut_write_info().unwrap(); + write_info.state = WriteState::BalanceStart; + } + WriteState::BalanceStart + | WriteState::BalanceNonRoot + | WriteState::BalanceNonRootWaitLoadPages => { + return_if_io!(self.balance()); + } + WriteState::Finish => { + self.state = CursorState::None; + return_if_io!(self.move_to(SeekKey::TableRowId(target_rowid), SeekOp::EQ)); + return Ok(CursorResult::Ok(())); + } + } + } } else { + // No balancing needed self.stack.retreat(); - return Ok(CursorResult::Ok(())); + Ok(CursorResult::Ok(())) } - Ok(CursorResult::Ok(())) } pub fn set_null_flag(&mut self, flag: bool) { From e0d54f3691f3cd90f1d2675c073f7c35f2aef007 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Thu, 20 Mar 2025 12:49:17 +0530 Subject: [PATCH 03/12] Initialize `CursorState` properly --- core/storage/btree.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 0d76ca9ab..314d89f2e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1,3 +1,4 @@ +use io_uring::opcode::Write; use tracing::debug; use crate::storage::pager::Pager; @@ -1899,17 +1900,9 @@ impl BTreeCursor { let needs_balancing = free_space as usize * 3 > usable_space * 2; if needs_balancing { - let write_info = WriteInfo { - state: WriteState::BalanceStart, - balance_info: RefCell::new(Some(BalanceInfo { - pages_to_balance: Vec::new(), - rightmost_pointer: std::ptr::null_mut(), - divider_cells: Vec::new(), - sibling_count: 0, - first_divider_cell: 0, - })), - }; - self.state = CursorState::Write(write_info); + if let CursorState::None = &self.state { + self.state = CursorState::Write(WriteInfo::new()); + } loop { let write_state = self.state.write_info().unwrap().state; From 8c164530fa0f3f498f61f470cbf826ece7e7c0b0 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Mon, 24 Mar 2025 16:37:10 +0530 Subject: [PATCH 04/12] Remove linear search to obtain cell. current_cell_idx - 1 seems to work reliably. --- core/storage/btree.rs | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 314d89f2e..0275b44bf 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1769,33 +1769,8 @@ impl BTreeCursor { None => return Ok(CursorResult::Ok(())), }; - let contents = page.get().contents.as_ref().unwrap(); - - // TODO(Krishna): We are doing this linear search here because seek() is returning the index of previous cell. - // And the fix is currently not very clear to me. - // This finds the cell with matching rowid with in a page. - let mut cell_idx = None; - for idx in 0..contents.cell_count() { - let cell = contents.cell_get( - idx, - self.pager.clone(), - payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), - payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), - self.usable_space(), - )?; - - if let BTreeCell::TableLeafCell(leaf_cell) = cell { - if leaf_cell._rowid == target_rowid { - cell_idx = Some(idx); - break; - } - } - } - - let cell_idx = match cell_idx { - Some(idx) => idx, - None => return Ok(CursorResult::Ok(())), - }; + let mut cell_idx = self.stack.current_cell_index() as usize; + cell_idx -= 1; let contents = page.get().contents.as_ref().unwrap(); let cell = contents.cell_get( From 6a8d1deee6910bd11a785cd848f9717590288971 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Mon, 24 Mar 2025 20:52:17 +0530 Subject: [PATCH 05/12] Convert implementation into event loop style. --- core/storage/btree.rs | 167 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 0275b44bf..da0deddfd 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1754,7 +1754,7 @@ impl BTreeCursor { Ok(CursorResult::Ok(())) } - pub fn delete(&mut self) -> Result> { + pub fn delete_old(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); let page = self.stack.top(); return_if_locked!(page); @@ -1905,6 +1905,171 @@ impl BTreeCursor { } } + pub fn delete(&mut self) -> Result> { + assert!(self.mv_cursor.is_none()); + + if let CursorState::None = &self.state { + self.state = CursorState::Write(WriteInfo::new()); + } + + let mut needs_balancing = false; + let mut target_rowid = 0; + + loop { + let write_state = { + let write_info = self.state.mut_write_info().expect("error"); + write_info.state + }; + match write_state { + WriteState::Start => { + let page = self.stack.top(); + return_if_locked!(page); + + if !page.is_loaded() { + self.pager.load_page(page.clone())?; + return Ok(CursorResult::IO); + } + + target_rowid = match self.rowid.get() { + Some(rowid) => rowid, + None => return Ok(CursorResult::Ok(())), + }; + + let mut cell_idx = self.stack.current_cell_index() as usize; + cell_idx -= 1; + let contents = page.get().contents.as_ref().unwrap(); + let cell = contents.cell_get( + cell_idx, + self.pager.clone(), + payload_overflow_threshold_max( + contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; + + if cell_idx >= contents.cell_count() { + return_corrupt!(format!( + "Corrupted page: cell index {} is out of bounds for page with {} cells", + cell_idx, + contents.cell_count() + )); + } + + let original_child_pointer = match &cell { + BTreeCell::TableInteriorCell(interior) => Some(interior._left_child_page), + _ => None, + }; + + return_if_io!(self.clear_overflow_pages(&cell)); + + let page = self.stack.top(); + return_if_locked!(page); + if !page.is_loaded() { + self.pager.load_page(page.clone())?; + return Ok(CursorResult::IO); + } + + page.set_dirty(); + self.pager.add_dirty(page.get().id); + + let contents = page.get().contents.as_mut().unwrap(); + + // If this is an interior node, we need to handle deletion differently + // For interior nodes: + // 1. Move cursor to largest entry in left subtree + // 2. Copy that entry to replace the one being deleted + // 3. Delete the leaf entry + if !contents.is_leaf() { + // 1. Move cursor to largest entry in left subtree + return_if_io!(self.prev()); + + let leaf_page = self.stack.top(); + + // 2. Copy that entry to replace the one being deleted + let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); + let leaf_cell_idx = self.stack.current_cell_index() as usize - 1; + let predecessor_cell = leaf_contents.cell_get( + leaf_cell_idx, + self.pager.clone(), + payload_overflow_threshold_max( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; + + // 3. Create an interior cell from the leaf cell + let mut cell_payload: Vec = Vec::new(); + match predecessor_cell { + BTreeCell::TableLeafCell(leaf_cell) => { + // Format: [left child page (4 bytes)][rowid varint] + if let Some(child_pointer) = original_child_pointer { + cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); + write_varint_to_vec(leaf_cell._rowid, &mut cell_payload); + } + } + _ => unreachable!("Expected table leaf cell"), + } + insert_into_cell( + contents, + &cell_payload, + cell_idx, + self.usable_space() as u16, + ) + .unwrap(); + drop_cell(contents, cell_idx, self.usable_space() as u16)?; + } else { + // For leaf nodes, simply remove the cell + drop_cell(contents, cell_idx, self.usable_space() as u16)?; + } + + let page = self.stack.top(); + return_if_locked!(page); + + if !page.is_loaded() { + self.pager.load_page(page.clone())?; + return Ok(CursorResult::IO); + } + + let contents = page.get().contents.as_ref().unwrap(); + let free_space = compute_free_space(contents, self.usable_space() as u16); + let usable_space = self.usable_space(); + needs_balancing = free_space as usize * 3 > usable_space * 2; + let write_info = self.state.mut_write_info().expect("error"); + + if needs_balancing { + write_info.state = WriteState::BalanceStart; + } else { + write_info.state = WriteState::Finish; + } + } + WriteState::BalanceStart + | WriteState::BalanceNonRoot + | WriteState::BalanceNonRootWaitLoadPages => { + return_if_io!(self.balance()); + } + WriteState::Finish => { + if needs_balancing { + self.state = CursorState::None; + return_if_io!(self.move_to(SeekKey::TableRowId(target_rowid), SeekOp::EQ)); + } else { + self.stack.retreat(); + } + return Ok(CursorResult::Ok(())); + } + } + } + } + pub fn set_null_flag(&mut self, flag: bool) { self.null_flag = flag; } From bf1718925e090f2b7fac1092f0f54f1ad937bf7a Mon Sep 17 00:00:00 2001 From: krishvishal Date: Tue, 25 Mar 2025 08:38:18 +0530 Subject: [PATCH 06/12] Delete with balancing now works!! --- core/storage/btree.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index da0deddfd..49a6b10c7 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1,4 +1,3 @@ -use io_uring::opcode::Write; use tracing::debug; use crate::storage::pager::Pager; @@ -2056,14 +2055,16 @@ impl BTreeCursor { | WriteState::BalanceNonRoot | WriteState::BalanceNonRootWaitLoadPages => { return_if_io!(self.balance()); + // TODO(Krishna): Add second balance in the case where deletion causes cursor to end up + // a level deeper. } WriteState::Finish => { if needs_balancing { - self.state = CursorState::None; return_if_io!(self.move_to(SeekKey::TableRowId(target_rowid), SeekOp::EQ)); } else { self.stack.retreat(); } + self.state = CursorState::None; return Ok(CursorResult::Ok(())); } } From ac29563efbfadf0518e51be875f4310e518d7d18 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Tue, 25 Mar 2025 08:55:13 +0530 Subject: [PATCH 07/12] Add test to verify balancing after maintains the following: 1. Delete is working properly. 2. Balancing after delete still maintains the validity of btree. --- core/storage/btree.rs | 57 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 49a6b10c7..d7ca62910 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4557,6 +4557,63 @@ mod tests { dbg!(free); } + #[test] + pub fn test_delete_balancing() { + let (pager, root_page) = empty_btree(); + + for i in 1..=10000 { + let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); + let key = OwnedValue::Integer(i); + let value = Record::new(vec![OwnedValue::Text(Text::new("hello world"))]); + + run_until_done( + || { + let key = SeekKey::TableRowId(i as u64); + cursor.move_to(key, SeekOp::EQ) + }, + pager.deref(), + ) + .unwrap(); + + run_until_done(|| cursor.insert(&key, &value, true), pager.deref()).unwrap(); + } + + match validate_btree(pager.clone(), root_page) { + (_, false) => panic!("Invalid B-tree after insertion"), + _ => {} + } + + for i in 500..=3500 { + let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); + let seek_key = SeekKey::TableRowId(i as u64); + + let found = run_until_done(|| cursor.seek(seek_key.clone(), SeekOp::EQ), pager.deref()) + .unwrap(); + + if found { + run_until_done(|| cursor.delete(), pager.deref()).unwrap(); + } + } + + for i in 1..=10000 { + if i >= 500 && i <= 3500 { + continue; + } + + let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); + let key = OwnedValue::Integer(i); + let exists = run_until_done(|| cursor.exists(&key), pager.deref()).unwrap(); + assert!(exists, "Key {} should exist but doesn't", i); + } + + for i in 500..=3500 { + let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); + let key = OwnedValue::Integer(i); + let exists = run_until_done(|| cursor.exists(&key), pager.deref()).unwrap(); + assert!(!exists, "Deleted key {} still exists", i); + } + } + fn run_until_done( mut action: impl FnMut() -> Result>, pager: &Pager, From 6fe7e419e23cf0243ed478854a245927d973244a Mon Sep 17 00:00:00 2001 From: krishvishal Date: Tue, 25 Mar 2025 08:58:18 +0530 Subject: [PATCH 08/12] cleanup --- core/storage/btree.rs | 151 ------------------------------------------ 1 file changed, 151 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d7ca62910..dad680f6d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1753,157 +1753,6 @@ impl BTreeCursor { Ok(CursorResult::Ok(())) } - pub fn delete_old(&mut self) -> Result> { - assert!(self.mv_cursor.is_none()); - let page = self.stack.top(); - return_if_locked!(page); - - if !page.is_loaded() { - self.pager.load_page(page.clone())?; - return Ok(CursorResult::IO); - } - - let target_rowid = match self.rowid.get() { - Some(rowid) => rowid, - None => return Ok(CursorResult::Ok(())), - }; - - let mut cell_idx = self.stack.current_cell_index() as usize; - cell_idx -= 1; - - let contents = page.get().contents.as_ref().unwrap(); - let cell = contents.cell_get( - cell_idx, - self.pager.clone(), - payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), - payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), - self.usable_space(), - )?; - - if cell_idx >= contents.cell_count() { - return_corrupt!(format!( - "Corrupted page: cell index {} is out of bounds for page with {} cells", - cell_idx, - contents.cell_count() - )); - } - - let original_child_pointer = match &cell { - BTreeCell::TableInteriorCell(interior) => Some(interior._left_child_page), - _ => None, - }; - - return_if_io!(self.clear_overflow_pages(&cell)); - - let page = self.stack.top(); - return_if_locked!(page); - if !page.is_loaded() { - self.pager.load_page(page.clone())?; - return Ok(CursorResult::IO); - } - - page.set_dirty(); - self.pager.add_dirty(page.get().id); - - let contents = page.get().contents.as_mut().unwrap(); - - // If this is an interior node, we need to handle deletion differently - // For interior nodes: - // 1. Move cursor to largest entry in left subtree - // 2. Copy that entry to replace the one being deleted - // 3. Delete the leaf entry - if !contents.is_leaf() { - // 1. Move cursor to largest entry in left subtree - return_if_io!(self.prev()); - - let leaf_page = self.stack.top(); - - // 2. Copy that entry to replace the one being deleted - let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); - let leaf_cell_idx = self.stack.current_cell_index() as usize - 1; - let predecessor_cell = leaf_contents.cell_get( - leaf_cell_idx, - self.pager.clone(), - payload_overflow_threshold_max( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - - // 3. Create an interior cell from the leaf cell - let mut cell_payload: Vec = Vec::new(); - match predecessor_cell { - BTreeCell::TableLeafCell(leaf_cell) => { - // Format: [left child page (4 bytes)][rowid varint] - if let Some(child_pointer) = original_child_pointer { - cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); - write_varint_to_vec(leaf_cell._rowid, &mut cell_payload); - } - } - _ => unreachable!("Expected table leaf cell"), - } - insert_into_cell( - contents, - &cell_payload, - cell_idx, - self.usable_space() as u16, - ) - .unwrap(); - drop_cell(contents, cell_idx, self.usable_space() as u16)?; - } else { - // For leaf nodes, simply remove the cell - drop_cell(contents, cell_idx, self.usable_space() as u16)?; - } - - let page = self.stack.top(); - return_if_locked!(page); - - if !page.is_loaded() { - self.pager.load_page(page.clone())?; - return Ok(CursorResult::IO); - } - - let contents = page.get().contents.as_ref().unwrap(); - let free_space = compute_free_space(contents, self.usable_space() as u16); - let usable_space = self.usable_space(); - let needs_balancing = free_space as usize * 3 > usable_space * 2; - - if needs_balancing { - if let CursorState::None = &self.state { - self.state = CursorState::Write(WriteInfo::new()); - } - - loop { - let write_state = self.state.write_info().unwrap().state; - match write_state { - WriteState::Start => { - let write_info = self.state.mut_write_info().unwrap(); - write_info.state = WriteState::BalanceStart; - } - WriteState::BalanceStart - | WriteState::BalanceNonRoot - | WriteState::BalanceNonRootWaitLoadPages => { - return_if_io!(self.balance()); - } - WriteState::Finish => { - self.state = CursorState::None; - return_if_io!(self.move_to(SeekKey::TableRowId(target_rowid), SeekOp::EQ)); - return Ok(CursorResult::Ok(())); - } - } - } - } else { - // No balancing needed - self.stack.retreat(); - Ok(CursorResult::Ok(())) - } - } - pub fn delete(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); From 16d77acac64779846b9cce60b97fb6ad7788c38b Mon Sep 17 00:00:00 2001 From: krishvishal Date: Wed, 26 Mar 2025 12:29:19 +0530 Subject: [PATCH 09/12] Add comments describing the `test_delete_balancing` --- core/storage/btree.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index dad680f6d..d0a59c537 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4408,8 +4408,17 @@ mod tests { #[test] pub fn test_delete_balancing() { + // What does this test do: + // 1. Insert 10,000 rows of ~15 byte payload each. This creates + // nearly 40 pages (10,000 * 15 / 4096) and 240 rows per page. + // 2. Delete enough rows to create empty/ nearly empty pages to trigger balancing + // (verified this in SQLite). + // 3. Verify validity/integrity of btree after deleting and also verify that these + // values are actually deleted. + let (pager, root_page) = empty_btree(); + // Insert 10,000 records in to the BTree. for i in 1..=10000 { let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); let key = OwnedValue::Integer(i); @@ -4432,6 +4441,7 @@ mod tests { _ => {} } + // Delete records with 500 <= key <= 3500 for i in 500..=3500 { let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); let seek_key = SeekKey::TableRowId(i as u64); @@ -4444,6 +4454,7 @@ mod tests { } } + // Verify that records with key < 500 and key > 3500 still exist in the BTree. for i in 1..=10000 { if i >= 500 && i <= 3500 { continue; @@ -4455,6 +4466,7 @@ mod tests { assert!(exists, "Key {} should exist but doesn't", i); } + // Verify the deleted records don't exist. for i in 500..=3500 { let mut cursor = BTreeCursor::new(None, pager.clone(), root_page); let key = OwnedValue::Integer(i); From 3e3a0f56a1d56fad9fa5034692b8e3bb3eb29071 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Wed, 26 Mar 2025 22:44:49 +0530 Subject: [PATCH 10/12] Make delete re-entrant. Setup DeleteState enum --- core/storage/btree.rs | 297 +++++++++++++++++++++++++++++------------- 1 file changed, 203 insertions(+), 94 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d0a59c537..67742d3c8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -89,6 +89,43 @@ struct DestroyInfo { state: DestroyState, } +#[derive(Debug, Clone)] +enum DeleteState { + Start, + LoadPage, + FindCell, + ClearOverflowPages { + cell_idx: usize, + cell: BTreeCell, + original_child_pointer: Option, + }, + InteriorNodeReplacement { + cell_idx: usize, + original_child_pointer: Option, + }, + DropCell { + cell_idx: usize, + }, + CheckNeedsBalancing, + StartBalancing { + target_rowid: u64, + }, + WaitForBalancingToComplete { + target_rowid: u64, + }, + SeekAfterBalancing { + target_rowid: u64, + }, + StackRetreat, + Finish, +} + +#[derive(Clone)] +struct DeleteInfo { + state: DeleteState, + balance_write_info: Option, +} + /// State machine of a write operation. /// May involve balancing due to overflow. #[derive(Debug, Clone, Copy)] @@ -100,6 +137,7 @@ enum WriteState { Finish, } +#[derive(Clone)] struct BalanceInfo { /// Old pages being balanced. pages_to_balance: Vec, @@ -113,6 +151,7 @@ struct BalanceInfo { first_divider_cell: usize, } +#[derive(Clone)] struct WriteInfo { /// State of the write operation state machine. state: WriteState, @@ -134,6 +173,7 @@ enum CursorState { None, Write(WriteInfo), Destroy(DestroyInfo), + Delete(DeleteInfo), } impl CursorState { @@ -162,6 +202,20 @@ impl CursorState { _ => None, } } + + fn delete_info(&self) -> Option<&DeleteInfo> { + match self { + CursorState::Delete(x) => Some(x), + _ => None, + } + } + + fn mut_delete_info(&mut self) -> Option<&mut DeleteInfo> { + match self { + CursorState::Delete(x) => Some(x), + _ => None, + } + } } enum OverflowState { @@ -1757,19 +1811,33 @@ impl BTreeCursor { assert!(self.mv_cursor.is_none()); if let CursorState::None = &self.state { - self.state = CursorState::Write(WriteInfo::new()); + self.state = CursorState::Delete(DeleteInfo { + state: DeleteState::Start, + balance_write_info: None, + }) } - let mut needs_balancing = false; - let mut target_rowid = 0; - loop { - let write_state = { - let write_info = self.state.mut_write_info().expect("error"); - write_info.state + let delete_state = { + let delete_info = self.state.delete_info().expect("cannot get delete info"); + delete_info.state.clone() }; - match write_state { - WriteState::Start => { + + match delete_state { + DeleteState::Start => { + let _target_rowid = match self.rowid.get() { + Some(rowid) => rowid, + None => { + self.state = CursorState::None; + return Ok(CursorResult::Ok(())); + } + }; + + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::LoadPage; + } + + DeleteState::LoadPage => { let page = self.stack.top(); return_if_locked!(page); @@ -1778,14 +1846,24 @@ impl BTreeCursor { return Ok(CursorResult::IO); } - target_rowid = match self.rowid.get() { - Some(rowid) => rowid, - None => return Ok(CursorResult::Ok(())), - }; + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::FindCell; + } + DeleteState::FindCell => { + let page = self.stack.top(); let mut cell_idx = self.stack.current_cell_index() as usize; cell_idx -= 1; + let contents = page.get().contents.as_ref().unwrap(); + if cell_idx >= contents.cell_count() { + return_corrupt!(format!( + "Corrupted page: cell index {} is out of bounds for page with {} cells", + cell_idx, + contents.cell_count() + )); + } + let cell = contents.cell_get( cell_idx, self.pager.clone(), @@ -1800,23 +1878,116 @@ impl BTreeCursor { self.usable_space(), )?; - if cell_idx >= contents.cell_count() { - return_corrupt!(format!( - "Corrupted page: cell index {} is out of bounds for page with {} cells", - cell_idx, - contents.cell_count() - )); - } - let original_child_pointer = match &cell { BTreeCell::TableInteriorCell(interior) => Some(interior._left_child_page), _ => None, }; + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::ClearOverflowPages { + cell_idx, + cell, + original_child_pointer, + }; + } + + DeleteState::ClearOverflowPages { + cell_idx, + cell, + original_child_pointer, + } => { return_if_io!(self.clear_overflow_pages(&cell)); + let page = self.stack.top(); + let contents = page.get().contents.as_ref().unwrap(); + + let delete_info = self.state.mut_delete_info().unwrap(); + if !contents.is_leaf() { + delete_info.state = DeleteState::InteriorNodeReplacement { + cell_idx, + original_child_pointer, + }; + } else { + delete_info.state = DeleteState::DropCell { cell_idx }; + } + } + + DeleteState::InteriorNodeReplacement { + cell_idx, + original_child_pointer, + } => { + // Move to the largest key in the left subtree + return_if_io!(self.prev()); + + let leaf_page = self.stack.top(); + return_if_locked!(leaf_page); + + if !leaf_page.is_loaded() { + self.pager.load_page(leaf_page.clone())?; + return Ok(CursorResult::IO); + } + + let parent_page = { + self.stack.pop(); + let parent = self.stack.top(); + self.stack.push(leaf_page.clone()); + parent + }; + + if !parent_page.is_loaded() { + self.pager.load_page(parent_page.clone())?; + return Ok(CursorResult::IO); + } + + let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); + let leaf_cell_idx = self.stack.current_cell_index() as usize - 1; + let predecessor_cell = leaf_contents.cell_get( + leaf_cell_idx, + self.pager.clone(), + payload_overflow_threshold_max( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; + + parent_page.set_dirty(); + self.pager.add_dirty(parent_page.get().id); + + let parent_contents = parent_page.get().contents.as_mut().unwrap(); + + // Create an interior cell from the leaf cell + let mut cell_payload: Vec = Vec::new(); + match predecessor_cell { + BTreeCell::TableLeafCell(leaf_cell) => { + if let Some(child_pointer) = original_child_pointer { + cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); + write_varint_to_vec(leaf_cell._rowid, &mut cell_payload); + } + } + _ => unreachable!("Expected table leaf cell"), + } + + insert_into_cell( + parent_contents, + &cell_payload, + cell_idx, + self.usable_space() as u16, + )?; + drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::CheckNeedsBalancing; + } + + DeleteState::DropCell { cell_idx } => { let page = self.stack.top(); return_if_locked!(page); + if !page.is_loaded() { self.pager.load_page(page.clone())?; return Ok(CursorResult::IO); @@ -1826,60 +1997,13 @@ impl BTreeCursor { self.pager.add_dirty(page.get().id); let contents = page.get().contents.as_mut().unwrap(); + drop_cell(contents, cell_idx, self.usable_space() as u16)?; - // If this is an interior node, we need to handle deletion differently - // For interior nodes: - // 1. Move cursor to largest entry in left subtree - // 2. Copy that entry to replace the one being deleted - // 3. Delete the leaf entry - if !contents.is_leaf() { - // 1. Move cursor to largest entry in left subtree - return_if_io!(self.prev()); - - let leaf_page = self.stack.top(); - - // 2. Copy that entry to replace the one being deleted - let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); - let leaf_cell_idx = self.stack.current_cell_index() as usize - 1; - let predecessor_cell = leaf_contents.cell_get( - leaf_cell_idx, - self.pager.clone(), - payload_overflow_threshold_max( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - - // 3. Create an interior cell from the leaf cell - let mut cell_payload: Vec = Vec::new(); - match predecessor_cell { - BTreeCell::TableLeafCell(leaf_cell) => { - // Format: [left child page (4 bytes)][rowid varint] - if let Some(child_pointer) = original_child_pointer { - cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); - write_varint_to_vec(leaf_cell._rowid, &mut cell_payload); - } - } - _ => unreachable!("Expected table leaf cell"), - } - insert_into_cell( - contents, - &cell_payload, - cell_idx, - self.usable_space() as u16, - ) - .unwrap(); - drop_cell(contents, cell_idx, self.usable_space() as u16)?; - } else { - // For leaf nodes, simply remove the cell - drop_cell(contents, cell_idx, self.usable_space() as u16)?; - } + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::CheckNeedsBalancing; + } + DeleteState::CheckNeedsBalancing => { let page = self.stack.top(); return_if_locked!(page); @@ -1890,32 +2014,17 @@ impl BTreeCursor { let contents = page.get().contents.as_ref().unwrap(); let free_space = compute_free_space(contents, self.usable_space() as u16); - let usable_space = self.usable_space(); - needs_balancing = free_space as usize * 3 > usable_space * 2; - let write_info = self.state.mut_write_info().expect("error"); + let needs_balancing = free_space as usize * 3 > self.usable_space() * 2; + let target_rowid = self.rowid.get().unwrap(); + + let delete_info = self.state.mut_delete_info().unwrap(); if needs_balancing { - write_info.state = WriteState::BalanceStart; + delete_info.state = DeleteState::StartBalancing { target_rowid }; } else { - write_info.state = WriteState::Finish; + delete_info.state = DeleteState::StackRetreat; } } - WriteState::BalanceStart - | WriteState::BalanceNonRoot - | WriteState::BalanceNonRootWaitLoadPages => { - return_if_io!(self.balance()); - // TODO(Krishna): Add second balance in the case where deletion causes cursor to end up - // a level deeper. - } - WriteState::Finish => { - if needs_balancing { - return_if_io!(self.move_to(SeekKey::TableRowId(target_rowid), SeekOp::EQ)); - } else { - self.stack.retreat(); - } - self.state = CursorState::None; - return Ok(CursorResult::Ok(())); - } } } } From fe7ff6f53d1f3c7b836d81eee96d2cfeba89f1b3 Mon Sep 17 00:00:00 2001 From: krishvishal Date: Thu, 27 Mar 2025 00:27:03 +0530 Subject: [PATCH 11/12] Add balancing states. Balancing test works. --- core/storage/btree.rs | 69 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 67742d3c8..c872a70a8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2025,6 +2025,75 @@ impl BTreeCursor { delete_info.state = DeleteState::StackRetreat; } } + + DeleteState::StartBalancing { target_rowid } => { + let delete_info = self.state.mut_delete_info().unwrap(); + + if delete_info.balance_write_info.is_none() { + let mut write_info = WriteInfo::new(); + write_info.state = WriteState::BalanceStart; + delete_info.balance_write_info = Some(write_info); + } + + delete_info.state = DeleteState::WaitForBalancingToComplete { target_rowid } + } + + DeleteState::WaitForBalancingToComplete { target_rowid } => { + let delete_info = self.state.mut_delete_info().unwrap(); + + // Switch the CursorState to Write state for balancing + let write_info = delete_info.balance_write_info.take().unwrap(); + self.state = CursorState::Write(write_info); + + match self.balance()? { + CursorResult::Ok(()) => { + let write_info = match &self.state { + CursorState::Write(wi) => wi.clone(), + _ => unreachable!("Balance operation changed cursor state"), + }; + + // Move to seek state + self.state = CursorState::Delete(DeleteInfo { + state: DeleteState::SeekAfterBalancing { target_rowid }, + balance_write_info: Some(write_info), + }); + } + + CursorResult::IO => { + // Save balance progress and return IO + let write_info = match &self.state { + CursorState::Write(wi) => wi.clone(), + _ => unreachable!("Balance operation changed cursor state"), + }; + + self.state = CursorState::Delete(DeleteInfo { + state: DeleteState::WaitForBalancingToComplete { target_rowid }, + balance_write_info: Some(write_info), + }); + return Ok(CursorResult::IO); + } + } + } + + DeleteState::SeekAfterBalancing { target_rowid } => { + return_if_io!(self.move_to(SeekKey::TableRowId(target_rowid), SeekOp::EQ)); + + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::Finish; + delete_info.balance_write_info = None; + } + + DeleteState::StackRetreat => { + self.stack.retreat(); + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::Finish; + delete_info.balance_write_info = None; + } + + DeleteState::Finish => { + self.state = CursorState::None; + return Ok(CursorResult::Ok(())); + } } } } From 1b77b76d583d59b34f01a1eca41e54b9a3109f2a Mon Sep 17 00:00:00 2001 From: krishvishal Date: Thu, 27 Mar 2025 01:02:59 +0530 Subject: [PATCH 12/12] Add comments detailing Delete state machine flow and few delete steps. --- core/storage/btree.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c872a70a8..5d61bc34b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1807,6 +1807,19 @@ impl BTreeCursor { Ok(CursorResult::Ok(())) } + /// Delete state machine flow: + /// 1. Start -> check if the rowid to be delete is present in the page or not. If not we early return + /// 2. LoadPage -> load the page. + /// 3. FindCell -> find the cell to be deleted in the page. + /// 4. ClearOverflowPages -> clear overflow pages associated with the cell. here if the cell is a leaf page go to DropCell state + /// or else go to InteriorNodeReplacement + /// 5. InteriorNodeReplacement -> we copy the left subtree leaf node into the deleted interior node's place. + /// 6. DropCell -> only for leaf nodes. drop the cell. + /// 7. CheckNeedsBalancing -> check if balancing is needed. If yes, move to StartBalancing else move to StackRetreat + /// 8. WaitForBalancingToComplete -> perform balancing + /// 9. SeekAfterBalancing -> adjust the cursor to a node that is closer to the deleted value. go to Finish + /// 10. StackRetreat -> perform stack retreat for cursor positioning. only when balancing is not needed. go to Finish + /// 11. Finish -> Delete operation is done. Return CursorResult(Ok()) pub fn delete(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); @@ -1916,6 +1929,12 @@ impl BTreeCursor { cell_idx, original_child_pointer, } => { + // This is an interior node, we need to handle deletion differently + // For interior nodes: + // 1. Move cursor to largest entry in left subtree + // 2. Copy that entry to replace the one being deleted + // 3. Delete the leaf entry + // Move to the largest key in the left subtree return_if_io!(self.prev()); @@ -2046,6 +2065,8 @@ impl BTreeCursor { self.state = CursorState::Write(write_info); match self.balance()? { + // TODO(Krishna): Add second balance in the case where deletion causes cursor to end up + // a level deeper. CursorResult::Ok(()) => { let write_info = match &self.state { CursorState::Write(wi) => wi.clone(),