From 0698249c5e32059a6f9dcb0f2bb9cac67a4d5eae Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 2 Jun 2025 15:01:51 +0200 Subject: [PATCH 01/42] test_delete_with_index --- .../query_processing/test_write_path.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/integration/query_processing/test_write_path.rs b/tests/integration/query_processing/test_write_path.rs index 624fcca55..edebd2821 100644 --- a/tests/integration/query_processing/test_write_path.rs +++ b/tests/integration/query_processing/test_write_path.rs @@ -428,6 +428,25 @@ fn test_update_with_index() -> anyhow::Result<()> { Ok(()) } +#[test] +fn test_delete_with_index() -> anyhow::Result<()> { + let _ = env_logger::try_init(); + + maybe_setup_tracing(); + + let tmp_db = TempDatabase::new_with_rusqlite("CREATE TABLE t(x UNIQUE)"); + let conn = tmp_db.connect_limbo(); + + run_query(&tmp_db, &conn, "INSERT INTO t VALUES (1), (2)")?; + run_query(&tmp_db, &conn, "DELETE FROM t WHERE x >= 1")?; + + run_query_on_row(&tmp_db, &conn, "SELECT * FROM t", |_| { + panic!("Delete should've deleted every row!"); + })?; + + Ok(()) +} + fn run_query(tmp_db: &TempDatabase, conn: &Rc, query: &str) -> anyhow::Result<()> { run_query_core(tmp_db, conn, query, None::) } From 105de7e1d861c048c01cef845de99f38352d19c8 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 14:08:47 +0200 Subject: [PATCH 02/42] seek, next and prev return bool --- core/storage/btree.rs | 470 +++++------------------------------------- 1 file changed, 52 insertions(+), 418 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 75a9a7e36..11517c9d8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -602,7 +602,7 @@ pub struct BTreeCursor { /// Page id of the root page used to go back up fast. root_page: usize, /// Rowid and record are stored before being consumed. - has_record: Cell, + has_record: Cell, null_flag: bool, /// Index internal pages are consumed on the way up, so we store going upwards flag in case /// we just moved to a parent page and the parent page is an internal index page which requires @@ -735,13 +735,9 @@ impl BTreeCursor { /// Move the cursor to the previous record and return it. /// Used in backwards iteration. - fn get_prev_record( - &mut self, - predicate: Option<(SeekKey<'_>, SeekOp)>, - ) -> Result> { + fn get_prev_record(&mut self) -> Result> { loop { let page = self.stack.top(); - let cell_idx = self.stack.current_cell_index(); // moved to beginning of current page // todo: find a better way to flag moved to end or begin of page @@ -755,19 +751,20 @@ impl BTreeCursor { self.stack.pop(); } else { // moved to begin of btree - return Ok(CursorResult::Ok(CursorHasRecord::No)); + return Ok(CursorResult::Ok(false)); } } // continue to next loop to get record from the new page continue; } - let cell_idx = cell_idx as usize; return_if_locked_maybe_load!(self.pager, page); + self.stack.retreat(); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); + let cell_idx = self.stack.current_cell_index() as usize; // If we are at the end of the page and we haven't just come back from the right child, // we now need to move to the rightmost child. @@ -780,12 +777,10 @@ impl BTreeCursor { } } - let cell_idx = if cell_idx >= cell_count { + if cell_idx >= cell_count { self.stack.set_cell_index(cell_count as i32 - 1); - cell_count - 1 - } else { - cell_idx - }; + } + let cell_idx = self.stack.current_cell_index() as usize; let cell = contents.cell_get( cell_idx, @@ -800,34 +795,14 @@ impl BTreeCursor { _rowid, }) => { let mem_page = self.read_page(_left_child_page as usize)?; - self.stack.retreat(); self.stack.push_backwards(mem_page); continue; } - BTreeCell::TableLeafCell(TableLeafCell { - _rowid, - _payload, - first_overflow_page, - payload_size, - }) => { - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(_payload, next_page, payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - _payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - self.stack.retreat(); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(_rowid), - })); + BTreeCell::TableLeafCell(TableLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } BTreeCell::IndexInteriorCell(IndexInteriorCell { - payload, - left_child_page, - first_overflow_page, - payload_size, + left_child_page, .. }) => { if !self.going_upwards { // In backwards iteration, if we haven't just moved to this interior node from the @@ -837,116 +812,19 @@ impl BTreeCursor { // this parent: key 666 // left child has: key 663, key 664, key 665 // we need to move to the previous parent (with e.g. key 662) when iterating backwards. - self.stack.retreat(); let mem_page = self.read_page(left_child_page as usize)?; - self.stack.push(mem_page); - // use cell_index = i32::MAX to tell next loop to go to the end of the current page - self.stack.set_cell_index(i32::MAX); + self.stack.push_backwards(mem_page); continue; } - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, next_page, payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; // Going upwards = we just moved to an interior cell from the right child. // On the first pass we must take the record from the interior cell (since unlike table btrees, index interior cells have payloads) // We then mark going_upwards=false so that we go back down the tree on the next invocation. self.going_upwards = false; - if predicate.is_none() { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } - - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_values = record.get_values(); - let record_slice_same_num_cols = - &record_values[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - - let found = match op { - SeekOp::EQ => order.is_eq(), - SeekOp::LE => order.is_le(), - SeekOp::LT => order.is_lt(), - _ => unreachable!("Seek GT/GE should not happen in get_prev_record() because we are iterating backwards"), - }; - if found { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } else { - continue; - } + return Ok(CursorResult::Ok(true)); } - BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - }) => { - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, next_page, payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - - self.stack.retreat(); - if predicate.is_none() { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_values = record.get_values(); - let record_slice_same_num_cols = - &record_values[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - let found = match op { - SeekOp::EQ => order.is_eq(), - SeekOp::LE => order.is_le(), - SeekOp::LT => order.is_lt(), - _ => unreachable!("Seek GT/GE should not happen in get_prev_record() because we are iterating backwards"), - }; - if found { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } else { - continue; - } + BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } } } @@ -1362,26 +1240,16 @@ impl BTreeCursor { /// Move the cursor to the next record and return it. /// Used in forwards iteration, which is the default. - fn get_next_record( - &mut self, - predicate: Option<(SeekKey<'_>, SeekOp)>, - ) -> Result> { + fn get_next_record(&mut self) -> Result> { if let Some(mv_cursor) = &self.mv_cursor { let mut mv_cursor = mv_cursor.borrow_mut(); let rowid = mv_cursor.current_row_id(); match rowid { - Some(rowid) => { - let record = mv_cursor.current_row().unwrap().unwrap(); - crate::storage::sqlite3_ondisk::read_record( - &record.data, - self.get_immutable_record_or_create().as_mut().unwrap(), - )?; + Some(_rowid) => { mv_cursor.forward(); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(rowid.row_id), - })); + return Ok(CursorResult::Ok(true)); } - None => return Ok(CursorResult::Ok(CursorHasRecord::No)), + None => return Ok(CursorResult::Ok(false)), } } loop { @@ -1394,6 +1262,8 @@ impl BTreeCursor { cell_idx ); return_if_locked_maybe_load!(self.pager, mem_page_rc); + // Important to advance only after loading the page in order to not advance > 1 times + self.stack.advance(); let mem_page = mem_page_rc.get(); let contents = mem_page.get().contents.as_ref().unwrap(); @@ -1416,7 +1286,7 @@ impl BTreeCursor { self.stack.pop(); continue; } else { - return Ok(CursorResult::Ok(CursorHasRecord::No)); + return Ok(CursorResult::Ok(false)); } } } @@ -1431,7 +1301,7 @@ impl BTreeCursor { self.stack.pop(); continue; } else { - return Ok(CursorResult::Ok(CursorHasRecord::No)); + return Ok(CursorResult::Ok(false)); } } assert!(cell_idx < contents.cell_count()); @@ -1447,156 +1317,27 @@ impl BTreeCursor { _left_child_page, _rowid, }) => { - assert!(predicate.is_none()); - self.stack.advance(); let mem_page = self.read_page(*_left_child_page as usize)?; self.stack.push(mem_page); continue; } - BTreeCell::TableLeafCell(TableLeafCell { - _rowid, - _payload, - payload_size, - first_overflow_page, - }) => { - assert!(predicate.is_none()); - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read( - _payload, - *next_page, - *payload_size - )) - } else { - crate::storage::sqlite3_ondisk::read_record( - _payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - self.stack.advance(); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(*_rowid), - })); + BTreeCell::TableLeafCell(TableLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } BTreeCell::IndexInteriorCell(IndexInteriorCell { - payload, - left_child_page, - first_overflow_page, - payload_size, + left_child_page, .. }) => { if !self.going_upwards { let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); continue; } - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read( - payload, - *next_page, - *payload_size - )) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; self.going_upwards = false; - self.stack.advance(); - if predicate.is_none() { - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - return Ok(CursorResult::Ok(cursor_has_record)); - } - - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_slice_same_num_cols = - &record.get_values()[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - let found = match op { - SeekOp::GT => order.is_gt(), - SeekOp::GE => order.is_ge(), - SeekOp::EQ => order.is_eq(), - _ => unreachable!("Seek LE/LT should not happen in get_next_record() because we are iterating forwards"), - }; - if found { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } else { - continue; - } + return Ok(CursorResult::Ok(true)); } - BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - }) => { - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read( - payload, - *next_page, - *payload_size - )) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - - self.stack.advance(); - if predicate.is_none() { - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - return Ok(CursorResult::Ok(cursor_has_record)); - } - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_slice_same_num_cols = - &record.get_values()[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - let found = match op { - SeekOp::GT => order.is_lt(), - SeekOp::GE => order.is_le(), - SeekOp::EQ => order.is_le(), - _ => todo!("not implemented: {:?}", op), - }; - if found { - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - return Ok(CursorResult::Ok(cursor_has_record)); - } else { - continue; - } + BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } } } @@ -1606,7 +1347,7 @@ impl BTreeCursor { /// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10) /// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10). /// We don't include the rowid in the comparison and that's why the last value from the record is not included. - fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { + fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { match key { SeekKey::TableRowId(rowid) => { return self.tablebtree_seek(rowid, op); @@ -1955,11 +1696,7 @@ impl BTreeCursor { /// Specialized version of do_seek() for table btrees that uses binary search instead /// of iterating cells in order. - fn tablebtree_seek( - &mut self, - rowid: i64, - seek_op: SeekOp, - ) -> Result> { + fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { assert!(self.mv_cursor.is_none()); let iter_dir = seek_op.iteration_direction(); @@ -2000,47 +1737,12 @@ impl BTreeCursor { let min = self.seek_state.get_min(); let max = self.seek_state.get_max(); if min > max { - let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() - else { - return Ok(CursorResult::Ok(CursorHasRecord::No)); - }; - let matching_cell = contents.cell_get( - nearest_matching_cell, - payload_overflow_threshold_max( - contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - let BTreeCell::TableLeafCell(TableLeafCell { - _rowid: cell_rowid, - _payload, - first_overflow_page, - payload_size, - .. - }) = matching_cell - else { - unreachable!("unexpected cell type: {:?}", matching_cell); - }; - - return_if_io!(self.read_record_w_possible_overflow( - _payload, - first_overflow_page, - payload_size - )); - let cell_idx = if iter_dir == IterationDirection::Forwards { - nearest_matching_cell as i32 + 1 + if let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() { + self.stack.set_cell_index(nearest_matching_cell as i32); + return Ok(CursorResult::Ok(true)); } else { - nearest_matching_cell as i32 - 1 + return Ok(CursorResult::Ok(false)); }; - self.stack.set_cell_index(cell_idx as i32); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(cell_rowid), - })); } let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. @@ -2058,42 +1760,8 @@ impl BTreeCursor { // rowids are unique, so we can return the rowid immediately if found && SeekOp::EQ == seek_op { - let cur_cell = contents.cell_get( - cur_cell_idx as usize, - payload_overflow_threshold_max( - contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - let BTreeCell::TableLeafCell(TableLeafCell { - _rowid: _, - _payload, - first_overflow_page, - payload_size, - .. - }) = cur_cell - else { - unreachable!("unexpected cell type: {:?}", cur_cell); - }; - return_if_io!(self.read_record_w_possible_overflow( - _payload, - first_overflow_page, - payload_size - )); - let cell_idx = if iter_dir == IterationDirection::Forwards { - cur_cell_idx + 1 - } else { - cur_cell_idx - 1 - }; - self.stack.set_cell_index(cell_idx as i32); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(cell_rowid), - })); + self.stack.set_cell_index(cur_cell_idx as i32); + return Ok(CursorResult::Ok(true)); } if found { @@ -2130,7 +1798,7 @@ impl BTreeCursor { &mut self, key: &ImmutableRecord, seek_op: SeekOp, - ) -> Result> { + ) -> Result> { if matches!(self.seek_state, CursorSeekState::Start) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::IndexKey(key), seek_op)); @@ -2168,8 +1836,10 @@ impl BTreeCursor { let min = self.seek_state.get_min(); let max = self.seek_state.get_max(); if min > max { - let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() - else { + if let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() { + self.stack.set_cell_index(nearest_matching_cell as i32); + return Ok(CursorResult::Ok(true)); + } else { // We have now iterated over all cells in the leaf page and found no match. // Unlike tables, indexes store payloads in interior cells as well. self.move_to() always moves to a leaf page, so there are cases where we need to // move back up to the parent interior cell and get the next record from there to perform a correct seek. @@ -2191,53 +1861,17 @@ impl BTreeCursor { self.seek_state.set_not_found_leaf(true); self.stack.set_cell_index(cell_count as i32); } - return self.get_next_record(Some((SeekKey::IndexKey(key), seek_op))); + return self.get_next_record(); } IterationDirection::Backwards => { if !self.seek_state.get_not_found_leaf() { self.seek_state.set_not_found_leaf(true); self.stack.set_cell_index(-1); } - return self.get_prev_record(Some((SeekKey::IndexKey(key), seek_op))); + return self.get_prev_record(); } } }; - let cell = contents.cell_get( - nearest_matching_cell as usize, - payload_overflow_threshold_max( - contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - - let BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - }) = &cell - else { - unreachable!("unexpected cell type: {:?}", cell); - }; - - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - } - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - self.stack.set_cell_index(nearest_matching_cell as i32); - self.stack.next_cell_in_direction(iter_dir); - return Ok(CursorResult::Ok(cursor_has_record)); } let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. @@ -2448,7 +2082,7 @@ impl BTreeCursor { ); if cmp == Ordering::Equal { tracing::debug!("insert_into_page: found exact match with cell_idx={cell_idx}, overwriting"); - self.has_record.set(CursorHasRecord::Yes { rowid: self.get_index_rowid_from_record() }); + self.has_record.set(true); self.overwrite_cell(page.clone(), cell_idx, record)?; self.state .mut_write_info() @@ -4191,8 +3825,8 @@ impl BTreeCursor { pub fn seek_to_last(&mut self) -> Result> { return_if_io!(self.move_to_rightmost()); - let cursor_has_record = return_if_io!(self.get_next_record(None)); - if cursor_has_record == CursorHasRecord::No { + let cursor_has_record = return_if_io!(self.get_next_record()); + if !cursor_has_record { let is_empty = return_if_io!(self.is_empty_table()); assert!(is_empty); return Ok(CursorResult::Ok(())); @@ -4202,7 +3836,7 @@ impl BTreeCursor { } pub fn is_empty(&self) -> bool { - CursorHasRecord::No == self.has_record.get() + !self.has_record.get() } pub fn root_page(&self) -> usize { @@ -4211,12 +3845,12 @@ impl BTreeCursor { pub fn rewind(&mut self) -> Result> { if self.mv_cursor.is_some() { - let cursor_has_record = return_if_io!(self.get_next_record(None)); + let cursor_has_record = return_if_io!(self.get_next_record()); self.has_record.replace(cursor_has_record); } else { self.move_to_root(); - let cursor_has_record = return_if_io!(self.get_next_record(None)); + let cursor_has_record = return_if_io!(self.get_next_record()); self.has_record.replace(cursor_has_record); } Ok(CursorResult::Ok(())) @@ -4232,7 +3866,7 @@ impl BTreeCursor { pub fn next(&mut self) -> Result> { let _ = return_if_io!(self.restore_context()); - let cursor_has_record = return_if_io!(self.get_next_record(None)); + let cursor_has_record = return_if_io!(self.get_next_record()); self.has_record.replace(cursor_has_record); Ok(CursorResult::Ok(())) } @@ -4240,7 +3874,7 @@ impl BTreeCursor { pub fn prev(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); let _ = return_if_io!(self.restore_context()); - match self.get_prev_record(None)? { + match self.get_prev_record()? { CursorResult::Ok(cursor_has_record) => { self.has_record.replace(cursor_has_record); Ok(CursorResult::Ok(())) From b0c64cb4d29d39d5d107b78b43db0c7829e619ea Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 14:28:19 +0200 Subject: [PATCH 03/42] parse record lazily --- core/storage/btree.rs | 68 ++++++++++++++++++++++++++++++++++++------- core/types.rs | 10 +++++++ 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 11517c9d8..e26778348 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -10,7 +10,7 @@ use crate::{ }, }, translate::{collate::CollationSeq, plan::IterationDirection}, - types::{IndexKeyInfo, IndexKeySortOrder}, + types::{IndexKeyInfo, IndexKeySortOrder, ParseRecordState}, MvCursor, }; @@ -618,6 +618,8 @@ pub struct BTreeCursor { stack: PageStack, /// Reusable immutable record, used to allow better allocation strategy. reusable_immutable_record: RefCell>, + /// Reusable immutable record, used to allow better allocation strategy. + parse_record_state: RefCell, pub index_key_info: Option, /// Maintain count of the number of records in the btree. Used for the `Count` opcode count: usize, @@ -649,7 +651,7 @@ impl BTreeCursor { mv_cursor, pager, root_page, - has_record: Cell::new(CursorHasRecord::No), + has_record: Cell::new(false), null_flag: false, going_upwards: false, state: CursorState::None, @@ -3883,10 +3885,12 @@ impl BTreeCursor { } } - pub fn rowid(&self) -> Result> { + pub fn rowid(&self) -> Result>> { if let Some(mv_cursor) = &self.mv_cursor { let mv_cursor = mv_cursor.borrow(); - return Ok(mv_cursor.current_row_id().map(|rowid| rowid.row_id)); + return Ok(CursorResult::Ok( + mv_cursor.current_row_id().map(|rowid| rowid.row_id), + )); } Ok(match self.has_record.get() { CursorHasRecord::Yes { rowid: Some(rowid) } => Some(rowid), @@ -3904,14 +3908,58 @@ impl BTreeCursor { // Reset seek state self.seek_state = CursorSeekState::Start; self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(matches!( - cursor_has_record, - CursorHasRecord::Yes { .. } - ))) + Ok(CursorResult::Ok(cursor_has_record)) } - pub fn record(&self) -> Ref> { - self.reusable_immutable_record.borrow() + /// Return a reference to the record the cursor is currently pointing to. + /// If record was not parsed yet, then we have to parse it and in case of I/O we yield control + /// back. + pub fn record(&mut self) -> Result>>> { + let invalidated = self + .reusable_immutable_record + .borrow() + .as_ref() + .map_or(true, |record| record.is_invalidated()); + if !invalidated { + *self.parse_record_state.borrow_mut() = ParseRecordState::Init; + return Ok(CursorResult::Ok(self.reusable_immutable_record.borrow())); + } + if *self.parse_record_state.borrow() == ParseRecordState::Init { + *self.parse_record_state.borrow_mut() = ParseRecordState::Parsing { + payload: Vec::new(), + }; + } + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); + let contents = page.get_contents(); + let cell_idx = self.stack.current_cell_index(); + let cell = contents.cell_get( + cell_idx as usize, + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), + self.usable_space(), + )?; + let BTreeCell::IndexInteriorCell(IndexInteriorCell { + payload, + payload_size, + first_overflow_page, + .. + }) = &cell + else { + unreachable!("unexpected cell type: {:?}", cell); + }; + if let Some(next_page) = first_overflow_page { + return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size)) + } else { + crate::storage::sqlite3_ondisk::read_record( + payload, + self.get_immutable_record_or_create().as_mut().unwrap(), + )? + }; + + *self.parse_record_state.borrow_mut() = ParseRecordState::Init; + Ok(CursorResult::Ok(self.reusable_immutable_record.borrow())) } #[instrument(skip_all, level = Level::TRACE)] diff --git a/core/types.rs b/core/types.rs index 68363d5dd..0a5770d4f 100644 --- a/core/types.rs +++ b/core/types.rs @@ -701,6 +701,12 @@ pub struct ImmutableRecord { recreating: bool, } +#[derive(PartialEq)] +pub enum ParseRecordState { + Init, + Parsing { payload: Vec }, +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Record { values: Vec, @@ -934,6 +940,10 @@ impl ImmutableRecord { self.values.clear(); } + pub fn is_invalidated(&self) -> bool { + self.payload.is_empty() + } + pub fn get_payload(&self) -> &[u8] { &self.payload } From 77b6896eae707b22405be66bb7bbe14d76c941fa Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 15:59:20 +0200 Subject: [PATCH 04/42] implement lazy record and rowid in cursor This also comments save_context for now --- core/storage/btree.rs | 174 +++++++++++++++++++++------------ core/storage/sqlite3_ondisk.rs | 9 ++ core/vdbe/execute.rs | 26 ++--- core/vdbe/mod.rs | 10 +- 4 files changed, 138 insertions(+), 81 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index e26778348..316e90d47 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -635,7 +635,7 @@ pub struct BTreeCursor { move_to_state: CursorMoveToState, /// Separate state to read a record with overflow pages. This separation from `state` is necessary as /// we can be in a function that relies on `state`, but also needs to process overflow pages - read_overflow_state: Option, + read_overflow_state: RefCell>, /// Contains the current cell_idx for `find_cell` find_cell_state: FindCellState, } @@ -669,8 +669,9 @@ impl BTreeCursor { collations, seek_state: CursorSeekState::Start, move_to_state: CursorMoveToState::Start, - read_overflow_state: None, + read_overflow_state: RefCell::new(None), find_cell_state: FindCellState(None), + parse_record_state: RefCell::new(ParseRecordState::Init), } } @@ -835,16 +836,16 @@ impl BTreeCursor { /// Reads the record of a cell that has overflow pages. This is a state machine that requires to be called until completion so everything /// that calls this function should be reentrant. fn process_overflow_read( - &mut self, + &self, payload: &'static [u8], start_next_page: u32, payload_size: u64, ) -> Result> { - let res = match &mut self.read_overflow_state { + let res = match &mut *self.read_overflow_state.borrow_mut() { None => { tracing::debug!("start reading overflow page payload_size={}", payload_size); let page = self.read_page(start_next_page as usize)?; - self.read_overflow_state = Some(ReadPayloadOverflow { + *self.read_overflow_state.borrow_mut() = Some(ReadPayloadOverflow { payload: payload.to_vec(), next_page: start_next_page, remaining_to_read: payload_size as usize - payload.len(), @@ -900,7 +901,7 @@ impl BTreeCursor { reuse_immutable.as_mut().unwrap(), )?; } - self.read_overflow_state = None; + *self.read_overflow_state.borrow_mut() = None; Ok(CursorResult::Ok(())) } CursorResult::IO => Ok(CursorResult::IO), @@ -1256,16 +1257,21 @@ impl BTreeCursor { } loop { let mem_page_rc = self.stack.top(); - let cell_idx = self.stack.current_cell_index() as usize; + tracing::trace!( + "current_before_advance id={} cell={}", + mem_page_rc.get().get().id, + self.stack.current_cell_index() + ); + return_if_locked_maybe_load!(self.pager, mem_page_rc); + // Important to advance only after loading the page in order to not advance > 1 times + self.stack.advance(); + let cell_idx = self.stack.current_cell_index() as usize; tracing::trace!( "current id={} cell={}", mem_page_rc.get().get().id, cell_idx ); - return_if_locked_maybe_load!(self.pager, mem_page_rc); - // Important to advance only after loading the page in order to not advance > 1 times - self.stack.advance(); let mem_page = mem_page_rc.get(); let contents = mem_page.get().contents.as_ref().unwrap(); @@ -3892,10 +3898,34 @@ impl BTreeCursor { mv_cursor.current_row_id().map(|rowid| rowid.row_id), )); } - Ok(match self.has_record.get() { - CursorHasRecord::Yes { rowid: Some(rowid) } => Some(rowid), - _ => None, - }) + if self.has_record.get() { + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let page_type = page.get().get_contents().page_type(); + let page = page.get(); + let contents = page.get_contents(); + let cell_idx = self.stack.current_cell_index(); + let cell = contents.cell_get( + cell_idx as usize, + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), + self.usable_space(), + )?; + if page_type.is_table() { + let BTreeCell::TableLeafCell(TableLeafCell { + _rowid, + _payload, + .. + }) = cell else { + unreachable!("unexpected page_type"); + }; + return Ok(CursorResult::Ok(Some(_rowid))); + } else { + return Ok(CursorResult::Ok(self.get_index_rowid_from_record())); + } + } else { + return Ok(CursorResult::Ok(None)); + } } pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { @@ -3914,7 +3944,7 @@ impl BTreeCursor { /// Return a reference to the record the cursor is currently pointing to. /// If record was not parsed yet, then we have to parse it and in case of I/O we yield control /// back. - pub fn record(&mut self) -> Result>>> { + pub fn record(&self) -> Result>>> { let invalidated = self .reusable_immutable_record .borrow() @@ -3940,17 +3970,28 @@ impl BTreeCursor { payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), self.usable_space(), )?; - let BTreeCell::IndexInteriorCell(IndexInteriorCell { - payload, - payload_size, - first_overflow_page, - .. - }) = &cell - else { - unreachable!("unexpected cell type: {:?}", cell); - }; + let (payload, payload_size, first_overflow_page) = match cell { + BTreeCell::TableLeafCell(TableLeafCell { + _rowid, + _payload, + payload_size, + first_overflow_page, + }) => (_payload, payload_size, first_overflow_page), + BTreeCell::IndexInteriorCell(IndexInteriorCell { + left_child_page: _, + payload, + payload_size, + first_overflow_page, + }) => (payload, payload_size, first_overflow_page), + BTreeCell::IndexLeafCell(IndexLeafCell { + payload, + first_overflow_page, + payload_size, + }) => (payload, payload_size, first_overflow_page), + _ => unreachable!("unexpected page_type"), + }; if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size)) + return_if_io!(self.process_overflow_read(payload, next_page, payload_size)) } else { crate::storage::sqlite3_ondisk::read_record( payload, @@ -3994,10 +4035,7 @@ impl BTreeCursor { } return_if_io!(self.insert_into_page(key)); if key.maybe_rowid().is_some() { - let int_key = key.to_rowid(); - self.has_record.replace(CursorHasRecord::Yes { - rowid: Some(int_key), - }); + self.has_record.replace(true); } } }; @@ -4040,8 +4078,8 @@ impl BTreeCursor { page.get().get_contents().page_type(), PageType::TableLeaf | PageType::TableInterior ) { - let _target_rowid = match self.has_record.get() { - CursorHasRecord::Yes { rowid: Some(rowid) } => rowid, + let _target_rowid = match return_if_io!(self.rowid()) { + Some(rowid) => rowid, _ => { self.state = CursorState::None; return Ok(CursorResult::Ok(())); @@ -4222,9 +4260,13 @@ impl BTreeCursor { let needs_balancing = free_space as usize * 3 > self.usable_space() * 2; let target_key = if page.is_index() { - DeleteSavepoint::Payload(self.record().as_ref().unwrap().clone()) + let record = match &*return_if_io!(self.record()) { + Some(record) => record.clone(), + None => unreachable!("there should've been a record"), + }; + DeleteSavepoint::Payload(record) } else { - let CursorHasRecord::Yes { rowid: Some(rowid) } = self.has_record.get() + let Some(rowid) = return_if_io!(self.rowid()) else { panic!("cursor should be pointing to a record with a rowid"); }; @@ -4245,7 +4287,7 @@ impl BTreeCursor { return Ok(CursorResult::Ok(())); } // Only reaches this function call if state = DeleteState::WaitForBalancingToComplete - self.save_context(); + // self.save_context(); } DeleteState::WaitForBalancingToComplete { target_key } => { @@ -4321,7 +4363,7 @@ impl BTreeCursor { pub fn key_exists_in_index(&mut self, key: &ImmutableRecord) -> Result> { return_if_io!(self.seek(SeekKey::IndexKey(key), SeekOp::GE)); - let record_opt = self.record(); + let record_opt = return_if_io!(self.record()); match record_opt.as_ref() { Some(record) => { // Existing record found — compare prefix @@ -4657,7 +4699,7 @@ impl BTreeCursor { // build the new payload let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type(); let mut new_payload = Vec::with_capacity(record.len()); - let CursorHasRecord::Yes { rowid } = self.has_record.get() else { + let rowid = return_if_io!(self.rowid()) else { panic!("cursor should be pointing to a record"); }; fill_cell_payload( @@ -4842,27 +4884,27 @@ impl BTreeCursor { } /// Save cursor context, to be restored later - pub fn save_context(&mut self) { - if let CursorHasRecord::Yes { rowid } = self.has_record.get() { - self.valid_state = CursorValidState::RequireSeek; - match self.stack.top().get().get_contents().page_type() { - PageType::TableInterior | PageType::TableLeaf => { - self.context = Some(CursorContext::TableRowId(rowid.expect( - "table cells should have a rowid since we don't support WITHOUT ROWID tables", - ))); - } - PageType::IndexInterior | PageType::IndexLeaf => { - self.context = Some(CursorContext::IndexKeyRowId( - self.reusable_immutable_record - .borrow() - .as_ref() - .unwrap() - .clone(), - )); - } - } - } - } + // pub fn save_context(&mut self) { + // if let CursorHasRecord::Yes { rowid } = self.has_record.get() { + // self.valid_state = CursorValidState::RequireSeek; + // match self.stack.top().get().get_contents().page_type() { + // PageType::TableInterior | PageType::TableLeaf => { + // self.context = Some(CursorContext::TableRowId(rowid.expect( + // "table cells should have a rowid since we don't support WITHOUT ROWID tables", + // ))); + // } + // PageType::IndexInterior | PageType::IndexLeaf => { + // self.context = Some(CursorContext::IndexKeyRowId( + // self.reusable_immutable_record + // .borrow() + // .as_ref() + // .unwrap() + // .clone(), + // )); + // } + // } + // } + // } /// If context is defined, restore it and set it None on success fn restore_context(&mut self) -> Result> { @@ -4962,7 +5004,7 @@ impl PageStack { } fn push(&self, page: BTreePage) { - self._push(page, 0); + self._push(page, -1); } fn push_backwards(&self, page: BTreePage) { @@ -6501,7 +6543,9 @@ mod tests { for key in keys.iter() { tracing::trace!("seeking key: {}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); - let cursor_rowid = cursor.rowid().unwrap().unwrap(); + let cursor_rowid = run_until_done(|| cursor.rowid(), pager.deref()) + .unwrap() + .unwrap(); if *key != cursor_rowid { valid = false; println!("key {} is not found, got {}", key, cursor_rowid); @@ -6532,7 +6576,9 @@ mod tests { for key in keys.iter() { tracing::trace!("seeking key: {}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); - let cursor_rowid = cursor.rowid().unwrap().unwrap(); + let cursor_rowid = run_until_done(|| cursor.rowid(), pager.deref()) + .unwrap() + .unwrap(); assert_eq!( *key, cursor_rowid, "key {} is not found, got {}", @@ -7815,11 +7861,13 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page); cursor.move_to_root(); for i in 0..iterations { - let CursorHasRecord::Yes { rowid: Some(rowid) } = - run_until_done(|| cursor.get_next_record(None), pager.deref()).unwrap() - else { + let has_next = run_until_done(|| cursor.get_next_record(), pager.deref()).unwrap(); + if !has_next { panic!("expected Some(rowid) but got {:?}", cursor.has_record.get()); }; + let rowid = run_until_done(|| cursor.rowid(), pager.deref()) + .unwrap() + .unwrap(); assert_eq!(rowid, i as i64, "got!=expected"); } } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index bc39cdfdf..917a91da5 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -377,6 +377,15 @@ pub enum PageType { TableLeaf = 13, } +impl PageType { + pub fn is_table(&self) -> bool { + match self { + PageType::IndexInterior | PageType::IndexLeaf => false, + PageType::TableInterior | PageType::TableLeaf => true, + } + } +} + impl TryFrom for PageType { type Error = LimboError; diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index d0ed2dab6..bdf77b83d 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1348,7 +1348,7 @@ pub fn op_column( let rowid = { let mut index_cursor = state.get_cursor(index_cursor_id); let index_cursor = index_cursor.as_btree_mut(); - index_cursor.rowid()? + return_if_io!(index_cursor.rowid()) }; let mut table_cursor = state.get_cursor(table_cursor_id); let table_cursor = table_cursor.as_btree_mut(); @@ -1369,7 +1369,7 @@ pub fn op_column( let mut cursor = must_be_btree_cursor!(*cursor_id, program.cursor_ref, state, "Column"); let cursor = cursor.as_btree_mut(); - let record = cursor.record(); + let record = return_if_io!(cursor.record()); let value = if let Some(record) = record.as_ref() { if cursor.get_null_flag() { RefValue::Null @@ -1904,7 +1904,7 @@ pub fn op_row_id( let rowid = { let mut index_cursor = state.get_cursor(index_cursor_id); let index_cursor = index_cursor.as_btree_mut(); - let record = index_cursor.record(); + let record = return_if_io!(index_cursor.record()); let record = record.as_ref().unwrap(); let rowid = record.get_values().last().unwrap(); match rowid { @@ -1926,7 +1926,7 @@ pub fn op_row_id( } let mut cursors = state.cursors.borrow_mut(); if let Some(Cursor::BTree(btree_cursor)) = cursors.get_mut(*cursor_id).unwrap() { - if let Some(ref rowid) = btree_cursor.rowid()? { + if let Some(ref rowid) = return_if_io!(btree_cursor.rowid()) { state.registers[*dest] = Register::Value(Value::Integer(*rowid as i64)); } else { state.registers[*dest] = Register::Value(Value::Null); @@ -1960,7 +1960,7 @@ pub fn op_idx_row_id( let mut cursors = state.cursors.borrow_mut(); let cursor = cursors.get_mut(*cursor_id).unwrap().as_mut().unwrap(); let cursor = cursor.as_btree_mut(); - let rowid = cursor.rowid()?; + let rowid = return_if_io!(cursor.rowid()); state.registers[*dest] = match rowid { Some(rowid) => Register::Value(Value::Integer(rowid as i64)), None => Register::Value(Value::Null), @@ -2160,7 +2160,7 @@ pub fn op_idx_ge( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2224,7 +2224,7 @@ pub fn op_idx_le( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2270,7 +2270,7 @@ pub fn op_idx_gt( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2316,7 +2316,7 @@ pub fn op_idx_lt( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -3842,7 +3842,7 @@ pub fn op_insert( return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record)), true)); // Only update last_insert_rowid for regular table inserts, not schema modifications if cursor.root_page() != 1 { - if let Some(rowid) = cursor.rowid()? { + if let Some(rowid) = return_if_io!(cursor.rowid()) { if let Some(conn) = program.connection.upgrade() { conn.update_last_rowid(rowid); } @@ -3932,10 +3932,10 @@ pub fn op_idx_delete( tracing::debug!( "op_idx_delete(seek={}, record={} rowid={:?})", &record, - cursor.record().as_ref().unwrap(), - cursor.rowid() + return_if_io!(cursor.record()).as_ref().unwrap(), + return_if_io!(cursor.rowid()) ); - if cursor.rowid()?.is_none() { + if return_if_io!(cursor.rowid()).is_none() { // If P5 is not zero, then raise an SQLITE_CORRUPT_INDEX error if no matching // index entry is found. This happens when running an UPDATE or DELETE statement and the // index entry to be updated or deleted is not found. For some uses of IdxDelete diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index f7f59597c..16e5a1117 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -499,11 +499,11 @@ fn get_new_rowid(cursor: &mut BTreeCursor, mut rng: R) -> Result {} CursorResult::IO => return Ok(CursorResult::IO), } - let mut rowid = cursor - .rowid()? - .unwrap_or(0) // if BTree is empty - use 0 as initial value for rowid - .checked_add(1) // add 1 but be careful with overflows - .unwrap_or(i64::MAX); // in case of overflow - use i64::MAX + let mut rowid = match cursor.rowid()? { + CursorResult::Ok(Some(rowid)) => rowid.checked_add(1).unwrap_or(i64::MAX), // add 1 but be careful with overflows, in case of overflow - use i64::MAX + CursorResult::Ok(None) => 0, + CursorResult::IO => return Ok(CursorResult::IO), + }; if rowid > i64::MAX.try_into().unwrap() { let distribution = Uniform::from(1..=i64::MAX); let max_attempts = 100; From 96324834a56bda976549db226147ad55b4df8fc8 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 17:11:25 +0200 Subject: [PATCH 05/42] invalidate records on movement --- core/storage/btree.rs | 83 +++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 316e90d47..fcff49e4b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -816,6 +816,7 @@ impl BTreeCursor { // left child has: key 663, key 664, key 665 // we need to move to the previous parent (with e.g. key 662) when iterating backwards. let mem_page = self.read_page(left_child_page as usize)?; + self.stack.advance(); self.stack.push_backwards(mem_page); continue; } @@ -1335,14 +1336,16 @@ impl BTreeCursor { BTreeCell::IndexInteriorCell(IndexInteriorCell { left_child_page, .. }) => { - if !self.going_upwards { + if self.going_upwards { + self.going_upwards = false; + return Ok(CursorResult::Ok(true)); + } else { let mem_page = self.read_page(*left_child_page as usize)?; + // we still need to process this page later when going up, so let's go back + self.stack.retreat(); self.stack.push(mem_page); continue; } - - self.going_upwards = false; - return Ok(CursorResult::Ok(true)); } BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => { return Ok(CursorResult::Ok(true)); @@ -1721,6 +1724,10 @@ impl BTreeCursor { ); let cell_count = contents.cell_count(); + if cell_count == 0 { + self.stack.set_cell_index(0); + return Ok(CursorResult::Ok(false)); + } let min: isize = 0; let max: isize = cell_count as isize - 1; @@ -1816,6 +1823,17 @@ impl BTreeCursor { let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); + if cell_count == 0 { + self.stack.set_cell_index(0); + match seek_op.iteration_direction() { + IterationDirection::Forwards => { + return self.next(); + } + IterationDirection::Backwards => { + return self.prev(); + } + } + } let min: isize = 0; let max: isize = cell_count as isize - 1; @@ -1869,14 +1887,14 @@ impl BTreeCursor { self.seek_state.set_not_found_leaf(true); self.stack.set_cell_index(cell_count as i32); } - return self.get_next_record(); + return self.next(); } IterationDirection::Backwards => { if !self.seek_state.get_not_found_leaf() { self.seek_state.set_not_found_leaf(true); self.stack.set_cell_index(-1); } - return self.get_prev_record(); + return self.prev(); } } }; @@ -2130,6 +2148,7 @@ impl BTreeCursor { )?; contents.overflow_cells.len() }; + self.stack.set_cell_index(cell_idx as i32); let write_info = self .state .mut_write_info() @@ -3834,6 +3853,7 @@ impl BTreeCursor { pub fn seek_to_last(&mut self) -> Result> { return_if_io!(self.move_to_rightmost()); let cursor_has_record = return_if_io!(self.get_next_record()); + self.invalidate_record(); if !cursor_has_record { let is_empty = return_if_io!(self.is_empty_table()); assert!(is_empty); @@ -3854,11 +3874,13 @@ impl BTreeCursor { pub fn rewind(&mut self) -> Result> { if self.mv_cursor.is_some() { let cursor_has_record = return_if_io!(self.get_next_record()); + self.invalidate_record(); self.has_record.replace(cursor_has_record); } else { self.move_to_root(); let cursor_has_record = return_if_io!(self.get_next_record()); + self.invalidate_record(); self.has_record.replace(cursor_has_record); } Ok(CursorResult::Ok(())) @@ -3867,25 +3889,37 @@ impl BTreeCursor { pub fn last(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); match self.move_to_rightmost()? { - CursorResult::Ok(_) => self.prev(), + CursorResult::Ok(_) => { + let _ = self.prev(); + Ok(CursorResult::Ok(())) + } CursorResult::IO => Ok(CursorResult::IO), } } - pub fn next(&mut self) -> Result> { - let _ = return_if_io!(self.restore_context()); + pub fn next(&mut self) -> Result> { + return_if_io!(self.restore_context()); let cursor_has_record = return_if_io!(self.get_next_record()); self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(())) + self.invalidate_record(); + Ok(CursorResult::Ok(cursor_has_record)) } - pub fn prev(&mut self) -> Result> { + fn invalidate_record(&mut self) { + self.get_immutable_record_or_create() + .as_mut() + .unwrap() + .invalidate(); + } + + pub fn prev(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); - let _ = return_if_io!(self.restore_context()); + return_if_io!(self.restore_context()); match self.get_prev_record()? { CursorResult::Ok(cursor_has_record) => { self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(())) + self.invalidate_record(); + Ok(CursorResult::Ok(cursor_has_record)) } CursorResult::IO => Ok(CursorResult::IO), } @@ -3919,12 +3953,12 @@ impl BTreeCursor { }) = cell else { unreachable!("unexpected page_type"); }; - return Ok(CursorResult::Ok(Some(_rowid))); + Ok(CursorResult::Ok(Some(_rowid))) } else { - return Ok(CursorResult::Ok(self.get_index_rowid_from_record())); + Ok(CursorResult::Ok(self.get_index_rowid_from_record())) } } else { - return Ok(CursorResult::Ok(None)); + Ok(CursorResult::Ok(None)) } } @@ -3933,8 +3967,10 @@ impl BTreeCursor { // We need to clear the null flag for the table cursor before seeking, // because it might have been set to false by an unmatched left-join row during the previous iteration // on the outer loop. + tracing::trace!("seek(key={:?}, op={:?})", key, op); self.set_null_flag(false); let cursor_has_record = return_if_io!(self.do_seek(key, op)); + self.invalidate_record(); // Reset seek state self.seek_state = CursorSeekState::Start; self.has_record.replace(cursor_has_record); @@ -4025,13 +4061,14 @@ impl BTreeCursor { if !moved_before && !matches!(self.state, CursorState::Write(..)) { match key { BTreeKey::IndexKey(_) => { - return_if_io!(self - .move_to(SeekKey::IndexKey(key.get_record().unwrap()), SeekOp::GE)) + return_if_io!( + self.seek(SeekKey::IndexKey(key.get_record().unwrap()), SeekOp::GE) + ) } - BTreeKey::TableRowId(_) => return_if_io!( - self.move_to(SeekKey::TableRowId(key.to_rowid()), SeekOp::EQ) - ), - } + BTreeKey::TableRowId(_) => { + return_if_io!(self.seek(SeekKey::TableRowId(key.to_rowid()), SeekOp::EQ)) + } + }; } return_if_io!(self.insert_into_page(key)); if key.maybe_rowid().is_some() { @@ -7861,7 +7898,7 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page); cursor.move_to_root(); for i in 0..iterations { - let has_next = run_until_done(|| cursor.get_next_record(), pager.deref()).unwrap(); + let has_next = run_until_done(|| cursor.next(), pager.deref()).unwrap(); if !has_next { panic!("expected Some(rowid) but got {:?}", cursor.has_record.get()); }; From c52d9a52d9c504210b0abdbfd5e405f16aa8d1cd Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 17:29:17 +0200 Subject: [PATCH 06/42] fix count --- core/storage/btree.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index fcff49e4b..6c201349d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4844,6 +4844,7 @@ impl BTreeCursor { self.count += contents.cell_count(); } + self.stack.advance(); let cell_idx = self.stack.current_cell_index() as usize; // Second condition is necessary in case we return if the page is locked in the loop below @@ -4857,7 +4858,6 @@ impl BTreeCursor { } // Move to parent - self.going_upwards = true; self.stack.pop(); mem_page_rc = self.stack.top(); @@ -4884,7 +4884,6 @@ impl BTreeCursor { let right_most_pointer = contents.rightmost_pointer().unwrap(); self.stack.advance(); let mem_page = self.read_page(right_most_pointer as usize)?; - self.going_upwards = false; self.stack.push(mem_page); } else { // Move to child left page @@ -4911,7 +4910,6 @@ impl BTreeCursor { }) => { self.stack.advance(); let mem_page = self.read_page(left_child_page as usize)?; - self.going_upwards = false; self.stack.push(mem_page); } _ => unreachable!(), From b46852a3661334e946f93f07f9a93f4b83a90fc2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 18:01:27 +0200 Subject: [PATCH 07/42] move setting target key in delete if balance is needed --- core/storage/btree.rs | 50 +++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 6c201349d..9affb2ed1 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -309,15 +309,6 @@ impl WriteInfo { } } -/// Whether the cursor is currently pointing to a record. -#[derive(Debug, Clone, Copy, PartialEq)] -enum CursorHasRecord { - Yes { - rowid: Option, // not all indexes and btrees have rowids, so this is optional. - }, - No, -} - /// Holds the state machine for the operation that was in flight when the cursor /// was suspended due to IO. enum CursorState { @@ -3935,6 +3926,8 @@ impl BTreeCursor { if self.has_record.get() { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); + // load record + let _ = return_if_io!(self.record()); let page_type = page.get().get_contents().page_type(); let page = page.get(); let contents = page.get_contents(); @@ -4143,10 +4136,7 @@ impl BTreeCursor { DeleteState::FindCell => { let page = self.stack.top(); - let mut cell_idx = self.stack.current_cell_index() as usize; - if cell_idx > 0 { - cell_idx -= 1; - } + let cell_idx = self.stack.current_cell_index() as usize; let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); @@ -4294,24 +4284,26 @@ impl BTreeCursor { let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let free_space = compute_free_space(contents, self.usable_space() as u16); - let needs_balancing = free_space as usize * 3 > self.usable_space() * 2; + let needs_balancing = self.stack.has_parent() + && free_space as usize * 3 > self.usable_space() * 2; - let target_key = if page.is_index() { - let record = match &*return_if_io!(self.record()) { - Some(record) => record.clone(), - None => unreachable!("there should've been a record"), - }; - DeleteSavepoint::Payload(record) - } else { - let Some(rowid) = return_if_io!(self.rowid()) + if needs_balancing { + // FIXME(pere): cell index must be updated before calling `rowid` or + // `record` + let target_key = if page.is_index() { + let record = match &*return_if_io!(self.record()) { + Some(record) => record.clone(), + None => unreachable!("there should've been a record"), + }; + DeleteSavepoint::Payload(record) + } else { + let Some(rowid) = return_if_io!(self.rowid()) else { panic!("cursor should be pointing to a record with a rowid"); }; - DeleteSavepoint::Rowid(rowid) - }; - - let delete_info = self.state.mut_delete_info().unwrap(); - if needs_balancing { + DeleteSavepoint::Rowid(rowid) + }; + let delete_info = self.state.mut_delete_info().unwrap(); if delete_info.balance_write_info.is_none() { let mut write_info = WriteInfo::new(); write_info.state = WriteState::BalanceStart; @@ -4736,9 +4728,7 @@ impl BTreeCursor { // build the new payload let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type(); let mut new_payload = Vec::with_capacity(record.len()); - let rowid = return_if_io!(self.rowid()) else { - panic!("cursor should be pointing to a record"); - }; + let rowid = return_if_io!(self.rowid()); fill_cell_payload( page_type, rowid, From 681df9b1eb574d8b2fca6a3c7e66d2cc1f16e494 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 3 Jun 2025 20:00:19 +0200 Subject: [PATCH 08/42] fix get record --- core/storage/btree.rs | 18 +++++++++++++----- core/vdbe/execute.rs | 8 ++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 9affb2ed1..10349d56d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -3973,7 +3973,10 @@ impl BTreeCursor { /// Return a reference to the record the cursor is currently pointing to. /// If record was not parsed yet, then we have to parse it and in case of I/O we yield control /// back. - pub fn record(&self) -> Result>>> { + pub fn record(&self) -> Result>>> { + if !self.has_record.get() { + return Ok(CursorResult::Ok(None)); + } let invalidated = self .reusable_immutable_record .borrow() @@ -3981,7 +3984,10 @@ impl BTreeCursor { .map_or(true, |record| record.is_invalidated()); if !invalidated { *self.parse_record_state.borrow_mut() = ParseRecordState::Init; - return Ok(CursorResult::Ok(self.reusable_immutable_record.borrow())); + let record_ref = + Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref()) + .unwrap(); + return Ok(CursorResult::Ok(Some(record_ref))); } if *self.parse_record_state.borrow() == ParseRecordState::Init { *self.parse_record_state.borrow_mut() = ParseRecordState::Parsing { @@ -4029,7 +4035,9 @@ impl BTreeCursor { }; *self.parse_record_state.borrow_mut() = ParseRecordState::Init; - Ok(CursorResult::Ok(self.reusable_immutable_record.borrow())) + let record_ref = + Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref()).unwrap(); + Ok(CursorResult::Ok(Some(record_ref))) } #[instrument(skip_all, level = Level::TRACE)] @@ -4291,7 +4299,7 @@ impl BTreeCursor { // FIXME(pere): cell index must be updated before calling `rowid` or // `record` let target_key = if page.is_index() { - let record = match &*return_if_io!(self.record()) { + let record = match return_if_io!(self.record()) { Some(record) => record.clone(), None => unreachable!("there should've been a record"), }; @@ -6689,7 +6697,7 @@ mod tests { for key in keys.iter() { tracing::trace!("seeking key: {:?}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); - let record = cursor.record(); + let record = run_until_done(|| cursor.record(), &pager).unwrap(); let record = record.as_ref().unwrap(); let cursor_key = record.get_values(); assert_eq!( diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index bdf77b83d..c2180806b 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2160,7 +2160,7 @@ pub fn op_idx_ge( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { + let pc = if let Some(idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2224,7 +2224,7 @@ pub fn op_idx_le( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { + let pc = if let Some(ref idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2270,7 +2270,7 @@ pub fn op_idx_gt( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { + let pc = if let Some(ref idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2316,7 +2316,7 @@ pub fn op_idx_lt( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *return_if_io!(cursor.record()) { + let pc = if let Some(ref idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; From f261d5b6af53455ff2125d9dfaec40a780642bc7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 4 Jun 2025 14:18:24 +0200 Subject: [PATCH 09/42] compare on next/prev after seek --- core/storage/btree.rs | 99 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 21 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 10349d56d..df7b75483 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -411,6 +411,9 @@ enum CursorSeekState { /// This value is only used for indexbtree not_found_leaf: bool, }, + /// In case value was not found in a leaf page, then we need to find in upper layer. This case + /// requires I/O for moving and for loading payload if it overflows so let's store it. + SeekingIndexMoveUp, } // These functions below exist to avoid problems with the borrow checker @@ -1839,6 +1842,38 @@ impl BTreeCursor { nearest_matching_cell, not_found_leaf: false, }; + } else if let CursorSeekState::SeekingIndexMoveUp = self.seek_state { + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); + let contents = page.get().contents.as_ref().unwrap(); + let cur_cell_idx = self.stack.current_cell_index() as usize; + let cell = contents.cell_get( + cur_cell_idx, + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), + self.usable_space(), + )?; + let BTreeCell::IndexInteriorCell(IndexInteriorCell { + payload, + first_overflow_page, + payload_size, + .. + }) = &cell + else { + unreachable!("unexpected cell type: {:?}", cell); + }; + + if let Some(next_page) = first_overflow_page { + return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size)) + } else { + crate::storage::sqlite3_ondisk::read_record( + payload, + self.get_immutable_record_or_create().as_mut().unwrap(), + )? + }; + let (_, found) = self.compare_with_current_record(key, seek_op); + return Ok(CursorResult::Ok(found)); } let page = self.stack.top(); @@ -1878,14 +1913,26 @@ impl BTreeCursor { self.seek_state.set_not_found_leaf(true); self.stack.set_cell_index(cell_count as i32); } - return self.next(); + let next_res = return_if_io!(self.next()); + if !next_res { + return Ok(CursorResult::Ok(false)); + } + // FIXME: optimize this in case record can be read directly + self.seek_state = CursorSeekState::SeekingIndexMoveUp; + return Ok(CursorResult::IO); } IterationDirection::Backwards => { if !self.seek_state.get_not_found_leaf() { self.seek_state.set_not_found_leaf(true); self.stack.set_cell_index(-1); } - return self.prev(); + let prev_res = return_if_io!(self.prev()); + if !prev_res { + return Ok(CursorResult::Ok(false)); + } + // FIXME: optimize this in case record can be read directly + self.seek_state = CursorSeekState::SeekingIndexMoveUp; + return Ok(CursorResult::IO); } } }; @@ -1917,25 +1964,7 @@ impl BTreeCursor { self.get_immutable_record_or_create().as_mut().unwrap(), )? }; - let cmp = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_slice_equal_number_of_cols = - &record.get_values().as_slice()[..key.get_values().len()]; - compare_immutable( - record_slice_equal_number_of_cols, - key.get_values(), - self.key_sort_order(), - &self.collations, - ) - }; - let found = match seek_op { - SeekOp::GT => cmp.is_gt(), - SeekOp::GE => cmp.is_ge(), - SeekOp::EQ => cmp.is_eq(), - SeekOp::LE => cmp.is_le(), - SeekOp::LT => cmp.is_lt(), - }; + let (cmp, found) = self.compare_with_current_record(key, seek_op); if found { self.seek_state .set_nearest_matching_cell(Some(cur_cell_idx as usize)); @@ -1966,6 +1995,34 @@ impl BTreeCursor { } } + fn compare_with_current_record( + &mut self, + key: &ImmutableRecord, + seek_op: SeekOp, + ) -> (Ordering, bool) { + let cmp = { + let record = self.get_immutable_record(); + let record = record.as_ref().unwrap(); + tracing::info!("record={}", record); + let record_slice_equal_number_of_cols = + &record.get_values().as_slice()[..key.get_values().len()]; + compare_immutable( + record_slice_equal_number_of_cols, + key.get_values(), + self.key_sort_order(), + &self.collations, + ) + }; + let found = match seek_op { + SeekOp::GT => cmp.is_gt(), + SeekOp::GE => cmp.is_ge(), + SeekOp::EQ => cmp.is_eq(), + SeekOp::LE => cmp.is_le(), + SeekOp::LT => cmp.is_lt(), + }; + (cmp, found) + } + fn read_record_w_possible_overflow( &mut self, payload: &'static [u8], From f83d837131a86563505c3a05f6e14106a500a4e2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 4 Jun 2025 14:19:07 +0200 Subject: [PATCH 10/42] fix next to not advance if not necessary --- core/storage/btree.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index df7b75483..a96f32afb 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1259,6 +1259,21 @@ impl BTreeCursor { ); return_if_locked_maybe_load!(self.pager, mem_page_rc); + let mem_page = mem_page_rc.get(); + + let contents = mem_page.get().contents.as_ref().unwrap(); + let cell_count = contents.cell_count(); + + let is_index = mem_page_rc.get().is_index(); + let should_skip_advance = is_index + && self.going_upwards // we are going upwards, this means we still need to visit divider cell in an index + && self.stack.current_cell_index() < cell_count as i32; // if we weren't on a + // valid cell then it means we will have to move upwards again or move to right page, + // anyways, we won't visit this invalid cell index + if should_skip_advance { + self.going_upwards = false; + return Ok(CursorResult::Ok(true)); + } // Important to advance only after loading the page in order to not advance > 1 times self.stack.advance(); let cell_idx = self.stack.current_cell_index() as usize; @@ -1267,10 +1282,6 @@ impl BTreeCursor { mem_page_rc.get().get().id, cell_idx ); - let mem_page = mem_page_rc.get(); - - let contents = mem_page.get().contents.as_ref().unwrap(); - let cell_count = contents.cell_count(); if cell_count == 0 || cell_idx == cell_count { // do rightmost @@ -1335,8 +1346,6 @@ impl BTreeCursor { return Ok(CursorResult::Ok(true)); } else { let mem_page = self.read_page(*left_child_page as usize)?; - // we still need to process this page later when going up, so let's go back - self.stack.retreat(); self.stack.push(mem_page); continue; } From 0f79b0dd50fe18b2490f1ce3eeb2bd0c648a071e Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 4 Jun 2025 17:00:57 +0200 Subject: [PATCH 11/42] fix prev? --- core/storage/btree.rs | 66 +++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a96f32afb..f1659a964 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -736,6 +736,46 @@ impl BTreeCursor { loop { let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); + let contents = page.get().contents.as_ref().unwrap(); + + let cell_count = contents.cell_count(); + let cell_idx = self.stack.current_cell_index() as usize; + // dbg!( + // page.get().id, + // self.stack.current_cell_index(), + // self.going_upwards, + // page.get_contents().cell_count() + // ); + + // If we are at the end of the page and we haven't just come back from the right child, + // we now need to move to the rightmost child. + if self.stack.current_cell_index() == i32::MAX && !self.going_upwards { + let rightmost_pointer = contents.rightmost_pointer(); + if let Some(rightmost_pointer) = rightmost_pointer { + self.stack + .push_backwards(self.read_page(rightmost_pointer as usize)?); + continue; + } + } + if cell_idx >= cell_count { + self.stack.set_cell_index(cell_count as i32 - 1); + } else { + let is_index = page.is_index(); + // skip retreat in case we still haven't visited this cell in index + let should_visit_internal_node = is_index && self.going_upwards; // we are going upwards, this means we still need to visit divider cell in an index + let page_type = contents.page_type(); + if should_visit_internal_node { + self.going_upwards = false; + return Ok(CursorResult::Ok(true)); + } else if matches!( + page_type, + PageType::IndexLeaf | PageType::TableLeaf | PageType::TableInterior + ) { + self.stack.retreat(); + } + } // moved to beginning of current page // todo: find a better way to flag moved to end or begin of page if self.stack.current_cell_index_less_than_min() { @@ -748,35 +788,13 @@ impl BTreeCursor { self.stack.pop(); } else { // moved to begin of btree + // dbg!(false); return Ok(CursorResult::Ok(false)); } } // continue to next loop to get record from the new page continue; } - - return_if_locked_maybe_load!(self.pager, page); - self.stack.retreat(); - let page = page.get(); - let contents = page.get().contents.as_ref().unwrap(); - - let cell_count = contents.cell_count(); - let cell_idx = self.stack.current_cell_index() as usize; - - // If we are at the end of the page and we haven't just come back from the right child, - // we now need to move to the rightmost child. - if cell_idx as i32 == i32::MAX && !self.going_upwards { - let rightmost_pointer = contents.rightmost_pointer(); - if let Some(rightmost_pointer) = rightmost_pointer { - self.stack - .push_backwards(self.read_page(rightmost_pointer as usize)?); - continue; - } - } - - if cell_idx >= cell_count { - self.stack.set_cell_index(cell_count as i32 - 1); - } let cell_idx = self.stack.current_cell_index() as usize; let cell = contents.cell_get( @@ -810,7 +828,7 @@ impl BTreeCursor { // left child has: key 663, key 664, key 665 // we need to move to the previous parent (with e.g. key 662) when iterating backwards. let mem_page = self.read_page(left_child_page as usize)?; - self.stack.advance(); + self.stack.retreat(); self.stack.push_backwards(mem_page); continue; } From 6e5f05a2578f2f51704254f86aa734ef4c435b70 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 08:46:44 +0300 Subject: [PATCH 12/42] Remove unnecessary cell_idx move from tablebtree_move_to() --- core/storage/btree.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index f1659a964..93cd8bd5c 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1470,18 +1470,7 @@ impl BTreeCursor { let left_child_page = contents.cell_table_interior_read_left_child_page( leftmost_matching_cell as usize, )?; - // If we found our target rowid in the left subtree, - // we need to move the parent cell pointer forwards or backwards depending on the iteration direction. - // For example: since the internal node contains the max rowid of the left subtree, we need to move the - // parent pointer backwards in backwards iteration so that we don't come back to the parent again. - // E.g. - // this parent: rowid 666 - // left child has: 664,665,666 - // we need to move to the previous parent (with e.g. rowid 663) when iterating backwards. - let index_change = - -1 + (iter_dir == IterationDirection::Forwards) as i32 * 2; - self.stack - .set_cell_index(leftmost_matching_cell as i32 + index_change); + self.stack.set_cell_index(leftmost_matching_cell as i32); let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); self.move_to_state = CursorMoveToState::ContinueLoop; From 8941c4a53713280193a9de5f261cdd967f3a320c Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 08:50:48 +0300 Subject: [PATCH 13/42] fmt --- core/storage/btree.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 93cd8bd5c..bda627ca0 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4013,11 +4013,10 @@ impl BTreeCursor { )?; if page_type.is_table() { let BTreeCell::TableLeafCell(TableLeafCell { - _rowid, - _payload, - .. - }) = cell else { - unreachable!("unexpected page_type"); + _rowid, _payload, .. + }) = cell + else { + unreachable!("unexpected page_type"); }; Ok(CursorResult::Ok(Some(_rowid))) } else { @@ -4378,10 +4377,9 @@ impl BTreeCursor { }; DeleteSavepoint::Payload(record) } else { - let Some(rowid) = return_if_io!(self.rowid()) - else { - panic!("cursor should be pointing to a record with a rowid"); - }; + let Some(rowid) = return_if_io!(self.rowid()) else { + panic!("cursor should be pointing to a record with a rowid"); + }; DeleteSavepoint::Rowid(rowid) }; let delete_info = self.state.mut_delete_info().unwrap(); From 58e1a2f5bc1f1869f200a1d1c059ebd1a7e6ca28 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 08:51:20 +0300 Subject: [PATCH 14/42] Remove unnecessary self.prev() from last() --- core/storage/btree.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index bda627ca0..65025a7d2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1400,7 +1400,7 @@ impl BTreeCursor { } /// Move the cursor to the rightmost record in the btree. - fn move_to_rightmost(&mut self) -> Result> { + fn move_to_rightmost(&mut self) -> Result> { self.move_to_root(); loop { @@ -1413,8 +1413,9 @@ impl BTreeCursor { if contents.is_leaf() { if contents.cell_count() > 0 { self.stack.set_cell_index(contents.cell_count() as i32 - 1); + return Ok(CursorResult::Ok(true)); } - return Ok(CursorResult::Ok(())); + return Ok(CursorResult::Ok(false)); } match contents.rightmost_pointer() { @@ -3952,13 +3953,10 @@ impl BTreeCursor { pub fn last(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); - match self.move_to_rightmost()? { - CursorResult::Ok(_) => { - let _ = self.prev(); - Ok(CursorResult::Ok(())) - } - CursorResult::IO => Ok(CursorResult::IO), - } + let cursor_has_record = return_if_io!(self.move_to_rightmost()); + self.has_record.replace(cursor_has_record); + self.invalidate_record(); + Ok(CursorResult::Ok(())) } pub fn next(&mut self) -> Result> { From a3ffc6f4e26e9b1ac6d8deb3158b89bdab04e20d Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 08:51:35 +0300 Subject: [PATCH 15/42] Align prev() implementation with next() --- core/storage/btree.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 65025a7d2..232955ef8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -3977,14 +3977,10 @@ impl BTreeCursor { pub fn prev(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); return_if_io!(self.restore_context()); - match self.get_prev_record()? { - CursorResult::Ok(cursor_has_record) => { - self.has_record.replace(cursor_has_record); - self.invalidate_record(); - Ok(CursorResult::Ok(cursor_has_record)) - } - CursorResult::IO => Ok(CursorResult::IO), - } + let cursor_has_record = return_if_io!(self.get_prev_record()); + self.has_record.replace(cursor_has_record); + self.invalidate_record(); + Ok(CursorResult::Ok(cursor_has_record)) } pub fn rowid(&self) -> Result>> { From 8ad6aadbbd8d9b7754d7c895d69b65f75abd6c8b Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 09:35:47 +0300 Subject: [PATCH 16/42] remove unnecessary SeekingIndexMoveUp state --- core/storage/btree.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 232955ef8..d4add2388 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -411,9 +411,6 @@ enum CursorSeekState { /// This value is only used for indexbtree not_found_leaf: bool, }, - /// In case value was not found in a leaf page, then we need to find in upper layer. This case - /// requires I/O for moving and for loading payload if it overflows so let's store it. - SeekingIndexMoveUp, } // These functions below exist to avoid problems with the borrow checker @@ -1859,7 +1856,7 @@ impl BTreeCursor { nearest_matching_cell, not_found_leaf: false, }; - } else if let CursorSeekState::SeekingIndexMoveUp = self.seek_state { + } else if self.seek_state.get_not_found_leaf() { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); @@ -1890,6 +1887,7 @@ impl BTreeCursor { )? }; let (_, found) = self.compare_with_current_record(key, seek_op); + self.seek_state.set_not_found_leaf(false); return Ok(CursorResult::Ok(found)); } @@ -1935,7 +1933,6 @@ impl BTreeCursor { return Ok(CursorResult::Ok(false)); } // FIXME: optimize this in case record can be read directly - self.seek_state = CursorSeekState::SeekingIndexMoveUp; return Ok(CursorResult::IO); } IterationDirection::Backwards => { @@ -1948,7 +1945,6 @@ impl BTreeCursor { return Ok(CursorResult::Ok(false)); } // FIXME: optimize this in case record can be read directly - self.seek_state = CursorSeekState::SeekingIndexMoveUp; return Ok(CursorResult::IO); } } From cba84b7ce94fc3f9070ace679a12585c564a51bb Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 09:36:29 +0300 Subject: [PATCH 17/42] Remove premature cast to usize (cell_idx can be negative) --- core/storage/btree.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d4add2388..71c1dc541 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -738,13 +738,7 @@ impl BTreeCursor { let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); - let cell_idx = self.stack.current_cell_index() as usize; - // dbg!( - // page.get().id, - // self.stack.current_cell_index(), - // self.going_upwards, - // page.get_contents().cell_count() - // ); + let cell_idx = self.stack.current_cell_index(); // If we are at the end of the page and we haven't just come back from the right child, // we now need to move to the rightmost child. @@ -756,7 +750,7 @@ impl BTreeCursor { continue; } } - if cell_idx >= cell_count { + if cell_idx >= cell_count as i32 { self.stack.set_cell_index(cell_count as i32 - 1); } else { let is_index = page.is_index(); From ae6a943e43b6101119fef61a831a9803e2ee2f78 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 10:31:45 +0300 Subject: [PATCH 18/42] Leave parent pointing at rightmost pointer at the end of balance_root() --- core/storage/btree.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 71c1dc541..efc136dac 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -3803,6 +3803,7 @@ impl BTreeCursor { self.root_page = root.get().id; self.stack.clear(); self.stack.push(root_btree.clone()); + self.stack.set_cell_index(0); // leave parent pointing at the rightmost pointer (in this case 0, as there are no cells), since we will be balancing the rightmost child page. self.stack.push(child_btree.clone()); } From 2bac140d73193b9fc34df39a9ece83889442c7cf Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 10:56:47 +0300 Subject: [PATCH 19/42] Remove SeekOp::EQ and encode eq_only in LE&GE - needed for iteration direction aware equality seeks --- core/storage/btree.rs | 71 +++++++++++++++++++-------------- core/translate/main_loop.rs | 18 ++++----- core/translate/optimizer/mod.rs | 67 +++++++++++++++++++++++-------- core/types.rs | 29 ++++++++++---- core/vdbe/execute.rs | 37 +++++++++++------ core/vdbe/explain.rs | 2 + core/vdbe/insn.rs | 2 + core/vdbe/mod.rs | 2 +- 8 files changed, 150 insertions(+), 78 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index efc136dac..c7db87114 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1508,10 +1508,9 @@ impl BTreeCursor { // EQ | < | go right | Last = key is in right subtree let is_on_left = match seek_op { SeekOp::GT => cell_rowid > rowid, - SeekOp::GE => cell_rowid >= rowid, - SeekOp::LE => cell_rowid >= rowid, + SeekOp::GE { .. } => cell_rowid >= rowid, + SeekOp::LE { .. } => cell_rowid >= rowid, SeekOp::LT => cell_rowid + 1 >= rowid, - SeekOp::EQ => cell_rowid >= rowid, }; if is_on_left { self.move_to_state @@ -1689,9 +1688,8 @@ impl BTreeCursor { match cmp { SeekOp::GT => interior_cell_vs_index_key.is_gt(), - SeekOp::GE => interior_cell_vs_index_key.is_ge(), - SeekOp::EQ => interior_cell_vs_index_key.is_ge(), - SeekOp::LE => interior_cell_vs_index_key.is_gt(), + SeekOp::GE { .. } => interior_cell_vs_index_key.is_ge(), + SeekOp::LE { .. } => interior_cell_vs_index_key.is_gt(), SeekOp::LT => interior_cell_vs_index_key.is_ge(), } }; @@ -1769,14 +1767,15 @@ impl BTreeCursor { let found = match seek_op { SeekOp::GT => cmp.is_gt(), - SeekOp::GE => cmp.is_ge(), - SeekOp::EQ => cmp.is_eq(), - SeekOp::LE => cmp.is_le(), + SeekOp::GE { eq_only: true } => cmp.is_eq(), + SeekOp::GE { eq_only: false } => cmp.is_ge(), + SeekOp::LE { eq_only: true } => cmp.is_eq(), + SeekOp::LE { eq_only: false } => cmp.is_le(), SeekOp::LT => cmp.is_lt(), }; // rowids are unique, so we can return the rowid immediately - if found && SeekOp::EQ == seek_op { + if found && seek_op.eq_only() { self.stack.set_cell_index(cur_cell_idx as i32); return Ok(CursorResult::Ok(true)); } @@ -2022,9 +2021,10 @@ impl BTreeCursor { }; let found = match seek_op { SeekOp::GT => cmp.is_gt(), - SeekOp::GE => cmp.is_ge(), - SeekOp::EQ => cmp.is_eq(), - SeekOp::LE => cmp.is_le(), + SeekOp::GE { eq_only: true } => cmp.is_eq(), + SeekOp::GE { eq_only: false } => cmp.is_ge(), + SeekOp::LE { eq_only: true } => cmp.is_eq(), + SeekOp::LE { eq_only: false } => cmp.is_le(), SeekOp::LT => cmp.is_lt(), }; (cmp, found) @@ -4119,12 +4119,16 @@ impl BTreeCursor { if !moved_before && !matches!(self.state, CursorState::Write(..)) { match key { BTreeKey::IndexKey(_) => { - return_if_io!( - self.seek(SeekKey::IndexKey(key.get_record().unwrap()), SeekOp::GE) - ) + return_if_io!(self.seek( + SeekKey::IndexKey(key.get_record().unwrap()), + SeekOp::GE { eq_only: true } + )) } BTreeKey::TableRowId(_) => { - return_if_io!(self.seek(SeekKey::TableRowId(key.to_rowid()), SeekOp::EQ)) + return_if_io!(self.seek( + SeekKey::TableRowId(key.to_rowid()), + SeekOp::GE { eq_only: true } + )) } }; } @@ -4430,7 +4434,7 @@ impl BTreeCursor { SeekKey::IndexKey(immutable_record) } }; - return_if_io!(self.seek(key, SeekOp::EQ)); + return_if_io!(self.seek(key, SeekOp::GE { eq_only: true })); self.state = CursorState::None; return Ok(CursorResult::Ok(())); @@ -4454,7 +4458,7 @@ impl BTreeCursor { /// Search for a key in an Index Btree. Looking up indexes that need to be unique, we cannot compare the rowid pub fn key_exists_in_index(&mut self, key: &ImmutableRecord) -> Result> { - return_if_io!(self.seek(SeekKey::IndexKey(key), SeekOp::GE)); + return_if_io!(self.seek(SeekKey::IndexKey(key), SeekOp::GE { eq_only: true })); let record_opt = return_if_io!(self.record()); match record_opt.as_ref() { @@ -4488,7 +4492,9 @@ impl BTreeCursor { Value::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - let _ = return_if_io!(self.move_to(SeekKey::TableRowId(*int_key), SeekOp::EQ)); + let _ = return_if_io!( + self.move_to(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true }) + ); let page = self.stack.top(); // TODO(pere): request load return_if_locked_maybe_load!(self.pager, page); @@ -5005,7 +5011,7 @@ impl BTreeCursor { CursorContext::TableRowId(rowid) => SeekKey::TableRowId(rowid), CursorContext::IndexKeyRowId(ref record) => SeekKey::IndexKey(record), }; - let res = self.seek(seek_key, SeekOp::EQ)?; + let res = self.seek(seek_key, SeekOp::GE { eq_only: true })?; match res { CursorResult::Ok(_) => { self.valid_state = CursorValidState::Valid; @@ -6504,7 +6510,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(*key); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -6528,7 +6534,7 @@ mod tests { let seek_key = SeekKey::TableRowId(*key); assert!( matches!( - cursor.seek(seek_key, SeekOp::EQ).unwrap(), + cursor.seek(seek_key, SeekOp::GE { eq_only: true }).unwrap(), CursorResult::Ok(true) ), "key {} is not found", @@ -6598,7 +6604,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(key); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7774,7 +7780,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7851,7 +7857,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7874,8 +7880,11 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page); let seek_key = SeekKey::TableRowId(i); - let found = run_until_done(|| cursor.seek(seek_key.clone(), SeekOp::EQ), pager.deref()) - .unwrap(); + let found = run_until_done( + || cursor.seek(seek_key.clone(), SeekOp::GE { eq_only: true }), + pager.deref(), + ) + .unwrap(); if found { run_until_done(|| cursor.delete(), pager.deref()).unwrap(); @@ -7932,7 +7941,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i as i64); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7974,7 +7983,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(1); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -8050,7 +8059,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(1); - cursor.move_to(key, SeekOp::EQ) + cursor.move_to(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index c7f789ca5..8a35bb50b 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -1182,12 +1182,13 @@ fn emit_seek( seek.len }; match seek.op { - SeekOp::GE => program.emit_insn(Insn::SeekGE { + SeekOp::GE { eq_only } => program.emit_insn(Insn::SeekGE { is_index, cursor_id: seek_cursor_id, start_reg, num_regs, target_pc: loop_end, + eq_only, }), SeekOp::GT => program.emit_insn(Insn::SeekGT { is_index, @@ -1196,12 +1197,13 @@ fn emit_seek( num_regs, target_pc: loop_end, }), - SeekOp::LE => program.emit_insn(Insn::SeekLE { + SeekOp::LE { eq_only } => program.emit_insn(Insn::SeekLE { is_index, cursor_id: seek_cursor_id, start_reg, num_regs, target_pc: loop_end, + eq_only, }), SeekOp::LT => program.emit_insn(Insn::SeekLT { is_index, @@ -1210,7 +1212,6 @@ fn emit_seek( num_regs, target_pc: loop_end, }), - SeekOp::EQ => panic!("An index seek is never EQ"), }; Ok(()) @@ -1293,7 +1294,7 @@ fn emit_seek_termination( } match (is_index, termination.op) { - (true, SeekOp::GE) => program.emit_insn(Insn::IdxGE { + (true, SeekOp::GE { .. }) => program.emit_insn(Insn::IdxGE { cursor_id: seek_cursor_id, start_reg, num_regs, @@ -1305,7 +1306,7 @@ fn emit_seek_termination( num_regs, target_pc: loop_end, }), - (true, SeekOp::LE) => program.emit_insn(Insn::IdxLE { + (true, SeekOp::LE { .. }) => program.emit_insn(Insn::IdxLE { cursor_id: seek_cursor_id, start_reg, num_regs, @@ -1317,7 +1318,7 @@ fn emit_seek_termination( num_regs, target_pc: loop_end, }), - (false, SeekOp::GE) => program.emit_insn(Insn::Ge { + (false, SeekOp::GE { .. }) => program.emit_insn(Insn::Ge { lhs: rowid_reg.unwrap(), rhs: start_reg, target_pc: loop_end, @@ -1331,7 +1332,7 @@ fn emit_seek_termination( flags: CmpInsFlags::default(), collation: program.curr_collation(), }), - (false, SeekOp::LE) => program.emit_insn(Insn::Le { + (false, SeekOp::LE { .. }) => program.emit_insn(Insn::Le { lhs: rowid_reg.unwrap(), rhs: start_reg, target_pc: loop_end, @@ -1345,9 +1346,6 @@ fn emit_seek_termination( flags: CmpInsFlags::default(), collation: program.curr_collation(), }), - (_, SeekOp::EQ) => { - panic!("An index termination condition is never EQ") - } }; Ok(()) diff --git a/core/translate/optimizer/mod.rs b/core/translate/optimizer/mod.rs index a054bac3e..38f1d3d73 100644 --- a/core/translate/optimizer/mod.rs +++ b/core/translate/optimizer/mod.rs @@ -881,7 +881,7 @@ fn build_seek_def( seek: Some(SeekKey { len: key_len, null_pad: false, - op: SeekOp::GE, + op: SeekOp::GE { eq_only: true }, }), termination: Some(TerminationKey { len: key_len, @@ -905,8 +905,8 @@ fn build_seek_def( ( key_len - 1, key_len, - SeekOp::LE.reverse(), - SeekOp::LE.reverse(), + SeekOp::LE { eq_only: false }.reverse(), + SeekOp::LE { eq_only: false }.reverse(), ) }; SeekDef { @@ -943,12 +943,17 @@ fn build_seek_def( (IterationDirection::Forwards, ast::Operator::GreaterEquals) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len, key_len - 1, SeekOp::GE, SeekOp::GT) + ( + key_len, + key_len - 1, + SeekOp::GE { eq_only: false }, + SeekOp::GT, + ) } else { ( key_len - 1, key_len, - SeekOp::LE.reverse(), + SeekOp::LE { eq_only: false }.reverse(), SeekOp::LT.reverse(), ) }; @@ -986,9 +991,19 @@ fn build_seek_def( (IterationDirection::Forwards, ast::Operator::Less) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len - 1, key_len, SeekOp::GT, SeekOp::GE) + ( + key_len - 1, + key_len, + SeekOp::GT, + SeekOp::GE { eq_only: false }, + ) } else { - (key_len, key_len - 1, SeekOp::GT, SeekOp::GE) + ( + key_len, + key_len - 1, + SeekOp::GT, + SeekOp::GE { eq_only: false }, + ) }; SeekDef { key, @@ -1029,8 +1044,8 @@ fn build_seek_def( ( key_len, key_len - 1, - SeekOp::LE.reverse(), - SeekOp::LE.reverse(), + SeekOp::LE { eq_only: false }.reverse(), + SeekOp::LE { eq_only: false }.reverse(), ) }; SeekDef { @@ -1066,7 +1081,7 @@ fn build_seek_def( iter_dir, seek: Some(SeekKey { len: key_len, - op: SeekOp::LE, + op: SeekOp::LE { eq_only: true }, null_pad: false, }), termination: Some(TerminationKey { @@ -1086,13 +1101,18 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::Less) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len, key_len - 1, SeekOp::LT, SeekOp::LE) + ( + key_len, + key_len - 1, + SeekOp::LT, + SeekOp::LE { eq_only: false }, + ) } else { ( key_len - 1, key_len, SeekOp::GT.reverse(), - SeekOp::GE.reverse(), + SeekOp::GE { eq_only: false }.reverse(), ) }; SeekDef { @@ -1129,7 +1149,12 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::LessEquals) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len, key_len - 1, SeekOp::LE, SeekOp::LE) + ( + key_len, + key_len - 1, + SeekOp::LE { eq_only: false }, + SeekOp::LE { eq_only: false }, + ) } else { ( key_len - 1, @@ -1172,7 +1197,12 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::Greater) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len - 1, key_len, SeekOp::LE, SeekOp::LE) + ( + key_len - 1, + key_len, + SeekOp::LE { eq_only: false }, + SeekOp::LE { eq_only: false }, + ) } else { ( key_len, @@ -1215,12 +1245,17 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::GreaterEquals) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len - 1, key_len, SeekOp::LE, SeekOp::LT) + ( + key_len - 1, + key_len, + SeekOp::LE { eq_only: false }, + SeekOp::LT, + ) } else { ( key_len, key_len - 1, - SeekOp::GE.reverse(), + SeekOp::GE { eq_only: false }.reverse(), SeekOp::GT.reverse(), ) }; diff --git a/core/types.rs b/core/types.rs index 0a5770d4f..8498c6db1 100644 --- a/core/types.rs +++ b/core/types.rs @@ -1492,10 +1492,17 @@ pub enum CursorResult { #[derive(Clone, Copy, PartialEq, Eq, Debug)] /// The match condition of a table/index seek. pub enum SeekOp { - EQ, - GE, + /// If eq_only is true, this means in practice: + /// We are iterating forwards, but we are really looking for an exact match on the seek key. + GE { + eq_only: bool, + }, GT, - LE, + /// If eq_only is true, this means in practice: + /// We are iterating backwards, but we are really looking for an exact match on the seek key. + LE { + eq_only: bool, + }, LT, } @@ -1512,17 +1519,23 @@ impl SeekOp { #[inline(always)] pub fn iteration_direction(&self) -> IterationDirection { match self { - SeekOp::EQ | SeekOp::GE | SeekOp::GT => IterationDirection::Forwards, - SeekOp::LE | SeekOp::LT => IterationDirection::Backwards, + SeekOp::GE { .. } | SeekOp::GT => IterationDirection::Forwards, + SeekOp::LE { .. } | SeekOp::LT => IterationDirection::Backwards, + } + } + + pub fn eq_only(&self) -> bool { + match self { + SeekOp::GE { eq_only } | SeekOp::LE { eq_only } => *eq_only, + _ => false, } } pub fn reverse(&self) -> Self { match self { - SeekOp::EQ => SeekOp::EQ, - SeekOp::GE => SeekOp::LE, + SeekOp::GE { eq_only } => SeekOp::LE { eq_only: *eq_only }, SeekOp::GT => SeekOp::LT, - SeekOp::LE => SeekOp::GE, + SeekOp::LE { eq_only } => SeekOp::GE { eq_only: *eq_only }, SeekOp::LT => SeekOp::GT, } } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index c2180806b..34cdff7be 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1352,7 +1352,10 @@ pub fn op_column( }; let mut table_cursor = state.get_cursor(table_cursor_id); let table_cursor = table_cursor.as_btree_mut(); - match table_cursor.seek(SeekKey::TableRowId(rowid.unwrap()), SeekOp::EQ)? { + match table_cursor.seek( + SeekKey::TableRowId(rowid.unwrap()), + SeekOp::GE { eq_only: true }, + )? { CursorResult::Ok(_) => None, CursorResult::IO => Some((index_cursor_id, table_cursor_id)), } @@ -1914,7 +1917,7 @@ pub fn op_row_id( }; let mut table_cursor = state.get_cursor(table_cursor_id); let table_cursor = table_cursor.as_btree_mut(); - match table_cursor.seek(SeekKey::TableRowId(rowid), SeekOp::EQ)? { + match table_cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true })? { CursorResult::Ok(_) => None, CursorResult::IO => Some((index_cursor_id, table_cursor_id)), } @@ -2000,7 +2003,9 @@ pub fn op_seek_rowid( }; match rowid { Some(rowid) => { - let found = return_if_io!(cursor.seek(SeekKey::TableRowId(rowid), SeekOp::EQ)); + let found = return_if_io!( + cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true }) + ); if !found { target_pc.to_offset_int() } else { @@ -2046,6 +2051,7 @@ pub fn op_seek( num_regs, target_pc, is_index, + .. } | Insn::SeekGT { cursor_id, @@ -2060,6 +2066,7 @@ pub fn op_seek( num_regs, target_pc, is_index, + .. } | Insn::SeekLT { cursor_id, @@ -2076,19 +2083,22 @@ pub fn op_seek( "target_pc should be an offset, is: {:?}", target_pc ); + let eq_only = match insn { + Insn::SeekGE { eq_only, .. } | Insn::SeekLE { eq_only, .. } => *eq_only, + _ => false, + }; let op = match insn { - Insn::SeekGE { .. } => SeekOp::GE, + Insn::SeekGE { eq_only, .. } => SeekOp::GE { eq_only: *eq_only }, Insn::SeekGT { .. } => SeekOp::GT, - Insn::SeekLE { .. } => SeekOp::LE, + Insn::SeekLE { eq_only, .. } => SeekOp::LE { eq_only: *eq_only }, Insn::SeekLT { .. } => SeekOp::LT, _ => unreachable!("unexpected Insn {:?}", insn), }; let op_name = match op { - SeekOp::GE => "SeekGE", + SeekOp::GE { .. } => "SeekGE", SeekOp::GT => "SeekGT", - SeekOp::LE => "SeekLE", + SeekOp::LE { .. } => "SeekLE", SeekOp::LT => "SeekLT", - _ => unreachable!("unexpected SeekOp {:?}", op), }; if *is_index { let found = { @@ -3928,7 +3938,9 @@ pub fn op_idx_delete( { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::EQ)); + return_if_io!( + cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true }) + ); tracing::debug!( "op_idx_delete(seek={}, record={} rowid={:?})", &record, @@ -4151,7 +4163,8 @@ pub fn op_no_conflict( return Ok(InsnFunctionStepResult::Step); } - let conflict = return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::EQ)); + let conflict = + return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true })); drop(cursor_ref); if !conflict { state.pc = target_pc.to_offset_int(); @@ -4925,10 +4938,10 @@ pub fn op_found( } }; - return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::EQ)) + return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true })) } else { let record = make_record(&state.registers, record_reg, num_regs); - return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::EQ)) + return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true })) } }; diff --git a/core/vdbe/explain.rs b/core/vdbe/explain.rs index 9aa6588c6..fa6a2a70e 100644 --- a/core/vdbe/explain.rs +++ b/core/vdbe/explain.rs @@ -809,6 +809,7 @@ pub fn insn_to_str( start_reg, num_regs, target_pc, + .. } | Insn::SeekLE { is_index: _, @@ -816,6 +817,7 @@ pub fn insn_to_str( start_reg, num_regs, target_pc, + .. } | Insn::SeekLT { is_index: _, diff --git a/core/vdbe/insn.rs b/core/vdbe/insn.rs index 6eea43318..23b9da480 100644 --- a/core/vdbe/insn.rs +++ b/core/vdbe/insn.rs @@ -502,6 +502,7 @@ pub enum Insn { start_reg: usize, num_regs: usize, target_pc: BranchOffset, + eq_only: bool, }, /// If cursor_id refers to an SQL table (B-Tree that uses integer keys), use the value in start_reg as the key. @@ -538,6 +539,7 @@ pub enum Insn { start_reg: usize, num_regs: usize, target_pc: BranchOffset, + eq_only: bool, }, // If cursor_id refers to an SQL table (B-Tree that uses integer keys), use the value in start_reg as the key. diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 16e5a1117..a53b9c755 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -509,7 +509,7 @@ fn get_new_rowid(cursor: &mut BTreeCursor, mut rng: R) -> Result break, // Found a non-existing rowid CursorResult::Ok(true) => { if count == max_attempts - 1 { From 0b7f5a2a1396949711daed551a52904c8955b793 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 13:01:01 +0300 Subject: [PATCH 20/42] Merge MoveTo&Seek states, remove unnecessary seekstate methods, add eq_seen flag to prevent unnecessary next()/prev() --- core/storage/btree.rs | 481 +++++++++++++++++++----------------------- 1 file changed, 215 insertions(+), 266 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index c7db87114..6af7c2d79 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -399,172 +399,36 @@ enum CursorValidState { #[derive(Debug)] /// State used for seeking -enum CursorSeekState { +pub enum CursorSeekState { Start, - Seeking { - /// Min cell_idx - min: isize, - /// Max cell_idx - max: isize, - nearest_matching_cell: Option, + MovingBetweenPages { + eq_seen: Cell, + }, + InteriorPageBinarySearch { + min_cell_idx: Cell, + max_cell_idx: Cell, + nearest_matching_cell: Cell>, + eq_seen: Cell, + }, + FoundLeaf { + eq_seen: Cell, + }, + LeafPageBinarySearch { + min_cell_idx: Cell, + max_cell_idx: Cell, + nearest_matching_cell: Cell>, + /// Indicates if we have seen an exact match during the downwards traversal of the btree. + /// This is only needed in index seeks, in cases where we need to determine whether we call + /// an additional next()/prev() to fetch a matching record from an interior node. We will not + /// do that if we haven't seen an EQ during the traversal, because that would leave the cursor + /// pointing to the wrong record. + eq_seen: Cell, /// Indicates when we have not not found a value in leaf and now will look in the next/prev record. /// This value is only used for indexbtree - not_found_leaf: bool, + moving_up_to_parent: Cell, }, } -// These functions below exist to avoid problems with the borrow checker -impl CursorSeekState { - /// method can only be called on CursorSeekState::Seeking - fn set_nearest_matching_cell(&mut self, matching_cell: Option) { - let CursorSeekState::Seeking { - nearest_matching_cell, - .. - } = self - else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *nearest_matching_cell = matching_cell; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_nearest_matching_cell(&mut self) -> Option { - let CursorSeekState::Seeking { - nearest_matching_cell, - .. - } = self - else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - nearest_matching_cell.clone() - } - - /// method can only be called on CursorSeekState::Seeking - fn set_max(&mut self, max: isize) { - let CursorSeekState::Seeking { max: max_state, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *max_state = max; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_max(&mut self) -> isize { - let CursorSeekState::Seeking { max, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *max - } - - /// method can only be called on CursorSeekState::Seeking - fn set_min(&mut self, min: isize) { - let CursorSeekState::Seeking { min: min_state, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *min_state = min; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_min(&mut self) -> isize { - let CursorSeekState::Seeking { min, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *min - } - - /// method can only be called on CursorSeekState::Seeking - fn set_not_found_leaf(&mut self, not_found: bool) { - let CursorSeekState::Seeking { not_found_leaf, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *not_found_leaf = not_found; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_not_found_leaf(&mut self) -> bool { - let CursorSeekState::Seeking { not_found_leaf, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *not_found_leaf - } -} - -#[derive(Debug)] -/// State used for seeking -enum CursorMoveToState { - Start, - ContinueLoop, - Seeking { - /// Min cell_idx - min: isize, - /// Max cell_idx - max: isize, - nearest_matching_cell: Option, - }, -} - -// These functions below exist to avoid problems with the borrow checker -impl CursorMoveToState { - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn set_nearest_matching_cell(&mut self, matching_cell: Option) { - match self { - CursorMoveToState::Seeking { - nearest_matching_cell, - .. - } => *nearest_matching_cell = matching_cell, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - }; - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn get_nearest_matching_cell(&mut self) -> Option { - match self { - CursorMoveToState::Seeking { - nearest_matching_cell, - .. - } => nearest_matching_cell.clone(), - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn set_max(&mut self, max: isize) { - match self { - CursorMoveToState::Seeking { max: max_state, .. } => *max_state = max, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn get_max(&mut self) -> isize { - match self { - CursorMoveToState::Seeking { max: max_state, .. } => *max_state, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn set_min(&mut self, min: isize) { - match self { - CursorMoveToState::Seeking { min: min_state, .. } => *min_state = min, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking "), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn get_min(&mut self) -> isize { - match self { - CursorMoveToState::Seeking { min: min_state, .. } => *min_state, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } -} - #[derive(Debug)] struct FindCellState(Option); @@ -623,7 +487,6 @@ pub struct BTreeCursor { /// This Vec should be empty for Table Btree collations: Vec, seek_state: CursorSeekState, - move_to_state: CursorMoveToState, /// Separate state to read a record with overflow pages. This separation from `state` is necessary as /// we can be in a function that relies on `state`, but also needs to process overflow pages read_overflow_state: RefCell>, @@ -659,7 +522,6 @@ impl BTreeCursor { valid_state: CursorValidState::Valid, collations, seek_state: CursorSeekState::Start, - move_to_state: CursorMoveToState::Start, read_overflow_state: RefCell::new(None), find_cell_state: FindCellState(None), parse_record_state: RefCell::new(ParseRecordState::Init), @@ -1427,45 +1289,64 @@ impl BTreeCursor { /// Specialized version of move_to() for table btrees. #[instrument(skip(self), level = Level::TRACE)] fn tablebtree_move_to(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { - let iter_dir = seek_op.iteration_direction(); 'outer: loop { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { + self.seek_state = CursorSeekState::FoundLeaf { + eq_seen: Cell::new(false), + }; return Ok(CursorResult::Ok(())); } let cell_count = contents.cell_count(); if matches!( - self.move_to_state, - CursorMoveToState::Start | CursorMoveToState::ContinueLoop { .. } + self.seek_state, + CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. } ) { - let min: isize = 0; - let max: isize = cell_count as isize - 1; - let leftmost_matching_cell = None; + let eq_seen = match &self.seek_state { + CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), + _ => false, + }; + let min_cell_idx = Cell::new(0); + let max_cell_idx = Cell::new(cell_count as isize - 1); + let nearest_matching_cell = Cell::new(None); - self.move_to_state = CursorMoveToState::Seeking { - min, - max, - nearest_matching_cell: leftmost_matching_cell, + self.seek_state = CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen: Cell::new(eq_seen), }; } + + let CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen, + .. + } = &self.seek_state + else { + unreachable!("we must be in an interior binary search state"); + }; + loop { - let min = self.move_to_state.get_min(); - let max = self.move_to_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - if let Some(leftmost_matching_cell) = - self.move_to_state.get_nearest_matching_cell() - { + if let Some(nearest_matching_cell) = nearest_matching_cell.get() { let left_child_page = contents.cell_table_interior_read_left_child_page( - leftmost_matching_cell as usize, + nearest_matching_cell as usize, )?; - self.stack.set_cell_index(leftmost_matching_cell as i32); + self.stack.set_cell_index(nearest_matching_cell as i32); let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } self.stack.set_cell_index(cell_count as i32 + 1); @@ -1473,7 +1354,9 @@ impl BTreeCursor { Some(right_most_pointer) => { let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } None => { @@ -1513,11 +1396,10 @@ impl BTreeCursor { SeekOp::LT => cell_rowid + 1 >= rowid, }; if is_on_left { - self.move_to_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); - self.move_to_state.set_max(cur_cell_idx - 1); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); + max_cell_idx.set(cur_cell_idx - 1); } else { - self.move_to_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } @@ -1537,38 +1419,60 @@ impl BTreeCursor { let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { + let eq_seen = match &self.seek_state { + CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), + _ => false, + }; + self.seek_state = CursorSeekState::FoundLeaf { + eq_seen: Cell::new(eq_seen), + }; return Ok(CursorResult::Ok(())); } if matches!( - self.move_to_state, - CursorMoveToState::Start | CursorMoveToState::ContinueLoop + self.seek_state, + CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. } ) { + let eq_seen = match &self.seek_state { + CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), + _ => false, + }; let cell_count = contents.cell_count(); - let min: isize = 0; - let max: isize = cell_count as isize - 1; - let leftmost_matching_cell = None; + let min_cell_idx = Cell::new(0); + let max_cell_idx = Cell::new(cell_count as isize - 1); + let nearest_matching_cell = Cell::new(None); - self.move_to_state = CursorMoveToState::Seeking { - min, - max, - nearest_matching_cell: leftmost_matching_cell, + self.seek_state = CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen: Cell::new(eq_seen), }; } + let CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen, + } = &self.seek_state + else { + unreachable!("we must be in an interior binary search state"); + }; + loop { - let min = self.move_to_state.get_min(); - let max = self.move_to_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - let Some(leftmost_matching_cell) = - self.move_to_state.get_nearest_matching_cell() - else { + let Some(leftmost_matching_cell) = nearest_matching_cell.get() else { self.stack.set_cell_index(contents.cell_count() as i32 + 1); match contents.rightmost_pointer() { Some(right_most_pointer) => { let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } None => { @@ -1609,7 +1513,9 @@ impl BTreeCursor { let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } @@ -1645,7 +1551,7 @@ impl BTreeCursor { self.get_immutable_record_or_create().as_mut().unwrap(), )? }; - let target_leaf_page_is_in_left_subtree = { + let (target_leaf_page_is_in_left_subtree, is_eq) = { let record = self.get_immutable_record(); let record = record.as_ref().unwrap(); let record_slice_equal_number_of_cols = @@ -1686,20 +1592,26 @@ impl BTreeCursor { // EQ | = | go left | First = key could be exactly this one, or in left subtree // EQ | < | go right | First = key must be in right subtree - match cmp { - SeekOp::GT => interior_cell_vs_index_key.is_gt(), - SeekOp::GE { .. } => interior_cell_vs_index_key.is_ge(), - SeekOp::LE { .. } => interior_cell_vs_index_key.is_gt(), - SeekOp::LT => interior_cell_vs_index_key.is_ge(), - } + ( + match cmp { + SeekOp::GT => interior_cell_vs_index_key.is_gt(), + SeekOp::GE { .. } => interior_cell_vs_index_key.is_ge(), + SeekOp::LE { .. } => interior_cell_vs_index_key.is_gt(), + SeekOp::LT => interior_cell_vs_index_key.is_ge(), + }, + interior_cell_vs_index_key.is_eq(), + ) }; + if is_eq { + eq_seen.set(true); + } + if target_leaf_page_is_in_left_subtree { - self.move_to_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); - self.move_to_state.set_max(cur_cell_idx - 1); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); + max_cell_idx.set(cur_cell_idx - 1); } else { - self.move_to_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } @@ -1711,7 +1623,7 @@ impl BTreeCursor { assert!(self.mv_cursor.is_none()); let iter_dir = seek_op.iteration_direction(); - if matches!(self.seek_state, CursorSeekState::Start) { + if matches!(self.seek_state, CursorSeekState::Start { .. }) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::TableRowId(rowid), seek_op)); let page = self.stack.top(); @@ -1728,31 +1640,42 @@ impl BTreeCursor { self.stack.set_cell_index(0); return Ok(CursorResult::Ok(false)); } - let min: isize = 0; - let max: isize = cell_count as isize - 1; + let min_cell_idx = Cell::new(0); + let max_cell_idx = Cell::new(cell_count as isize - 1); // If iter dir is forwards, we want the first cell that matches; // If iter dir is backwards, we want the last cell that matches. - let nearest_matching_cell = None; + let nearest_matching_cell = Cell::new(None); - self.seek_state = CursorSeekState::Seeking { - min, - max, + self.seek_state = CursorSeekState::LeafPageBinarySearch { + min_cell_idx, + max_cell_idx, nearest_matching_cell, - not_found_leaf: false, + moving_up_to_parent: Cell::new(false), + eq_seen: Cell::new(false), // not relevant for table btrees }; } + let CursorSeekState::LeafPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + .. + } = &self.seek_state + else { + unreachable!("we must be in a leaf binary search state"); + }; + let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); loop { - let min = self.seek_state.get_min(); - let max = self.seek_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - if let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() { + if let Some(nearest_matching_cell) = nearest_matching_cell.get() { self.stack.set_cell_index(nearest_matching_cell as i32); return Ok(CursorResult::Ok(true)); } else { @@ -1781,28 +1704,27 @@ impl BTreeCursor { } if found { - self.seek_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } IterationDirection::Backwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } else { if cmp.is_gt() { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } else if cmp.is_lt() { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } else { match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } IterationDirection::Backwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } } } @@ -1815,9 +1737,16 @@ impl BTreeCursor { key: &ImmutableRecord, seek_op: SeekOp, ) -> Result> { - if matches!(self.seek_state, CursorSeekState::Start) { + if matches!(self.seek_state, CursorSeekState::Start { .. }) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::IndexKey(key), seek_op)); + let CursorSeekState::FoundLeaf { eq_seen } = &self.seek_state else { + unreachable!( + "We must still be in FoundLeaf state after move_to, got: {:?}", + self.seek_state + ); + }; + let eq_seen = eq_seen.get(); let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); @@ -1836,20 +1765,34 @@ impl BTreeCursor { } } - let min: isize = 0; - let max: isize = cell_count as isize - 1; + let min = Cell::new(0); + let max = Cell::new(cell_count as isize - 1); // If iter dir is forwards, we want the first cell that matches; // If iter dir is backwards, we want the last cell that matches. - let nearest_matching_cell = None; + let nearest_matching_cell = Cell::new(None); - self.seek_state = CursorSeekState::Seeking { - min, - max, + self.seek_state = CursorSeekState::LeafPageBinarySearch { + min_cell_idx: min, + max_cell_idx: max, nearest_matching_cell, - not_found_leaf: false, + moving_up_to_parent: Cell::new(false), + eq_seen: Cell::new(eq_seen), }; - } else if self.seek_state.get_not_found_leaf() { + } + + let CursorSeekState::LeafPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen, + moving_up_to_parent, + } = &self.seek_state + else { + unreachable!("we must be in a leaf binary search state"); + }; + + if moving_up_to_parent.get() { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); @@ -1880,7 +1823,7 @@ impl BTreeCursor { )? }; let (_, found) = self.compare_with_current_record(key, seek_op); - self.seek_state.set_not_found_leaf(false); + moving_up_to_parent.set(false); return Ok(CursorResult::Ok(found)); } @@ -1893,10 +1836,10 @@ impl BTreeCursor { let iter_dir = seek_op.iteration_direction(); loop { - let min = self.seek_state.get_min(); - let max = self.seek_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - if let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() { + if let Some(nearest_matching_cell) = nearest_matching_cell.get() { self.stack.set_cell_index(nearest_matching_cell as i32); return Ok(CursorResult::Ok(true)); } else { @@ -1915,10 +1858,17 @@ impl BTreeCursor { // if we were to return Ok(CursorResult::Ok((None, None))), self.record would be None, which is incorrect, because we already know // that there is a record with a key greater than K (K' = K+2) in the parent interior cell. Hence, we need to move back up the tree // and get the next matching record from there. + // + // However we only do this if either of the following is true: + // - We have seen an EQ match up in the tree in an interior node + // - Or, we are not looking for an exact match. + if seek_op.eq_only() && !eq_seen.get() { + return Ok(CursorResult::Ok(false)); + } match iter_dir { IterationDirection::Forwards => { - if !self.seek_state.get_not_found_leaf() { - self.seek_state.set_not_found_leaf(true); + if !moving_up_to_parent.get() { + moving_up_to_parent.set(true); self.stack.set_cell_index(cell_count as i32); } let next_res = return_if_io!(self.next()); @@ -1929,8 +1879,8 @@ impl BTreeCursor { return Ok(CursorResult::IO); } IterationDirection::Backwards => { - if !self.seek_state.get_not_found_leaf() { - self.seek_state.set_not_found_leaf(true); + if !moving_up_to_parent.get() { + moving_up_to_parent.set(true); self.stack.set_cell_index(-1); } let prev_res = return_if_io!(self.prev()); @@ -1972,28 +1922,27 @@ impl BTreeCursor { }; let (cmp, found) = self.compare_with_current_record(key, seek_op); if found { - self.seek_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } IterationDirection::Backwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } else { if cmp.is_gt() { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } else if cmp.is_lt() { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } else { match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } IterationDirection::Backwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } } } @@ -2002,7 +1951,7 @@ impl BTreeCursor { } fn compare_with_current_record( - &mut self, + &self, key: &ImmutableRecord, seek_op: SeekOp, ) -> (Ordering, bool) { @@ -2074,7 +2023,7 @@ impl BTreeCursor { // 5. We scan the leaf cells in the leaf page until we find the cell whose rowid is equal to the rowid we are looking for. // This cell contains the actual data we are looking for. // 6. If we find the cell, we return the record. Otherwise, we return an empty result. - if matches!(self.move_to_state, CursorMoveToState::Start) { + if matches!(self.seek_state, CursorSeekState::Start { .. }) { self.move_to_root(); } @@ -2083,7 +2032,6 @@ impl BTreeCursor { SeekKey::IndexKey(index_key) => self.indexbtree_move_to(index_key, cmp), }; return_if_io!(ret); - self.move_to_state = CursorMoveToState::Start; Ok(CursorResult::Ok(())) } @@ -4495,6 +4443,7 @@ impl BTreeCursor { let _ = return_if_io!( self.move_to(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true }) ); + self.seek_state = CursorSeekState::Start; let page = self.stack.top(); // TODO(pere): request load return_if_locked_maybe_load!(self.pager, page); @@ -6510,7 +6459,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(*key); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -6604,7 +6553,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(key); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7780,7 +7729,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7857,7 +7806,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7941,7 +7890,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i as i64); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -7983,7 +7932,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(1); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -8059,7 +8008,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(1); - cursor.move_to(key, SeekOp::GE { eq_only: true }) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) From 0ce18a9146f4ab6d7285b53a1528dd802c28b3f5 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 13:04:03 +0300 Subject: [PATCH 21/42] Fix comment --- core/storage/btree.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 6af7c2d79..4a0222496 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -420,8 +420,9 @@ pub enum CursorSeekState { /// Indicates if we have seen an exact match during the downwards traversal of the btree. /// This is only needed in index seeks, in cases where we need to determine whether we call /// an additional next()/prev() to fetch a matching record from an interior node. We will not - /// do that if we haven't seen an EQ during the traversal, because that would leave the cursor - /// pointing to the wrong record. + /// do that if both are true: + /// 1. We have not seen an EQ during the traversal + /// 2. We are looking for an exact match ([SeekOp::GE] or [SeekOp::LE] with eq_only: true) eq_seen: Cell, /// Indicates when we have not not found a value in leaf and now will look in the next/prev record. /// This value is only used for indexbtree From 5f60cce3c70c4bf28c08a3a35f86299b554a8791 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 13:08:57 +0300 Subject: [PATCH 22/42] fix seek_to_last() --- core/storage/btree.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 4a0222496..cf9a3dbdb 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -3856,15 +3856,14 @@ impl BTreeCursor { } pub fn seek_to_last(&mut self) -> Result> { - return_if_io!(self.move_to_rightmost()); - let cursor_has_record = return_if_io!(self.get_next_record()); + let has_record = return_if_io!(self.move_to_rightmost()); self.invalidate_record(); - if !cursor_has_record { + self.has_record.replace(has_record); + if !has_record { let is_empty = return_if_io!(self.is_empty_table()); assert!(is_empty); return Ok(CursorResult::Ok(())); } - self.has_record.replace(cursor_has_record); Ok(CursorResult::Ok(())) } From da2437408e855a3125230de10c093378717622e1 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 13:26:41 +0300 Subject: [PATCH 23/42] get_new_rowid(): fix off by one - rowids start at 1 --- core/vdbe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index a53b9c755..3e0cffa3c 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -501,7 +501,7 @@ fn get_new_rowid(cursor: &mut BTreeCursor, mut rng: R) -> Result rowid.checked_add(1).unwrap_or(i64::MAX), // add 1 but be careful with overflows, in case of overflow - use i64::MAX - CursorResult::Ok(None) => 0, + CursorResult::Ok(None) => 1, CursorResult::IO => return Ok(CursorResult::IO), }; if rowid > i64::MAX.try_into().unwrap() { From e897052650a936281e63eac00cc5a2d1249be523 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 13:40:32 +0300 Subject: [PATCH 24/42] flatten process_overflow_read() to get rid of borrowmuterror possibility --- core/storage/btree.rs | 120 ++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 64 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index cf9a3dbdb..abfcc25f6 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -708,71 +708,63 @@ impl BTreeCursor { start_next_page: u32, payload_size: u64, ) -> Result> { - let res = match &mut *self.read_overflow_state.borrow_mut() { - None => { - tracing::debug!("start reading overflow page payload_size={}", payload_size); - let page = self.read_page(start_next_page as usize)?; - *self.read_overflow_state.borrow_mut() = Some(ReadPayloadOverflow { - payload: payload.to_vec(), - next_page: start_next_page, - remaining_to_read: payload_size as usize - payload.len(), - page, - }); - CursorResult::IO - } - Some(ReadPayloadOverflow { - payload, - next_page, - remaining_to_read, - page: page_btree, - }) => { - if page_btree.get().is_locked() { - return Ok(CursorResult::IO); - } - tracing::debug!("reading overflow page {} {}", next_page, remaining_to_read); - let page = page_btree.get(); - let contents = page.get_contents(); - // The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain. - let next = contents.read_u32_no_offset(0); - let buf = contents.as_ptr(); - let usable_space = self.pager.usable_space(); - let to_read = (*remaining_to_read).min(usable_space - 4); - payload.extend_from_slice(&buf[4..4 + to_read]); - *remaining_to_read -= to_read; - if *remaining_to_read == 0 || next == 0 { - assert!( - *remaining_to_read == 0 && next == 0, - "we can't have more pages to read while also have read everything" - ); - let mut payload_swap = Vec::new(); - std::mem::swap(payload, &mut payload_swap); - CursorResult::Ok(payload_swap) - } else { - let new_page = self.pager.read_page(next as usize).map(|page| { - Arc::new(BTreePageInner { - page: RefCell::new(page), - }) - })?; - *page_btree = new_page; - *next_page = next; - CursorResult::IO - } - } - }; - match res { - CursorResult::Ok(payload) => { - { - let mut reuse_immutable = self.get_immutable_record_or_create(); - crate::storage::sqlite3_ondisk::read_record( - &payload, - reuse_immutable.as_mut().unwrap(), - )?; - } - *self.read_overflow_state.borrow_mut() = None; - Ok(CursorResult::Ok(())) - } - CursorResult::IO => Ok(CursorResult::IO), + if self.read_overflow_state.borrow().is_none() { + let page = self.read_page(start_next_page as usize)?; + *self.read_overflow_state.borrow_mut() = Some(ReadPayloadOverflow { + payload: payload.to_vec(), + next_page: start_next_page, + remaining_to_read: payload_size as usize - payload.len(), + page, + }); + return Ok(CursorResult::IO); } + let mut read_overflow_state = self.read_overflow_state.borrow_mut(); + let ReadPayloadOverflow { + payload, + next_page, + remaining_to_read, + page: page_btree, + } = read_overflow_state.as_mut().unwrap(); + + if page_btree.get().is_locked() { + return Ok(CursorResult::IO); + } + tracing::debug!("reading overflow page {} {}", next_page, remaining_to_read); + let page = page_btree.get(); + let contents = page.get_contents(); + // The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain. + let next = contents.read_u32_no_offset(0); + let buf = contents.as_ptr(); + let usable_space = self.pager.usable_space(); + let to_read = (*remaining_to_read).min(usable_space - 4); + payload.extend_from_slice(&buf[4..4 + to_read]); + *remaining_to_read -= to_read; + + if *remaining_to_read != 0 && next != 0 { + let new_page = self.pager.read_page(next as usize).map(|page| { + Arc::new(BTreePageInner { + page: RefCell::new(page), + }) + })?; + *page_btree = new_page; + *next_page = next; + return Ok(CursorResult::IO); + } + assert!( + *remaining_to_read == 0 && next == 0, + "we can't have more pages to read while also have read everything" + ); + let mut payload_swap = Vec::new(); + std::mem::swap(payload, &mut payload_swap); + + let mut reuse_immutable = self.get_immutable_record_or_create(); + crate::storage::sqlite3_ondisk::read_record( + &payload_swap, + reuse_immutable.as_mut().unwrap(), + )?; + + let _ = read_overflow_state.take(); + Ok(CursorResult::Ok(())) } /// Calculates how much of a cell's payload should be stored locally vs in overflow pages From 5c08d259bfc8dc81e2247bf0ac98a6d06904244e Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 13:53:21 +0300 Subject: [PATCH 25/42] Fix drop table: initialize loaded pages to cell idx 0 --- core/storage/btree.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index abfcc25f6..da5267331 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4564,6 +4564,7 @@ impl BTreeCursor { DestroyState::LoadPage => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); + self.stack.set_cell_index(0); // initialize to first cell of page when loaded. let destroy_info = self .state From 1b4bef9c7c9b13a11dd74133f7aa0c1658e2ee2b Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 15:15:59 +0300 Subject: [PATCH 26/42] Fix op_idx_delete infinite seeking loop --- core/vdbe/execute.rs | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 34cdff7be..6916b53e7 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -3932,6 +3932,13 @@ pub fn op_idx_delete( else { unreachable!("unexpected Insn {:?}", insn) }; + tracing::debug!( + "op_idx_delete(cursor_id={}, start_reg={}, num_regs={}, state={:?})", + cursor_id, + start_reg, + num_regs, + state.op_idx_delete_state + ); loop { match &state.op_idx_delete_state { Some(OpIdxDeleteState::Seeking(record)) => { @@ -3941,12 +3948,13 @@ pub fn op_idx_delete( return_if_io!( cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true }) ); - tracing::debug!( - "op_idx_delete(seek={}, record={} rowid={:?})", - &record, - return_if_io!(cursor.record()).as_ref().unwrap(), - return_if_io!(cursor.rowid()) - ); + } + state.op_idx_delete_state = Some(OpIdxDeleteState::Deleting); + } + Some(OpIdxDeleteState::Deleting) => { + { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); if return_if_io!(cursor.rowid()).is_none() { // If P5 is not zero, then raise an SQLITE_CORRUPT_INDEX error if no matching // index entry is found. This happens when running an UPDATE or DELETE statement and the @@ -3955,16 +3963,9 @@ pub fn op_idx_delete( // For those cases, P5 is zero. Also, do not raise this (self-correcting and non-critical) error if in writable_schema mode. return Err(LimboError::Corrupt(format!( "IdxDelete: no matching index entry found for record {:?}", - record + make_record(&state.registers, start_reg, num_regs) ))); } - } - state.op_idx_delete_state = Some(OpIdxDeleteState::Deleting); - } - Some(OpIdxDeleteState::Deleting) => { - { - let mut cursor = state.get_cursor(*cursor_id); - let cursor = cursor.as_btree_mut(); return_if_io!(cursor.delete()); } let n_change = program.n_change.get(); From 499296d396cf364f3fa42e2cb24f132f95e55bc5 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 5 Jun 2025 15:39:26 +0300 Subject: [PATCH 27/42] fix drop table again: only stack.advance() in a single place --- core/storage/btree.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index da5267331..75fa02b5d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4564,7 +4564,7 @@ impl BTreeCursor { DestroyState::LoadPage => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); - self.stack.set_cell_index(0); // initialize to first cell of page when loaded. + self.stack.advance(); let destroy_info = self .state @@ -4595,7 +4595,6 @@ impl BTreeCursor { (false, n) if n == contents.cell_count() as i32 => { if let Some(rightmost) = contents.rightmost_pointer() { let rightmost_page = self.read_page(rightmost as usize)?; - self.stack.advance(); self.stack.push(rightmost_page); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", @@ -4664,7 +4663,6 @@ impl BTreeCursor { _ => panic!("expected interior cell"), }; let child_page = self.read_page(child_page_id as usize)?; - self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", @@ -4682,7 +4680,6 @@ impl BTreeCursor { BTreeCell::IndexInteriorCell(index_int_cell) => { let child_page = self.read_page(index_int_cell.left_child_page as usize)?; - self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", @@ -4692,7 +4689,6 @@ impl BTreeCursor { } // For any leaf cell, advance the index now that overflow pages have been cleared BTreeCell::TableLeafCell(_) | BTreeCell::IndexLeafCell(_) => { - self.stack.advance(); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", ); From d81f5f67bd0c80193adb052d7a5224cf833669fd Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 11:04:04 +0300 Subject: [PATCH 28/42] insert spaghetti fixes --- core/storage/btree.rs | 115 +++++++++++++++++---------------- core/storage/sqlite3_ondisk.rs | 7 +- core/vdbe/execute.rs | 19 ++++-- 3 files changed, 81 insertions(+), 60 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 75fa02b5d..65f5588e6 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -381,7 +381,7 @@ enum OverflowState { /// Holds a Record or RowId, so that these can be transformed into a SeekKey to restore /// cursor position to its previous location. -enum CursorContext { +pub enum CursorContext { TableRowId(i64), /// If we are in an index tree we can then reuse this field to save @@ -390,7 +390,8 @@ enum CursorContext { } /// In the future, we may expand these general validity states -enum CursorValidState { +#[derive(Debug, PartialEq, Eq)] +pub enum CursorValidState { /// Cursor is pointing a to an existing location/cell in the Btree Valid, /// Cursor may be pointing to a non-existent location/cell. This can happen after balancing operations @@ -431,16 +432,16 @@ pub enum CursorSeekState { } #[derive(Debug)] -struct FindCellState(Option); +struct FindCellState(Option); impl FindCellState { #[inline] - fn set(&mut self, cell_idx: usize) { + fn set(&mut self, cell_idx: isize) { self.0 = Some(cell_idx) } #[inline] - fn get_cell_idx(&mut self) -> usize { + fn get_cell_idx(&mut self) -> isize { self.0.expect("get can only be called after a set") } @@ -482,7 +483,7 @@ pub struct BTreeCursor { /// Stores the cursor context before rebalancing so that a seek can be done later context: Option, /// Store whether the Cursor is in a valid state. Meaning if it is pointing to a valid cell index or not - valid_state: CursorValidState, + pub valid_state: CursorValidState, /// Colations for Index Btree constraint checks /// Contains the Collation Seq for the whole Index /// This Vec should be empty for Table Btree @@ -1226,14 +1227,16 @@ impl BTreeCursor { /// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10). /// We don't include the rowid in the comparison and that's why the last value from the record is not included. fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { - match key { + let ret = return_if_io!(match key { SeekKey::TableRowId(rowid) => { - return self.tablebtree_seek(rowid, op); + self.tablebtree_seek(rowid, op) } SeekKey::IndexKey(index_key) => { - return self.indexbtree_seek(index_key, op); + self.indexbtree_seek(index_key, op) } - } + }); + self.valid_state = CursorValidState::Valid; + Ok(CursorResult::Ok(ret)) } /// Move the cursor to the root page of the btree. @@ -1951,7 +1954,7 @@ impl BTreeCursor { let cmp = { let record = self.get_immutable_record(); let record = record.as_ref().unwrap(); - tracing::info!("record={}", record); + tracing::debug!("record={}", record); let record_slice_equal_number_of_cols = &record.get_values().as_slice()[..key.get_values().len()]; compare_immutable( @@ -2068,6 +2071,7 @@ impl BTreeCursor { // find cell (return_if_io!(self.find_cell(page, bkey)), page.page_type()) }; + self.stack.set_cell_index(cell_idx as i32); tracing::debug!("insert_into_page(cell_idx={})", cell_idx); // if the cell index is less than the total cells, check: if its an existing @@ -2145,13 +2149,24 @@ impl BTreeCursor { contents.overflow_cells.len() }; self.stack.set_cell_index(cell_idx as i32); - let write_info = self - .state - .mut_write_info() - .expect("can't count while inserting"); if overflow > 0 { + // A balance will happen so save the key we were inserting + self.save_context(match bkey { + BTreeKey::TableRowId(rowid) => CursorContext::TableRowId(rowid.0), + BTreeKey::IndexKey(record) => { + CursorContext::IndexKeyRowId((*record).clone()) + } + }); + let write_info = self + .state + .mut_write_info() + .expect("can't count while inserting"); write_info.state = WriteState::BalanceStart; } else { + let write_info = self + .state + .mut_write_info() + .expect("can't count while inserting"); write_info.state = WriteState::Finish; } } @@ -3758,10 +3773,12 @@ impl BTreeCursor { self.find_cell_state.set(0); } let cell_count = page.cell_count(); - while self.find_cell_state.get_cell_idx() < cell_count { + while self.find_cell_state.get_cell_idx() < cell_count as isize { + assert!(self.find_cell_state.get_cell_idx() >= 0); + let cell_idx = self.find_cell_state.get_cell_idx() as usize; match page .cell_get( - self.find_cell_state.get_cell_idx(), + cell_idx, payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16), payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16), self.usable_space(), @@ -3814,6 +3831,8 @@ impl BTreeCursor { self.find_cell_state.set(cell_idx + 1); } let cell_idx = self.find_cell_state.get_cell_idx(); + assert!(cell_idx >= 0); + let cell_idx = cell_idx as usize; assert!(cell_idx <= cell_count); self.find_cell_state.reset(); Ok(CursorResult::Ok(cell_idx)) @@ -3914,7 +3933,7 @@ impl BTreeCursor { Ok(CursorResult::Ok(cursor_has_record)) } - pub fn rowid(&self) -> Result>> { + pub fn rowid(&mut self) -> Result>> { if let Some(mv_cursor) = &self.mv_cursor { let mv_cursor = mv_cursor.borrow(); return Ok(CursorResult::Ok( @@ -3963,6 +3982,7 @@ impl BTreeCursor { self.invalidate_record(); // Reset seek state self.seek_state = CursorSeekState::Start; + self.valid_state = CursorValidState::Valid; self.has_record.replace(cursor_has_record); Ok(CursorResult::Ok(cursor_has_record)) } @@ -4057,22 +4077,38 @@ impl BTreeCursor { None => { tracing::trace!("moved {}", moved_before); if !moved_before && !matches!(self.state, CursorState::Write(..)) { + // Use move_to() so that we always end up on a leaf page. seek() might go back to an interior cell in index seeks, + // which we never want. The reason we can use move_to() is that + // find_cell() iterates the leaf page from left to right to find the insertion point anyway, so we don't care + // which cell we are in as long as we are on the right page. + // FIXME: find_cell() should not use linear search because it's slow. + self.seek_state = CursorSeekState::Start; match key { BTreeKey::IndexKey(_) => { - return_if_io!(self.seek( + return_if_io!(self.move_to( SeekKey::IndexKey(key.get_record().unwrap()), SeekOp::GE { eq_only: true } )) } BTreeKey::TableRowId(_) => { - return_if_io!(self.seek( + return_if_io!(self.move_to( SeekKey::TableRowId(key.to_rowid()), SeekOp::GE { eq_only: true } )) } }; + self.context.take(); // we know where we wanted to move so if there was any saved context, discard it. + self.valid_state = CursorValidState::Valid; + self.seek_state = CursorSeekState::Start; + tracing::info!( + "seeked to the right place, page is now {:?}", + self.stack.top().get().get().id + ); } return_if_io!(self.insert_into_page(key)); + // if we did a tree rebalance, eagerly seek to the right place again. + // might not be the right thing to do this eagerly, but we just want to make this work for starters... + return_if_io!(self.restore_context()); if key.maybe_rowid().is_some() { self.has_record.replace(true); } @@ -4916,28 +4952,11 @@ impl BTreeCursor { } } - /// Save cursor context, to be restored later - // pub fn save_context(&mut self) { - // if let CursorHasRecord::Yes { rowid } = self.has_record.get() { - // self.valid_state = CursorValidState::RequireSeek; - // match self.stack.top().get().get_contents().page_type() { - // PageType::TableInterior | PageType::TableLeaf => { - // self.context = Some(CursorContext::TableRowId(rowid.expect( - // "table cells should have a rowid since we don't support WITHOUT ROWID tables", - // ))); - // } - // PageType::IndexInterior | PageType::IndexLeaf => { - // self.context = Some(CursorContext::IndexKeyRowId( - // self.reusable_immutable_record - // .borrow() - // .as_ref() - // .unwrap() - // .clone(), - // )); - // } - // } - // } - // } + // Save cursor context, to be restored later + pub fn save_context(&mut self, cursor_context: CursorContext) { + self.valid_state = CursorValidState::RequireSeek; + self.context = Some(cursor_context); + } /// If context is defined, restore it and set it None on success fn restore_context(&mut self) -> Result> { @@ -5112,18 +5131,6 @@ impl PageStack { self.cell_indices.borrow_mut()[current] -= 1; } - /// Move the cursor to the next cell in the current page according to the iteration direction. - fn next_cell_in_direction(&self, iteration_direction: IterationDirection) { - match iteration_direction { - IterationDirection::Forwards => { - self.advance(); - } - IterationDirection::Backwards => { - self.retreat(); - } - } - } - fn set_cell_index(&self, idx: i32) { let current = self.current(); self.cell_indices.borrow_mut()[current] = idx; diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 917a91da5..31ddc1cc0 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -594,7 +594,12 @@ impl PageContent { // the page header is 12 bytes for interior pages, 8 bytes for leaf pages // this is because the 4 last bytes in the interior page's header are used for the rightmost pointer. let cell_pointer_array_start = self.header_size(); - assert!(idx < ncells, "cell_get: idx out of bounds"); + assert!( + idx < ncells, + "cell_get: idx out of bounds: idx={}, ncells={}", + idx, + ncells + ); let cell_pointer = cell_pointer_array_start + (idx * 2); let cell_pointer = self.read_u16(cell_pointer) as usize; diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 6916b53e7..5b1cce3bd 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1,6 +1,7 @@ #![allow(unused_variables)] use crate::numeric::{NullableInteger, Numeric}; use crate::schema::Schema; +use crate::storage::btree::CursorValidState; use crate::storage::database::FileMemoryStorage; use crate::storage::page_cache::DumbLruPageCache; use crate::storage::pager::CreateBTreeFlags; @@ -3849,7 +3850,10 @@ pub fn op_insert( // NOTE(pere): Sending moved_before == true is okay because we moved before but // if we were to set to false after starting a balance procedure, it might // leave undefined state. - return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record)), true)); + return_if_io!(cursor.insert( + &BTreeKey::new_table_rowid(key, Some(record)), + cursor.valid_state == CursorValidState::Valid + )); // Only update last_insert_rowid for regular table inserts, not schema modifications if cursor.root_page() != 1 { if let Some(rowid) = return_if_io!(cursor.rowid()) { @@ -3900,9 +3904,9 @@ pub fn op_delete( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); tracing::debug!( - "op_delete(record={:?}, rowid={:?})", - cursor.record(), - cursor.rowid()? + "op_delete(rowid ={:?}, record={:?})", + cursor.rowid()?, + cursor.record()?, ); return_if_io!(cursor.delete()); } @@ -4014,7 +4018,7 @@ pub fn op_idx_insert( }; // To make this reentrant in case of `moved_before` = false, we need to check if the previous cursor.insert started // a write/balancing operation. If it did, it means we already moved to the place we wanted. - let moved_before = if cursor.is_write_in_progress() { + let mut moved_before = if cursor.is_write_in_progress() { true } else { if index_meta.unique { @@ -4034,6 +4038,11 @@ pub fn op_idx_insert( } }; + if cursor.valid_state != CursorValidState::Valid { + // A balance happened so we need to move. + moved_before = false; + } + // Start insertion of row. This might trigger a balance procedure which will take care of moving to different pages, // therefore, we don't want to seek again if that happens, meaning we don't want to return on io without moving to the following opcode // because it could trigger a movement to child page after a balance root which will leave the current page as the root page. From 844461d20b82800334cf073d9eb9a22936e7f4af Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 13:08:21 +0300 Subject: [PATCH 29/42] update and delete fixes --- .github/workflows/rust.yml | 1 + core/storage/btree.rs | 72 ++++++++++++++++++++++++++------- core/translate/emitter.rs | 15 ++++++- core/translate/optimizer/mod.rs | 21 ++++++---- core/vdbe/execute.rs | 23 ++++++----- 5 files changed, 98 insertions(+), 34 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fd70ebbeb..ae5ad13a7 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -67,6 +67,7 @@ jobs: test-limbo: runs-on: blacksmith-4vcpu-ubuntu-2404 + timeout-minutes: 20 steps: - name: Install cargo-c env: diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 65f5588e6..65529793d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -176,7 +176,9 @@ enum DeleteState { cell_idx: usize, original_child_pointer: Option, }, - CheckNeedsBalancing, + CheckNeedsBalancing { + rightmost_cell_was_dropped: bool, + }, WaitForBalancingToComplete { target_key: DeleteSavepoint, }, @@ -1117,8 +1119,8 @@ impl BTreeCursor { } loop { let mem_page_rc = self.stack.top(); - tracing::trace!( - "current_before_advance id={} cell={}", + tracing::debug!( + "next: current_before_advance id={} cell={}", mem_page_rc.get().get().id, self.stack.current_cell_index() ); @@ -1142,8 +1144,8 @@ impl BTreeCursor { // Important to advance only after loading the page in order to not advance > 1 times self.stack.advance(); let cell_idx = self.stack.current_cell_index() as usize; - tracing::trace!( - "current id={} cell={}", + tracing::debug!( + "next:current id={} cell={}", mem_page_rc.get().get().id, cell_idx ); @@ -1619,7 +1621,12 @@ impl BTreeCursor { assert!(self.mv_cursor.is_none()); let iter_dir = seek_op.iteration_direction(); - if matches!(self.seek_state, CursorSeekState::Start { .. }) { + if matches!( + self.seek_state, + CursorSeekState::Start { .. } + | CursorSeekState::MovingBetweenPages { .. } + | CursorSeekState::InteriorPageBinarySearch { .. } + ) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::TableRowId(rowid), seek_op)); let page = self.stack.top(); @@ -1733,7 +1740,12 @@ impl BTreeCursor { key: &ImmutableRecord, seek_op: SeekOp, ) -> Result> { - if matches!(self.seek_state, CursorSeekState::Start { .. }) { + if matches!( + self.seek_state, + CursorSeekState::Start { .. } + | CursorSeekState::MovingBetweenPages { .. } + | CursorSeekState::InteriorPageBinarySearch { .. } + ) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::IndexKey(key), seek_op)); let CursorSeekState::FoundLeaf { eq_seen } = &self.seek_state else { @@ -1785,7 +1797,10 @@ impl BTreeCursor { moving_up_to_parent, } = &self.seek_state else { - unreachable!("we must be in a leaf binary search state"); + unreachable!( + "we must be in a leaf binary search state, got: {:?}", + self.seek_state + ); }; if moving_up_to_parent.get() { @@ -2019,7 +2034,15 @@ impl BTreeCursor { // 5. We scan the leaf cells in the leaf page until we find the cell whose rowid is equal to the rowid we are looking for. // This cell contains the actual data we are looking for. // 6. If we find the cell, we return the record. Otherwise, we return an empty result. - if matches!(self.seek_state, CursorSeekState::Start { .. }) { + + // If we are at the beginning/end of seek state, start a new move from the root. + if matches!( + self.seek_state, + CursorSeekState::LeafPageBinarySearch { .. } + ) { + self.seek_state = CursorSeekState::Start; + } + if matches!(self.seek_state, CursorSeekState::Start) { self.move_to_root(); } @@ -4082,7 +4105,6 @@ impl BTreeCursor { // find_cell() iterates the leaf page from left to right to find the insertion point anyway, so we don't care // which cell we are in as long as we are on the right page. // FIXME: find_cell() should not use linear search because it's slow. - self.seek_state = CursorSeekState::Start; match key { BTreeKey::IndexKey(_) => { return_if_io!(self.move_to( @@ -4193,6 +4215,12 @@ impl BTreeCursor { )); } + tracing::debug!( + "DeleteState::FindCell: page_id: {}, cell_idx: {}", + page.get().id, + cell_idx + ); + let cell = contents.cell_get( cell_idx, payload_overflow_threshold_max( @@ -4231,6 +4259,8 @@ impl BTreeCursor { let page = page.get(); let contents = page.get_contents(); + let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1); + let delete_info = self.state.mut_delete_info().unwrap(); if !contents.is_leaf() { delete_info.state = DeleteState::InteriorNodeReplacement { @@ -4242,7 +4272,9 @@ impl BTreeCursor { drop_cell(contents, cell_idx, self.usable_space() as u16)?; let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::CheckNeedsBalancing; + delete_info.state = DeleteState::CheckNeedsBalancing { + rightmost_cell_was_dropped: is_last_cell, + }; } } @@ -4319,10 +4351,14 @@ impl BTreeCursor { drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::CheckNeedsBalancing; + delete_info.state = DeleteState::CheckNeedsBalancing { + rightmost_cell_was_dropped: false, // TODO: when is this true? + }; } - DeleteState::CheckNeedsBalancing => { + DeleteState::CheckNeedsBalancing { + rightmost_cell_was_dropped, + } => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); @@ -4332,9 +4368,15 @@ impl BTreeCursor { let needs_balancing = self.stack.has_parent() && free_space as usize * 3 > self.usable_space() * 2; + if rightmost_cell_was_dropped { + // If we drop a cell in the middle, e.g. our current index is 2 and we drop 'c' from [a,b,c,d,e], then we don't need to retreat index, + // because index 2 is still going to be the right place [a,b,D,e] + // but: + // If we drop the rightmost cell, e.g. our current index is 4 and we drop 'e' from [a,b,c,d,e], then we need to retreat index, + // because index 4 is now pointing beyond the last cell [a,b,c,d] _ <-- index 4 + self.stack.retreat(); + } if needs_balancing { - // FIXME(pere): cell index must be updated before calling `rowid` or - // `record` let target_key = if page.is_index() { let record = match return_if_io!(self.record()) { Some(record) => record.clone(), diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index 06ae3ea8d..f8be9e87d 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -870,9 +870,20 @@ fn emit_program_for_update( )?; // Open indexes for update. let mut index_cursors = Vec::with_capacity(plan.indexes_to_update.len()); - // TODO: do not reopen if there is table reference using it. - for index in &plan.indexes_to_update { + if let Some(index_cursor) = program.resolve_cursor_id_safe(&CursorKey::index( + plan.table_references + .joined_tables() + .first() + .unwrap() + .internal_id, + index.clone(), + )) { + // Don't reopen index if it was already opened as the iteration cursor for this update plan. + let record_reg = program.alloc_register(); + index_cursors.push((index_cursor, record_reg)); + continue; + } let index_cursor = program.alloc_cursor_id(CursorType::BTreeIndex(index.clone())); program.emit_insn(Insn::OpenWrite { cursor_id: index_cursor, diff --git a/core/translate/optimizer/mod.rs b/core/translate/optimizer/mod.rs index 38f1d3d73..910c93e2c 100644 --- a/core/translate/optimizer/mod.rs +++ b/core/translate/optimizer/mod.rs @@ -98,7 +98,7 @@ fn optimize_delete_plan(plan: &mut DeletePlan, schema: &Schema) -> Result<()> { Ok(()) } -fn optimize_update_plan(plan: &mut UpdatePlan, schema: &Schema) -> Result<()> { +fn optimize_update_plan(plan: &mut UpdatePlan, _schema: &Schema) -> Result<()> { rewrite_exprs_update(plan)?; if let ConstantConditionEliminationResult::ImpossibleCondition = eliminate_constant_conditions(&mut plan.where_clause)? @@ -106,13 +106,18 @@ fn optimize_update_plan(plan: &mut UpdatePlan, schema: &Schema) -> Result<()> { plan.contains_constant_false_condition = true; return Ok(()); } - let _ = optimize_table_access( - &mut plan.table_references, - &schema.indexes, - &mut plan.where_clause, - &mut plan.order_by, - &mut None, - )?; + // FIXME: don't use indexes for update right now because it's not safe to traverse an index + // while also updating the same table, things go wrong. + // e.g. in 'explain update t set x=x+5 where x > 10;' where x is an indexed column, + // sqlite first creates an ephemeral index to store the current values so the tree traversal + // doesn't get messed up while updating. + // let _ = optimize_table_access( + // &mut plan.table_references, + // &schema.indexes, + // &mut plan.where_clause, + // &mut plan.order_by, + // &mut None, + // )?; Ok(()) } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 5b1cce3bd..356025279 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1345,11 +1345,16 @@ pub fn op_column( unreachable!("unexpected Insn {:?}", insn) }; if let Some((index_cursor_id, table_cursor_id)) = state.deferred_seeks[*cursor_id].take() { - let deferred_seek = { + let deferred_seek = 'd: { let rowid = { let mut index_cursor = state.get_cursor(index_cursor_id); let index_cursor = index_cursor.as_btree_mut(); - return_if_io!(index_cursor.rowid()) + match index_cursor.rowid()? { + CursorResult::IO => { + break 'd Some((index_cursor_id, table_cursor_id)); + } + CursorResult::Ok(rowid) => rowid, + } }; let mut table_cursor = state.get_cursor(table_cursor_id); let table_cursor = table_cursor.as_btree_mut(); @@ -1904,11 +1909,16 @@ pub fn op_row_id( unreachable!("unexpected Insn {:?}", insn) }; if let Some((index_cursor_id, table_cursor_id)) = state.deferred_seeks[*cursor_id].take() { - let deferred_seek = { + let deferred_seek = 'd: { let rowid = { let mut index_cursor = state.get_cursor(index_cursor_id); let index_cursor = index_cursor.as_btree_mut(); - let record = return_if_io!(index_cursor.record()); + let record = match index_cursor.record()? { + CursorResult::IO => { + break 'd Some((index_cursor_id, table_cursor_id)); + } + CursorResult::Ok(record) => record, + }; let record = record.as_ref().unwrap(); let rowid = record.get_values().last().unwrap(); match rowid { @@ -3903,11 +3913,6 @@ pub fn op_delete( { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - tracing::debug!( - "op_delete(rowid ={:?}, record={:?})", - cursor.rowid()?, - cursor.record()?, - ); return_if_io!(cursor.delete()); } let prev_changes = program.n_change.get(); From 843eb18dafb8e7884d17586bf2a38d5eb07bfb33 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 13:29:09 +0300 Subject: [PATCH 30/42] simplify cursor.exists() by using seek() --- core/storage/btree.rs | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 65529793d..778b4af62 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4510,38 +4510,11 @@ impl BTreeCursor { Value::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - let _ = return_if_io!( - self.move_to(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true }) - ); - self.seek_state = CursorSeekState::Start; - let page = self.stack.top(); - // TODO(pere): request load - return_if_locked_maybe_load!(self.pager, page); - - let page = page.get(); - let contents = page.get().contents.as_ref().unwrap(); - - // find cell - let int_key = match key { - Value::Integer(i) => *i, - _ => unreachable!("btree tables are indexed by integers!"), - }; - let cell_idx = - return_if_io!(self.find_cell(contents, &BTreeKey::new_table_rowid(int_key, None))); - if cell_idx >= contents.cell_count() { - Ok(CursorResult::Ok(false)) - } else { - let equals = match &contents.cell_get( - cell_idx, - payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), - payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), - self.usable_space(), - )? { - BTreeCell::TableLeafCell(l) => l._rowid == int_key, - _ => unreachable!(), - }; - Ok(CursorResult::Ok(equals)) - } + let has_record = + return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true })); + self.has_record.set(has_record); + self.invalidate_record(); + Ok(CursorResult::Ok(has_record)) } /// Clear the overflow pages linked to a specific page provided by the leaf cell From 3265e0a789da6a31a60bdc25f7632333e30964bb Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 13:47:22 +0300 Subject: [PATCH 31/42] adjust timeouts in ci --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ae5ad13a7..d38006d4c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -40,7 +40,7 @@ jobs: env: RUST_LOG: ${{ runner.debug && 'limbo_core::storage=trace' || '' }} run: cargo test --verbose - timeout-minutes: 10 + timeout-minutes: 20 clippy: @@ -92,7 +92,7 @@ jobs: - uses: "./.github/shared/install_sqlite" - name: Test run: make test - timeout-minutes: 10 + timeout-minutes: 20 test-sqlite: runs-on: blacksmith-4vcpu-ubuntu-2404 From 04e89c0c4a1cb6afac307742aced537ee4b5823a Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 18:14:22 +0300 Subject: [PATCH 32/42] actually fix drop table --- core/storage/btree.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 778b4af62..8f88793a3 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4615,7 +4615,6 @@ impl BTreeCursor { DestroyState::LoadPage => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); - self.stack.advance(); let destroy_info = self .state @@ -4625,8 +4624,8 @@ impl BTreeCursor { } DestroyState::ProcessPage => { let page = self.stack.top(); + self.stack.advance(); assert!(page.get().is_loaded()); // page should be loaded at this time - let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index(); From a5aeff9a3d77d89c8a9ee649b8cfca27ea2a7ec3 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 18:41:40 +0300 Subject: [PATCH 33/42] Fix index insert accidentally double-inserting after balance --- core/storage/btree.rs | 134 +++++++++++++++++++++++++++++++----------- core/storage/pager.rs | 1 + core/vdbe/execute.rs | 16 +---- 3 files changed, 104 insertions(+), 47 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 8f88793a3..a2ff9a854 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1119,17 +1119,17 @@ impl BTreeCursor { } loop { let mem_page_rc = self.stack.top(); - tracing::debug!( - "next: current_before_advance id={} cell={}", - mem_page_rc.get().get().id, - self.stack.current_cell_index() - ); - return_if_locked_maybe_load!(self.pager, mem_page_rc); let mem_page = mem_page_rc.get(); let contents = mem_page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); + tracing::debug!( + "next: current_before_advance id={} cell={}, cell_count={}", + mem_page_rc.get().get().id, + self.stack.current_cell_index(), + cell_count + ); let is_index = mem_page_rc.get().is_index(); let should_skip_advance = is_index @@ -1138,6 +1138,12 @@ impl BTreeCursor { // valid cell then it means we will have to move upwards again or move to right page, // anyways, we won't visit this invalid cell index if should_skip_advance { + tracing::debug!( + "next: skipping advance, going_upwards: {}, page: {}, cell_idx: {}", + self.going_upwards, + mem_page_rc.get().get().id, + self.stack.current_cell_index() + ); self.going_upwards = false; return Ok(CursorResult::Ok(true)); } @@ -1244,6 +1250,8 @@ impl BTreeCursor { /// Move the cursor to the root page of the btree. #[instrument(skip_all, level = Level::TRACE)] fn move_to_root(&mut self) { + self.seek_state = CursorSeekState::Start; + self.going_upwards = false; tracing::trace!("move_to_root({})", self.root_page); let mem_page = self.read_page(self.root_page).unwrap(); self.stack.clear(); @@ -1455,7 +1463,10 @@ impl BTreeCursor { eq_seen, } = &self.seek_state else { - unreachable!("we must be in an interior binary search state"); + unreachable!( + "we must be in an interior binary search state, got {:?}", + self.seek_state + ); }; loop { @@ -2038,7 +2049,8 @@ impl BTreeCursor { // If we are at the beginning/end of seek state, start a new move from the root. if matches!( self.seek_state, - CursorSeekState::LeafPageBinarySearch { .. } + // these are stages that happen at the leaf page, so we can consider that the previous seek finished and we can start a new one. + CursorSeekState::LeafPageBinarySearch { .. } | CursorSeekState::FoundLeaf { .. } ) { self.seek_state = CursorSeekState::Start; } @@ -2174,6 +2186,7 @@ impl BTreeCursor { self.stack.set_cell_index(cell_idx as i32); if overflow > 0 { // A balance will happen so save the key we were inserting + tracing::debug!("insert_into_page: balance triggered on key {:?}, page={:?}, cell_idx={}", bkey, page.get().get().id, cell_idx); self.save_context(match bkey { BTreeKey::TableRowId(rowid) => CursorContext::TableRowId(rowid.0), BTreeKey::IndexKey(record) => { @@ -2203,6 +2216,12 @@ impl BTreeCursor { } }; }; + if matches!(self.state.write_info().unwrap().state, WriteState::Finish) { + // if there was a balance triggered, the cursor position is invalid. + // it's probably not the greatest idea in the world to do this eagerly here, + // but at least it works. + return_if_io!(self.restore_context()); + } self.state = CursorState::None; ret } @@ -3836,9 +3855,13 @@ impl BTreeCursor { first_overflow_page, payload_size, )); + let key_values = key.to_index_key_values(); + let record = self.get_immutable_record(); + let record = record.as_ref().unwrap(); + let record_same_number_cols = &record.get_values()[..key_values.len()]; let order = compare_immutable( - key.to_index_key_values(), - self.get_immutable_record().as_ref().unwrap().get_values(), + key_values, + record_same_number_cols, self.key_sort_order(), &self.collations, ); @@ -4084,9 +4107,8 @@ impl BTreeCursor { pub fn insert( &mut self, key: &BTreeKey, - moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */ + mut moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */ ) -> Result> { - tracing::trace!("insert"); match &self.mv_cursor { Some(mv_cursor) => match key.maybe_rowid() { Some(rowid) => { @@ -4098,8 +4120,12 @@ impl BTreeCursor { None => todo!("Support mvcc inserts with index btrees"), }, None => { - tracing::trace!("moved {}", moved_before); - if !moved_before && !matches!(self.state, CursorState::Write(..)) { + tracing::debug!("insert(moved={}, key={:?}), valid_state={:?}, cursor_state={:?}, is_write_in_progress={}", moved_before, key, self.valid_state, self.state, self.is_write_in_progress()); + if self.valid_state != CursorValidState::Valid && !self.is_write_in_progress() { + // A balance happened so we need to move. + moved_before = false; + } + if !moved_before { // Use move_to() so that we always end up on a leaf page. seek() might go back to an interior cell in index seeks, // which we never want. The reason we can use move_to() is that // find_cell() iterates the leaf page from left to right to find the insertion point anyway, so we don't care @@ -4122,15 +4148,12 @@ impl BTreeCursor { self.context.take(); // we know where we wanted to move so if there was any saved context, discard it. self.valid_state = CursorValidState::Valid; self.seek_state = CursorSeekState::Start; - tracing::info!( + tracing::debug!( "seeked to the right place, page is now {:?}", self.stack.top().get().get().id ); } return_if_io!(self.insert_into_page(key)); - // if we did a tree rebalance, eagerly seek to the right place again. - // might not be the right thing to do this eagerly, but we just want to make this work for starters... - return_if_io!(self.restore_context()); if key.maybe_rowid().is_some() { self.has_record.replace(true); } @@ -4348,7 +4371,12 @@ impl BTreeCursor { cell_idx, self.usable_space() as u16, )?; + let is_last_parent_cell = + cell_idx == parent_contents.cell_count().saturating_sub(1); drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + if is_last_parent_cell { + tracing::debug!("is_last_parent_cell: {:?}", parent_contents.cell_count()); + } let delete_info = self.state.mut_delete_info().unwrap(); delete_info.state = DeleteState::CheckNeedsBalancing { @@ -6508,14 +6536,14 @@ mod tests { std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() - .as_secs(), + .as_millis(), |v| { v.parse() .expect("Failed to parse SEED environment variable as u64") }, ); - let rng = ChaCha8Rng::seed_from_u64(seed); - (rng, seed) + let rng = ChaCha8Rng::seed_from_u64(seed as u64); + (rng, seed as u64) } fn btree_insert_fuzz_run( @@ -6668,7 +6696,6 @@ mod tests { let mut keys = SortedVec::new(); tracing::info!("seed: {}", seed); for i in 0..inserts { - tracing::info!("insert {}", i); pager.begin_read_tx().unwrap(); pager.begin_write_tx().unwrap(); let key = { @@ -6687,6 +6714,7 @@ mod tests { } result }; + tracing::info!("insert {}/{}: {:?}", i + 1, inserts, key); keys.push(key.clone()); let value = ImmutableRecord::from_registers( &key.iter() @@ -6713,22 +6741,62 @@ mod tests { } } } + + // Check that all keys can be found by seeking pager.begin_read_tx().unwrap(); cursor.move_to_root(); - for key in keys.iter() { - tracing::trace!("seeking key: {:?}", key); + for (i, key) in keys.iter().enumerate() { + tracing::info!("seeking key {}/{}: {:?}", i + 1, keys.len(), key); + let exists = run_until_done( + || { + cursor.seek( + SeekKey::IndexKey(&ImmutableRecord::from_registers( + &key.iter() + .map(|col| Register::Value(Value::Integer(*col))) + .collect::>(), + )), + SeekOp::GE { eq_only: true }, + ) + }, + pager.deref(), + ) + .unwrap(); + assert!(exists, "key {:?} is not found", key); + } + // Check that key count is right + cursor.move_to_root(); + let mut count = 0; + while run_until_done(|| cursor.next(), pager.deref()).unwrap() { + count += 1; + } + assert_eq!( + count, + keys.len(), + "key count is not right, got {}, expected {}", + count, + keys.len() + ); + // Check that all keys can be found in-order, by iterating the btree + cursor.move_to_root(); + let mut prev = None; + for (i, key) in keys.iter().enumerate() { + tracing::info!("iterating key {}/{}: {:?}", i + 1, keys.len(), key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); let record = run_until_done(|| cursor.record(), &pager).unwrap(); let record = record.as_ref().unwrap(); - let cursor_key = record.get_values(); - assert_eq!( - cursor_key, - &key.iter() - .map(|col| RefValue::Integer(*col)) - .collect::>(), - "key {:?} is not found", - key - ); + let cur = record.get_values().clone(); + if let Some(prev) = prev { + if prev >= cur { + println!("Seed: {}", seed); + } + assert!( + prev < cur, + "keys are not in ascending order: {:?} < {:?}", + prev, + cur + ); + } + prev = Some(cur); } pager.end_read_tx().unwrap(); } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 9a5fd30b1..255a9123e 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -850,6 +850,7 @@ impl Pager { // Providing a page is optional, if provided it will be used to avoid reading the page from disk. // This is implemented in accordance with sqlite freepage2() function. pub fn free_page(&self, page: Option, page_id: usize) -> Result<()> { + tracing::info!("free_page(page_id={})", page_id); const TRUNK_PAGE_HEADER_SIZE: usize = 8; const LEAF_ENTRY_SIZE: usize = 4; const RESERVED_SLOTS: usize = 2; diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 356025279..cf85acec5 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1,7 +1,6 @@ #![allow(unused_variables)] use crate::numeric::{NullableInteger, Numeric}; use crate::schema::Schema; -use crate::storage::btree::CursorValidState; use crate::storage::database::FileMemoryStorage; use crate::storage::page_cache::DumbLruPageCache; use crate::storage::pager::CreateBTreeFlags; @@ -3857,13 +3856,7 @@ pub fn op_insert( Value::Integer(i) => *i, _ => unreachable!("expected integer key"), }; - // NOTE(pere): Sending moved_before == true is okay because we moved before but - // if we were to set to false after starting a balance procedure, it might - // leave undefined state. - return_if_io!(cursor.insert( - &BTreeKey::new_table_rowid(key, Some(record)), - cursor.valid_state == CursorValidState::Valid - )); + return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record)), true)); // Only update last_insert_rowid for regular table inserts, not schema modifications if cursor.root_page() != 1 { if let Some(rowid) = return_if_io!(cursor.rowid()) { @@ -4023,7 +4016,7 @@ pub fn op_idx_insert( }; // To make this reentrant in case of `moved_before` = false, we need to check if the previous cursor.insert started // a write/balancing operation. If it did, it means we already moved to the place we wanted. - let mut moved_before = if cursor.is_write_in_progress() { + let moved_before = if cursor.is_write_in_progress() { true } else { if index_meta.unique { @@ -4043,11 +4036,6 @@ pub fn op_idx_insert( } }; - if cursor.valid_state != CursorValidState::Valid { - // A balance happened so we need to move. - moved_before = false; - } - // Start insertion of row. This might trigger a balance procedure which will take care of moving to different pages, // therefore, we don't want to seek again if that happens, meaning we don't want to return on io without moving to the following opcode // because it could trigger a movement to child page after a balance root which will leave the current page as the root page. From e1bc268a6536e2f01d5a0ec4bf177444be32cd86 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 6 Jun 2025 20:11:14 +0300 Subject: [PATCH 34/42] fix CREATE TABLE hang --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a2ff9a854..9d287729b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1156,7 +1156,7 @@ impl BTreeCursor { cell_idx ); - if cell_count == 0 || cell_idx == cell_count { + if cell_idx == cell_count { // do rightmost let has_parent = self.stack.has_parent(); match contents.rightmost_pointer() { From 58172641fdc1d79919832509511efc5157cc3f97 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 9 Jun 2025 10:16:57 +0300 Subject: [PATCH 35/42] Use SeekOP:LT after post-deletebalancing to end up pointing to the left of the deleted row --- core/storage/btree.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 9d287729b..58e432446 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4404,6 +4404,7 @@ impl BTreeCursor { // because index 4 is now pointing beyond the last cell [a,b,c,d] _ <-- index 4 self.stack.retreat(); } + if needs_balancing { let target_key = if page.is_index() { let record = match return_if_io!(self.record()) { @@ -4480,7 +4481,9 @@ impl BTreeCursor { SeekKey::IndexKey(immutable_record) } }; - return_if_io!(self.seek(key, SeekOp::GE { eq_only: true })); + // We want to end up pointing at the row to the left of the position of the row we deleted, so + // that after we call next() in the loop,the next row we delete will again be the same position as this one. + return_if_io!(self.seek(key, SeekOp::LT)); self.state = CursorState::None; return Ok(CursorResult::Ok(())); From 976c2f72ef5a343a1aea335190fc16592b922420 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 9 Jun 2025 23:35:25 +0300 Subject: [PATCH 36/42] OpIdxDeleteState needs another state to be re-entrant --- core/vdbe/execute.rs | 57 ++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index cf85acec5..845f20b18 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -3917,6 +3917,7 @@ pub fn op_delete( #[derive(Debug)] pub enum OpIdxDeleteState { Seeking(ImmutableRecord), // First seek row to delete + Verifying, Deleting, } pub fn op_idx_delete( @@ -3934,22 +3935,49 @@ pub fn op_idx_delete( else { unreachable!("unexpected Insn {:?}", insn) }; - tracing::debug!( - "op_idx_delete(cursor_id={}, start_reg={}, num_regs={}, state={:?})", - cursor_id, - start_reg, - num_regs, - state.op_idx_delete_state - ); loop { + tracing::debug!( + "op_idx_delete(cursor_id={}, start_reg={}, num_regs={}, rootpage={}, state={:?})", + cursor_id, + start_reg, + num_regs, + state.get_cursor(*cursor_id).as_btree_mut().root_page(), + state.op_idx_delete_state + ); match &state.op_idx_delete_state { Some(OpIdxDeleteState::Seeking(record)) => { { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - return_if_io!( - cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true }) + let found = return_if_io!( + cursor.seek(SeekKey::IndexKey(&record), SeekOp::LE { eq_only: true }) ); + tracing::debug!( + "op_idx_delete: found={:?}, rootpage={}, key={:?}", + found, + cursor.root_page(), + record + ); + } + state.op_idx_delete_state = Some(OpIdxDeleteState::Verifying); + } + Some(OpIdxDeleteState::Verifying) => { + let rowid = { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.rowid()) + }; + + if rowid.is_none() { + // If P5 is not zero, then raise an SQLITE_CORRUPT_INDEX error if no matching + // index entry is found. This happens when running an UPDATE or DELETE statement and the + // index entry to be updated or deleted is not found. For some uses of IdxDelete + // (example: the EXCEPT operator) it does not matter that no matching entry is found. + // For those cases, P5 is zero. Also, do not raise this (self-correcting and non-critical) error if in writable_schema mode. + return Err(LimboError::Corrupt(format!( + "IdxDelete: no matching index entry found for record {:?}", + make_record(&state.registers, start_reg, num_regs) + ))); } state.op_idx_delete_state = Some(OpIdxDeleteState::Deleting); } @@ -3957,17 +3985,6 @@ pub fn op_idx_delete( { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - if return_if_io!(cursor.rowid()).is_none() { - // If P5 is not zero, then raise an SQLITE_CORRUPT_INDEX error if no matching - // index entry is found. This happens when running an UPDATE or DELETE statement and the - // index entry to be updated or deleted is not found. For some uses of IdxDelete - // (example: the EXCEPT operator) it does not matter that no matching entry is found. - // For those cases, P5 is zero. Also, do not raise this (self-correcting and non-critical) error if in writable_schema mode. - return Err(LimboError::Corrupt(format!( - "IdxDelete: no matching index entry found for record {:?}", - make_record(&state.registers, start_reg, num_regs) - ))); - } return_if_io!(cursor.delete()); } let n_change = program.n_change.get(); From d827eeade005d0bd760e0d108be69f65ce496e39 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 10 Jun 2025 11:26:39 +0300 Subject: [PATCH 37/42] For now always calculate post-balance seek key --- core/storage/btree.rs | 89 ++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 58e432446..2b08a21ac 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -165,19 +165,27 @@ enum DeleteSavepoint { #[derive(Debug, Clone)] enum DeleteState { Start, - LoadPage, - FindCell, + DeterminePostBalancingSeekKey, + LoadPage { + post_balancing_seek_key: Option, + }, + FindCell { + post_balancing_seek_key: Option, + }, ClearOverflowPages { cell_idx: usize, cell: BTreeCell, original_child_pointer: Option, + post_balancing_seek_key: Option, }, InteriorNodeReplacement { cell_idx: usize, original_child_pointer: Option, + post_balancing_seek_key: Option, }, CheckNeedsBalancing { rightmost_cell_was_dropped: bool, + post_balancing_seek_key: Option, }, WaitForBalancingToComplete { target_key: DeleteSavepoint, @@ -4164,14 +4172,15 @@ impl BTreeCursor { /// Delete state machine flow: /// 1. Start -> check if the rowid to be delete is present in the page or not. If not we early return - /// 2. LoadPage -> load the page. - /// 3. FindCell -> find the cell to be deleted in the page. - /// 4. ClearOverflowPages -> Clear the overflow pages if there are any before dropping the cell, then if we are in a leaf page we just drop the cell in place. + /// 2. DeterminePostBalancingSeekKey -> determine the key to seek to after balancing. + /// 3. LoadPage -> load the page. + /// 4. FindCell -> find the cell to be deleted in the page. + /// 5. ClearOverflowPages -> Clear the overflow pages if there are any before dropping the cell, then if we are in a leaf page we just drop the cell in place. /// if we are in interior page, we need to rotate keys in order to replace current cell (InteriorNodeReplacement). - /// 5. InteriorNodeReplacement -> we copy the left subtree leaf node into the deleted interior node's place. - /// 6. WaitForBalancingToComplete -> perform balancing - /// 7. SeekAfterBalancing -> adjust the cursor to a node that is closer to the deleted value. go to Finish - /// 8. Finish -> Delete operation is done. Return CursorResult(Ok()) + /// 6. InteriorNodeReplacement -> we copy the left subtree leaf node into the deleted interior node's place. + /// 7. WaitForBalancingToComplete -> perform balancing + /// 8. SeekAfterBalancing -> adjust the cursor to a node that is closer to the deleted value. go to Finish + /// 9. Finish -> Delete operation is done. Return CursorResult(Ok()) pub fn delete(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); @@ -4213,18 +4222,49 @@ impl BTreeCursor { } let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::LoadPage; + delete_info.state = DeleteState::DeterminePostBalancingSeekKey; } - DeleteState::LoadPage => { + DeleteState::DeterminePostBalancingSeekKey => { + // FIXME: skip this work if we determine deletion wont result in balancing + // Right now we calculate the key every time for simplicity/debugging + // since it won't affect correctness which is more important + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let target_key = if page.get().is_index() { + let record = match return_if_io!(self.record()) { + Some(record) => record.clone(), + None => unreachable!("there should've been a record"), + }; + DeleteSavepoint::Payload(record) + } else { + let Some(rowid) = return_if_io!(self.rowid()) else { + panic!("cursor should be pointing to a record with a rowid"); + }; + DeleteSavepoint::Rowid(rowid) + }; + + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::LoadPage { + post_balancing_seek_key: Some(target_key), + }; + } + + DeleteState::LoadPage { + post_balancing_seek_key, + } => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::FindCell; + delete_info.state = DeleteState::FindCell { + post_balancing_seek_key, + }; } - DeleteState::FindCell => { + DeleteState::FindCell { + post_balancing_seek_key, + } => { let page = self.stack.top(); let cell_idx = self.stack.current_cell_index() as usize; @@ -4268,6 +4308,7 @@ impl BTreeCursor { cell_idx, cell, original_child_pointer, + post_balancing_seek_key, }; } @@ -4275,6 +4316,7 @@ impl BTreeCursor { cell_idx, cell, original_child_pointer, + post_balancing_seek_key, } => { return_if_io!(self.clear_overflow_pages(&cell)); @@ -4289,6 +4331,7 @@ impl BTreeCursor { delete_info.state = DeleteState::InteriorNodeReplacement { cell_idx, original_child_pointer, + post_balancing_seek_key, }; } else { let contents = page.get().contents.as_mut().unwrap(); @@ -4297,6 +4340,7 @@ impl BTreeCursor { let delete_info = self.state.mut_delete_info().unwrap(); delete_info.state = DeleteState::CheckNeedsBalancing { rightmost_cell_was_dropped: is_last_cell, + post_balancing_seek_key, }; } } @@ -4304,6 +4348,7 @@ impl BTreeCursor { DeleteState::InteriorNodeReplacement { cell_idx, original_child_pointer, + post_balancing_seek_key, } => { // This is an interior node, we need to handle deletion differently // For interior nodes: @@ -4381,11 +4426,13 @@ impl BTreeCursor { let delete_info = self.state.mut_delete_info().unwrap(); delete_info.state = DeleteState::CheckNeedsBalancing { rightmost_cell_was_dropped: false, // TODO: when is this true? + post_balancing_seek_key, }; } DeleteState::CheckNeedsBalancing { rightmost_cell_was_dropped, + post_balancing_seek_key, } => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); @@ -4406,25 +4453,15 @@ impl BTreeCursor { } if needs_balancing { - let target_key = if page.is_index() { - let record = match return_if_io!(self.record()) { - Some(record) => record.clone(), - None => unreachable!("there should've been a record"), - }; - DeleteSavepoint::Payload(record) - } else { - let Some(rowid) = return_if_io!(self.rowid()) else { - panic!("cursor should be pointing to a record with a rowid"); - }; - DeleteSavepoint::Rowid(rowid) - }; let delete_info = self.state.mut_delete_info().unwrap(); if delete_info.balance_write_info.is_none() { let mut write_info = WriteInfo::new(); write_info.state = WriteState::BalanceStart; delete_info.balance_write_info = Some(write_info); } - delete_info.state = DeleteState::WaitForBalancingToComplete { target_key } + delete_info.state = DeleteState::WaitForBalancingToComplete { + target_key: post_balancing_seek_key.unwrap(), + } } else { self.stack.retreat(); self.state = CursorState::None; From 10caca25c92d95ca52889207c926f38ef5fdc154 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 10 Jun 2025 11:29:02 +0300 Subject: [PATCH 38/42] advance in balance_non_root() if -1 idx --- core/storage/btree.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 2b08a21ac..a9543b2f6 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2322,6 +2322,10 @@ impl BTreeCursor { == parent_page.get_contents().cell_count() + 1 { self.stack.retreat(); + } else if self.stack.current_cell_index() == -1 { + // We might've retreated in CheckRequiresBalancing, so advance to the next cell + // to prevent panic in the asserts below due to -1 index + self.stack.advance(); } parent_page.set_dirty(); self.pager.add_dirty(parent_page.get().id); From 9caa8334be129f12710f0e4fb74ebe3d6c83baf2 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 10 Jun 2025 13:44:15 +0300 Subject: [PATCH 39/42] add FIXME about balance after interior node replacement --- core/storage/btree.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a9543b2f6..5388a9d48 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4467,6 +4467,8 @@ impl BTreeCursor { target_key: post_balancing_seek_key.unwrap(), } } else { + // FIXME: if we deleted something from an interior page, this is now the leaf page from where a replacement cell + // was taken in InteriorNodeReplacement. We must also check if the parent needs balancing!!! self.stack.retreat(); self.state = CursorState::None; return Ok(CursorResult::Ok(())); From 07c947b47de2e7396b08b3d19c89af9009e78231 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 10 Jun 2025 14:13:07 +0300 Subject: [PATCH 40/42] use GE for idx delete seek, although it doesnt really matter --- core/vdbe/execute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 845f20b18..1e56778b7 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -3950,7 +3950,7 @@ pub fn op_idx_delete( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let found = return_if_io!( - cursor.seek(SeekKey::IndexKey(&record), SeekOp::LE { eq_only: true }) + cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true }) ); tracing::debug!( "op_idx_delete: found={:?}, rootpage={}, key={:?}", From 6d2ca582356a5d8ffab11bec6401d2d8f582fe28 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 10 Jun 2025 14:14:52 +0300 Subject: [PATCH 41/42] get_prev_record() small fixes --- core/storage/btree.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5388a9d48..602585e59 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -626,7 +626,7 @@ impl BTreeCursor { } if cell_idx >= cell_count as i32 { self.stack.set_cell_index(cell_count as i32 - 1); - } else { + } else if !self.stack.current_cell_index_less_than_min() { let is_index = page.is_index(); // skip retreat in case we still haven't visited this cell in index let should_visit_internal_node = is_index && self.going_upwards; // we are going upwards, this means we still need to visit divider cell in an index @@ -1142,9 +1142,9 @@ impl BTreeCursor { let is_index = mem_page_rc.get().is_index(); let should_skip_advance = is_index && self.going_upwards // we are going upwards, this means we still need to visit divider cell in an index - && self.stack.current_cell_index() < cell_count as i32; // if we weren't on a - // valid cell then it means we will have to move upwards again or move to right page, - // anyways, we won't visit this invalid cell index + && self.stack.current_cell_index() >= 0 && self.stack.current_cell_index() < cell_count as i32; // if we weren't on a + // valid cell then it means we will have to move upwards again or move to right page, + // anyways, we won't visit this invalid cell index if should_skip_advance { tracing::debug!( "next: skipping advance, going_upwards: {}, page: {}, cell_idx: {}", From f8df870fb76de8ddb00d86882358db5096cfb4d6 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 10 Jun 2025 14:15:44 +0300 Subject: [PATCH 42/42] Fix implementation of InteriorNodeReplacement(interior index cell being deleted) --- core/storage/btree.rs | 134 +++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 60 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 602585e59..4456024ce 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -4354,82 +4354,96 @@ impl BTreeCursor { original_child_pointer, post_balancing_seek_key, } => { - // This is an interior node, we need to handle deletion differently - // For interior nodes: - // 1. Move cursor to largest entry in left subtree - // 2. Copy that entry to replace the one being deleted - // 3. Delete the leaf entry + // This is an interior node, we need to handle deletion differently. + // 1. Move cursor to the largest key in the left subtree. + // 2. Replace the cell in the interior (parent) node with that key. + // 3. Delete that key from the child page. - // Move to the largest key in the left subtree + // Step 1: Move cursor to the largest key in the left subtree. return_if_io!(self.prev()); + let (cell_payload, leaf_cell_idx) = { + let leaf_page_ref = self.stack.top(); + let leaf_page = leaf_page_ref.get(); + let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); + assert!(leaf_contents.is_leaf()); + assert!(leaf_contents.cell_count() > 0); + let leaf_cell_idx = leaf_contents.cell_count() - 1; + let last_cell_on_child_page = leaf_contents.cell_get( + leaf_cell_idx, + payload_overflow_threshold_max( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; - let leaf_page = self.stack.top(); - return_if_locked_maybe_load!(self.pager, leaf_page); - assert!( - matches!( - leaf_page.get().get_contents().page_type(), - PageType::TableLeaf | PageType::IndexLeaf - ), - "self.prev should have returned a leaf page" - ); + let mut cell_payload: Vec = Vec::new(); + let child_pointer = + original_child_pointer.expect("there should be a pointer"); + // Rewrite the old leaf cell as an interior cell depending on type. + match last_cell_on_child_page { + BTreeCell::TableLeafCell(leaf_cell) => { + // Table interior cells contain the left child pointer and the rowid as varint. + cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); + write_varint_to_vec(leaf_cell._rowid as u64, &mut cell_payload); + } + BTreeCell::IndexLeafCell(leaf_cell) => { + // Index interior cells contain: + // 1. The left child pointer + // 2. The payload size as varint + // 3. The payload + // 4. The first overflow page as varint, omitted if no overflow. + cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); + write_varint_to_vec(leaf_cell.payload_size, &mut cell_payload); + cell_payload.extend_from_slice(leaf_cell.payload); + if let Some(first_overflow_page) = leaf_cell.first_overflow_page { + cell_payload + .extend_from_slice(&first_overflow_page.to_be_bytes()); + } + } + _ => unreachable!("Expected table leaf cell"), + } + (cell_payload, leaf_cell_idx) + }; let parent_page = self.stack.parent_page().unwrap(); - assert!(parent_page.get().is_loaded(), "parent page"); - - let leaf_page = leaf_page.get(); - let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); - // The index of the cell to removed must be the last one. - let leaf_cell_idx = leaf_contents.cell_count() - 1; - let predecessor_cell = leaf_contents.cell_get( - leaf_cell_idx, - payload_overflow_threshold_max( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; + let leaf_page = self.stack.top(); parent_page.get().set_dirty(); self.pager.add_dirty(parent_page.get().get().id); + leaf_page.get().set_dirty(); + self.pager.add_dirty(leaf_page.get().get().id); - let parent_page = parent_page.get(); - let parent_contents = parent_page.get().contents.as_mut().unwrap(); + // Step 2: Replace the cell in the parent (interior) page. + { + let parent_page_ref = parent_page.get(); + let parent_contents = parent_page_ref.get().contents.as_mut().unwrap(); - // Create an interior cell from a predecessor - let mut cell_payload: Vec = Vec::new(); - let child_pointer = original_child_pointer.expect("there should be a pointer"); - match predecessor_cell { - BTreeCell::TableLeafCell(leaf_cell) => { - cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); - write_varint_to_vec(leaf_cell._rowid as u64, &mut cell_payload); - } - BTreeCell::IndexLeafCell(leaf_cell) => { - cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); - cell_payload.extend_from_slice(leaf_cell.payload); - } - _ => unreachable!("Expected table leaf cell"), + // First, drop the old cell that is being replaced. + drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + // Then, insert the new cell (the predecessor) in its place. + insert_into_cell( + parent_contents, + &cell_payload, + cell_idx, + self.usable_space() as u16, + )?; } - insert_into_cell( - parent_contents, - &cell_payload, - cell_idx, - self.usable_space() as u16, - )?; - let is_last_parent_cell = - cell_idx == parent_contents.cell_count().saturating_sub(1); - drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; - if is_last_parent_cell { - tracing::debug!("is_last_parent_cell: {:?}", parent_contents.cell_count()); + // Step 3: Delete the predecessor cell from the leaf page. + { + let leaf_page_ref = leaf_page.get(); + let leaf_contents = leaf_page_ref.get().contents.as_mut().unwrap(); + drop_cell(leaf_contents, leaf_cell_idx, self.usable_space() as u16)?; } let delete_info = self.state.mut_delete_info().unwrap(); delete_info.state = DeleteState::CheckNeedsBalancing { - rightmost_cell_was_dropped: false, // TODO: when is this true? + rightmost_cell_was_dropped: false, post_balancing_seek_key, }; }