diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fd70ebbeb..d38006d4c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -40,7 +40,7 @@ jobs: env: RUST_LOG: ${{ runner.debug && 'limbo_core::storage=trace' || '' }} run: cargo test --verbose - timeout-minutes: 10 + timeout-minutes: 20 clippy: @@ -67,6 +67,7 @@ jobs: test-limbo: runs-on: blacksmith-4vcpu-ubuntu-2404 + timeout-minutes: 20 steps: - name: Install cargo-c env: @@ -91,7 +92,7 @@ jobs: - uses: "./.github/shared/install_sqlite" - name: Test run: make test - timeout-minutes: 10 + timeout-minutes: 20 test-sqlite: runs-on: blacksmith-4vcpu-ubuntu-2404 diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 75a9a7e36..4456024ce 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -10,7 +10,7 @@ use crate::{ }, }, translate::{collate::CollationSeq, plan::IterationDirection}, - types::{IndexKeyInfo, IndexKeySortOrder}, + types::{IndexKeyInfo, IndexKeySortOrder, ParseRecordState}, MvCursor, }; @@ -165,18 +165,28 @@ enum DeleteSavepoint { #[derive(Debug, Clone)] enum DeleteState { Start, - LoadPage, - FindCell, + DeterminePostBalancingSeekKey, + LoadPage { + post_balancing_seek_key: Option, + }, + FindCell { + post_balancing_seek_key: Option, + }, ClearOverflowPages { cell_idx: usize, cell: BTreeCell, original_child_pointer: Option, + post_balancing_seek_key: Option, }, InteriorNodeReplacement { cell_idx: usize, original_child_pointer: Option, + post_balancing_seek_key: Option, + }, + CheckNeedsBalancing { + rightmost_cell_was_dropped: bool, + post_balancing_seek_key: Option, }, - CheckNeedsBalancing, WaitForBalancingToComplete { target_key: DeleteSavepoint, }, @@ -309,15 +319,6 @@ impl WriteInfo { } } -/// Whether the cursor is currently pointing to a record. -#[derive(Debug, Clone, Copy, PartialEq)] -enum CursorHasRecord { - Yes { - rowid: Option, // not all indexes and btrees have rowids, so this is optional. - }, - No, -} - /// Holds the state machine for the operation that was in flight when the cursor /// was suspended due to IO. enum CursorState { @@ -390,7 +391,7 @@ enum OverflowState { /// Holds a Record or RowId, so that these can be transformed into a SeekKey to restore /// cursor position to its previous location. -enum CursorContext { +pub enum CursorContext { TableRowId(i64), /// If we are in an index tree we can then reuse this field to save @@ -399,7 +400,8 @@ enum CursorContext { } /// In the future, we may expand these general validity states -enum CursorValidState { +#[derive(Debug, PartialEq, Eq)] +pub enum CursorValidState { /// Cursor is pointing a to an existing location/cell in the Btree Valid, /// Cursor may be pointing to a non-existent location/cell. This can happen after balancing operations @@ -408,183 +410,48 @@ enum CursorValidState { #[derive(Debug)] /// State used for seeking -enum CursorSeekState { +pub enum CursorSeekState { Start, - Seeking { - /// Min cell_idx - min: isize, - /// Max cell_idx - max: isize, - nearest_matching_cell: Option, + MovingBetweenPages { + eq_seen: Cell, + }, + InteriorPageBinarySearch { + min_cell_idx: Cell, + max_cell_idx: Cell, + nearest_matching_cell: Cell>, + eq_seen: Cell, + }, + FoundLeaf { + eq_seen: Cell, + }, + LeafPageBinarySearch { + min_cell_idx: Cell, + max_cell_idx: Cell, + nearest_matching_cell: Cell>, + /// Indicates if we have seen an exact match during the downwards traversal of the btree. + /// This is only needed in index seeks, in cases where we need to determine whether we call + /// an additional next()/prev() to fetch a matching record from an interior node. We will not + /// do that if both are true: + /// 1. We have not seen an EQ during the traversal + /// 2. We are looking for an exact match ([SeekOp::GE] or [SeekOp::LE] with eq_only: true) + eq_seen: Cell, /// Indicates when we have not not found a value in leaf and now will look in the next/prev record. /// This value is only used for indexbtree - not_found_leaf: bool, + moving_up_to_parent: Cell, }, } -// These functions below exist to avoid problems with the borrow checker -impl CursorSeekState { - /// method can only be called on CursorSeekState::Seeking - fn set_nearest_matching_cell(&mut self, matching_cell: Option) { - let CursorSeekState::Seeking { - nearest_matching_cell, - .. - } = self - else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *nearest_matching_cell = matching_cell; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_nearest_matching_cell(&mut self) -> Option { - let CursorSeekState::Seeking { - nearest_matching_cell, - .. - } = self - else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - nearest_matching_cell.clone() - } - - /// method can only be called on CursorSeekState::Seeking - fn set_max(&mut self, max: isize) { - let CursorSeekState::Seeking { max: max_state, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *max_state = max; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_max(&mut self) -> isize { - let CursorSeekState::Seeking { max, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *max - } - - /// method can only be called on CursorSeekState::Seeking - fn set_min(&mut self, min: isize) { - let CursorSeekState::Seeking { min: min_state, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *min_state = min; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_min(&mut self) -> isize { - let CursorSeekState::Seeking { min, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *min - } - - /// method can only be called on CursorSeekState::Seeking - fn set_not_found_leaf(&mut self, not_found: bool) { - let CursorSeekState::Seeking { not_found_leaf, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *not_found_leaf = not_found; - } - - /// method can only be called on CursorSeekState::Seeking - fn get_not_found_leaf(&mut self) -> bool { - let CursorSeekState::Seeking { not_found_leaf, .. } = self else { - unreachable!("method can only be called on CursorSeekState::Seeking") - }; - *not_found_leaf - } -} - #[derive(Debug)] -/// State used for seeking -enum CursorMoveToState { - Start, - ContinueLoop, - Seeking { - /// Min cell_idx - min: isize, - /// Max cell_idx - max: isize, - nearest_matching_cell: Option, - }, -} - -// These functions below exist to avoid problems with the borrow checker -impl CursorMoveToState { - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn set_nearest_matching_cell(&mut self, matching_cell: Option) { - match self { - CursorMoveToState::Seeking { - nearest_matching_cell, - .. - } => *nearest_matching_cell = matching_cell, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - }; - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn get_nearest_matching_cell(&mut self) -> Option { - match self { - CursorMoveToState::Seeking { - nearest_matching_cell, - .. - } => nearest_matching_cell.clone(), - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn set_max(&mut self, max: isize) { - match self { - CursorMoveToState::Seeking { max: max_state, .. } => *max_state = max, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn get_max(&mut self) -> isize { - match self { - CursorMoveToState::Seeking { max: max_state, .. } => *max_state, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn set_min(&mut self, min: isize) { - match self { - CursorMoveToState::Seeking { min: min_state, .. } => *min_state = min, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking "), - } - } - - /// # Safety - /// method can only be called on CursorMoveToState::Seeking - fn get_min(&mut self) -> isize { - match self { - CursorMoveToState::Seeking { min: min_state, .. } => *min_state, - _ => unreachable!("method can only be called on CursorMoveToState::Seeking"), - } - } -} - -#[derive(Debug)] -struct FindCellState(Option); +struct FindCellState(Option); impl FindCellState { #[inline] - fn set(&mut self, cell_idx: usize) { + fn set(&mut self, cell_idx: isize) { self.0 = Some(cell_idx) } #[inline] - fn get_cell_idx(&mut self) -> usize { + fn get_cell_idx(&mut self) -> isize { self.0.expect("get can only be called after a set") } @@ -602,7 +469,7 @@ pub struct BTreeCursor { /// Page id of the root page used to go back up fast. root_page: usize, /// Rowid and record are stored before being consumed. - has_record: Cell, + has_record: Cell, null_flag: bool, /// Index internal pages are consumed on the way up, so we store going upwards flag in case /// we just moved to a parent page and the parent page is an internal index page which requires @@ -618,22 +485,23 @@ pub struct BTreeCursor { stack: PageStack, /// Reusable immutable record, used to allow better allocation strategy. reusable_immutable_record: RefCell>, + /// Reusable immutable record, used to allow better allocation strategy. + parse_record_state: RefCell, pub index_key_info: Option, /// Maintain count of the number of records in the btree. Used for the `Count` opcode count: usize, /// Stores the cursor context before rebalancing so that a seek can be done later context: Option, /// Store whether the Cursor is in a valid state. Meaning if it is pointing to a valid cell index or not - valid_state: CursorValidState, + pub valid_state: CursorValidState, /// Colations for Index Btree constraint checks /// Contains the Collation Seq for the whole Index /// This Vec should be empty for Table Btree collations: Vec, seek_state: CursorSeekState, - move_to_state: CursorMoveToState, /// Separate state to read a record with overflow pages. This separation from `state` is necessary as /// we can be in a function that relies on `state`, but also needs to process overflow pages - read_overflow_state: Option, + read_overflow_state: RefCell>, /// Contains the current cell_idx for `find_cell` find_cell_state: FindCellState, } @@ -649,7 +517,7 @@ impl BTreeCursor { mv_cursor, pager, root_page, - has_record: Cell::new(CursorHasRecord::No), + has_record: Cell::new(false), null_flag: false, going_upwards: false, state: CursorState::None, @@ -666,9 +534,9 @@ impl BTreeCursor { valid_state: CursorValidState::Valid, collations, seek_state: CursorSeekState::Start, - move_to_state: CursorMoveToState::Start, - read_overflow_state: None, + read_overflow_state: RefCell::new(None), find_cell_state: FindCellState(None), + parse_record_state: RefCell::new(ParseRecordState::Init), } } @@ -735,14 +603,44 @@ impl BTreeCursor { /// Move the cursor to the previous record and return it. /// Used in backwards iteration. - fn get_prev_record( - &mut self, - predicate: Option<(SeekKey<'_>, SeekOp)>, - ) -> Result> { + fn get_prev_record(&mut self) -> Result> { loop { let page = self.stack.top(); + + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); + let contents = page.get().contents.as_ref().unwrap(); + + let cell_count = contents.cell_count(); let cell_idx = self.stack.current_cell_index(); + // If we are at the end of the page and we haven't just come back from the right child, + // we now need to move to the rightmost child. + if self.stack.current_cell_index() == i32::MAX && !self.going_upwards { + let rightmost_pointer = contents.rightmost_pointer(); + if let Some(rightmost_pointer) = rightmost_pointer { + self.stack + .push_backwards(self.read_page(rightmost_pointer as usize)?); + continue; + } + } + if cell_idx >= cell_count as i32 { + self.stack.set_cell_index(cell_count as i32 - 1); + } else if !self.stack.current_cell_index_less_than_min() { + let is_index = page.is_index(); + // skip retreat in case we still haven't visited this cell in index + let should_visit_internal_node = is_index && self.going_upwards; // we are going upwards, this means we still need to visit divider cell in an index + let page_type = contents.page_type(); + if should_visit_internal_node { + self.going_upwards = false; + return Ok(CursorResult::Ok(true)); + } else if matches!( + page_type, + PageType::IndexLeaf | PageType::TableLeaf | PageType::TableInterior + ) { + self.stack.retreat(); + } + } // moved to beginning of current page // todo: find a better way to flag moved to end or begin of page if self.stack.current_cell_index_less_than_min() { @@ -755,37 +653,14 @@ impl BTreeCursor { self.stack.pop(); } else { // moved to begin of btree - return Ok(CursorResult::Ok(CursorHasRecord::No)); + // dbg!(false); + return Ok(CursorResult::Ok(false)); } } // continue to next loop to get record from the new page continue; } - - let cell_idx = cell_idx as usize; - return_if_locked_maybe_load!(self.pager, page); - let page = page.get(); - let contents = page.get().contents.as_ref().unwrap(); - - let cell_count = contents.cell_count(); - - // If we are at the end of the page and we haven't just come back from the right child, - // we now need to move to the rightmost child. - if cell_idx as i32 == i32::MAX && !self.going_upwards { - let rightmost_pointer = contents.rightmost_pointer(); - if let Some(rightmost_pointer) = rightmost_pointer { - self.stack - .push_backwards(self.read_page(rightmost_pointer as usize)?); - continue; - } - } - - let cell_idx = if cell_idx >= cell_count { - self.stack.set_cell_index(cell_count as i32 - 1); - cell_count - 1 - } else { - cell_idx - }; + let cell_idx = self.stack.current_cell_index() as usize; let cell = contents.cell_get( cell_idx, @@ -800,34 +675,14 @@ impl BTreeCursor { _rowid, }) => { let mem_page = self.read_page(_left_child_page as usize)?; - self.stack.retreat(); self.stack.push_backwards(mem_page); continue; } - BTreeCell::TableLeafCell(TableLeafCell { - _rowid, - _payload, - first_overflow_page, - payload_size, - }) => { - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(_payload, next_page, payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - _payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - self.stack.retreat(); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(_rowid), - })); + BTreeCell::TableLeafCell(TableLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } BTreeCell::IndexInteriorCell(IndexInteriorCell { - payload, - left_child_page, - first_overflow_page, - payload_size, + left_child_page, .. }) => { if !self.going_upwards { // In backwards iteration, if we haven't just moved to this interior node from the @@ -837,116 +692,20 @@ impl BTreeCursor { // this parent: key 666 // left child has: key 663, key 664, key 665 // we need to move to the previous parent (with e.g. key 662) when iterating backwards. - self.stack.retreat(); let mem_page = self.read_page(left_child_page as usize)?; - self.stack.push(mem_page); - // use cell_index = i32::MAX to tell next loop to go to the end of the current page - self.stack.set_cell_index(i32::MAX); + self.stack.retreat(); + self.stack.push_backwards(mem_page); continue; } - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, next_page, payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; // Going upwards = we just moved to an interior cell from the right child. // On the first pass we must take the record from the interior cell (since unlike table btrees, index interior cells have payloads) // We then mark going_upwards=false so that we go back down the tree on the next invocation. self.going_upwards = false; - if predicate.is_none() { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } - - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_values = record.get_values(); - let record_slice_same_num_cols = - &record_values[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - - let found = match op { - SeekOp::EQ => order.is_eq(), - SeekOp::LE => order.is_le(), - SeekOp::LT => order.is_lt(), - _ => unreachable!("Seek GT/GE should not happen in get_prev_record() because we are iterating backwards"), - }; - if found { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } else { - continue; - } + return Ok(CursorResult::Ok(true)); } - BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - }) => { - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, next_page, payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - - self.stack.retreat(); - if predicate.is_none() { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_values = record.get_values(); - let record_slice_same_num_cols = - &record_values[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - let found = match op { - SeekOp::EQ => order.is_eq(), - SeekOp::LE => order.is_le(), - SeekOp::LT => order.is_lt(), - _ => unreachable!("Seek GT/GE should not happen in get_prev_record() because we are iterating backwards"), - }; - if found { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } else { - continue; - } + BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } } } @@ -955,76 +714,68 @@ impl BTreeCursor { /// Reads the record of a cell that has overflow pages. This is a state machine that requires to be called until completion so everything /// that calls this function should be reentrant. fn process_overflow_read( - &mut self, + &self, payload: &'static [u8], start_next_page: u32, payload_size: u64, ) -> Result> { - let res = match &mut self.read_overflow_state { - None => { - tracing::debug!("start reading overflow page payload_size={}", payload_size); - let page = self.read_page(start_next_page as usize)?; - self.read_overflow_state = Some(ReadPayloadOverflow { - payload: payload.to_vec(), - next_page: start_next_page, - remaining_to_read: payload_size as usize - payload.len(), - page, - }); - CursorResult::IO - } - Some(ReadPayloadOverflow { - payload, - next_page, - remaining_to_read, - page: page_btree, - }) => { - if page_btree.get().is_locked() { - return Ok(CursorResult::IO); - } - tracing::debug!("reading overflow page {} {}", next_page, remaining_to_read); - let page = page_btree.get(); - let contents = page.get_contents(); - // The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain. - let next = contents.read_u32_no_offset(0); - let buf = contents.as_ptr(); - let usable_space = self.pager.usable_space(); - let to_read = (*remaining_to_read).min(usable_space - 4); - payload.extend_from_slice(&buf[4..4 + to_read]); - *remaining_to_read -= to_read; - if *remaining_to_read == 0 || next == 0 { - assert!( - *remaining_to_read == 0 && next == 0, - "we can't have more pages to read while also have read everything" - ); - let mut payload_swap = Vec::new(); - std::mem::swap(payload, &mut payload_swap); - CursorResult::Ok(payload_swap) - } else { - let new_page = self.pager.read_page(next as usize).map(|page| { - Arc::new(BTreePageInner { - page: RefCell::new(page), - }) - })?; - *page_btree = new_page; - *next_page = next; - CursorResult::IO - } - } - }; - match res { - CursorResult::Ok(payload) => { - { - let mut reuse_immutable = self.get_immutable_record_or_create(); - crate::storage::sqlite3_ondisk::read_record( - &payload, - reuse_immutable.as_mut().unwrap(), - )?; - } - self.read_overflow_state = None; - Ok(CursorResult::Ok(())) - } - CursorResult::IO => Ok(CursorResult::IO), + if self.read_overflow_state.borrow().is_none() { + let page = self.read_page(start_next_page as usize)?; + *self.read_overflow_state.borrow_mut() = Some(ReadPayloadOverflow { + payload: payload.to_vec(), + next_page: start_next_page, + remaining_to_read: payload_size as usize - payload.len(), + page, + }); + return Ok(CursorResult::IO); } + let mut read_overflow_state = self.read_overflow_state.borrow_mut(); + let ReadPayloadOverflow { + payload, + next_page, + remaining_to_read, + page: page_btree, + } = read_overflow_state.as_mut().unwrap(); + + if page_btree.get().is_locked() { + return Ok(CursorResult::IO); + } + tracing::debug!("reading overflow page {} {}", next_page, remaining_to_read); + let page = page_btree.get(); + let contents = page.get_contents(); + // The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain. + let next = contents.read_u32_no_offset(0); + let buf = contents.as_ptr(); + let usable_space = self.pager.usable_space(); + let to_read = (*remaining_to_read).min(usable_space - 4); + payload.extend_from_slice(&buf[4..4 + to_read]); + *remaining_to_read -= to_read; + + if *remaining_to_read != 0 && next != 0 { + let new_page = self.pager.read_page(next as usize).map(|page| { + Arc::new(BTreePageInner { + page: RefCell::new(page), + }) + })?; + *page_btree = new_page; + *next_page = next; + return Ok(CursorResult::IO); + } + assert!( + *remaining_to_read == 0 && next == 0, + "we can't have more pages to read while also have read everything" + ); + let mut payload_swap = Vec::new(); + std::mem::swap(payload, &mut payload_swap); + + let mut reuse_immutable = self.get_immutable_record_or_create(); + crate::storage::sqlite3_ondisk::read_record( + &payload_swap, + reuse_immutable.as_mut().unwrap(), + )?; + + let _ = read_overflow_state.take(); + Ok(CursorResult::Ok(())) } /// Calculates how much of a cell's payload should be stored locally vs in overflow pages @@ -1362,44 +1113,58 @@ impl BTreeCursor { /// Move the cursor to the next record and return it. /// Used in forwards iteration, which is the default. - fn get_next_record( - &mut self, - predicate: Option<(SeekKey<'_>, SeekOp)>, - ) -> Result> { + fn get_next_record(&mut self) -> Result> { if let Some(mv_cursor) = &self.mv_cursor { let mut mv_cursor = mv_cursor.borrow_mut(); let rowid = mv_cursor.current_row_id(); match rowid { - Some(rowid) => { - let record = mv_cursor.current_row().unwrap().unwrap(); - crate::storage::sqlite3_ondisk::read_record( - &record.data, - self.get_immutable_record_or_create().as_mut().unwrap(), - )?; + Some(_rowid) => { mv_cursor.forward(); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(rowid.row_id), - })); + return Ok(CursorResult::Ok(true)); } - None => return Ok(CursorResult::Ok(CursorHasRecord::No)), + None => return Ok(CursorResult::Ok(false)), } } loop { let mem_page_rc = self.stack.top(); - let cell_idx = self.stack.current_cell_index() as usize; - - tracing::trace!( - "current id={} cell={}", - mem_page_rc.get().get().id, - cell_idx - ); return_if_locked_maybe_load!(self.pager, mem_page_rc); let mem_page = mem_page_rc.get(); let contents = mem_page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); + tracing::debug!( + "next: current_before_advance id={} cell={}, cell_count={}", + mem_page_rc.get().get().id, + self.stack.current_cell_index(), + cell_count + ); - if cell_count == 0 || cell_idx == cell_count { + let is_index = mem_page_rc.get().is_index(); + let should_skip_advance = is_index + && self.going_upwards // we are going upwards, this means we still need to visit divider cell in an index + && self.stack.current_cell_index() >= 0 && self.stack.current_cell_index() < cell_count as i32; // if we weren't on a + // valid cell then it means we will have to move upwards again or move to right page, + // anyways, we won't visit this invalid cell index + if should_skip_advance { + tracing::debug!( + "next: skipping advance, going_upwards: {}, page: {}, cell_idx: {}", + self.going_upwards, + mem_page_rc.get().get().id, + self.stack.current_cell_index() + ); + self.going_upwards = false; + return Ok(CursorResult::Ok(true)); + } + // Important to advance only after loading the page in order to not advance > 1 times + self.stack.advance(); + let cell_idx = self.stack.current_cell_index() as usize; + tracing::debug!( + "next:current id={} cell={}", + mem_page_rc.get().get().id, + cell_idx + ); + + if cell_idx == cell_count { // do rightmost let has_parent = self.stack.has_parent(); match contents.rightmost_pointer() { @@ -1416,7 +1181,7 @@ impl BTreeCursor { self.stack.pop(); continue; } else { - return Ok(CursorResult::Ok(CursorHasRecord::No)); + return Ok(CursorResult::Ok(false)); } } } @@ -1431,7 +1196,7 @@ impl BTreeCursor { self.stack.pop(); continue; } else { - return Ok(CursorResult::Ok(CursorHasRecord::No)); + return Ok(CursorResult::Ok(false)); } } assert!(cell_idx < contents.cell_count()); @@ -1447,156 +1212,27 @@ impl BTreeCursor { _left_child_page, _rowid, }) => { - assert!(predicate.is_none()); - self.stack.advance(); let mem_page = self.read_page(*_left_child_page as usize)?; self.stack.push(mem_page); continue; } - BTreeCell::TableLeafCell(TableLeafCell { - _rowid, - _payload, - payload_size, - first_overflow_page, - }) => { - assert!(predicate.is_none()); - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read( - _payload, - *next_page, - *payload_size - )) - } else { - crate::storage::sqlite3_ondisk::read_record( - _payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - self.stack.advance(); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(*_rowid), - })); + BTreeCell::TableLeafCell(TableLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } BTreeCell::IndexInteriorCell(IndexInteriorCell { - payload, - left_child_page, - first_overflow_page, - payload_size, + left_child_page, .. }) => { - if !self.going_upwards { + if self.going_upwards { + self.going_upwards = false; + return Ok(CursorResult::Ok(true)); + } else { let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); continue; } - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read( - payload, - *next_page, - *payload_size - )) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - - self.going_upwards = false; - self.stack.advance(); - if predicate.is_none() { - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - return Ok(CursorResult::Ok(cursor_has_record)); - } - - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_slice_same_num_cols = - &record.get_values()[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - let found = match op { - SeekOp::GT => order.is_gt(), - SeekOp::GE => order.is_ge(), - SeekOp::EQ => order.is_eq(), - _ => unreachable!("Seek LE/LT should not happen in get_next_record() because we are iterating forwards"), - }; - if found { - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - })); - } else { - continue; - } } - BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - }) => { - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read( - payload, - *next_page, - *payload_size - )) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - }; - - self.stack.advance(); - if predicate.is_none() { - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - return Ok(CursorResult::Ok(cursor_has_record)); - } - let (key, op) = predicate.as_ref().unwrap(); - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - let order = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_slice_same_num_cols = - &record.get_values()[..index_key.get_values().len()]; - let order = compare_immutable( - record_slice_same_num_cols, - index_key.get_values(), - self.key_sort_order(), - &self.collations, - ); - order - }; - let found = match op { - SeekOp::GT => order.is_lt(), - SeekOp::GE => order.is_le(), - SeekOp::EQ => order.is_le(), - _ => todo!("not implemented: {:?}", op), - }; - if found { - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - return Ok(CursorResult::Ok(cursor_has_record)); - } else { - continue; - } + BTreeCell::IndexLeafCell(IndexLeafCell { .. }) => { + return Ok(CursorResult::Ok(true)); } } } @@ -1606,20 +1242,24 @@ impl BTreeCursor { /// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10) /// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10). /// We don't include the rowid in the comparison and that's why the last value from the record is not included. - fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { - match key { + fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { + let ret = return_if_io!(match key { SeekKey::TableRowId(rowid) => { - return self.tablebtree_seek(rowid, op); + self.tablebtree_seek(rowid, op) } SeekKey::IndexKey(index_key) => { - return self.indexbtree_seek(index_key, op); + self.indexbtree_seek(index_key, op) } - } + }); + self.valid_state = CursorValidState::Valid; + Ok(CursorResult::Ok(ret)) } /// Move the cursor to the root page of the btree. #[instrument(skip_all, level = Level::TRACE)] fn move_to_root(&mut self) { + self.seek_state = CursorSeekState::Start; + self.going_upwards = false; tracing::trace!("move_to_root({})", self.root_page); let mem_page = self.read_page(self.root_page).unwrap(); self.stack.clear(); @@ -1627,7 +1267,7 @@ impl BTreeCursor { } /// Move the cursor to the rightmost record in the btree. - fn move_to_rightmost(&mut self) -> Result> { + fn move_to_rightmost(&mut self) -> Result> { self.move_to_root(); loop { @@ -1640,8 +1280,9 @@ impl BTreeCursor { if contents.is_leaf() { if contents.cell_count() > 0 { self.stack.set_cell_index(contents.cell_count() as i32 - 1); + return Ok(CursorResult::Ok(true)); } - return Ok(CursorResult::Ok(())); + return Ok(CursorResult::Ok(false)); } match contents.rightmost_pointer() { @@ -1662,56 +1303,64 @@ impl BTreeCursor { /// Specialized version of move_to() for table btrees. #[instrument(skip(self), level = Level::TRACE)] fn tablebtree_move_to(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { - let iter_dir = seek_op.iteration_direction(); 'outer: loop { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { + self.seek_state = CursorSeekState::FoundLeaf { + eq_seen: Cell::new(false), + }; return Ok(CursorResult::Ok(())); } let cell_count = contents.cell_count(); if matches!( - self.move_to_state, - CursorMoveToState::Start | CursorMoveToState::ContinueLoop { .. } + self.seek_state, + CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. } ) { - let min: isize = 0; - let max: isize = cell_count as isize - 1; - let leftmost_matching_cell = None; + let eq_seen = match &self.seek_state { + CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), + _ => false, + }; + let min_cell_idx = Cell::new(0); + let max_cell_idx = Cell::new(cell_count as isize - 1); + let nearest_matching_cell = Cell::new(None); - self.move_to_state = CursorMoveToState::Seeking { - min, - max, - nearest_matching_cell: leftmost_matching_cell, + self.seek_state = CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen: Cell::new(eq_seen), }; } + + let CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen, + .. + } = &self.seek_state + else { + unreachable!("we must be in an interior binary search state"); + }; + loop { - let min = self.move_to_state.get_min(); - let max = self.move_to_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - if let Some(leftmost_matching_cell) = - self.move_to_state.get_nearest_matching_cell() - { + if let Some(nearest_matching_cell) = nearest_matching_cell.get() { let left_child_page = contents.cell_table_interior_read_left_child_page( - leftmost_matching_cell as usize, + nearest_matching_cell as usize, )?; - // If we found our target rowid in the left subtree, - // we need to move the parent cell pointer forwards or backwards depending on the iteration direction. - // For example: since the internal node contains the max rowid of the left subtree, we need to move the - // parent pointer backwards in backwards iteration so that we don't come back to the parent again. - // E.g. - // this parent: rowid 666 - // left child has: 664,665,666 - // we need to move to the previous parent (with e.g. rowid 663) when iterating backwards. - let index_change = - -1 + (iter_dir == IterationDirection::Forwards) as i32 * 2; - self.stack - .set_cell_index(leftmost_matching_cell as i32 + index_change); + self.stack.set_cell_index(nearest_matching_cell as i32); let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } self.stack.set_cell_index(cell_count as i32 + 1); @@ -1719,7 +1368,9 @@ impl BTreeCursor { Some(right_most_pointer) => { let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } None => { @@ -1754,17 +1405,15 @@ impl BTreeCursor { // EQ | < | go right | Last = key is in right subtree let is_on_left = match seek_op { SeekOp::GT => cell_rowid > rowid, - SeekOp::GE => cell_rowid >= rowid, - SeekOp::LE => cell_rowid >= rowid, + SeekOp::GE { .. } => cell_rowid >= rowid, + SeekOp::LE { .. } => cell_rowid >= rowid, SeekOp::LT => cell_rowid + 1 >= rowid, - SeekOp::EQ => cell_rowid >= rowid, }; if is_on_left { - self.move_to_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); - self.move_to_state.set_max(cur_cell_idx - 1); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); + max_cell_idx.set(cur_cell_idx - 1); } else { - self.move_to_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } @@ -1784,38 +1433,63 @@ impl BTreeCursor { let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { + let eq_seen = match &self.seek_state { + CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), + _ => false, + }; + self.seek_state = CursorSeekState::FoundLeaf { + eq_seen: Cell::new(eq_seen), + }; return Ok(CursorResult::Ok(())); } if matches!( - self.move_to_state, - CursorMoveToState::Start | CursorMoveToState::ContinueLoop + self.seek_state, + CursorSeekState::Start | CursorSeekState::MovingBetweenPages { .. } ) { + let eq_seen = match &self.seek_state { + CursorSeekState::MovingBetweenPages { eq_seen } => eq_seen.get(), + _ => false, + }; let cell_count = contents.cell_count(); - let min: isize = 0; - let max: isize = cell_count as isize - 1; - let leftmost_matching_cell = None; + let min_cell_idx = Cell::new(0); + let max_cell_idx = Cell::new(cell_count as isize - 1); + let nearest_matching_cell = Cell::new(None); - self.move_to_state = CursorMoveToState::Seeking { - min, - max, - nearest_matching_cell: leftmost_matching_cell, + self.seek_state = CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen: Cell::new(eq_seen), }; } + let CursorSeekState::InteriorPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen, + } = &self.seek_state + else { + unreachable!( + "we must be in an interior binary search state, got {:?}", + self.seek_state + ); + }; + loop { - let min = self.move_to_state.get_min(); - let max = self.move_to_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - let Some(leftmost_matching_cell) = - self.move_to_state.get_nearest_matching_cell() - else { + let Some(leftmost_matching_cell) = nearest_matching_cell.get() else { self.stack.set_cell_index(contents.cell_count() as i32 + 1); match contents.rightmost_pointer() { Some(right_most_pointer) => { let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } None => { @@ -1856,7 +1530,9 @@ impl BTreeCursor { let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); - self.move_to_state = CursorMoveToState::ContinueLoop; + self.seek_state = CursorSeekState::MovingBetweenPages { + eq_seen: Cell::new(eq_seen.get()), + }; continue 'outer; } @@ -1892,7 +1568,7 @@ impl BTreeCursor { self.get_immutable_record_or_create().as_mut().unwrap(), )? }; - let target_leaf_page_is_in_left_subtree = { + let (target_leaf_page_is_in_left_subtree, is_eq) = { let record = self.get_immutable_record(); let record = record.as_ref().unwrap(); let record_slice_equal_number_of_cols = @@ -1933,21 +1609,26 @@ impl BTreeCursor { // EQ | = | go left | First = key could be exactly this one, or in left subtree // EQ | < | go right | First = key must be in right subtree - match cmp { - SeekOp::GT => interior_cell_vs_index_key.is_gt(), - SeekOp::GE => interior_cell_vs_index_key.is_ge(), - SeekOp::EQ => interior_cell_vs_index_key.is_ge(), - SeekOp::LE => interior_cell_vs_index_key.is_gt(), - SeekOp::LT => interior_cell_vs_index_key.is_ge(), - } + ( + match cmp { + SeekOp::GT => interior_cell_vs_index_key.is_gt(), + SeekOp::GE { .. } => interior_cell_vs_index_key.is_ge(), + SeekOp::LE { .. } => interior_cell_vs_index_key.is_gt(), + SeekOp::LT => interior_cell_vs_index_key.is_ge(), + }, + interior_cell_vs_index_key.is_eq(), + ) }; + if is_eq { + eq_seen.set(true); + } + if target_leaf_page_is_in_left_subtree { - self.move_to_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); - self.move_to_state.set_max(cur_cell_idx - 1); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); + max_cell_idx.set(cur_cell_idx - 1); } else { - self.move_to_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } @@ -1955,15 +1636,16 @@ impl BTreeCursor { /// Specialized version of do_seek() for table btrees that uses binary search instead /// of iterating cells in order. - fn tablebtree_seek( - &mut self, - rowid: i64, - seek_op: SeekOp, - ) -> Result> { + fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { assert!(self.mv_cursor.is_none()); let iter_dir = seek_op.iteration_direction(); - if matches!(self.seek_state, CursorSeekState::Start) { + if matches!( + self.seek_state, + CursorSeekState::Start { .. } + | CursorSeekState::MovingBetweenPages { .. } + | CursorSeekState::InteriorPageBinarySearch { .. } + ) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::TableRowId(rowid), seek_op)); let page = self.stack.top(); @@ -1976,71 +1658,51 @@ impl BTreeCursor { ); let cell_count = contents.cell_count(); - let min: isize = 0; - let max: isize = cell_count as isize - 1; + if cell_count == 0 { + self.stack.set_cell_index(0); + return Ok(CursorResult::Ok(false)); + } + let min_cell_idx = Cell::new(0); + let max_cell_idx = Cell::new(cell_count as isize - 1); // If iter dir is forwards, we want the first cell that matches; // If iter dir is backwards, we want the last cell that matches. - let nearest_matching_cell = None; + let nearest_matching_cell = Cell::new(None); - self.seek_state = CursorSeekState::Seeking { - min, - max, + self.seek_state = CursorSeekState::LeafPageBinarySearch { + min_cell_idx, + max_cell_idx, nearest_matching_cell, - not_found_leaf: false, + moving_up_to_parent: Cell::new(false), + eq_seen: Cell::new(false), // not relevant for table btrees }; } + let CursorSeekState::LeafPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + .. + } = &self.seek_state + else { + unreachable!("we must be in a leaf binary search state"); + }; + let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); loop { - let min = self.seek_state.get_min(); - let max = self.seek_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() - else { - return Ok(CursorResult::Ok(CursorHasRecord::No)); - }; - let matching_cell = contents.cell_get( - nearest_matching_cell, - payload_overflow_threshold_max( - contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - let BTreeCell::TableLeafCell(TableLeafCell { - _rowid: cell_rowid, - _payload, - first_overflow_page, - payload_size, - .. - }) = matching_cell - else { - unreachable!("unexpected cell type: {:?}", matching_cell); - }; - - return_if_io!(self.read_record_w_possible_overflow( - _payload, - first_overflow_page, - payload_size - )); - let cell_idx = if iter_dir == IterationDirection::Forwards { - nearest_matching_cell as i32 + 1 + if let Some(nearest_matching_cell) = nearest_matching_cell.get() { + self.stack.set_cell_index(nearest_matching_cell as i32); + return Ok(CursorResult::Ok(true)); } else { - nearest_matching_cell as i32 - 1 + return Ok(CursorResult::Ok(false)); }; - self.stack.set_cell_index(cell_idx as i32); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(cell_rowid), - })); } let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. @@ -2050,75 +1712,41 @@ impl BTreeCursor { let found = match seek_op { SeekOp::GT => cmp.is_gt(), - SeekOp::GE => cmp.is_ge(), - SeekOp::EQ => cmp.is_eq(), - SeekOp::LE => cmp.is_le(), + SeekOp::GE { eq_only: true } => cmp.is_eq(), + SeekOp::GE { eq_only: false } => cmp.is_ge(), + SeekOp::LE { eq_only: true } => cmp.is_eq(), + SeekOp::LE { eq_only: false } => cmp.is_le(), SeekOp::LT => cmp.is_lt(), }; // rowids are unique, so we can return the rowid immediately - if found && SeekOp::EQ == seek_op { - let cur_cell = contents.cell_get( - cur_cell_idx as usize, - payload_overflow_threshold_max( - contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - let BTreeCell::TableLeafCell(TableLeafCell { - _rowid: _, - _payload, - first_overflow_page, - payload_size, - .. - }) = cur_cell - else { - unreachable!("unexpected cell type: {:?}", cur_cell); - }; - return_if_io!(self.read_record_w_possible_overflow( - _payload, - first_overflow_page, - payload_size - )); - let cell_idx = if iter_dir == IterationDirection::Forwards { - cur_cell_idx + 1 - } else { - cur_cell_idx - 1 - }; - self.stack.set_cell_index(cell_idx as i32); - return Ok(CursorResult::Ok(CursorHasRecord::Yes { - rowid: Some(cell_rowid), - })); + if found && seek_op.eq_only() { + self.stack.set_cell_index(cur_cell_idx as i32); + return Ok(CursorResult::Ok(true)); } if found { - self.seek_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } IterationDirection::Backwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } else { if cmp.is_gt() { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } else if cmp.is_lt() { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } else { match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } IterationDirection::Backwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } } } @@ -2130,32 +1758,105 @@ impl BTreeCursor { &mut self, key: &ImmutableRecord, seek_op: SeekOp, - ) -> Result> { - if matches!(self.seek_state, CursorSeekState::Start) { + ) -> Result> { + if matches!( + self.seek_state, + CursorSeekState::Start { .. } + | CursorSeekState::MovingBetweenPages { .. } + | CursorSeekState::InteriorPageBinarySearch { .. } + ) { // No need for another move_to_root. Move_to already moves to root return_if_io!(self.move_to(SeekKey::IndexKey(key), seek_op)); + let CursorSeekState::FoundLeaf { eq_seen } = &self.seek_state else { + unreachable!( + "We must still be in FoundLeaf state after move_to, got: {:?}", + self.seek_state + ); + }; + let eq_seen = eq_seen.get(); let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); + if cell_count == 0 { + self.stack.set_cell_index(0); + match seek_op.iteration_direction() { + IterationDirection::Forwards => { + return self.next(); + } + IterationDirection::Backwards => { + return self.prev(); + } + } + } - let min: isize = 0; - let max: isize = cell_count as isize - 1; + let min = Cell::new(0); + let max = Cell::new(cell_count as isize - 1); // If iter dir is forwards, we want the first cell that matches; // If iter dir is backwards, we want the last cell that matches. - let nearest_matching_cell = None; + let nearest_matching_cell = Cell::new(None); - self.seek_state = CursorSeekState::Seeking { - min, - max, + self.seek_state = CursorSeekState::LeafPageBinarySearch { + min_cell_idx: min, + max_cell_idx: max, nearest_matching_cell, - not_found_leaf: false, + moving_up_to_parent: Cell::new(false), + eq_seen: Cell::new(eq_seen), }; } + let CursorSeekState::LeafPageBinarySearch { + min_cell_idx, + max_cell_idx, + nearest_matching_cell, + eq_seen, + moving_up_to_parent, + } = &self.seek_state + else { + unreachable!( + "we must be in a leaf binary search state, got: {:?}", + self.seek_state + ); + }; + + if moving_up_to_parent.get() { + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); + let contents = page.get().contents.as_ref().unwrap(); + let cur_cell_idx = self.stack.current_cell_index() as usize; + let cell = contents.cell_get( + cur_cell_idx, + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), + self.usable_space(), + )?; + let BTreeCell::IndexInteriorCell(IndexInteriorCell { + payload, + first_overflow_page, + payload_size, + .. + }) = &cell + else { + unreachable!("unexpected cell type: {:?}", cell); + }; + + if let Some(next_page) = first_overflow_page { + return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size)) + } else { + crate::storage::sqlite3_ondisk::read_record( + payload, + self.get_immutable_record_or_create().as_mut().unwrap(), + )? + }; + let (_, found) = self.compare_with_current_record(key, seek_op); + moving_up_to_parent.set(false); + return Ok(CursorResult::Ok(found)); + } + let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); @@ -2165,11 +1866,13 @@ impl BTreeCursor { let iter_dir = seek_op.iteration_direction(); loop { - let min = self.seek_state.get_min(); - let max = self.seek_state.get_max(); + let min = min_cell_idx.get(); + let max = max_cell_idx.get(); if min > max { - let Some(nearest_matching_cell) = self.seek_state.get_nearest_matching_cell() - else { + if let Some(nearest_matching_cell) = nearest_matching_cell.get() { + self.stack.set_cell_index(nearest_matching_cell as i32); + return Ok(CursorResult::Ok(true)); + } else { // We have now iterated over all cells in the leaf page and found no match. // Unlike tables, indexes store payloads in interior cells as well. self.move_to() always moves to a leaf page, so there are cases where we need to // move back up to the parent interior cell and get the next record from there to perform a correct seek. @@ -2185,59 +1888,40 @@ impl BTreeCursor { // if we were to return Ok(CursorResult::Ok((None, None))), self.record would be None, which is incorrect, because we already know // that there is a record with a key greater than K (K' = K+2) in the parent interior cell. Hence, we need to move back up the tree // and get the next matching record from there. + // + // However we only do this if either of the following is true: + // - We have seen an EQ match up in the tree in an interior node + // - Or, we are not looking for an exact match. + if seek_op.eq_only() && !eq_seen.get() { + return Ok(CursorResult::Ok(false)); + } match iter_dir { IterationDirection::Forwards => { - if !self.seek_state.get_not_found_leaf() { - self.seek_state.set_not_found_leaf(true); + if !moving_up_to_parent.get() { + moving_up_to_parent.set(true); self.stack.set_cell_index(cell_count as i32); } - return self.get_next_record(Some((SeekKey::IndexKey(key), seek_op))); + let next_res = return_if_io!(self.next()); + if !next_res { + return Ok(CursorResult::Ok(false)); + } + // FIXME: optimize this in case record can be read directly + return Ok(CursorResult::IO); } IterationDirection::Backwards => { - if !self.seek_state.get_not_found_leaf() { - self.seek_state.set_not_found_leaf(true); + if !moving_up_to_parent.get() { + moving_up_to_parent.set(true); self.stack.set_cell_index(-1); } - return self.get_prev_record(Some((SeekKey::IndexKey(key), seek_op))); + let prev_res = return_if_io!(self.prev()); + if !prev_res { + return Ok(CursorResult::Ok(false)); + } + // FIXME: optimize this in case record can be read directly + return Ok(CursorResult::IO); } } }; - let cell = contents.cell_get( - nearest_matching_cell as usize, - payload_overflow_threshold_max( - contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; - - let BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - }) = &cell - else { - unreachable!("unexpected cell type: {:?}", cell); - }; - - if let Some(next_page) = first_overflow_page { - return_if_io!(self.process_overflow_read(payload, *next_page, *payload_size)) - } else { - crate::storage::sqlite3_ondisk::read_record( - payload, - self.get_immutable_record_or_create().as_mut().unwrap(), - )? - } - let cursor_has_record = CursorHasRecord::Yes { - rowid: self.get_index_rowid_from_record(), - }; - self.stack.set_cell_index(nearest_matching_cell as i32); - self.stack.next_cell_in_direction(iter_dir); - return Ok(CursorResult::Ok(cursor_has_record)); } let cur_cell_idx = (min + max) >> 1; // rustc generates extra insns for (min+max)/2 due to them being isize. we know min&max are >=0 here. @@ -2266,48 +1950,29 @@ impl BTreeCursor { self.get_immutable_record_or_create().as_mut().unwrap(), )? }; - let cmp = { - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_slice_equal_number_of_cols = - &record.get_values().as_slice()[..key.get_values().len()]; - compare_immutable( - record_slice_equal_number_of_cols, - key.get_values(), - self.key_sort_order(), - &self.collations, - ) - }; - let found = match seek_op { - SeekOp::GT => cmp.is_gt(), - SeekOp::GE => cmp.is_ge(), - SeekOp::EQ => cmp.is_eq(), - SeekOp::LE => cmp.is_le(), - SeekOp::LT => cmp.is_lt(), - }; + let (cmp, found) = self.compare_with_current_record(key, seek_op); if found { - self.seek_state - .set_nearest_matching_cell(Some(cur_cell_idx as usize)); + nearest_matching_cell.set(Some(cur_cell_idx as usize)); match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } IterationDirection::Backwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } } } else { if cmp.is_gt() { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } else if cmp.is_lt() { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } else { match iter_dir { IterationDirection::Forwards => { - self.seek_state.set_min(cur_cell_idx + 1); + min_cell_idx.set(cur_cell_idx + 1); } IterationDirection::Backwards => { - self.seek_state.set_max(cur_cell_idx - 1); + max_cell_idx.set(cur_cell_idx - 1); } } } @@ -2315,6 +1980,35 @@ impl BTreeCursor { } } + fn compare_with_current_record( + &self, + key: &ImmutableRecord, + seek_op: SeekOp, + ) -> (Ordering, bool) { + let cmp = { + let record = self.get_immutable_record(); + let record = record.as_ref().unwrap(); + tracing::debug!("record={}", record); + let record_slice_equal_number_of_cols = + &record.get_values().as_slice()[..key.get_values().len()]; + compare_immutable( + record_slice_equal_number_of_cols, + key.get_values(), + self.key_sort_order(), + &self.collations, + ) + }; + let found = match seek_op { + SeekOp::GT => cmp.is_gt(), + SeekOp::GE { eq_only: true } => cmp.is_eq(), + SeekOp::GE { eq_only: false } => cmp.is_ge(), + SeekOp::LE { eq_only: true } => cmp.is_eq(), + SeekOp::LE { eq_only: false } => cmp.is_le(), + SeekOp::LT => cmp.is_lt(), + }; + (cmp, found) + } + fn read_record_w_possible_overflow( &mut self, payload: &'static [u8], @@ -2359,7 +2053,16 @@ impl BTreeCursor { // 5. We scan the leaf cells in the leaf page until we find the cell whose rowid is equal to the rowid we are looking for. // This cell contains the actual data we are looking for. // 6. If we find the cell, we return the record. Otherwise, we return an empty result. - if matches!(self.move_to_state, CursorMoveToState::Start) { + + // If we are at the beginning/end of seek state, start a new move from the root. + if matches!( + self.seek_state, + // these are stages that happen at the leaf page, so we can consider that the previous seek finished and we can start a new one. + CursorSeekState::LeafPageBinarySearch { .. } | CursorSeekState::FoundLeaf { .. } + ) { + self.seek_state = CursorSeekState::Start; + } + if matches!(self.seek_state, CursorSeekState::Start) { self.move_to_root(); } @@ -2368,7 +2071,6 @@ impl BTreeCursor { SeekKey::IndexKey(index_key) => self.indexbtree_move_to(index_key, cmp), }; return_if_io!(ret); - self.move_to_state = CursorMoveToState::Start; Ok(CursorResult::Ok(())) } @@ -2412,6 +2114,7 @@ impl BTreeCursor { // find cell (return_if_io!(self.find_cell(page, bkey)), page.page_type()) }; + self.stack.set_cell_index(cell_idx as i32); tracing::debug!("insert_into_page(cell_idx={})", cell_idx); // if the cell index is less than the total cells, check: if its an existing @@ -2448,7 +2151,7 @@ impl BTreeCursor { ); if cmp == Ordering::Equal { tracing::debug!("insert_into_page: found exact match with cell_idx={cell_idx}, overwriting"); - self.has_record.set(CursorHasRecord::Yes { rowid: self.get_index_rowid_from_record() }); + self.has_record.set(true); self.overwrite_cell(page.clone(), cell_idx, record)?; self.state .mut_write_info() @@ -2488,13 +2191,26 @@ impl BTreeCursor { )?; contents.overflow_cells.len() }; - let write_info = self - .state - .mut_write_info() - .expect("can't count while inserting"); + self.stack.set_cell_index(cell_idx as i32); if overflow > 0 { + // A balance will happen so save the key we were inserting + tracing::debug!("insert_into_page: balance triggered on key {:?}, page={:?}, cell_idx={}", bkey, page.get().get().id, cell_idx); + self.save_context(match bkey { + BTreeKey::TableRowId(rowid) => CursorContext::TableRowId(rowid.0), + BTreeKey::IndexKey(record) => { + CursorContext::IndexKeyRowId((*record).clone()) + } + }); + let write_info = self + .state + .mut_write_info() + .expect("can't count while inserting"); write_info.state = WriteState::BalanceStart; } else { + let write_info = self + .state + .mut_write_info() + .expect("can't count while inserting"); write_info.state = WriteState::Finish; } } @@ -2508,6 +2224,12 @@ impl BTreeCursor { } }; }; + if matches!(self.state.write_info().unwrap().state, WriteState::Finish) { + // if there was a balance triggered, the cursor position is invalid. + // it's probably not the greatest idea in the world to do this eagerly here, + // but at least it works. + return_if_io!(self.restore_context()); + } self.state = CursorState::None; ret } @@ -2600,6 +2322,10 @@ impl BTreeCursor { == parent_page.get_contents().cell_count() + 1 { self.stack.retreat(); + } else if self.stack.current_cell_index() == -1 { + // We might've retreated in CheckRequiresBalancing, so advance to the next cell + // to prevent panic in the asserts below due to -1 index + self.stack.advance(); } parent_page.set_dirty(); self.pager.add_dirty(parent_page.get().id); @@ -4087,6 +3813,7 @@ impl BTreeCursor { self.root_page = root.get().id; self.stack.clear(); self.stack.push(root_btree.clone()); + self.stack.set_cell_index(0); // leave parent pointing at the rightmost pointer (in this case 0, as there are no cells), since we will be balancing the rightmost child page. self.stack.push(child_btree.clone()); } @@ -4100,10 +3827,12 @@ impl BTreeCursor { self.find_cell_state.set(0); } let cell_count = page.cell_count(); - while self.find_cell_state.get_cell_idx() < cell_count { + while self.find_cell_state.get_cell_idx() < cell_count as isize { + assert!(self.find_cell_state.get_cell_idx() >= 0); + let cell_idx = self.find_cell_state.get_cell_idx() as usize; match page .cell_get( - self.find_cell_state.get_cell_idx(), + cell_idx, payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16), payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16), self.usable_space(), @@ -4138,9 +3867,13 @@ impl BTreeCursor { first_overflow_page, payload_size, )); + let key_values = key.to_index_key_values(); + let record = self.get_immutable_record(); + let record = record.as_ref().unwrap(); + let record_same_number_cols = &record.get_values()[..key_values.len()]; let order = compare_immutable( - key.to_index_key_values(), - self.get_immutable_record().as_ref().unwrap().get_values(), + key_values, + record_same_number_cols, self.key_sort_order(), &self.collations, ); @@ -4156,6 +3889,8 @@ impl BTreeCursor { self.find_cell_state.set(cell_idx + 1); } let cell_idx = self.find_cell_state.get_cell_idx(); + assert!(cell_idx >= 0); + let cell_idx = cell_idx as usize; assert!(cell_idx <= cell_count); self.find_cell_state.reset(); Ok(CursorResult::Ok(cell_idx)) @@ -4190,19 +3925,19 @@ impl BTreeCursor { } pub fn seek_to_last(&mut self) -> Result> { - return_if_io!(self.move_to_rightmost()); - let cursor_has_record = return_if_io!(self.get_next_record(None)); - if cursor_has_record == CursorHasRecord::No { + let has_record = return_if_io!(self.move_to_rightmost()); + self.invalidate_record(); + self.has_record.replace(has_record); + if !has_record { let is_empty = return_if_io!(self.is_empty_table()); assert!(is_empty); return Ok(CursorResult::Ok(())); } - self.has_record.replace(cursor_has_record); Ok(CursorResult::Ok(())) } pub fn is_empty(&self) -> bool { - CursorHasRecord::No == self.has_record.get() + !self.has_record.get() } pub fn root_page(&self) -> usize { @@ -4211,12 +3946,14 @@ impl BTreeCursor { pub fn rewind(&mut self) -> Result> { if self.mv_cursor.is_some() { - let cursor_has_record = return_if_io!(self.get_next_record(None)); + let cursor_has_record = return_if_io!(self.get_next_record()); + self.invalidate_record(); self.has_record.replace(cursor_has_record); } else { self.move_to_root(); - let cursor_has_record = return_if_io!(self.get_next_record(None)); + let cursor_has_record = return_if_io!(self.get_next_record()); + self.invalidate_record(); self.has_record.replace(cursor_has_record); } Ok(CursorResult::Ok(())) @@ -4224,40 +3961,72 @@ impl BTreeCursor { pub fn last(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); - match self.move_to_rightmost()? { - CursorResult::Ok(_) => self.prev(), - CursorResult::IO => Ok(CursorResult::IO), - } - } - - pub fn next(&mut self) -> Result> { - let _ = return_if_io!(self.restore_context()); - let cursor_has_record = return_if_io!(self.get_next_record(None)); + let cursor_has_record = return_if_io!(self.move_to_rightmost()); self.has_record.replace(cursor_has_record); + self.invalidate_record(); Ok(CursorResult::Ok(())) } - pub fn prev(&mut self) -> Result> { - assert!(self.mv_cursor.is_none()); - let _ = return_if_io!(self.restore_context()); - match self.get_prev_record(None)? { - CursorResult::Ok(cursor_has_record) => { - self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(())) - } - CursorResult::IO => Ok(CursorResult::IO), - } + pub fn next(&mut self) -> Result> { + return_if_io!(self.restore_context()); + let cursor_has_record = return_if_io!(self.get_next_record()); + self.has_record.replace(cursor_has_record); + self.invalidate_record(); + Ok(CursorResult::Ok(cursor_has_record)) } - pub fn rowid(&self) -> Result> { + fn invalidate_record(&mut self) { + self.get_immutable_record_or_create() + .as_mut() + .unwrap() + .invalidate(); + } + + pub fn prev(&mut self) -> Result> { + assert!(self.mv_cursor.is_none()); + return_if_io!(self.restore_context()); + let cursor_has_record = return_if_io!(self.get_prev_record()); + self.has_record.replace(cursor_has_record); + self.invalidate_record(); + Ok(CursorResult::Ok(cursor_has_record)) + } + + pub fn rowid(&mut self) -> Result>> { if let Some(mv_cursor) = &self.mv_cursor { let mv_cursor = mv_cursor.borrow(); - return Ok(mv_cursor.current_row_id().map(|rowid| rowid.row_id)); + return Ok(CursorResult::Ok( + mv_cursor.current_row_id().map(|rowid| rowid.row_id), + )); + } + if self.has_record.get() { + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + // load record + let _ = return_if_io!(self.record()); + let page_type = page.get().get_contents().page_type(); + let page = page.get(); + let contents = page.get_contents(); + let cell_idx = self.stack.current_cell_index(); + let cell = contents.cell_get( + cell_idx as usize, + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), + self.usable_space(), + )?; + if page_type.is_table() { + let BTreeCell::TableLeafCell(TableLeafCell { + _rowid, _payload, .. + }) = cell + else { + unreachable!("unexpected page_type"); + }; + Ok(CursorResult::Ok(Some(_rowid))) + } else { + Ok(CursorResult::Ok(self.get_index_rowid_from_record())) + } + } else { + Ok(CursorResult::Ok(None)) } - Ok(match self.has_record.get() { - CursorHasRecord::Yes { rowid: Some(rowid) } => Some(rowid), - _ => None, - }) } pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { @@ -4265,28 +4034,93 @@ impl BTreeCursor { // We need to clear the null flag for the table cursor before seeking, // because it might have been set to false by an unmatched left-join row during the previous iteration // on the outer loop. + tracing::trace!("seek(key={:?}, op={:?})", key, op); self.set_null_flag(false); let cursor_has_record = return_if_io!(self.do_seek(key, op)); + self.invalidate_record(); // Reset seek state self.seek_state = CursorSeekState::Start; + self.valid_state = CursorValidState::Valid; self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(matches!( - cursor_has_record, - CursorHasRecord::Yes { .. } - ))) + Ok(CursorResult::Ok(cursor_has_record)) } - pub fn record(&self) -> Ref> { - self.reusable_immutable_record.borrow() + /// Return a reference to the record the cursor is currently pointing to. + /// If record was not parsed yet, then we have to parse it and in case of I/O we yield control + /// back. + pub fn record(&self) -> Result>>> { + if !self.has_record.get() { + return Ok(CursorResult::Ok(None)); + } + let invalidated = self + .reusable_immutable_record + .borrow() + .as_ref() + .map_or(true, |record| record.is_invalidated()); + if !invalidated { + *self.parse_record_state.borrow_mut() = ParseRecordState::Init; + let record_ref = + Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref()) + .unwrap(); + return Ok(CursorResult::Ok(Some(record_ref))); + } + if *self.parse_record_state.borrow() == ParseRecordState::Init { + *self.parse_record_state.borrow_mut() = ParseRecordState::Parsing { + payload: Vec::new(), + }; + } + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); + let contents = page.get_contents(); + let cell_idx = self.stack.current_cell_index(); + let cell = contents.cell_get( + cell_idx as usize, + payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), + payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), + self.usable_space(), + )?; + let (payload, payload_size, first_overflow_page) = match cell { + BTreeCell::TableLeafCell(TableLeafCell { + _rowid, + _payload, + payload_size, + first_overflow_page, + }) => (_payload, payload_size, first_overflow_page), + BTreeCell::IndexInteriorCell(IndexInteriorCell { + left_child_page: _, + payload, + payload_size, + first_overflow_page, + }) => (payload, payload_size, first_overflow_page), + BTreeCell::IndexLeafCell(IndexLeafCell { + payload, + first_overflow_page, + payload_size, + }) => (payload, payload_size, first_overflow_page), + _ => unreachable!("unexpected page_type"), + }; + if let Some(next_page) = first_overflow_page { + return_if_io!(self.process_overflow_read(payload, next_page, payload_size)) + } else { + crate::storage::sqlite3_ondisk::read_record( + payload, + self.get_immutable_record_or_create().as_mut().unwrap(), + )? + }; + + *self.parse_record_state.borrow_mut() = ParseRecordState::Init; + let record_ref = + Ref::filter_map(self.reusable_immutable_record.borrow(), |opt| opt.as_ref()).unwrap(); + Ok(CursorResult::Ok(Some(record_ref))) } #[instrument(skip_all, level = Level::TRACE)] pub fn insert( &mut self, key: &BTreeKey, - moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */ + mut moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */ ) -> Result> { - tracing::trace!("insert"); match &self.mv_cursor { Some(mv_cursor) => match key.maybe_rowid() { Some(rowid) => { @@ -4298,24 +4132,42 @@ impl BTreeCursor { None => todo!("Support mvcc inserts with index btrees"), }, None => { - tracing::trace!("moved {}", moved_before); - if !moved_before && !matches!(self.state, CursorState::Write(..)) { + tracing::debug!("insert(moved={}, key={:?}), valid_state={:?}, cursor_state={:?}, is_write_in_progress={}", moved_before, key, self.valid_state, self.state, self.is_write_in_progress()); + if self.valid_state != CursorValidState::Valid && !self.is_write_in_progress() { + // A balance happened so we need to move. + moved_before = false; + } + if !moved_before { + // Use move_to() so that we always end up on a leaf page. seek() might go back to an interior cell in index seeks, + // which we never want. The reason we can use move_to() is that + // find_cell() iterates the leaf page from left to right to find the insertion point anyway, so we don't care + // which cell we are in as long as we are on the right page. + // FIXME: find_cell() should not use linear search because it's slow. match key { BTreeKey::IndexKey(_) => { - return_if_io!(self - .move_to(SeekKey::IndexKey(key.get_record().unwrap()), SeekOp::GE)) + return_if_io!(self.move_to( + SeekKey::IndexKey(key.get_record().unwrap()), + SeekOp::GE { eq_only: true } + )) } - BTreeKey::TableRowId(_) => return_if_io!( - self.move_to(SeekKey::TableRowId(key.to_rowid()), SeekOp::EQ) - ), - } + BTreeKey::TableRowId(_) => { + return_if_io!(self.move_to( + SeekKey::TableRowId(key.to_rowid()), + SeekOp::GE { eq_only: true } + )) + } + }; + self.context.take(); // we know where we wanted to move so if there was any saved context, discard it. + self.valid_state = CursorValidState::Valid; + self.seek_state = CursorSeekState::Start; + tracing::debug!( + "seeked to the right place, page is now {:?}", + self.stack.top().get().get().id + ); } return_if_io!(self.insert_into_page(key)); if key.maybe_rowid().is_some() { - let int_key = key.to_rowid(); - self.has_record.replace(CursorHasRecord::Yes { - rowid: Some(int_key), - }); + self.has_record.replace(true); } } }; @@ -4324,14 +4176,15 @@ impl BTreeCursor { /// Delete state machine flow: /// 1. Start -> check if the rowid to be delete is present in the page or not. If not we early return - /// 2. LoadPage -> load the page. - /// 3. FindCell -> find the cell to be deleted in the page. - /// 4. ClearOverflowPages -> Clear the overflow pages if there are any before dropping the cell, then if we are in a leaf page we just drop the cell in place. + /// 2. DeterminePostBalancingSeekKey -> determine the key to seek to after balancing. + /// 3. LoadPage -> load the page. + /// 4. FindCell -> find the cell to be deleted in the page. + /// 5. ClearOverflowPages -> Clear the overflow pages if there are any before dropping the cell, then if we are in a leaf page we just drop the cell in place. /// if we are in interior page, we need to rotate keys in order to replace current cell (InteriorNodeReplacement). - /// 5. InteriorNodeReplacement -> we copy the left subtree leaf node into the deleted interior node's place. - /// 6. WaitForBalancingToComplete -> perform balancing - /// 7. SeekAfterBalancing -> adjust the cursor to a node that is closer to the deleted value. go to Finish - /// 8. Finish -> Delete operation is done. Return CursorResult(Ok()) + /// 6. InteriorNodeReplacement -> we copy the left subtree leaf node into the deleted interior node's place. + /// 7. WaitForBalancingToComplete -> perform balancing + /// 8. SeekAfterBalancing -> adjust the cursor to a node that is closer to the deleted value. go to Finish + /// 9. Finish -> Delete operation is done. Return CursorResult(Ok()) pub fn delete(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); @@ -4358,8 +4211,8 @@ impl BTreeCursor { page.get().get_contents().page_type(), PageType::TableLeaf | PageType::TableInterior ) { - let _target_rowid = match self.has_record.get() { - CursorHasRecord::Yes { rowid: Some(rowid) } => rowid, + let _target_rowid = match return_if_io!(self.rowid()) { + Some(rowid) => rowid, _ => { self.state = CursorState::None; return Ok(CursorResult::Ok(())); @@ -4373,23 +4226,51 @@ impl BTreeCursor { } let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::LoadPage; + delete_info.state = DeleteState::DeterminePostBalancingSeekKey; } - DeleteState::LoadPage => { + DeleteState::DeterminePostBalancingSeekKey => { + // FIXME: skip this work if we determine deletion wont result in balancing + // Right now we calculate the key every time for simplicity/debugging + // since it won't affect correctness which is more important + let page = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page); + let target_key = if page.get().is_index() { + let record = match return_if_io!(self.record()) { + Some(record) => record.clone(), + None => unreachable!("there should've been a record"), + }; + DeleteSavepoint::Payload(record) + } else { + let Some(rowid) = return_if_io!(self.rowid()) else { + panic!("cursor should be pointing to a record with a rowid"); + }; + DeleteSavepoint::Rowid(rowid) + }; + + let delete_info = self.state.mut_delete_info().unwrap(); + delete_info.state = DeleteState::LoadPage { + post_balancing_seek_key: Some(target_key), + }; + } + + DeleteState::LoadPage { + post_balancing_seek_key, + } => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::FindCell; + delete_info.state = DeleteState::FindCell { + post_balancing_seek_key, + }; } - DeleteState::FindCell => { + DeleteState::FindCell { + post_balancing_seek_key, + } => { let page = self.stack.top(); - let mut cell_idx = self.stack.current_cell_index() as usize; - if cell_idx > 0 { - cell_idx -= 1; - } + let cell_idx = self.stack.current_cell_index() as usize; let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); @@ -4401,6 +4282,12 @@ impl BTreeCursor { )); } + tracing::debug!( + "DeleteState::FindCell: page_id: {}, cell_idx: {}", + page.get().id, + cell_idx + ); + let cell = contents.cell_get( cell_idx, payload_overflow_threshold_max( @@ -4425,6 +4312,7 @@ impl BTreeCursor { cell_idx, cell, original_child_pointer, + post_balancing_seek_key, }; } @@ -4432,6 +4320,7 @@ impl BTreeCursor { cell_idx, cell, original_child_pointer, + post_balancing_seek_key, } => { return_if_io!(self.clear_overflow_pages(&cell)); @@ -4439,131 +4328,167 @@ impl BTreeCursor { let page = page.get(); let contents = page.get_contents(); + let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1); + let delete_info = self.state.mut_delete_info().unwrap(); if !contents.is_leaf() { delete_info.state = DeleteState::InteriorNodeReplacement { cell_idx, original_child_pointer, + post_balancing_seek_key, }; } else { let contents = page.get().contents.as_mut().unwrap(); drop_cell(contents, cell_idx, self.usable_space() as u16)?; let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::CheckNeedsBalancing; + delete_info.state = DeleteState::CheckNeedsBalancing { + rightmost_cell_was_dropped: is_last_cell, + post_balancing_seek_key, + }; } } DeleteState::InteriorNodeReplacement { cell_idx, original_child_pointer, + post_balancing_seek_key, } => { - // This is an interior node, we need to handle deletion differently - // For interior nodes: - // 1. Move cursor to largest entry in left subtree - // 2. Copy that entry to replace the one being deleted - // 3. Delete the leaf entry + // This is an interior node, we need to handle deletion differently. + // 1. Move cursor to the largest key in the left subtree. + // 2. Replace the cell in the interior (parent) node with that key. + // 3. Delete that key from the child page. - // Move to the largest key in the left subtree + // Step 1: Move cursor to the largest key in the left subtree. return_if_io!(self.prev()); + let (cell_payload, leaf_cell_idx) = { + let leaf_page_ref = self.stack.top(); + let leaf_page = leaf_page_ref.get(); + let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); + assert!(leaf_contents.is_leaf()); + assert!(leaf_contents.cell_count() > 0); + let leaf_cell_idx = leaf_contents.cell_count() - 1; + let last_cell_on_child_page = leaf_contents.cell_get( + leaf_cell_idx, + payload_overflow_threshold_max( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + payload_overflow_threshold_min( + leaf_contents.page_type(), + self.usable_space() as u16, + ), + self.usable_space(), + )?; - let leaf_page = self.stack.top(); - return_if_locked_maybe_load!(self.pager, leaf_page); - assert!( - matches!( - leaf_page.get().get_contents().page_type(), - PageType::TableLeaf | PageType::IndexLeaf - ), - "self.prev should have returned a leaf page" - ); + let mut cell_payload: Vec = Vec::new(); + let child_pointer = + original_child_pointer.expect("there should be a pointer"); + // Rewrite the old leaf cell as an interior cell depending on type. + match last_cell_on_child_page { + BTreeCell::TableLeafCell(leaf_cell) => { + // Table interior cells contain the left child pointer and the rowid as varint. + cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); + write_varint_to_vec(leaf_cell._rowid as u64, &mut cell_payload); + } + BTreeCell::IndexLeafCell(leaf_cell) => { + // Index interior cells contain: + // 1. The left child pointer + // 2. The payload size as varint + // 3. The payload + // 4. The first overflow page as varint, omitted if no overflow. + cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); + write_varint_to_vec(leaf_cell.payload_size, &mut cell_payload); + cell_payload.extend_from_slice(leaf_cell.payload); + if let Some(first_overflow_page) = leaf_cell.first_overflow_page { + cell_payload + .extend_from_slice(&first_overflow_page.to_be_bytes()); + } + } + _ => unreachable!("Expected table leaf cell"), + } + (cell_payload, leaf_cell_idx) + }; let parent_page = self.stack.parent_page().unwrap(); - assert!(parent_page.get().is_loaded(), "parent page"); - - let leaf_page = leaf_page.get(); - let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); - // The index of the cell to removed must be the last one. - let leaf_cell_idx = leaf_contents.cell_count() - 1; - let predecessor_cell = leaf_contents.cell_get( - leaf_cell_idx, - payload_overflow_threshold_max( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - payload_overflow_threshold_min( - leaf_contents.page_type(), - self.usable_space() as u16, - ), - self.usable_space(), - )?; + let leaf_page = self.stack.top(); parent_page.get().set_dirty(); self.pager.add_dirty(parent_page.get().get().id); + leaf_page.get().set_dirty(); + self.pager.add_dirty(leaf_page.get().get().id); - let parent_page = parent_page.get(); - let parent_contents = parent_page.get().contents.as_mut().unwrap(); + // Step 2: Replace the cell in the parent (interior) page. + { + let parent_page_ref = parent_page.get(); + let parent_contents = parent_page_ref.get().contents.as_mut().unwrap(); - // Create an interior cell from a predecessor - let mut cell_payload: Vec = Vec::new(); - let child_pointer = original_child_pointer.expect("there should be a pointer"); - match predecessor_cell { - BTreeCell::TableLeafCell(leaf_cell) => { - cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); - write_varint_to_vec(leaf_cell._rowid as u64, &mut cell_payload); - } - BTreeCell::IndexLeafCell(leaf_cell) => { - cell_payload.extend_from_slice(&child_pointer.to_be_bytes()); - cell_payload.extend_from_slice(leaf_cell.payload); - } - _ => unreachable!("Expected table leaf cell"), + // First, drop the old cell that is being replaced. + drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + // Then, insert the new cell (the predecessor) in its place. + insert_into_cell( + parent_contents, + &cell_payload, + cell_idx, + self.usable_space() as u16, + )?; } - insert_into_cell( - parent_contents, - &cell_payload, - cell_idx, - self.usable_space() as u16, - )?; - drop_cell(parent_contents, cell_idx, self.usable_space() as u16)?; + // Step 3: Delete the predecessor cell from the leaf page. + { + let leaf_page_ref = leaf_page.get(); + let leaf_contents = leaf_page_ref.get().contents.as_mut().unwrap(); + drop_cell(leaf_contents, leaf_cell_idx, self.usable_space() as u16)?; + } let delete_info = self.state.mut_delete_info().unwrap(); - delete_info.state = DeleteState::CheckNeedsBalancing; + delete_info.state = DeleteState::CheckNeedsBalancing { + rightmost_cell_was_dropped: false, + post_balancing_seek_key, + }; } - DeleteState::CheckNeedsBalancing => { + DeleteState::CheckNeedsBalancing { + rightmost_cell_was_dropped, + post_balancing_seek_key, + } => { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let free_space = compute_free_space(contents, self.usable_space() as u16); - let needs_balancing = free_space as usize * 3 > self.usable_space() * 2; + let needs_balancing = self.stack.has_parent() + && free_space as usize * 3 > self.usable_space() * 2; - let target_key = if page.is_index() { - DeleteSavepoint::Payload(self.record().as_ref().unwrap().clone()) - } else { - let CursorHasRecord::Yes { rowid: Some(rowid) } = self.has_record.get() - else { - panic!("cursor should be pointing to a record with a rowid"); - }; - DeleteSavepoint::Rowid(rowid) - }; + if rightmost_cell_was_dropped { + // If we drop a cell in the middle, e.g. our current index is 2 and we drop 'c' from [a,b,c,d,e], then we don't need to retreat index, + // because index 2 is still going to be the right place [a,b,D,e] + // but: + // If we drop the rightmost cell, e.g. our current index is 4 and we drop 'e' from [a,b,c,d,e], then we need to retreat index, + // because index 4 is now pointing beyond the last cell [a,b,c,d] _ <-- index 4 + self.stack.retreat(); + } - let delete_info = self.state.mut_delete_info().unwrap(); if needs_balancing { + let delete_info = self.state.mut_delete_info().unwrap(); if delete_info.balance_write_info.is_none() { let mut write_info = WriteInfo::new(); write_info.state = WriteState::BalanceStart; delete_info.balance_write_info = Some(write_info); } - delete_info.state = DeleteState::WaitForBalancingToComplete { target_key } + delete_info.state = DeleteState::WaitForBalancingToComplete { + target_key: post_balancing_seek_key.unwrap(), + } } else { + // FIXME: if we deleted something from an interior page, this is now the leaf page from where a replacement cell + // was taken in InteriorNodeReplacement. We must also check if the parent needs balancing!!! self.stack.retreat(); self.state = CursorState::None; return Ok(CursorResult::Ok(())); } // Only reaches this function call if state = DeleteState::WaitForBalancingToComplete - self.save_context(); + // self.save_context(); } DeleteState::WaitForBalancingToComplete { target_key } => { @@ -4613,7 +4538,9 @@ impl BTreeCursor { SeekKey::IndexKey(immutable_record) } }; - return_if_io!(self.seek(key, SeekOp::EQ)); + // We want to end up pointing at the row to the left of the position of the row we deleted, so + // that after we call next() in the loop,the next row we delete will again be the same position as this one. + return_if_io!(self.seek(key, SeekOp::LT)); self.state = CursorState::None; return Ok(CursorResult::Ok(())); @@ -4637,9 +4564,9 @@ impl BTreeCursor { /// Search for a key in an Index Btree. Looking up indexes that need to be unique, we cannot compare the rowid pub fn key_exists_in_index(&mut self, key: &ImmutableRecord) -> Result> { - return_if_io!(self.seek(SeekKey::IndexKey(key), SeekOp::GE)); + return_if_io!(self.seek(SeekKey::IndexKey(key), SeekOp::GE { eq_only: true })); - let record_opt = self.record(); + let record_opt = return_if_io!(self.record()); match record_opt.as_ref() { Some(record) => { // Existing record found — compare prefix @@ -4671,35 +4598,11 @@ impl BTreeCursor { Value::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - let _ = return_if_io!(self.move_to(SeekKey::TableRowId(*int_key), SeekOp::EQ)); - let page = self.stack.top(); - // TODO(pere): request load - return_if_locked_maybe_load!(self.pager, page); - - let page = page.get(); - let contents = page.get().contents.as_ref().unwrap(); - - // find cell - let int_key = match key { - Value::Integer(i) => *i, - _ => unreachable!("btree tables are indexed by integers!"), - }; - let cell_idx = - return_if_io!(self.find_cell(contents, &BTreeKey::new_table_rowid(int_key, None))); - if cell_idx >= contents.cell_count() { - Ok(CursorResult::Ok(false)) - } else { - let equals = match &contents.cell_get( - cell_idx, - payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16), - payload_overflow_threshold_min(contents.page_type(), self.usable_space() as u16), - self.usable_space(), - )? { - BTreeCell::TableLeafCell(l) => l._rowid == int_key, - _ => unreachable!(), - }; - Ok(CursorResult::Ok(equals)) - } + let has_record = + return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true })); + self.has_record.set(has_record); + self.invalidate_record(); + Ok(CursorResult::Ok(has_record)) } /// Clear the overflow pages linked to a specific page provided by the leaf cell @@ -4809,8 +4712,8 @@ impl BTreeCursor { } DestroyState::ProcessPage => { let page = self.stack.top(); + self.stack.advance(); assert!(page.get().is_loaded()); // page should be loaded at this time - let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index(); @@ -4830,7 +4733,6 @@ impl BTreeCursor { (false, n) if n == contents.cell_count() as i32 => { if let Some(rightmost) = contents.rightmost_pointer() { let rightmost_page = self.read_page(rightmost as usize)?; - self.stack.advance(); self.stack.push(rightmost_page); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", @@ -4899,7 +4801,6 @@ impl BTreeCursor { _ => panic!("expected interior cell"), }; let child_page = self.read_page(child_page_id as usize)?; - self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", @@ -4917,7 +4818,6 @@ impl BTreeCursor { BTreeCell::IndexInteriorCell(index_int_cell) => { let child_page = self.read_page(index_int_cell.left_child_page as usize)?; - self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", @@ -4927,7 +4827,6 @@ impl BTreeCursor { } // For any leaf cell, advance the index now that overflow pages have been cleared BTreeCell::TableLeafCell(_) | BTreeCell::IndexLeafCell(_) => { - self.stack.advance(); let destroy_info = self.state.mut_destroy_info().expect( "unable to get a mut reference to destroy state in cursor", ); @@ -4975,9 +4874,7 @@ impl BTreeCursor { // build the new payload let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type(); let mut new_payload = Vec::with_capacity(record.len()); - let CursorHasRecord::Yes { rowid } = self.has_record.get() else { - panic!("cursor should be pointing to a record"); - }; + let rowid = return_if_io!(self.rowid()); fill_cell_payload( page_type, rowid, @@ -5083,6 +4980,7 @@ impl BTreeCursor { self.count += contents.cell_count(); } + self.stack.advance(); let cell_idx = self.stack.current_cell_index() as usize; // Second condition is necessary in case we return if the page is locked in the loop below @@ -5096,7 +4994,6 @@ impl BTreeCursor { } // Move to parent - self.going_upwards = true; self.stack.pop(); mem_page_rc = self.stack.top(); @@ -5123,7 +5020,6 @@ impl BTreeCursor { let right_most_pointer = contents.rightmost_pointer().unwrap(); self.stack.advance(); let mem_page = self.read_page(right_most_pointer as usize)?; - self.going_upwards = false; self.stack.push(mem_page); } else { // Move to child left page @@ -5150,7 +5046,6 @@ impl BTreeCursor { }) => { self.stack.advance(); let mem_page = self.read_page(left_child_page as usize)?; - self.going_upwards = false; self.stack.push(mem_page); } _ => unreachable!(), @@ -5159,27 +5054,10 @@ impl BTreeCursor { } } - /// Save cursor context, to be restored later - pub fn save_context(&mut self) { - if let CursorHasRecord::Yes { rowid } = self.has_record.get() { - self.valid_state = CursorValidState::RequireSeek; - match self.stack.top().get().get_contents().page_type() { - PageType::TableInterior | PageType::TableLeaf => { - self.context = Some(CursorContext::TableRowId(rowid.expect( - "table cells should have a rowid since we don't support WITHOUT ROWID tables", - ))); - } - PageType::IndexInterior | PageType::IndexLeaf => { - self.context = Some(CursorContext::IndexKeyRowId( - self.reusable_immutable_record - .borrow() - .as_ref() - .unwrap() - .clone(), - )); - } - } - } + // Save cursor context, to be restored later + pub fn save_context(&mut self, cursor_context: CursorContext) { + self.valid_state = CursorValidState::RequireSeek; + self.context = Some(cursor_context); } /// If context is defined, restore it and set it None on success @@ -5192,7 +5070,7 @@ impl BTreeCursor { CursorContext::TableRowId(rowid) => SeekKey::TableRowId(rowid), CursorContext::IndexKeyRowId(ref record) => SeekKey::IndexKey(record), }; - let res = self.seek(seek_key, SeekOp::EQ)?; + let res = self.seek(seek_key, SeekOp::GE { eq_only: true })?; match res { CursorResult::Ok(_) => { self.valid_state = CursorValidState::Valid; @@ -5280,7 +5158,7 @@ impl PageStack { } fn push(&self, page: BTreePage) { - self._push(page, 0); + self._push(page, -1); } fn push_backwards(&self, page: BTreePage) { @@ -5355,18 +5233,6 @@ impl PageStack { self.cell_indices.borrow_mut()[current] -= 1; } - /// Move the cursor to the next cell in the current page according to the iteration direction. - fn next_cell_in_direction(&self, iteration_direction: IterationDirection) { - match iteration_direction { - IterationDirection::Forwards => { - self.advance(); - } - IterationDirection::Backwards => { - self.retreat(); - } - } - } - fn set_cell_index(&self, idx: i32) { let current = self.current(); self.cell_indices.borrow_mut()[current] = idx; @@ -6691,7 +6557,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(*key); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -6715,7 +6581,7 @@ mod tests { let seek_key = SeekKey::TableRowId(*key); assert!( matches!( - cursor.seek(seek_key, SeekOp::EQ).unwrap(), + cursor.seek(seek_key, SeekOp::GE { eq_only: true }).unwrap(), CursorResult::Ok(true) ), "key {} is not found", @@ -6730,14 +6596,14 @@ mod tests { std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() - .as_secs(), + .as_millis(), |v| { v.parse() .expect("Failed to parse SEED environment variable as u64") }, ); - let rng = ChaCha8Rng::seed_from_u64(seed); - (rng, seed) + let rng = ChaCha8Rng::seed_from_u64(seed as u64); + (rng, seed as u64) } fn btree_insert_fuzz_run( @@ -6785,7 +6651,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(key); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -6819,7 +6685,9 @@ mod tests { for key in keys.iter() { tracing::trace!("seeking key: {}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); - let cursor_rowid = cursor.rowid().unwrap().unwrap(); + let cursor_rowid = run_until_done(|| cursor.rowid(), pager.deref()) + .unwrap() + .unwrap(); if *key != cursor_rowid { valid = false; println!("key {} is not found, got {}", key, cursor_rowid); @@ -6850,7 +6718,9 @@ mod tests { for key in keys.iter() { tracing::trace!("seeking key: {}", key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); - let cursor_rowid = cursor.rowid().unwrap().unwrap(); + let cursor_rowid = run_until_done(|| cursor.rowid(), pager.deref()) + .unwrap() + .unwrap(); assert_eq!( *key, cursor_rowid, "key {} is not found, got {}", @@ -6886,7 +6756,6 @@ mod tests { let mut keys = SortedVec::new(); tracing::info!("seed: {}", seed); for i in 0..inserts { - tracing::info!("insert {}", i); pager.begin_read_tx().unwrap(); pager.begin_write_tx().unwrap(); let key = { @@ -6905,6 +6774,7 @@ mod tests { } result }; + tracing::info!("insert {}/{}: {:?}", i + 1, inserts, key); keys.push(key.clone()); let value = ImmutableRecord::from_registers( &key.iter() @@ -6931,22 +6801,62 @@ mod tests { } } } + + // Check that all keys can be found by seeking pager.begin_read_tx().unwrap(); cursor.move_to_root(); - for key in keys.iter() { - tracing::trace!("seeking key: {:?}", key); + for (i, key) in keys.iter().enumerate() { + tracing::info!("seeking key {}/{}: {:?}", i + 1, keys.len(), key); + let exists = run_until_done( + || { + cursor.seek( + SeekKey::IndexKey(&ImmutableRecord::from_registers( + &key.iter() + .map(|col| Register::Value(Value::Integer(*col))) + .collect::>(), + )), + SeekOp::GE { eq_only: true }, + ) + }, + pager.deref(), + ) + .unwrap(); + assert!(exists, "key {:?} is not found", key); + } + // Check that key count is right + cursor.move_to_root(); + let mut count = 0; + while run_until_done(|| cursor.next(), pager.deref()).unwrap() { + count += 1; + } + assert_eq!( + count, + keys.len(), + "key count is not right, got {}, expected {}", + count, + keys.len() + ); + // Check that all keys can be found in-order, by iterating the btree + cursor.move_to_root(); + let mut prev = None; + for (i, key) in keys.iter().enumerate() { + tracing::info!("iterating key {}/{}: {:?}", i + 1, keys.len(), key); run_until_done(|| cursor.next(), pager.deref()).unwrap(); - let record = cursor.record(); + let record = run_until_done(|| cursor.record(), &pager).unwrap(); let record = record.as_ref().unwrap(); - let cursor_key = record.get_values(); - assert_eq!( - cursor_key, - &key.iter() - .map(|col| RefValue::Integer(*col)) - .collect::>(), - "key {:?} is not found", - key - ); + let cur = record.get_values().clone(); + if let Some(prev) = prev { + if prev >= cur { + println!("Seed: {}", seed); + } + assert!( + prev < cur, + "keys are not in ascending order: {:?} < {:?}", + prev, + cur + ); + } + prev = Some(cur); } pager.end_read_tx().unwrap(); } @@ -7957,7 +7867,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -8034,7 +7944,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -8057,8 +7967,11 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page); let seek_key = SeekKey::TableRowId(i); - let found = run_until_done(|| cursor.seek(seek_key.clone(), SeekOp::EQ), pager.deref()) - .unwrap(); + let found = run_until_done( + || cursor.seek(seek_key.clone(), SeekOp::GE { eq_only: true }), + pager.deref(), + ) + .unwrap(); if found { run_until_done(|| cursor.delete(), pager.deref()).unwrap(); @@ -8115,7 +8028,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(i as i64); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -8133,11 +8046,13 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page); cursor.move_to_root(); for i in 0..iterations { - let CursorHasRecord::Yes { rowid: Some(rowid) } = - run_until_done(|| cursor.get_next_record(None), pager.deref()).unwrap() - else { + let has_next = run_until_done(|| cursor.next(), pager.deref()).unwrap(); + if !has_next { panic!("expected Some(rowid) but got {:?}", cursor.has_record.get()); }; + let rowid = run_until_done(|| cursor.rowid(), pager.deref()) + .unwrap() + .unwrap(); assert_eq!(rowid, i as i64, "got!=expected"); } } @@ -8155,7 +8070,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(1); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) @@ -8231,7 +8146,7 @@ mod tests { run_until_done( || { let key = SeekKey::TableRowId(1); - cursor.move_to(key, SeekOp::EQ) + cursor.seek(key, SeekOp::GE { eq_only: true }) }, pager.deref(), ) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 9a5fd30b1..255a9123e 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -850,6 +850,7 @@ impl Pager { // Providing a page is optional, if provided it will be used to avoid reading the page from disk. // This is implemented in accordance with sqlite freepage2() function. pub fn free_page(&self, page: Option, page_id: usize) -> Result<()> { + tracing::info!("free_page(page_id={})", page_id); const TRUNK_PAGE_HEADER_SIZE: usize = 8; const LEAF_ENTRY_SIZE: usize = 4; const RESERVED_SLOTS: usize = 2; diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index bc39cdfdf..31ddc1cc0 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -377,6 +377,15 @@ pub enum PageType { TableLeaf = 13, } +impl PageType { + pub fn is_table(&self) -> bool { + match self { + PageType::IndexInterior | PageType::IndexLeaf => false, + PageType::TableInterior | PageType::TableLeaf => true, + } + } +} + impl TryFrom for PageType { type Error = LimboError; @@ -585,7 +594,12 @@ impl PageContent { // the page header is 12 bytes for interior pages, 8 bytes for leaf pages // this is because the 4 last bytes in the interior page's header are used for the rightmost pointer. let cell_pointer_array_start = self.header_size(); - assert!(idx < ncells, "cell_get: idx out of bounds"); + assert!( + idx < ncells, + "cell_get: idx out of bounds: idx={}, ncells={}", + idx, + ncells + ); let cell_pointer = cell_pointer_array_start + (idx * 2); let cell_pointer = self.read_u16(cell_pointer) as usize; diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index 06ae3ea8d..f8be9e87d 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -870,9 +870,20 @@ fn emit_program_for_update( )?; // Open indexes for update. let mut index_cursors = Vec::with_capacity(plan.indexes_to_update.len()); - // TODO: do not reopen if there is table reference using it. - for index in &plan.indexes_to_update { + if let Some(index_cursor) = program.resolve_cursor_id_safe(&CursorKey::index( + plan.table_references + .joined_tables() + .first() + .unwrap() + .internal_id, + index.clone(), + )) { + // Don't reopen index if it was already opened as the iteration cursor for this update plan. + let record_reg = program.alloc_register(); + index_cursors.push((index_cursor, record_reg)); + continue; + } let index_cursor = program.alloc_cursor_id(CursorType::BTreeIndex(index.clone())); program.emit_insn(Insn::OpenWrite { cursor_id: index_cursor, diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index c7f789ca5..8a35bb50b 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -1182,12 +1182,13 @@ fn emit_seek( seek.len }; match seek.op { - SeekOp::GE => program.emit_insn(Insn::SeekGE { + SeekOp::GE { eq_only } => program.emit_insn(Insn::SeekGE { is_index, cursor_id: seek_cursor_id, start_reg, num_regs, target_pc: loop_end, + eq_only, }), SeekOp::GT => program.emit_insn(Insn::SeekGT { is_index, @@ -1196,12 +1197,13 @@ fn emit_seek( num_regs, target_pc: loop_end, }), - SeekOp::LE => program.emit_insn(Insn::SeekLE { + SeekOp::LE { eq_only } => program.emit_insn(Insn::SeekLE { is_index, cursor_id: seek_cursor_id, start_reg, num_regs, target_pc: loop_end, + eq_only, }), SeekOp::LT => program.emit_insn(Insn::SeekLT { is_index, @@ -1210,7 +1212,6 @@ fn emit_seek( num_regs, target_pc: loop_end, }), - SeekOp::EQ => panic!("An index seek is never EQ"), }; Ok(()) @@ -1293,7 +1294,7 @@ fn emit_seek_termination( } match (is_index, termination.op) { - (true, SeekOp::GE) => program.emit_insn(Insn::IdxGE { + (true, SeekOp::GE { .. }) => program.emit_insn(Insn::IdxGE { cursor_id: seek_cursor_id, start_reg, num_regs, @@ -1305,7 +1306,7 @@ fn emit_seek_termination( num_regs, target_pc: loop_end, }), - (true, SeekOp::LE) => program.emit_insn(Insn::IdxLE { + (true, SeekOp::LE { .. }) => program.emit_insn(Insn::IdxLE { cursor_id: seek_cursor_id, start_reg, num_regs, @@ -1317,7 +1318,7 @@ fn emit_seek_termination( num_regs, target_pc: loop_end, }), - (false, SeekOp::GE) => program.emit_insn(Insn::Ge { + (false, SeekOp::GE { .. }) => program.emit_insn(Insn::Ge { lhs: rowid_reg.unwrap(), rhs: start_reg, target_pc: loop_end, @@ -1331,7 +1332,7 @@ fn emit_seek_termination( flags: CmpInsFlags::default(), collation: program.curr_collation(), }), - (false, SeekOp::LE) => program.emit_insn(Insn::Le { + (false, SeekOp::LE { .. }) => program.emit_insn(Insn::Le { lhs: rowid_reg.unwrap(), rhs: start_reg, target_pc: loop_end, @@ -1345,9 +1346,6 @@ fn emit_seek_termination( flags: CmpInsFlags::default(), collation: program.curr_collation(), }), - (_, SeekOp::EQ) => { - panic!("An index termination condition is never EQ") - } }; Ok(()) diff --git a/core/translate/optimizer/mod.rs b/core/translate/optimizer/mod.rs index a054bac3e..910c93e2c 100644 --- a/core/translate/optimizer/mod.rs +++ b/core/translate/optimizer/mod.rs @@ -98,7 +98,7 @@ fn optimize_delete_plan(plan: &mut DeletePlan, schema: &Schema) -> Result<()> { Ok(()) } -fn optimize_update_plan(plan: &mut UpdatePlan, schema: &Schema) -> Result<()> { +fn optimize_update_plan(plan: &mut UpdatePlan, _schema: &Schema) -> Result<()> { rewrite_exprs_update(plan)?; if let ConstantConditionEliminationResult::ImpossibleCondition = eliminate_constant_conditions(&mut plan.where_clause)? @@ -106,13 +106,18 @@ fn optimize_update_plan(plan: &mut UpdatePlan, schema: &Schema) -> Result<()> { plan.contains_constant_false_condition = true; return Ok(()); } - let _ = optimize_table_access( - &mut plan.table_references, - &schema.indexes, - &mut plan.where_clause, - &mut plan.order_by, - &mut None, - )?; + // FIXME: don't use indexes for update right now because it's not safe to traverse an index + // while also updating the same table, things go wrong. + // e.g. in 'explain update t set x=x+5 where x > 10;' where x is an indexed column, + // sqlite first creates an ephemeral index to store the current values so the tree traversal + // doesn't get messed up while updating. + // let _ = optimize_table_access( + // &mut plan.table_references, + // &schema.indexes, + // &mut plan.where_clause, + // &mut plan.order_by, + // &mut None, + // )?; Ok(()) } @@ -881,7 +886,7 @@ fn build_seek_def( seek: Some(SeekKey { len: key_len, null_pad: false, - op: SeekOp::GE, + op: SeekOp::GE { eq_only: true }, }), termination: Some(TerminationKey { len: key_len, @@ -905,8 +910,8 @@ fn build_seek_def( ( key_len - 1, key_len, - SeekOp::LE.reverse(), - SeekOp::LE.reverse(), + SeekOp::LE { eq_only: false }.reverse(), + SeekOp::LE { eq_only: false }.reverse(), ) }; SeekDef { @@ -943,12 +948,17 @@ fn build_seek_def( (IterationDirection::Forwards, ast::Operator::GreaterEquals) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len, key_len - 1, SeekOp::GE, SeekOp::GT) + ( + key_len, + key_len - 1, + SeekOp::GE { eq_only: false }, + SeekOp::GT, + ) } else { ( key_len - 1, key_len, - SeekOp::LE.reverse(), + SeekOp::LE { eq_only: false }.reverse(), SeekOp::LT.reverse(), ) }; @@ -986,9 +996,19 @@ fn build_seek_def( (IterationDirection::Forwards, ast::Operator::Less) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len - 1, key_len, SeekOp::GT, SeekOp::GE) + ( + key_len - 1, + key_len, + SeekOp::GT, + SeekOp::GE { eq_only: false }, + ) } else { - (key_len, key_len - 1, SeekOp::GT, SeekOp::GE) + ( + key_len, + key_len - 1, + SeekOp::GT, + SeekOp::GE { eq_only: false }, + ) }; SeekDef { key, @@ -1029,8 +1049,8 @@ fn build_seek_def( ( key_len, key_len - 1, - SeekOp::LE.reverse(), - SeekOp::LE.reverse(), + SeekOp::LE { eq_only: false }.reverse(), + SeekOp::LE { eq_only: false }.reverse(), ) }; SeekDef { @@ -1066,7 +1086,7 @@ fn build_seek_def( iter_dir, seek: Some(SeekKey { len: key_len, - op: SeekOp::LE, + op: SeekOp::LE { eq_only: true }, null_pad: false, }), termination: Some(TerminationKey { @@ -1086,13 +1106,18 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::Less) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len, key_len - 1, SeekOp::LT, SeekOp::LE) + ( + key_len, + key_len - 1, + SeekOp::LT, + SeekOp::LE { eq_only: false }, + ) } else { ( key_len - 1, key_len, SeekOp::GT.reverse(), - SeekOp::GE.reverse(), + SeekOp::GE { eq_only: false }.reverse(), ) }; SeekDef { @@ -1129,7 +1154,12 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::LessEquals) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len, key_len - 1, SeekOp::LE, SeekOp::LE) + ( + key_len, + key_len - 1, + SeekOp::LE { eq_only: false }, + SeekOp::LE { eq_only: false }, + ) } else { ( key_len - 1, @@ -1172,7 +1202,12 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::Greater) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len - 1, key_len, SeekOp::LE, SeekOp::LE) + ( + key_len - 1, + key_len, + SeekOp::LE { eq_only: false }, + SeekOp::LE { eq_only: false }, + ) } else { ( key_len, @@ -1215,12 +1250,17 @@ fn build_seek_def( (IterationDirection::Backwards, ast::Operator::GreaterEquals) => { let (seek_key_len, termination_key_len, seek_op, termination_op) = if sort_order_of_last_key == SortOrder::Asc { - (key_len - 1, key_len, SeekOp::LE, SeekOp::LT) + ( + key_len - 1, + key_len, + SeekOp::LE { eq_only: false }, + SeekOp::LT, + ) } else { ( key_len, key_len - 1, - SeekOp::GE.reverse(), + SeekOp::GE { eq_only: false }.reverse(), SeekOp::GT.reverse(), ) }; diff --git a/core/types.rs b/core/types.rs index 68363d5dd..8498c6db1 100644 --- a/core/types.rs +++ b/core/types.rs @@ -701,6 +701,12 @@ pub struct ImmutableRecord { recreating: bool, } +#[derive(PartialEq)] +pub enum ParseRecordState { + Init, + Parsing { payload: Vec }, +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Record { values: Vec, @@ -934,6 +940,10 @@ impl ImmutableRecord { self.values.clear(); } + pub fn is_invalidated(&self) -> bool { + self.payload.is_empty() + } + pub fn get_payload(&self) -> &[u8] { &self.payload } @@ -1482,10 +1492,17 @@ pub enum CursorResult { #[derive(Clone, Copy, PartialEq, Eq, Debug)] /// The match condition of a table/index seek. pub enum SeekOp { - EQ, - GE, + /// If eq_only is true, this means in practice: + /// We are iterating forwards, but we are really looking for an exact match on the seek key. + GE { + eq_only: bool, + }, GT, - LE, + /// If eq_only is true, this means in practice: + /// We are iterating backwards, but we are really looking for an exact match on the seek key. + LE { + eq_only: bool, + }, LT, } @@ -1502,17 +1519,23 @@ impl SeekOp { #[inline(always)] pub fn iteration_direction(&self) -> IterationDirection { match self { - SeekOp::EQ | SeekOp::GE | SeekOp::GT => IterationDirection::Forwards, - SeekOp::LE | SeekOp::LT => IterationDirection::Backwards, + SeekOp::GE { .. } | SeekOp::GT => IterationDirection::Forwards, + SeekOp::LE { .. } | SeekOp::LT => IterationDirection::Backwards, + } + } + + pub fn eq_only(&self) -> bool { + match self { + SeekOp::GE { eq_only } | SeekOp::LE { eq_only } => *eq_only, + _ => false, } } pub fn reverse(&self) -> Self { match self { - SeekOp::EQ => SeekOp::EQ, - SeekOp::GE => SeekOp::LE, + SeekOp::GE { eq_only } => SeekOp::LE { eq_only: *eq_only }, SeekOp::GT => SeekOp::LT, - SeekOp::LE => SeekOp::GE, + SeekOp::LE { eq_only } => SeekOp::GE { eq_only: *eq_only }, SeekOp::LT => SeekOp::GT, } } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index d0ed2dab6..1e56778b7 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1344,15 +1344,23 @@ pub fn op_column( unreachable!("unexpected Insn {:?}", insn) }; if let Some((index_cursor_id, table_cursor_id)) = state.deferred_seeks[*cursor_id].take() { - let deferred_seek = { + let deferred_seek = 'd: { let rowid = { let mut index_cursor = state.get_cursor(index_cursor_id); let index_cursor = index_cursor.as_btree_mut(); - index_cursor.rowid()? + match index_cursor.rowid()? { + CursorResult::IO => { + break 'd Some((index_cursor_id, table_cursor_id)); + } + CursorResult::Ok(rowid) => rowid, + } }; let mut table_cursor = state.get_cursor(table_cursor_id); let table_cursor = table_cursor.as_btree_mut(); - match table_cursor.seek(SeekKey::TableRowId(rowid.unwrap()), SeekOp::EQ)? { + match table_cursor.seek( + SeekKey::TableRowId(rowid.unwrap()), + SeekOp::GE { eq_only: true }, + )? { CursorResult::Ok(_) => None, CursorResult::IO => Some((index_cursor_id, table_cursor_id)), } @@ -1369,7 +1377,7 @@ pub fn op_column( let mut cursor = must_be_btree_cursor!(*cursor_id, program.cursor_ref, state, "Column"); let cursor = cursor.as_btree_mut(); - let record = cursor.record(); + let record = return_if_io!(cursor.record()); let value = if let Some(record) = record.as_ref() { if cursor.get_null_flag() { RefValue::Null @@ -1900,11 +1908,16 @@ pub fn op_row_id( unreachable!("unexpected Insn {:?}", insn) }; if let Some((index_cursor_id, table_cursor_id)) = state.deferred_seeks[*cursor_id].take() { - let deferred_seek = { + let deferred_seek = 'd: { let rowid = { let mut index_cursor = state.get_cursor(index_cursor_id); let index_cursor = index_cursor.as_btree_mut(); - let record = index_cursor.record(); + let record = match index_cursor.record()? { + CursorResult::IO => { + break 'd Some((index_cursor_id, table_cursor_id)); + } + CursorResult::Ok(record) => record, + }; let record = record.as_ref().unwrap(); let rowid = record.get_values().last().unwrap(); match rowid { @@ -1914,7 +1927,7 @@ pub fn op_row_id( }; let mut table_cursor = state.get_cursor(table_cursor_id); let table_cursor = table_cursor.as_btree_mut(); - match table_cursor.seek(SeekKey::TableRowId(rowid), SeekOp::EQ)? { + match table_cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true })? { CursorResult::Ok(_) => None, CursorResult::IO => Some((index_cursor_id, table_cursor_id)), } @@ -1926,7 +1939,7 @@ pub fn op_row_id( } let mut cursors = state.cursors.borrow_mut(); if let Some(Cursor::BTree(btree_cursor)) = cursors.get_mut(*cursor_id).unwrap() { - if let Some(ref rowid) = btree_cursor.rowid()? { + if let Some(ref rowid) = return_if_io!(btree_cursor.rowid()) { state.registers[*dest] = Register::Value(Value::Integer(*rowid as i64)); } else { state.registers[*dest] = Register::Value(Value::Null); @@ -1960,7 +1973,7 @@ pub fn op_idx_row_id( let mut cursors = state.cursors.borrow_mut(); let cursor = cursors.get_mut(*cursor_id).unwrap().as_mut().unwrap(); let cursor = cursor.as_btree_mut(); - let rowid = cursor.rowid()?; + let rowid = return_if_io!(cursor.rowid()); state.registers[*dest] = match rowid { Some(rowid) => Register::Value(Value::Integer(rowid as i64)), None => Register::Value(Value::Null), @@ -2000,7 +2013,9 @@ pub fn op_seek_rowid( }; match rowid { Some(rowid) => { - let found = return_if_io!(cursor.seek(SeekKey::TableRowId(rowid), SeekOp::EQ)); + let found = return_if_io!( + cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true }) + ); if !found { target_pc.to_offset_int() } else { @@ -2046,6 +2061,7 @@ pub fn op_seek( num_regs, target_pc, is_index, + .. } | Insn::SeekGT { cursor_id, @@ -2060,6 +2076,7 @@ pub fn op_seek( num_regs, target_pc, is_index, + .. } | Insn::SeekLT { cursor_id, @@ -2076,19 +2093,22 @@ pub fn op_seek( "target_pc should be an offset, is: {:?}", target_pc ); + let eq_only = match insn { + Insn::SeekGE { eq_only, .. } | Insn::SeekLE { eq_only, .. } => *eq_only, + _ => false, + }; let op = match insn { - Insn::SeekGE { .. } => SeekOp::GE, + Insn::SeekGE { eq_only, .. } => SeekOp::GE { eq_only: *eq_only }, Insn::SeekGT { .. } => SeekOp::GT, - Insn::SeekLE { .. } => SeekOp::LE, + Insn::SeekLE { eq_only, .. } => SeekOp::LE { eq_only: *eq_only }, Insn::SeekLT { .. } => SeekOp::LT, _ => unreachable!("unexpected Insn {:?}", insn), }; let op_name = match op { - SeekOp::GE => "SeekGE", + SeekOp::GE { .. } => "SeekGE", SeekOp::GT => "SeekGT", - SeekOp::LE => "SeekLE", + SeekOp::LE { .. } => "SeekLE", SeekOp::LT => "SeekLT", - _ => unreachable!("unexpected SeekOp {:?}", op), }; if *is_index { let found = { @@ -2160,7 +2180,7 @@ pub fn op_idx_ge( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2224,7 +2244,7 @@ pub fn op_idx_le( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2270,7 +2290,7 @@ pub fn op_idx_gt( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -2316,7 +2336,7 @@ pub fn op_idx_lt( let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); - let pc = if let Some(ref idx_record) = *cursor.record() { + let pc = if let Some(ref idx_record) = return_if_io!(cursor.record()) { // Compare against the same number of values let idx_values = idx_record.get_values(); let idx_values = &idx_values[..record_from_regs.len()]; @@ -3836,13 +3856,10 @@ pub fn op_insert( Value::Integer(i) => *i, _ => unreachable!("expected integer key"), }; - // NOTE(pere): Sending moved_before == true is okay because we moved before but - // if we were to set to false after starting a balance procedure, it might - // leave undefined state. return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record)), true)); // Only update last_insert_rowid for regular table inserts, not schema modifications if cursor.root_page() != 1 { - if let Some(rowid) = cursor.rowid()? { + if let Some(rowid) = return_if_io!(cursor.rowid()) { if let Some(conn) = program.connection.upgrade() { conn.update_last_rowid(rowid); } @@ -3889,11 +3906,6 @@ pub fn op_delete( { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - tracing::debug!( - "op_delete(record={:?}, rowid={:?})", - cursor.record(), - cursor.rowid()? - ); return_if_io!(cursor.delete()); } let prev_changes = program.n_change.get(); @@ -3905,6 +3917,7 @@ pub fn op_delete( #[derive(Debug)] pub enum OpIdxDeleteState { Seeking(ImmutableRecord), // First seek row to delete + Verifying, Deleting, } pub fn op_idx_delete( @@ -3923,29 +3936,48 @@ pub fn op_idx_delete( unreachable!("unexpected Insn {:?}", insn) }; loop { + tracing::debug!( + "op_idx_delete(cursor_id={}, start_reg={}, num_regs={}, rootpage={}, state={:?})", + cursor_id, + start_reg, + num_regs, + state.get_cursor(*cursor_id).as_btree_mut().root_page(), + state.op_idx_delete_state + ); match &state.op_idx_delete_state { Some(OpIdxDeleteState::Seeking(record)) => { { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::EQ)); - tracing::debug!( - "op_idx_delete(seek={}, record={} rowid={:?})", - &record, - cursor.record().as_ref().unwrap(), - cursor.rowid() + let found = return_if_io!( + cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true }) ); - if cursor.rowid()?.is_none() { - // If P5 is not zero, then raise an SQLITE_CORRUPT_INDEX error if no matching - // index entry is found. This happens when running an UPDATE or DELETE statement and the - // index entry to be updated or deleted is not found. For some uses of IdxDelete - // (example: the EXCEPT operator) it does not matter that no matching entry is found. - // For those cases, P5 is zero. Also, do not raise this (self-correcting and non-critical) error if in writable_schema mode. - return Err(LimboError::Corrupt(format!( - "IdxDelete: no matching index entry found for record {:?}", - record - ))); - } + tracing::debug!( + "op_idx_delete: found={:?}, rootpage={}, key={:?}", + found, + cursor.root_page(), + record + ); + } + state.op_idx_delete_state = Some(OpIdxDeleteState::Verifying); + } + Some(OpIdxDeleteState::Verifying) => { + let rowid = { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.rowid()) + }; + + if rowid.is_none() { + // If P5 is not zero, then raise an SQLITE_CORRUPT_INDEX error if no matching + // index entry is found. This happens when running an UPDATE or DELETE statement and the + // index entry to be updated or deleted is not found. For some uses of IdxDelete + // (example: the EXCEPT operator) it does not matter that no matching entry is found. + // For those cases, P5 is zero. Also, do not raise this (self-correcting and non-critical) error if in writable_schema mode. + return Err(LimboError::Corrupt(format!( + "IdxDelete: no matching index entry found for record {:?}", + make_record(&state.registers, start_reg, num_regs) + ))); } state.op_idx_delete_state = Some(OpIdxDeleteState::Deleting); } @@ -4151,7 +4183,8 @@ pub fn op_no_conflict( return Ok(InsnFunctionStepResult::Step); } - let conflict = return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::EQ)); + let conflict = + return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true })); drop(cursor_ref); if !conflict { state.pc = target_pc.to_offset_int(); @@ -4925,10 +4958,10 @@ pub fn op_found( } }; - return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::EQ)) + return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true })) } else { let record = make_record(&state.registers, record_reg, num_regs); - return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::EQ)) + return_if_io!(cursor.seek(SeekKey::IndexKey(&record), SeekOp::GE { eq_only: true })) } }; diff --git a/core/vdbe/explain.rs b/core/vdbe/explain.rs index 9aa6588c6..fa6a2a70e 100644 --- a/core/vdbe/explain.rs +++ b/core/vdbe/explain.rs @@ -809,6 +809,7 @@ pub fn insn_to_str( start_reg, num_regs, target_pc, + .. } | Insn::SeekLE { is_index: _, @@ -816,6 +817,7 @@ pub fn insn_to_str( start_reg, num_regs, target_pc, + .. } | Insn::SeekLT { is_index: _, diff --git a/core/vdbe/insn.rs b/core/vdbe/insn.rs index 6eea43318..23b9da480 100644 --- a/core/vdbe/insn.rs +++ b/core/vdbe/insn.rs @@ -502,6 +502,7 @@ pub enum Insn { start_reg: usize, num_regs: usize, target_pc: BranchOffset, + eq_only: bool, }, /// If cursor_id refers to an SQL table (B-Tree that uses integer keys), use the value in start_reg as the key. @@ -538,6 +539,7 @@ pub enum Insn { start_reg: usize, num_regs: usize, target_pc: BranchOffset, + eq_only: bool, }, // If cursor_id refers to an SQL table (B-Tree that uses integer keys), use the value in start_reg as the key. diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index f7f59597c..3e0cffa3c 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -499,17 +499,17 @@ fn get_new_rowid(cursor: &mut BTreeCursor, mut rng: R) -> Result {} CursorResult::IO => return Ok(CursorResult::IO), } - let mut rowid = cursor - .rowid()? - .unwrap_or(0) // if BTree is empty - use 0 as initial value for rowid - .checked_add(1) // add 1 but be careful with overflows - .unwrap_or(i64::MAX); // in case of overflow - use i64::MAX + let mut rowid = match cursor.rowid()? { + CursorResult::Ok(Some(rowid)) => rowid.checked_add(1).unwrap_or(i64::MAX), // add 1 but be careful with overflows, in case of overflow - use i64::MAX + CursorResult::Ok(None) => 1, + CursorResult::IO => return Ok(CursorResult::IO), + }; if rowid > i64::MAX.try_into().unwrap() { let distribution = Uniform::from(1..=i64::MAX); let max_attempts = 100; for count in 0..max_attempts { rowid = distribution.sample(&mut rng).try_into().unwrap(); - match cursor.seek(SeekKey::TableRowId(rowid), SeekOp::EQ)? { + match cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true })? { CursorResult::Ok(false) => break, // Found a non-existing rowid CursorResult::Ok(true) => { if count == max_attempts - 1 { diff --git a/tests/integration/query_processing/test_write_path.rs b/tests/integration/query_processing/test_write_path.rs index 624fcca55..edebd2821 100644 --- a/tests/integration/query_processing/test_write_path.rs +++ b/tests/integration/query_processing/test_write_path.rs @@ -428,6 +428,25 @@ fn test_update_with_index() -> anyhow::Result<()> { Ok(()) } +#[test] +fn test_delete_with_index() -> anyhow::Result<()> { + let _ = env_logger::try_init(); + + maybe_setup_tracing(); + + let tmp_db = TempDatabase::new_with_rusqlite("CREATE TABLE t(x UNIQUE)"); + let conn = tmp_db.connect_limbo(); + + run_query(&tmp_db, &conn, "INSERT INTO t VALUES (1), (2)")?; + run_query(&tmp_db, &conn, "DELETE FROM t WHERE x >= 1")?; + + run_query_on_row(&tmp_db, &conn, "SELECT * FROM t", |_| { + panic!("Delete should've deleted every row!"); + })?; + + Ok(()) +} + fn run_query(tmp_db: &TempDatabase, conn: &Rc, query: &str) -> anyhow::Result<()> { run_query_core(tmp_db, conn, query, None::) }