From 9201030e67f814182c57d014183612a79a10b12c Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 7 Jul 2025 14:55:51 +0300 Subject: [PATCH 01/10] Ephemeral UNION indexes don't need to be UNIQUE --- core/translate/compound_select.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/translate/compound_select.rs b/core/translate/compound_select.rs index 2938c2169..426d396ab 100644 --- a/core/translate/compound_select.rs +++ b/core/translate/compound_select.rs @@ -333,7 +333,7 @@ fn create_dedupe_index( root_page: 0, ephemeral: true, table_name: String::new(), - unique: true, + unique: false, has_rowid: false, }); let cursor_id = program.alloc_cursor_id(CursorType::BTreeIndex(dedupe_index.clone())); From e33ff667dc7a69978f655056cfadafb6eb006ee8 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 8 Jul 2025 18:44:42 +0300 Subject: [PATCH 02/10] btree: use seek() when inserting -- replaces find_cell() --- core/storage/btree.rs | 158 ++---------------------------------------- 1 file changed, 6 insertions(+), 152 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 373f58a22..ea6b7fa0b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -306,14 +306,6 @@ impl BTreeKey<'_> { BTreeKey::IndexKey(_) => panic!("BTreeKey::to_rowid called on IndexKey"), } } - - /// Assert that the key is an index key and return it. - fn to_index_key_values(&self) -> Vec { - match self { - BTreeKey::TableRowId(_) => panic!("BTreeKey::to_index_key called on TableRowId"), - BTreeKey::IndexKey(key) => key.get_values(), - } - } } #[derive(Clone)] @@ -465,26 +457,6 @@ pub enum CursorSeekState { }, } -#[derive(Debug)] -struct FindCellState(Option<(usize, usize)>); // low, high - -impl FindCellState { - #[inline] - fn set(&mut self, lowhigh: (usize, usize)) { - self.0 = Some(lowhigh); - } - - #[inline] - fn get_state(&mut self) -> (usize, usize) { - self.0.expect("get can only be called after a set") - } - - #[inline] - fn reset(&mut self) { - self.0 = None; - } -} - pub struct BTreeCursor { /// The multi-version cursor that is used to read and write to the database file. mv_cursor: Option>>, @@ -523,8 +495,6 @@ pub struct BTreeCursor { /// Separate state to read a record with overflow pages. This separation from `state` is necessary as /// we can be in a function that relies on `state`, but also needs to process overflow pages read_overflow_state: RefCell>, - /// Contains the current cell_idx for `find_cell` - find_cell_state: FindCellState, /// `RecordCursor` is used to parse SQLite record format data retrieved from B-tree /// leaf pages. It provides incremental parsing, only deserializing the columns that are /// actually accessed, which is crucial for performance when dealing with wide tables @@ -592,7 +562,6 @@ impl BTreeCursor { valid_state: CursorValidState::Valid, seek_state: CursorSeekState::Start, read_overflow_state: RefCell::new(None), - find_cell_state: FindCellState(None), parse_record_state: RefCell::new(ParseRecordState::Init), record_cursor: RefCell::new(RecordCursor::with_capacity(num_columns)), } @@ -2032,30 +2001,7 @@ impl BTreeCursor { (cmp, found) } - fn read_record_w_possible_overflow( - &mut self, - payload: &'static [u8], - next_page: Option, - payload_size: u64, - ) -> Result> { - if let Some(next_page) = next_page { - self.process_overflow_read(payload, next_page, payload_size) - } else { - self.get_immutable_record_or_create() - .as_mut() - .unwrap() - .invalidate(); - self.get_immutable_record_or_create() - .as_mut() - .unwrap() - .start_serialization(payload); - - self.record_cursor.borrow_mut().invalidate(); - Ok(IOResult::Done(())) - } - } - - #[instrument(skip_all, level = Level::DEBUG)] + #[instrument(skip_all, level = Level::INFO)] pub fn move_to(&mut self, key: SeekKey<'_>, cmp: SeekOp) -> Result> { turso_assert!( self.mv_cursor.is_none(), @@ -2139,15 +2085,8 @@ impl BTreeCursor { self.pager.add_dirty(page.get().id); let page = page.get().contents.as_mut().unwrap(); - turso_assert!( - matches!(page.page_type(), PageType::TableLeaf | PageType::IndexLeaf), - "expected table or index leaf page" - ); - - // find cell - (return_if_io!(self.find_cell(page, bkey)), page.page_type()) + (self.stack.current_cell_index() as usize, page.page_type()) }; - self.stack.set_cell_index(cell_idx as i32); tracing::debug!(cell_idx); // if the cell index is less than the total cells, check: if its an existing @@ -2178,8 +2117,8 @@ impl BTreeCursor { continue; } } - BTreeCell::IndexLeafCell(..) => { - // Not necessary to read record again here, as find_cell already does that for us + BTreeCell::IndexLeafCell(..) | BTreeCell::IndexInteriorCell(..) => { + // Not necessary to read record again here, as the prior seek() call already does that for us let cmp = compare_immutable( record_values.as_slice(), self.get_immutable_record() @@ -3910,86 +3849,6 @@ impl BTreeCursor { self.pager.usable_space() } - /// Find the index of the cell in the page that contains the given rowid. - #[instrument( skip_all, level = Level::DEBUG)] - fn find_cell(&mut self, page: &PageContent, key: &BTreeKey) -> Result> { - let cell_count = page.cell_count(); - let mut low = 0; - let mut high = if cell_count > 0 { cell_count - 1 } else { 0 }; - let mut result_index = cell_count; - if self.find_cell_state.0.is_some() { - (low, high) = self.find_cell_state.get_state(); - } - - while low <= high && cell_count > 0 { - let mid = low + (high - low) / 2; - self.find_cell_state.set((low, high)); - let cell = page.cell_get(mid, self.usable_space())?; - - let comparison_result = match cell { - BTreeCell::TableLeafCell(cell) => key.to_rowid().cmp(&cell.rowid), - BTreeCell::TableInteriorCell(cell) => key.to_rowid().cmp(&cell.rowid), - BTreeCell::IndexInteriorCell(IndexInteriorCell { - payload, - first_overflow_page, - payload_size, - .. - }) - | BTreeCell::IndexLeafCell(IndexLeafCell { - payload, - first_overflow_page, - payload_size, - .. - }) => { - // TODO: implement efficient comparison of records - // e.g. https://github.com/sqlite/sqlite/blob/master/src/vdbeaux.c#L4719 - return_if_io!(self.read_record_w_possible_overflow( - payload, - first_overflow_page, - payload_size - )); - - let key_values = key.to_index_key_values(); - let record = self.get_immutable_record(); - let record = record.as_ref().unwrap(); - let record_same_number_cols = &record.get_values()[..key_values.len()]; - compare_immutable( - key_values.as_slice(), - record_same_number_cols, - self.index_info - .as_ref() - .expect("indexbtree_find_cell without index_info") - .key_info - .as_slice(), - ) - } - }; - - match comparison_result { - Ordering::Equal => { - result_index = mid; - break; - } - Ordering::Greater => { - low = mid + 1; - } - Ordering::Less => { - result_index = mid; - if mid == 0 { - break; - } - high = mid - 1; - } - } - } - - self.find_cell_state.reset(); - assert!(result_index <= cell_count); - - Ok(IOResult::Done(result_index)) - } - - #[instrument(skip_all, level = Level::DEBUG)] pub fn seek_end(&mut self) -> Result> { assert!(self.mv_cursor.is_none()); // unsure about this -_- self.move_to_root()?; @@ -4265,20 +4124,15 @@ impl BTreeCursor { moved_before = false; } if !moved_before { - // Use move_to() so that we always end up on a leaf page. seek() might go back to an interior cell in index seeks, - // which we never want. The reason we can use move_to() is that - // find_cell() iterates the leaf page from left to right to find the insertion point anyway, so we don't care - // which cell we are in as long as we are on the right page. - // FIXME: find_cell() should not use linear search because it's slow. match key { BTreeKey::IndexKey(_) => { - return_if_io!(self.move_to( + return_if_io!(self.seek( SeekKey::IndexKey(key.get_record().unwrap()), SeekOp::GE { eq_only: true } )) } BTreeKey::TableRowId(_) => { - return_if_io!(self.move_to( + return_if_io!(self.seek( SeekKey::TableRowId(key.to_rowid()), SeekOp::GE { eq_only: true } )) From 2b23495943844f861ea39a1cc8a4b106a9c6b9a9 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 7 Jul 2025 15:56:36 +0300 Subject: [PATCH 03/10] btree: allow overwriting index interior cell --- core/storage/btree.rs | 79 +++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index ea6b7fa0b..a4ac9f5f2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2077,15 +2077,14 @@ impl BTreeCursor { return_if_locked_maybe_load!(self.pager, page); // get page and find cell - let (cell_idx, page_type) = { + let cell_idx = { return_if_locked!(page.get()); let page = page.get(); page.set_dirty(); self.pager.add_dirty(page.get().id); - let page = page.get().contents.as_mut().unwrap(); - (self.stack.current_cell_index() as usize, page.page_type()) + self.stack.current_cell_index() as usize }; tracing::debug!(cell_idx); @@ -2118,7 +2117,7 @@ impl BTreeCursor { } } BTreeCell::IndexLeafCell(..) | BTreeCell::IndexInteriorCell(..) => { - // Not necessary to read record again here, as the prior seek() call already does that for us + return_if_io!(self.record()); let cmp = compare_immutable( record_values.as_slice(), self.get_immutable_record() @@ -2145,18 +2144,25 @@ impl BTreeCursor { self.save_context(CursorContext::IndexKeyRowId((*record).clone())); } continue; + } else { + turso_assert!( + !matches!(cell, BTreeCell::IndexInteriorCell(..)), + "we should not be inserting a new index interior cell. the only valid operation on an index interior cell is an overwrite!" + ); } } other => panic!("unexpected cell type, expected TableLeaf or IndexLeaf, found: {other:?}"), } } + // insert cell let mut cell_payload: Vec = Vec::with_capacity(record_values.len() + 4); fill_cell_payload( - page_type, + page.get().get().contents.as_ref().unwrap(), bkey.maybe_rowid(), &mut cell_payload, + cell_idx, record, self.usable_space() as u16, self.pager.clone(), @@ -4819,14 +4825,16 @@ impl BTreeCursor { record: &ImmutableRecord, ) -> Result> { // build the new payload - let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type(); + let page = page_ref.get(); + let page_contents = page.get().contents.as_ref().unwrap(); let serial_types_len = self.record_cursor.borrow_mut().len(record); let mut new_payload = Vec::with_capacity(serial_types_len); let rowid = return_if_io!(self.rowid()); fill_cell_payload( - page_type, + page_contents, rowid, &mut new_payload, + cell_idx, record, self.usable_space() as u16, self.pager.clone(), @@ -6127,9 +6135,10 @@ fn _insert_into_cell( ) -> Result<()> { assert!( cell_idx <= page.cell_count() + page.overflow_cells.len(), - "attempting to add cell to an incorrect place cell_idx={} cell_count={}", + "attempting to add cell to an incorrect place cell_idx={} cell_count={} page_type={:?}", cell_idx, - page.cell_count() + page.cell_count(), + page.page_type() ); let already_has_overflow = !page.overflow_cells.is_empty(); let enough_space = if already_has_overflow && !allow_regular_insert_despite_overflow { @@ -6317,21 +6326,25 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) - /// Fill in the cell payload with the record. /// If the record is too large to fit in the cell, it will spill onto overflow pages. fn fill_cell_payload( - page_type: PageType, + page_contents: &PageContent, int_key: Option, cell_payload: &mut Vec, + cell_idx: usize, record: &ImmutableRecord, usable_space: u16, pager: Rc, ) { - assert!(matches!( - page_type, - PageType::TableLeaf | PageType::IndexLeaf - )); // TODO: make record raw from start, having to serialize is not good let record_buf = record.get_payload().to_vec(); + let page_type = page_contents.page_type(); // fill in header + if matches!(page_type, PageType::IndexInterior) { + // if a write happened on an index interior page, it is always an overwrite. + // we must copy the left child pointer of the replaced cell to the new cell. + let left_child_page = page_contents.cell_interior_read_left_child_page(cell_idx); + cell_payload.extend_from_slice(&left_child_page.to_be_bytes()); + } if matches!(page_type, PageType::TableLeaf) { let int_key = int_key.unwrap(); write_varint_to_vec(record_buf.len() as u64, cell_payload); @@ -6565,9 +6578,10 @@ mod tests { ) -> Vec { let mut payload: Vec = Vec::new(); fill_cell_payload( - page.page_type(), + page, Some(id as i64), &mut payload, + pos, &record, 4096, conn.pager.borrow().clone(), @@ -7711,9 +7725,10 @@ mod tests { let record = ImmutableRecord::from_registers(regs, regs.len()); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.page_type(), + page, Some(i as i64), &mut payload, + cell_idx, &record, 4096, conn.pager.borrow().clone(), @@ -7784,9 +7799,10 @@ mod tests { let record = ImmutableRecord::from_registers(regs, regs.len()); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.page_type(), + page, Some(i), &mut payload, + cell_idx, &record, 4096, conn.pager.borrow().clone(), @@ -8148,9 +8164,10 @@ mod tests { let record = ImmutableRecord::from_registers(regs, regs.len()); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.get().get_contents().page_type(), + page.get().get_contents(), Some(0), &mut payload, + 0, &record, 4096, conn.pager.borrow().clone(), @@ -8225,9 +8242,10 @@ mod tests { let record = ImmutableRecord::from_registers(regs, regs.len()); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.get().get_contents().page_type(), + page.get().get_contents(), Some(0), &mut payload, + 0, &record, 4096, conn.pager.borrow().clone(), @@ -8580,7 +8598,7 @@ mod tests { // add a bunch of cells while compute_free_space(page.get_contents(), pager.usable_space() as u16) >= size + 10 { - insert_cell(i, size, page.get_contents(), pager.clone(), page_type); + insert_cell(i, size, page.get_contents(), pager.clone()); i += 1; size = (rng.next_u64() % 1024) as u16; } @@ -8637,24 +8655,25 @@ mod tests { } } - fn insert_cell( - i: u64, - size: u16, - contents: &mut PageContent, - pager: Rc, - page_type: PageType, - ) { + fn insert_cell(cell_idx: u64, size: u16, contents: &mut PageContent, pager: Rc) { let mut payload = Vec::new(); let regs = &[Register::Value(Value::Blob(vec![0; size as usize]))]; let record = ImmutableRecord::from_registers(regs, regs.len()); fill_cell_payload( - page_type, - Some(i as i64), + contents, + Some(cell_idx as i64), &mut payload, + cell_idx as usize, &record, pager.usable_space() as u16, pager.clone(), ); - insert_into_cell(contents, &payload, i as usize, pager.usable_space() as u16).unwrap(); + insert_into_cell( + contents, + &payload, + cell_idx as usize, + pager.usable_space() as u16, + ) + .unwrap(); } } From 4f0ef663e2da61884303abc9ef64aac3de7564be Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 7 Jul 2025 17:31:16 +0300 Subject: [PATCH 04/10] btree: add target cell tracking for EQ seeks --- core/storage/btree.rs | 82 +++++++++++++++++++++++++++++++------------ core/vdbe/execute.rs | 5 ++- 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a4ac9f5f2..10a003cb8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -454,6 +454,18 @@ pub enum CursorSeekState { /// 1. We have not seen an EQ during the traversal /// 2. We are looking for an exact match ([SeekOp::GE] or [SeekOp::LE] with eq_only: true) eq_seen: Cell, + /// In multiple places, we do a seek that checks for an exact match (SeekOp::EQ) in the tree. + /// In those cases, we need to know where to land if we don't find an exact match in the leaf page. + /// For non-eq-only conditions (GT, LT, GE, LE), this is pretty simple: + /// - If we are looking for GT/GE and don't find a match, we should end up beyond the end of the page (idx=cell count). + /// - If we are looking for LT/LE and don't find a match, we should end up before the beginning of the page (idx=-1). + /// + /// For eq-only conditions (GE { eq_only: true } or LE { eq_only: true }), we need to know where to land if we don't find an exact match. + /// For GE, we want to land at the first cell that is greater than the seek key. + /// For LE, we want to land at the last cell that is less than the seek key. + /// This is because e.g. when we attempt to insert rowid 666, we first check if it exists. + /// If it doesn't, we want to land in the place where rowid 666 WOULD be inserted. + target_cell_when_not_found: Cell, }, } @@ -1712,6 +1724,10 @@ impl BTreeCursor { max_cell_idx, nearest_matching_cell, eq_seen: Cell::new(false), // not relevant for table btrees + target_cell_when_not_found: Cell::new(match seek_op.iteration_direction() { + IterationDirection::Forwards => cell_count as i32, + IterationDirection::Backwards => -1, + }), }; } @@ -1719,6 +1735,7 @@ impl BTreeCursor { min_cell_idx, max_cell_idx, nearest_matching_cell, + target_cell_when_not_found, .. } = &self.seek_state else { @@ -1736,6 +1753,7 @@ impl BTreeCursor { if min > max { if let Some(nearest_matching_cell) = nearest_matching_cell.get() { self.stack.set_cell_index(nearest_matching_cell as i32); + self.has_record.set(true); return Ok(IOResult::Done(SeekResult::Found)); } else { // if !eq_only - matching entry can exist in neighbour leaf page @@ -1743,19 +1761,14 @@ impl BTreeCursor { // in such case BTree can navigate to the leaf which no longer has matching key for seek_op // in this case, caller must advance cursor if necessary return Ok(IOResult::Done(if seek_op.eq_only() { + let has_record = target_cell_when_not_found.get() >= 0 + && target_cell_when_not_found.get() < contents.cell_count() as i32; + self.has_record.set(has_record); + self.stack.set_cell_index(target_cell_when_not_found.get()); SeekResult::NotFound } else { - let contents = page.get().contents.as_ref().unwrap(); - turso_assert!( - contents.is_leaf(), - "tablebtree_seek() called on non-leaf page" - ); - let cell_count = contents.cell_count(); // set cursor to the position where which would hold the op-boundary if it were present - self.stack.set_cell_index(match &seek_op { - SeekOp::GT | SeekOp::GE { .. } => cell_count as i32, - SeekOp::LT | SeekOp::LE { .. } => 0, - }); + self.stack.set_cell_index(target_cell_when_not_found.get()); SeekResult::TryAdvance })); }; @@ -1778,6 +1791,7 @@ impl BTreeCursor { // rowids are unique, so we can return the rowid immediately if found && seek_op.eq_only() { self.stack.set_cell_index(cur_cell_idx as i32); + self.has_record.set(true); return Ok(IOResult::Done(SeekResult::Found)); } @@ -1792,8 +1806,16 @@ impl BTreeCursor { } } } else if cmp.is_gt() { + if matches!(seek_op, SeekOp::GE { eq_only: true }) { + target_cell_when_not_found + .set(target_cell_when_not_found.get().min(cur_cell_idx as i32)); + } max_cell_idx.set(cur_cell_idx - 1); } else if cmp.is_lt() { + if matches!(seek_op, SeekOp::LE { eq_only: true }) { + target_cell_when_not_found + .set(target_cell_when_not_found.get().max(cur_cell_idx as i32)); + } min_cell_idx.set(cur_cell_idx + 1); } else { match iter_dir { @@ -1865,6 +1887,10 @@ impl BTreeCursor { max_cell_idx: max, nearest_matching_cell, eq_seen: Cell::new(eq_seen), + target_cell_when_not_found: Cell::new(match seek_op.iteration_direction() { + IterationDirection::Forwards => cell_count as i32, + IterationDirection::Backwards => -1, + }), }; } @@ -1873,6 +1899,7 @@ impl BTreeCursor { max_cell_idx, nearest_matching_cell, eq_seen, + target_cell_when_not_found, } = &self.seek_state else { unreachable!( @@ -1885,7 +1912,6 @@ impl BTreeCursor { return_if_locked_maybe_load!(self.pager, page); let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); - let cell_count = contents.cell_count(); let iter_dir = seek_op.iteration_direction(); @@ -1895,19 +1921,21 @@ impl BTreeCursor { if min > max { if let Some(nearest_matching_cell) = nearest_matching_cell.get() { self.stack.set_cell_index(nearest_matching_cell as i32); + self.has_record.set(true); return Ok(IOResult::Done(SeekResult::Found)); } else { + // set cursor to the position where which would hold the op-boundary if it were present + let target_cell = target_cell_when_not_found.get(); + self.stack.set_cell_index(target_cell); + let has_record = target_cell >= 0 && target_cell < contents.cell_count() as i32; + self.has_record.set(has_record); + // Similar logic as in tablebtree_seek(), but for indexes. // The difference is that since index keys are not necessarily unique, we need to TryAdvance // even when eq_only=true and we have seen an EQ match up in the tree in an interior node. if seek_op.eq_only() && !eq_seen.get() { return Ok(IOResult::Done(SeekResult::NotFound)); } - // set cursor to the position where which would hold the op-boundary if it were present - self.stack.set_cell_index(match &seek_op { - SeekOp::GT | SeekOp::GE { .. } => cell_count as i32, - SeekOp::LT | SeekOp::LE { .. } => 0, - }); return Ok(IOResult::Done(SeekResult::TryAdvance)); }; } @@ -1958,8 +1986,16 @@ impl BTreeCursor { } } } else if cmp.is_gt() { + if matches!(seek_op, SeekOp::GE { eq_only: true }) { + target_cell_when_not_found + .set(target_cell_when_not_found.get().min(cur_cell_idx as i32)); + } max_cell_idx.set(cur_cell_idx - 1); } else if cmp.is_lt() { + if matches!(seek_op, SeekOp::LE { eq_only: true }) { + target_cell_when_not_found + .set(target_cell_when_not_found.get().max(cur_cell_idx as i32)); + } min_cell_idx.set(cur_cell_idx + 1); } else { match iter_dir { @@ -2084,8 +2120,13 @@ impl BTreeCursor { page.set_dirty(); self.pager.add_dirty(page.get().id); - self.stack.current_cell_index() as usize + self.stack.current_cell_index() }; + if cell_idx == -1 { + // This might be a brand new table and the cursor hasn't moved yet. Let's advance it to the first slot. + self.stack.set_cell_index(0); + } + let cell_idx = self.stack.current_cell_index() as usize; tracing::debug!(cell_idx); // if the cell index is less than the total cells, check: if its an existing @@ -4013,8 +4054,6 @@ impl BTreeCursor { // Reset seek state self.seek_state = CursorSeekState::Start; self.valid_state = CursorValidState::Valid; - self.has_record - .replace(matches!(seek_result, SeekResult::Found)); Ok(IOResult::Done(seek_result)) } @@ -4560,10 +4599,9 @@ impl BTreeCursor { }; let seek_result = return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true })); - let has_record = matches!(seek_result, SeekResult::Found); - self.has_record.set(has_record); + let exists = matches!(seek_result, SeekResult::Found); self.invalidate_record(); - Ok(IOResult::Done(has_record)) + Ok(IOResult::Done(exists)) } /// Clear the overflow pages linked to a specific page provided by the leaf cell diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index f5de1a998..0aa905e54 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -4884,7 +4884,10 @@ pub fn op_insert( Register::Aggregate(..) => unreachable!("Cannot insert an aggregate value."), }; - // query planner must emit NewRowId/NotExists/etc op-codes which will properly reposition cursor + // In a table insert, we can always assume we have moved to the correct place, because + // 1. if the rowid is autogenerated, op_new_rowid() always gets called and we seek to the end of the table. + // 2. if the rowid is not autogenerated, op_not_exists() always gets called to ensure the rowid being inserted does not exist. + // Both of these code paths perform a seek, which lands us at the correct insertion place. return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record.as_ref())), true)); } From 9ee6988fc5995699b2b9e5c9a1fa101058a61b2f Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 7 Jul 2025 17:52:10 +0300 Subject: [PATCH 05/10] VDBE: NewRowid needs to call next() in case op_not_exists() is not called afterwards --- core/vdbe/execute.rs | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 0aa905e54..04079a72b 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -5195,8 +5195,17 @@ pub enum OpNewRowidState { Start, SeekingToLast, ReadingMaxRowid, - GeneratingRandom { attempts: u32 }, - VerifyingCandidate { attempts: u32, candidate: i64 }, + GeneratingRandom { + attempts: u32, + }, + VerifyingCandidate { + attempts: u32, + candidate: i64, + }, + /// In case a rowid was generated and not provided by the user, we need to call next() on the cursor + /// after generating the rowid. This is because the rowid was generated by seeking to the last row in the + /// table, and we need to insert _after_ that row. + GoNext, } pub fn op_new_rowid( @@ -5253,9 +5262,8 @@ pub fn op_new_rowid( Some(rowid) if rowid < MAX_ROWID => { // Can use sequential state.registers[*rowid_reg] = Register::Value(Value::Integer(rowid + 1)); - state.op_new_rowid_state = OpNewRowidState::Start; - state.pc += 1; - return Ok(InsnFunctionStepResult::Step); + state.op_new_rowid_state = OpNewRowidState::GoNext; + continue; } Some(_) => { // Must use random (rowid == MAX_ROWID) @@ -5265,9 +5273,8 @@ pub fn op_new_rowid( None => { // Empty table state.registers[*rowid_reg] = Register::Value(Value::Integer(1)); - state.op_new_rowid_state = OpNewRowidState::Start; - state.pc += 1; - return Ok(InsnFunctionStepResult::Step); + state.op_new_rowid_state = OpNewRowidState::GoNext; + continue; } } } @@ -5318,6 +5325,16 @@ pub fn op_new_rowid( }; } } + OpNewRowidState::GoNext => { + { + let mut cursor = state.get_cursor(*cursor); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.next()); + } + state.op_new_rowid_state = OpNewRowidState::Start; + state.pc += 1; + return Ok(InsnFunctionStepResult::Step); + } } } } From 55151a80618de824ca339f604d59379f99dac3a6 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Mon, 7 Jul 2025 18:42:18 +0300 Subject: [PATCH 06/10] Fix cases where Insn::Insert needs to seek to ensure correct insertion --- core/translate/emitter.rs | 8 +++++++- core/translate/insert.rs | 3 ++- core/translate/result_row.rs | 3 ++- core/vdbe/execute.rs | 15 ++++++++++----- core/vdbe/insn.rs | 9 +++------ 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index 36915f720..140ec8c3f 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -1153,7 +1153,13 @@ fn emit_update_insns( cursor: cursor_id, key_reg: rowid_set_clause_reg.unwrap_or(beg), record_reg, - flag: InsertFlags::new().update(true), + flag: if has_user_provided_rowid { + // The previous Insn::NotExists and Insn::Delete seek to the old rowid, + // so to insert a new user-provided rowid, we need to seek to the correct place. + InsertFlags::new().require_seek() + } else { + InsertFlags::new() + }, table_name: table_ref.identifier.clone(), }); diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 868aacf50..578682684 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -231,7 +231,8 @@ pub fn translate_insert( cursor: temp_cursor_id, key_reg: rowid_reg, record_reg, - flag: InsertFlags::new(), + // since we are not doing an Insn::NewRowid or an Insn::NotExists here, we need to seek to ensure the insertion happens in the correct place. + flag: InsertFlags::new().require_seek(), table_name: "".to_string(), }); diff --git a/core/translate/result_row.rs b/core/translate/result_row.rs index 62c6bc96a..4a7b78890 100644 --- a/core/translate/result_row.rs +++ b/core/translate/result_row.rs @@ -128,7 +128,8 @@ pub fn emit_result_row_and_limit( cursor: *table_cursor_id, key_reg: result_columns_start_reg + (plan.result_columns.len() - 1), // Rowid reg is the last register record_reg, - flag: InsertFlags(0), + // since we are not doing an Insn::NewRowid or an Insn::NotExists here, we need to seek to ensure the insertion happens in the correct place. + flag: InsertFlags::new().require_seek(), table_name: table.name.clone(), }); } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 04079a72b..1ced21636 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -13,6 +13,7 @@ use crate::types::{ compare_immutable, compare_records_generic, ImmutableRecord, SeekResult, Text, TextSubtype, }; use crate::util::normalize_ident; +use crate::vdbe::insn::InsertFlags; use crate::vdbe::registers_to_ref_values; use crate::{ error::{ @@ -4884,11 +4885,15 @@ pub fn op_insert( Register::Aggregate(..) => unreachable!("Cannot insert an aggregate value."), }; - // In a table insert, we can always assume we have moved to the correct place, because - // 1. if the rowid is autogenerated, op_new_rowid() always gets called and we seek to the end of the table. - // 2. if the rowid is not autogenerated, op_not_exists() always gets called to ensure the rowid being inserted does not exist. - // Both of these code paths perform a seek, which lands us at the correct insertion place. - return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record.as_ref())), true)); + // In a table insert, if the caller does not pass InsertFlags::REQUIRE_SEEK, they must ensure that a seek has already happened to the correct location. + // This typically happens by invoking either Insn::NewRowid or Insn::NotExists, because: + // 1. op_new_rowid() seeks to the end of the table, which is the correct insertion position. + // 2. op_not_exists() seeks to the position in the table where the target rowid would be inserted. + let moved_before = !flag.has(InsertFlags::REQUIRE_SEEK); + return_if_io!(cursor.insert( + &BTreeKey::new_table_rowid(key, Some(record.as_ref())), + moved_before + )); } // Only update last_insert_rowid for regular table inserts, not schema modifications diff --git a/core/vdbe/insn.rs b/core/vdbe/insn.rs index 37acbc5e3..90a3fcec1 100644 --- a/core/vdbe/insn.rs +++ b/core/vdbe/insn.rs @@ -109,6 +109,7 @@ pub struct InsertFlags(pub u8); impl InsertFlags { pub const UPDATE: u8 = 0x01; // Flag indicating this is part of an UPDATE statement + pub const REQUIRE_SEEK: u8 = 0x02; // Flag indicating that a seek is required to insert the row pub fn new() -> Self { InsertFlags(0) @@ -118,12 +119,8 @@ impl InsertFlags { (self.0 & flag) != 0 } - pub fn update(mut self, is_update: bool) -> Self { - if is_update { - self.0 |= InsertFlags::UPDATE; - } else { - self.0 &= !InsertFlags::UPDATE; - } + pub fn require_seek(mut self) -> Self { + self.0 |= InsertFlags::REQUIRE_SEEK; self } } From fdeb15bb9d01167a20626d0c87b05a0c263e51e3 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 8 Jul 2025 15:13:49 +0300 Subject: [PATCH 07/10] btree/delete: rightmost_cell_was_dropped logic is not needed since a) if we balance, we seek anyway, and b) if we dont balance, we retreat anyway --- core/storage/btree.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 10a003cb8..0e140cccb 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -196,7 +196,6 @@ enum DeleteState { post_balancing_seek_key: Option, }, CheckNeedsBalancing { - rightmost_cell_was_dropped: bool, post_balancing_seek_key: Option, }, WaitForBalancingToComplete { @@ -4348,12 +4347,10 @@ impl BTreeCursor { post_balancing_seek_key, }; } else { - let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1); drop_cell(contents, cell_idx, self.usable_space() as u16)?; let delete_info = self.state.mut_delete_info().unwrap(); delete_info.state = DeleteState::CheckNeedsBalancing { - rightmost_cell_was_dropped: is_last_cell, post_balancing_seek_key, }; } @@ -4454,13 +4451,11 @@ impl BTreeCursor { let delete_info = self.state.mut_delete_info().unwrap(); delete_info.state = DeleteState::CheckNeedsBalancing { - rightmost_cell_was_dropped: false, post_balancing_seek_key, }; } DeleteState::CheckNeedsBalancing { - rightmost_cell_was_dropped, post_balancing_seek_key, } => { let page = self.stack.top(); @@ -4472,15 +4467,6 @@ impl BTreeCursor { let needs_balancing = self.stack.has_parent() && free_space as usize * 3 > self.usable_space() * 2; - if rightmost_cell_was_dropped { - // If we drop a cell in the middle, e.g. our current index is 2 and we drop 'c' from [a,b,c,d,e], then we don't need to retreat index, - // because index 2 is still going to be the right place [a,b,D,e] - // but: - // If we drop the rightmost cell, e.g. our current index is 4 and we drop 'e' from [a,b,c,d,e], then we need to retreat index, - // because index 4 is now pointing beyond the last cell [a,b,c,d] _ <-- index 4 - self.stack.retreat(); - } - if needs_balancing { let delete_info = self.state.mut_delete_info().unwrap(); if delete_info.balance_write_info.is_none() { From 28c050dd27e9d405be55b1b1d9c6095510e1d7df Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 8 Jul 2025 22:04:46 +0300 Subject: [PATCH 08/10] seek before insert to ensure correct location in fuzz test --- core/storage/btree.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 0e140cccb..fcfbbc209 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -7161,6 +7161,15 @@ mod tests { .map(|col| Register::Value(Value::Integer(*col))) .collect::>(); let value = ImmutableRecord::from_registers(®s, regs.len()); + run_until_done( + || { + let record = ImmutableRecord::from_registers(®s, regs.len()); + let key = SeekKey::IndexKey(&record); + cursor.seek(key, SeekOp::GE { eq_only: true }) + }, + pager.deref(), + ) + .unwrap(); run_until_done( || { cursor.insert( From 2a2ab16c5214244579de2a8b87ac7ebd2dbf6cce Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Thu, 17 Jul 2025 09:35:12 +0300 Subject: [PATCH 09/10] fix moved_before handling in cursor.insert --- core/storage/btree.rs | 42 ++++++++++++++++++++++++++++++++++++------ core/types.rs | 2 +- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index fcfbbc209..5bbca79f7 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -424,6 +424,8 @@ pub enum CursorValidState { Valid, /// Cursor may be pointing to a non-existent location/cell. This can happen after balancing operations RequireSeek, + /// Cursor requires an advance after a seek + RequireAdvance(IterationDirection), } #[derive(Debug)] @@ -4149,7 +4151,10 @@ impl BTreeCursor { pub fn insert( &mut self, key: &BTreeKey, - mut moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */ + // Indicate whether it's necessary to traverse to find the leaf page + // FIXME: refactor this out into a state machine, these ad-hoc state + // variables are very hard to reason about + mut moved_before: bool, ) -> Result> { tracing::debug!(valid_state = ?self.valid_state, cursor_state = ?self.state, is_write_in_progress = self.is_write_in_progress()); match &self.mv_cursor { @@ -4163,12 +4168,32 @@ impl BTreeCursor { None => todo!("Support mvcc inserts with index btrees"), }, None => { - if self.valid_state != CursorValidState::Valid && !self.is_write_in_progress() { - // A balance happened so we need to move. - moved_before = false; - } + match (&self.valid_state, self.is_write_in_progress()) { + (CursorValidState::Valid, _) => { + // consider the current position valid unless the caller explicitly asks us to seek. + } + (CursorValidState::RequireSeek, false) => { + // we must seek. + moved_before = false; + } + (CursorValidState::RequireSeek, true) => { + // illegal to seek during a write no matter what CursorValidState or caller says -- we might e.g. move to the wrong page during balancing + moved_before = true; + } + (CursorValidState::RequireAdvance(direction), _) => { + // FIXME: this is a hack to support the case where we need to advance the cursor after a seek. + // We should have a proper state machine for this. + return_if_io!(match direction { + IterationDirection::Forwards => self.next(), + IterationDirection::Backwards => self.prev(), + }); + self.valid_state = CursorValidState::Valid; + self.seek_state = CursorSeekState::Start; + moved_before = true; + } + }; if !moved_before { - match key { + let seek_result = match key { BTreeKey::IndexKey(_) => { return_if_io!(self.seek( SeekKey::IndexKey(key.get_record().unwrap()), @@ -4182,6 +4207,11 @@ impl BTreeCursor { )) } }; + if SeekResult::TryAdvance == seek_result { + self.valid_state = + CursorValidState::RequireAdvance(IterationDirection::Forwards); + return_if_io!(self.next()); + } self.context.take(); // we know where we wanted to move so if there was any saved context, discard it. self.valid_state = CursorValidState::Valid; self.seek_state = CursorSeekState::Start; diff --git a/core/types.rs b/core/types.rs index 2bc88acd6..0beb024b8 100644 --- a/core/types.rs +++ b/core/types.rs @@ -2235,7 +2235,7 @@ macro_rules! return_if_io { }; } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum SeekResult { /// Record matching the [SeekOp] found in the B-tree and cursor was positioned to point onto that record Found, From 40df1725c57da86c11b98f6c327fe1a0fa87c6ae Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Fri, 18 Jul 2025 12:54:24 +0300 Subject: [PATCH 10/10] Fix restore_context() not advancing when required --- core/storage/btree.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5bbca79f7..1e83592a2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -5054,7 +5054,19 @@ impl BTreeCursor { /// If context is defined, restore it and set it None on success #[instrument(skip_all, level = Level::DEBUG)] fn restore_context(&mut self) -> Result> { - if self.context.is_none() || !matches!(self.valid_state, CursorValidState::RequireSeek) { + if self.context.is_none() || matches!(self.valid_state, CursorValidState::Valid) { + return Ok(IOResult::Done(())); + } + if let CursorValidState::RequireAdvance(direction) = self.valid_state { + let has_record = return_if_io!(match direction { + // Avoid calling next()/prev() directly because they immediately call restore_context() + IterationDirection::Forwards => self.get_next_record(), + IterationDirection::Backwards => self.get_prev_record(), + }); + self.has_record.set(has_record); + self.invalidate_record(); + self.context = None; + self.valid_state = CursorValidState::Valid; return Ok(IOResult::Done(())); } let ctx = self.context.take().unwrap(); @@ -5064,7 +5076,13 @@ impl BTreeCursor { }; let res = self.seek(seek_key, SeekOp::GE { eq_only: true })?; match res { - IOResult::Done(_) => { + IOResult::Done(res) => { + if let SeekResult::TryAdvance = res { + self.valid_state = + CursorValidState::RequireAdvance(IterationDirection::Forwards); + self.context = Some(ctx); + return Ok(IOResult::IO); + } self.valid_state = CursorValidState::Valid; Ok(IOResult::Done(())) }