Merge 'Remove find_cell, allow overwrite of index interior cell' from Jussi Saurio

Currently we are using `find_cell` which does a binary search on every insert, even if the cursor is already in the correct place. 1. Remove `find_cell` and use `seek` when inserting instead of `move_to`. 2. After removing `find_cell`, we now need to be more mindful of when we do actually require a seek before insertion. This is signified by the addition of `InsertFlags::REQUIRE_SEEK`. Previously some inserts worked by accident because we always happened to be on the correct page -- `find_cell()´ only binary searches a single page. 3. After removing `find_cell` we also need to call `next()` after `Insn::NewRowid`, because `NewRowid` positions us at the old maximum rowid, and we need to be positioned after it. 4. After removing `find_cell` we also need to leave the cursor positioned correctly during seeks where we don't find a match. 5. Allow overwrite of an index interior cell, which Closes #1975. Reviewed-by: Pere Diaz Bou <pere-altea@homail.com> Closes #1988
2026-02-23 00:45:37 +01:00 · 2025-07-18 14:09:39 +03:00
parent f7068ce23a 40df1725c5
commit bca1e56f49
8 changed files with 228 additions and 244 deletions
--- a/core/storage/btree.rs
+++ b/core/storage/btree.rs
@@ -196,7 +196,6 @@ enum DeleteState {
        post_balancing_seek_key: Option<DeleteSavepoint>,
    },
    CheckNeedsBalancing {
-        rightmost_cell_was_dropped: bool,
        post_balancing_seek_key: Option<DeleteSavepoint>,
    },
    WaitForBalancingToComplete {
@@ -306,14 +305,6 @@ impl BTreeKey<'_> {
            BTreeKey::IndexKey(_) => panic!("BTreeKey::to_rowid called on IndexKey"),
        }
    }
-
-    /// Assert that the key is an index key and return it.
-    fn to_index_key_values(&self) -> Vec<RefValue> {
-        match self {
-            BTreeKey::TableRowId(_) => panic!("BTreeKey::to_index_key called on TableRowId"),
-            BTreeKey::IndexKey(key) => key.get_values(),
-        }
-    }
 }

 #[derive(Clone)]
@@ -433,6 +424,8 @@ pub enum CursorValidState {
    Valid,
    /// Cursor may be pointing to a non-existent location/cell. This can happen after balancing operations
    RequireSeek,
+    /// Cursor requires an advance after a seek
+    RequireAdvance(IterationDirection),
 }

 #[derive(Debug)]
@@ -462,29 +455,21 @@ pub enum CursorSeekState {
        /// 1. We have not seen an EQ during the traversal
        /// 2. We are looking for an exact match ([SeekOp::GE] or [SeekOp::LE] with eq_only: true)
        eq_seen: Cell<bool>,
+        /// In multiple places, we do a seek that checks for an exact match (SeekOp::EQ) in the tree.
+        /// In those cases, we need to know where to land if we don't find an exact match in the leaf page.
+        /// For non-eq-only conditions (GT, LT, GE, LE), this is pretty simple:
+        /// - If we are looking for GT/GE and don't find a match, we should end up beyond the end of the page (idx=cell count).
+        /// - If we are looking for LT/LE and don't find a match, we should end up before the beginning of the page (idx=-1).
+        ///
+        /// For eq-only conditions (GE { eq_only: true } or LE { eq_only: true }), we need to know where to land if we don't find an exact match.
+        /// For GE, we want to land at the first cell that is greater than the seek key.
+        /// For LE, we want to land at the last cell that is less than the seek key.
+        /// This is because e.g. when we attempt to insert rowid 666, we first check if it exists.
+        /// If it doesn't, we want to land in the place where rowid 666 WOULD be inserted.
+        target_cell_when_not_found: Cell<i32>,
    },
 }

-#[derive(Debug)]
-struct FindCellState(Option<(usize, usize)>); // low, high
-
-impl FindCellState {
-    #[inline]
-    fn set(&mut self, lowhigh: (usize, usize)) {
-        self.0 = Some(lowhigh);
-    }
-
-    #[inline]
-    fn get_state(&mut self) -> (usize, usize) {
-        self.0.expect("get can only be called after a set")
-    }
-
-    #[inline]
-    fn reset(&mut self) {
-        self.0 = None;
-    }
-}
-
 pub struct BTreeCursor {
    /// The multi-version cursor that is used to read and write to the database file.
    mv_cursor: Option<Rc<RefCell<MvCursor>>>,
@@ -523,8 +508,6 @@ pub struct BTreeCursor {
    /// Separate state to read a record with overflow pages. This separation from `state` is necessary as
    /// we can be in a function that relies on `state`, but also needs to process overflow pages
    read_overflow_state: RefCell<Option<ReadPayloadOverflow>>,
-    /// Contains the current cell_idx for `find_cell`
-    find_cell_state: FindCellState,
    /// `RecordCursor` is used to parse SQLite record format data retrieved from B-tree
    /// leaf pages. It provides incremental parsing, only deserializing the columns that are
    /// actually accessed, which is crucial for performance when dealing with wide tables
@@ -592,7 +575,6 @@ impl BTreeCursor {
            valid_state: CursorValidState::Valid,
            seek_state: CursorSeekState::Start,
            read_overflow_state: RefCell::new(None),
-            find_cell_state: FindCellState(None),
            parse_record_state: RefCell::new(ParseRecordState::Init),
            record_cursor: RefCell::new(RecordCursor::with_capacity(num_columns)),
        }
@@ -1743,6 +1725,10 @@ impl BTreeCursor {
                max_cell_idx,
                nearest_matching_cell,
                eq_seen: Cell::new(false), // not relevant for table btrees
+                target_cell_when_not_found: Cell::new(match seek_op.iteration_direction() {
+                    IterationDirection::Forwards => cell_count as i32,
+                    IterationDirection::Backwards => -1,
+                }),
            };
        }

@@ -1750,6 +1736,7 @@ impl BTreeCursor {
            min_cell_idx,
            max_cell_idx,
            nearest_matching_cell,
+            target_cell_when_not_found,
            ..
        } = &self.seek_state
        else {
@@ -1767,6 +1754,7 @@ impl BTreeCursor {
            if min > max {
                if let Some(nearest_matching_cell) = nearest_matching_cell.get() {
                    self.stack.set_cell_index(nearest_matching_cell as i32);
+                    self.has_record.set(true);
                    return Ok(IOResult::Done(SeekResult::Found));
                } else {
                    // if !eq_only - matching entry can exist in neighbour leaf page
@@ -1774,19 +1762,14 @@ impl BTreeCursor {
                    // in such case BTree can navigate to the leaf which no longer has matching key for seek_op
                    // in this case, caller must advance cursor if necessary
                    return Ok(IOResult::Done(if seek_op.eq_only() {
+                        let has_record = target_cell_when_not_found.get() >= 0
+                            && target_cell_when_not_found.get() < contents.cell_count() as i32;
+                        self.has_record.set(has_record);
+                        self.stack.set_cell_index(target_cell_when_not_found.get());
                        SeekResult::NotFound
                    } else {
-                        let contents = page.get().contents.as_ref().unwrap();
-                        turso_assert!(
-                            contents.is_leaf(),
-                            "tablebtree_seek() called on non-leaf page"
-                        );
-                        let cell_count = contents.cell_count();
                        // set cursor to the position where which would hold the op-boundary if it were present
-                        self.stack.set_cell_index(match &seek_op {
-                            SeekOp::GT | SeekOp::GE { .. } => cell_count as i32,
-                            SeekOp::LT | SeekOp::LE { .. } => 0,
-                        });
+                        self.stack.set_cell_index(target_cell_when_not_found.get());
                        SeekResult::TryAdvance
                    }));
                };
@@ -1809,6 +1792,7 @@ impl BTreeCursor {
            // rowids are unique, so we can return the rowid immediately
            if found && seek_op.eq_only() {
                self.stack.set_cell_index(cur_cell_idx as i32);
+                self.has_record.set(true);
                return Ok(IOResult::Done(SeekResult::Found));
            }

@@ -1823,8 +1807,16 @@ impl BTreeCursor {
                    }
                }
            } else if cmp.is_gt() {
+                if matches!(seek_op, SeekOp::GE { eq_only: true }) {
+                    target_cell_when_not_found
+                        .set(target_cell_when_not_found.get().min(cur_cell_idx as i32));
+                }
                max_cell_idx.set(cur_cell_idx - 1);
            } else if cmp.is_lt() {
+                if matches!(seek_op, SeekOp::LE { eq_only: true }) {
+                    target_cell_when_not_found
+                        .set(target_cell_when_not_found.get().max(cur_cell_idx as i32));
+                }
                min_cell_idx.set(cur_cell_idx + 1);
            } else {
                match iter_dir {
@@ -1896,6 +1888,10 @@ impl BTreeCursor {
                max_cell_idx: max,
                nearest_matching_cell,
                eq_seen: Cell::new(eq_seen),
+                target_cell_when_not_found: Cell::new(match seek_op.iteration_direction() {
+                    IterationDirection::Forwards => cell_count as i32,
+                    IterationDirection::Backwards => -1,
+                }),
            };
        }

@@ -1904,6 +1900,7 @@ impl BTreeCursor {
            max_cell_idx,
            nearest_matching_cell,
            eq_seen,
+            target_cell_when_not_found,
        } = &self.seek_state
        else {
            unreachable!(
@@ -1916,7 +1913,6 @@ impl BTreeCursor {
        return_if_locked_maybe_load!(self.pager, page);
        let page = page.get();
        let contents = page.get().contents.as_ref().unwrap();
-        let cell_count = contents.cell_count();

        let iter_dir = seek_op.iteration_direction();

@@ -1926,19 +1922,21 @@ impl BTreeCursor {
            if min > max {
                if let Some(nearest_matching_cell) = nearest_matching_cell.get() {
                    self.stack.set_cell_index(nearest_matching_cell as i32);
+                    self.has_record.set(true);
                    return Ok(IOResult::Done(SeekResult::Found));
                } else {
+                    // set cursor to the position where which would hold the op-boundary if it were present
+                    let target_cell = target_cell_when_not_found.get();
+                    self.stack.set_cell_index(target_cell);
+                    let has_record = target_cell >= 0 && target_cell < contents.cell_count() as i32;
+                    self.has_record.set(has_record);
+
                    // Similar logic as in tablebtree_seek(), but for indexes.
                    // The difference is that since index keys are not necessarily unique, we need to TryAdvance
                    // even when eq_only=true and we have seen an EQ match up in the tree in an interior node.
                    if seek_op.eq_only() && !eq_seen.get() {
                        return Ok(IOResult::Done(SeekResult::NotFound));
                    }
-                    // set cursor to the position where which would hold the op-boundary if it were present
-                    self.stack.set_cell_index(match &seek_op {
-                        SeekOp::GT | SeekOp::GE { .. } => cell_count as i32,
-                        SeekOp::LT | SeekOp::LE { .. } => 0,
-                    });
                    return Ok(IOResult::Done(SeekResult::TryAdvance));
                };
            }
@@ -1989,8 +1987,16 @@ impl BTreeCursor {
                    }
                }
            } else if cmp.is_gt() {
+                if matches!(seek_op, SeekOp::GE { eq_only: true }) {
+                    target_cell_when_not_found
+                        .set(target_cell_when_not_found.get().min(cur_cell_idx as i32));
+                }
                max_cell_idx.set(cur_cell_idx - 1);
            } else if cmp.is_lt() {
+                if matches!(seek_op, SeekOp::LE { eq_only: true }) {
+                    target_cell_when_not_found
+                        .set(target_cell_when_not_found.get().max(cur_cell_idx as i32));
+                }
                min_cell_idx.set(cur_cell_idx + 1);
            } else {
                match iter_dir {
@@ -2032,30 +2038,7 @@ impl BTreeCursor {
        (cmp, found)
    }

-    fn read_record_w_possible_overflow(
-        &mut self,
-        payload: &'static [u8],
-        next_page: Option<u32>,
-        payload_size: u64,
-    ) -> Result<IOResult<()>> {
-        if let Some(next_page) = next_page {
-            self.process_overflow_read(payload, next_page, payload_size)
-        } else {
-            self.get_immutable_record_or_create()
-                .as_mut()
-                .unwrap()
-                .invalidate();
-            self.get_immutable_record_or_create()
-                .as_mut()
-                .unwrap()
-                .start_serialization(payload);
-
-            self.record_cursor.borrow_mut().invalidate();
-            Ok(IOResult::Done(()))
-        }
-    }
-
-    #[instrument(skip_all, level = Level::DEBUG)]
+    #[instrument(skip_all, level = Level::INFO)]
    pub fn move_to(&mut self, key: SeekKey<'_>, cmp: SeekOp) -> Result<IOResult<()>> {
        turso_assert!(
            self.mv_cursor.is_none(),
@@ -2131,23 +2114,20 @@ impl BTreeCursor {
                    return_if_locked_maybe_load!(self.pager, page);

                    // get page and find cell
-                    let (cell_idx, page_type) = {
+                    let cell_idx = {
                        return_if_locked!(page.get());
                        let page = page.get();

                        page.set_dirty();
                        self.pager.add_dirty(page.get().id);

-                        let page = page.get().contents.as_mut().unwrap();
-                        turso_assert!(
-                            matches!(page.page_type(), PageType::TableLeaf | PageType::IndexLeaf),
-                            "expected table or index leaf page"
-                        );
-
-                        // find cell
-                        (return_if_io!(self.find_cell(page, bkey)), page.page_type())
+                        self.stack.current_cell_index()
                    };
-                    self.stack.set_cell_index(cell_idx as i32);
+                    if cell_idx == -1 {
+                        // This might be a brand new table and the cursor hasn't moved yet. Let's advance it to the first slot.
+                        self.stack.set_cell_index(0);
+                    }
+                    let cell_idx = self.stack.current_cell_index() as usize;
                    tracing::debug!(cell_idx);

                    // if the cell index is less than the total cells, check: if its an existing
@@ -2178,8 +2158,8 @@ impl BTreeCursor {
                                    continue;
                                }
                            }
-                            BTreeCell::IndexLeafCell(..) => {
-                                // Not necessary to read record again here, as find_cell already does that for us
+                            BTreeCell::IndexLeafCell(..) | BTreeCell::IndexInteriorCell(..) => {
+                                return_if_io!(self.record());
                                let cmp = compare_immutable(
                                    record_values.as_slice(),
                                    self.get_immutable_record()
@@ -2206,18 +2186,25 @@ impl BTreeCursor {
                                        self.save_context(CursorContext::IndexKeyRowId((*record).clone()));
                                    }
                                    continue;
+                                } else {
+                                    turso_assert!(
+                                        !matches!(cell, BTreeCell::IndexInteriorCell(..)),
+                                         "we should not be inserting a new index interior cell. the only valid operation on an index interior cell is an overwrite!"
+                                    );
                                }
                            }
                            other => panic!("unexpected cell type, expected TableLeaf or IndexLeaf, found: {other:?}"),
                        }
                    }
+
                    // insert cell

                    let mut cell_payload: Vec<u8> = Vec::with_capacity(record_values.len() + 4);
                    fill_cell_payload(
-                        page_type,
+                        page.get().get().contents.as_ref().unwrap(),
                        bkey.maybe_rowid(),
                        &mut cell_payload,
+                        cell_idx,
                        record,
                        self.usable_space() as u16,
                        self.pager.clone(),
@@ -3910,86 +3897,6 @@ impl BTreeCursor {
        self.pager.usable_space()
    }

-    /// Find the index of the cell in the page that contains the given rowid.
-    #[instrument( skip_all, level = Level::DEBUG)]
-    fn find_cell(&mut self, page: &PageContent, key: &BTreeKey) -> Result<IOResult<usize>> {
-        let cell_count = page.cell_count();
-        let mut low = 0;
-        let mut high = if cell_count > 0 { cell_count - 1 } else { 0 };
-        let mut result_index = cell_count;
-        if self.find_cell_state.0.is_some() {
-            (low, high) = self.find_cell_state.get_state();
-        }
-
-        while low <= high && cell_count > 0 {
-            let mid = low + (high - low) / 2;
-            self.find_cell_state.set((low, high));
-            let cell = page.cell_get(mid, self.usable_space())?;
-
-            let comparison_result = match cell {
-                BTreeCell::TableLeafCell(cell) => key.to_rowid().cmp(&cell.rowid),
-                BTreeCell::TableInteriorCell(cell) => key.to_rowid().cmp(&cell.rowid),
-                BTreeCell::IndexInteriorCell(IndexInteriorCell {
-                    payload,
-                    first_overflow_page,
-                    payload_size,
-                    ..
-                })
-                | BTreeCell::IndexLeafCell(IndexLeafCell {
-                    payload,
-                    first_overflow_page,
-                    payload_size,
-                    ..
-                }) => {
-                    // TODO: implement efficient comparison of records
-                    // e.g. https://github.com/sqlite/sqlite/blob/master/src/vdbeaux.c#L4719
-                    return_if_io!(self.read_record_w_possible_overflow(
-                        payload,
-                        first_overflow_page,
-                        payload_size
-                    ));
-
-                    let key_values = key.to_index_key_values();
-                    let record = self.get_immutable_record();
-                    let record = record.as_ref().unwrap();
-                    let record_same_number_cols = &record.get_values()[..key_values.len()];
-                    compare_immutable(
-                        key_values.as_slice(),
-                        record_same_number_cols,
-                        self.index_info
-                            .as_ref()
-                            .expect("indexbtree_find_cell without index_info")
-                            .key_info
-                            .as_slice(),
-                    )
-                }
-            };
-
-            match comparison_result {
-                Ordering::Equal => {
-                    result_index = mid;
-                    break;
-                }
-                Ordering::Greater => {
-                    low = mid + 1;
-                }
-                Ordering::Less => {
-                    result_index = mid;
-                    if mid == 0 {
-                        break;
-                    }
-                    high = mid - 1;
-                }
-            }
-        }
-
-        self.find_cell_state.reset();
-        assert!(result_index <= cell_count);
-
-        Ok(IOResult::Done(result_index))
-    }
-
-    #[instrument(skip_all, level = Level::DEBUG)]
    pub fn seek_end(&mut self) -> Result<IOResult<()>> {
        assert!(self.mv_cursor.is_none()); // unsure about this -_-
        self.move_to_root()?;
@@ -4148,8 +4055,6 @@ impl BTreeCursor {
        // Reset seek state
        self.seek_state = CursorSeekState::Start;
        self.valid_state = CursorValidState::Valid;
-        self.has_record
-            .replace(matches!(seek_result, SeekResult::Found));
        Ok(IOResult::Done(seek_result))
    }

@@ -4246,7 +4151,10 @@ impl BTreeCursor {
    pub fn insert(
        &mut self,
        key: &BTreeKey,
-        mut moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */
+        // Indicate whether it's necessary to traverse to find the leaf page
+        // FIXME: refactor this out into a state machine, these ad-hoc state
+        // variables are very hard to reason about
+        mut moved_before: bool,
    ) -> Result<IOResult<()>> {
        tracing::debug!(valid_state = ?self.valid_state, cursor_state = ?self.state, is_write_in_progress = self.is_write_in_progress());
        match &self.mv_cursor {
@@ -4260,30 +4168,50 @@ impl BTreeCursor {
                None => todo!("Support mvcc inserts with index btrees"),
            },
            None => {
-                if self.valid_state != CursorValidState::Valid && !self.is_write_in_progress() {
-                    // A balance happened so we need to move.
-                    moved_before = false;
-                }
+                match (&self.valid_state, self.is_write_in_progress()) {
+                    (CursorValidState::Valid, _) => {
+                        // consider the current position valid unless the caller explicitly asks us to seek.
+                    }
+                    (CursorValidState::RequireSeek, false) => {
+                        // we must seek.
+                        moved_before = false;
+                    }
+                    (CursorValidState::RequireSeek, true) => {
+                        // illegal to seek during a write no matter what CursorValidState or caller says -- we might e.g. move to the wrong page during balancing
+                        moved_before = true;
+                    }
+                    (CursorValidState::RequireAdvance(direction), _) => {
+                        // FIXME: this is a hack to support the case where we need to advance the cursor after a seek.
+                        // We should have a proper state machine for this.
+                        return_if_io!(match direction {
+                            IterationDirection::Forwards => self.next(),
+                            IterationDirection::Backwards => self.prev(),
+                        });
+                        self.valid_state = CursorValidState::Valid;
+                        self.seek_state = CursorSeekState::Start;
+                        moved_before = true;
+                    }
+                };
                if !moved_before {
-                    // Use move_to() so that we always end up on a leaf page. seek() might go back to an interior cell in index seeks,
-                    // which we never want. The reason we can use move_to() is that
-                    // find_cell() iterates the leaf page from left to right to find the insertion point anyway, so we don't care
-                    // which cell we are in as long as we are on the right page.
-                    // FIXME: find_cell() should not use linear search because it's slow.
-                    match key {
+                    let seek_result = match key {
                        BTreeKey::IndexKey(_) => {
-                            return_if_io!(self.move_to(
+                            return_if_io!(self.seek(
                                SeekKey::IndexKey(key.get_record().unwrap()),
                                SeekOp::GE { eq_only: true }
                            ))
                        }
                        BTreeKey::TableRowId(_) => {
-                            return_if_io!(self.move_to(
+                            return_if_io!(self.seek(
                                SeekKey::TableRowId(key.to_rowid()),
                                SeekOp::GE { eq_only: true }
                            ))
                        }
                    };
+                    if SeekResult::TryAdvance == seek_result {
+                        self.valid_state =
+                            CursorValidState::RequireAdvance(IterationDirection::Forwards);
+                        return_if_io!(self.next());
+                    }
                    self.context.take(); // we know where we wanted to move so if there was any saved context, discard it.
                    self.valid_state = CursorValidState::Valid;
                    self.seek_state = CursorSeekState::Start;
@@ -4449,12 +4377,10 @@ impl BTreeCursor {
                            post_balancing_seek_key,
                        };
                    } else {
-                        let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1);
                        drop_cell(contents, cell_idx, self.usable_space() as u16)?;

                        let delete_info = self.state.mut_delete_info().unwrap();
                        delete_info.state = DeleteState::CheckNeedsBalancing {
-                            rightmost_cell_was_dropped: is_last_cell,
                            post_balancing_seek_key,
                        };
                    }
@@ -4555,13 +4481,11 @@ impl BTreeCursor {

                    let delete_info = self.state.mut_delete_info().unwrap();
                    delete_info.state = DeleteState::CheckNeedsBalancing {
-                        rightmost_cell_was_dropped: false,
                        post_balancing_seek_key,
                    };
                }

                DeleteState::CheckNeedsBalancing {
-                    rightmost_cell_was_dropped,
                    post_balancing_seek_key,
                } => {
                    let page = self.stack.top();
@@ -4573,15 +4497,6 @@ impl BTreeCursor {
                    let needs_balancing = self.stack.has_parent()
                        && free_space as usize * 3 > self.usable_space() * 2;

-                    if rightmost_cell_was_dropped {
-                        // If we drop a cell in the middle, e.g. our current index is 2 and we drop 'c' from [a,b,c,d,e], then we don't need to retreat index,
-                        // because index 2 is still going to be the right place [a,b,D,e]
-                        // but:
-                        // If we drop the rightmost cell, e.g. our current index is 4 and we drop 'e' from [a,b,c,d,e], then we need to retreat index,
-                        // because index 4 is now pointing beyond the last cell [a,b,c,d] _ <-- index 4
-                        self.stack.retreat();
-                    }
-
                    if needs_balancing {
                        let delete_info = self.state.mut_delete_info().unwrap();
                        if delete_info.balance_write_info.is_none() {
@@ -4700,10 +4615,9 @@ impl BTreeCursor {
        };
        let seek_result =
            return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true }));
-        let has_record = matches!(seek_result, SeekResult::Found);
-        self.has_record.set(has_record);
+        let exists = matches!(seek_result, SeekResult::Found);
        self.invalidate_record();
-        Ok(IOResult::Done(has_record))
+        Ok(IOResult::Done(exists))
    }

    /// Clear the overflow pages linked to a specific page provided by the leaf cell
@@ -4965,14 +4879,16 @@ impl BTreeCursor {
        record: &ImmutableRecord,
    ) -> Result<IOResult<()>> {
        // build the new payload
-        let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type();
+        let page = page_ref.get();
+        let page_contents = page.get().contents.as_ref().unwrap();
        let serial_types_len = self.record_cursor.borrow_mut().len(record);
        let mut new_payload = Vec::with_capacity(serial_types_len);
        let rowid = return_if_io!(self.rowid());
        fill_cell_payload(
-            page_type,
+            page_contents,
            rowid,
            &mut new_payload,
+            cell_idx,
            record,
            self.usable_space() as u16,
            self.pager.clone(),
@@ -5138,7 +5054,19 @@ impl BTreeCursor {
    /// If context is defined, restore it and set it None on success
    #[instrument(skip_all, level = Level::DEBUG)]
    fn restore_context(&mut self) -> Result<IOResult<()>> {
-        if self.context.is_none() || !matches!(self.valid_state, CursorValidState::RequireSeek) {
+        if self.context.is_none() || matches!(self.valid_state, CursorValidState::Valid) {
+            return Ok(IOResult::Done(()));
+        }
+        if let CursorValidState::RequireAdvance(direction) = self.valid_state {
+            let has_record = return_if_io!(match direction {
+                // Avoid calling next()/prev() directly because they immediately call restore_context()
+                IterationDirection::Forwards => self.get_next_record(),
+                IterationDirection::Backwards => self.get_prev_record(),
+            });
+            self.has_record.set(has_record);
+            self.invalidate_record();
+            self.context = None;
+            self.valid_state = CursorValidState::Valid;
            return Ok(IOResult::Done(()));
        }
        let ctx = self.context.take().unwrap();
@@ -5148,7 +5076,13 @@ impl BTreeCursor {
        };
        let res = self.seek(seek_key, SeekOp::GE { eq_only: true })?;
        match res {
-            IOResult::Done(_) => {
+            IOResult::Done(res) => {
+                if let SeekResult::TryAdvance = res {
+                    self.valid_state =
+                        CursorValidState::RequireAdvance(IterationDirection::Forwards);
+                    self.context = Some(ctx);
+                    return Ok(IOResult::IO);
+                }
                self.valid_state = CursorValidState::Valid;
                Ok(IOResult::Done(()))
            }
@@ -6273,9 +6207,10 @@ fn _insert_into_cell(
 ) -> Result<()> {
    assert!(
        cell_idx <= page.cell_count() + page.overflow_cells.len(),
-        "attempting to add cell to an incorrect place cell_idx={} cell_count={}",
+        "attempting to add cell to an incorrect place cell_idx={} cell_count={} page_type={:?}",
        cell_idx,
-        page.cell_count()
+        page.cell_count(),
+        page.page_type()
    );
    let already_has_overflow = !page.overflow_cells.is_empty();
    let enough_space = if already_has_overflow && !allow_regular_insert_despite_overflow {
@@ -6463,21 +6398,25 @@ fn allocate_cell_space(page_ref: &PageContent, amount: u16, usable_space: u16) -
 /// Fill in the cell payload with the record.
 /// If the record is too large to fit in the cell, it will spill onto overflow pages.
 fn fill_cell_payload(
-    page_type: PageType,
+    page_contents: &PageContent,
    int_key: Option<i64>,
    cell_payload: &mut Vec<u8>,
+    cell_idx: usize,
    record: &ImmutableRecord,
    usable_space: u16,
    pager: Rc<Pager>,
 ) {
-    assert!(matches!(
-        page_type,
-        PageType::TableLeaf | PageType::IndexLeaf
-    ));
    // TODO: make record raw from start, having to serialize is not good
    let record_buf = record.get_payload().to_vec();

+    let page_type = page_contents.page_type();
    // fill in header
+    if matches!(page_type, PageType::IndexInterior) {
+        // if a write happened on an index interior page, it is always an overwrite.
+        // we must copy the left child pointer of the replaced cell to the new cell.
+        let left_child_page = page_contents.cell_interior_read_left_child_page(cell_idx);
+        cell_payload.extend_from_slice(&left_child_page.to_be_bytes());
+    }
    if matches!(page_type, PageType::TableLeaf) {
        let int_key = int_key.unwrap();
        write_varint_to_vec(record_buf.len() as u64, cell_payload);
@@ -6711,9 +6650,10 @@ mod tests {
    ) -> Vec<u8> {
        let mut payload: Vec<u8> = Vec::new();
        fill_cell_payload(
-            page.page_type(),
+            page,
            Some(id as i64),
            &mut payload,
+            pos,
            &record,
            4096,
            conn.pager.borrow().clone(),
@@ -7269,6 +7209,15 @@ mod tests {
                    .map(|col| Register::Value(Value::Integer(*col)))
                    .collect::<Vec<_>>();
                let value = ImmutableRecord::from_registers(&regs, regs.len());
+                run_until_done(
+                    || {
+                        let record = ImmutableRecord::from_registers(&regs, regs.len());
+                        let key = SeekKey::IndexKey(&record);
+                        cursor.seek(key, SeekOp::GE { eq_only: true })
+                    },
+                    pager.deref(),
+                )
+                .unwrap();
                run_until_done(
                    || {
                        cursor.insert(
@@ -7857,9 +7806,10 @@ mod tests {
                    let record = ImmutableRecord::from_registers(regs, regs.len());
                    let mut payload: Vec<u8> = Vec::new();
                    fill_cell_payload(
-                        page.page_type(),
+                        page,
                        Some(i as i64),
                        &mut payload,
+                        cell_idx,
                        &record,
                        4096,
                        conn.pager.borrow().clone(),
@@ -7930,9 +7880,10 @@ mod tests {
                        let record = ImmutableRecord::from_registers(regs, regs.len());
                        let mut payload: Vec<u8> = Vec::new();
                        fill_cell_payload(
-                            page.page_type(),
+                            page,
                            Some(i),
                            &mut payload,
+                            cell_idx,
                            &record,
                            4096,
                            conn.pager.borrow().clone(),
@@ -8294,9 +8245,10 @@ mod tests {
        let record = ImmutableRecord::from_registers(regs, regs.len());
        let mut payload: Vec<u8> = Vec::new();
        fill_cell_payload(
-            page.get().get_contents().page_type(),
+            page.get().get_contents(),
            Some(0),
            &mut payload,
+            0,
            &record,
            4096,
            conn.pager.borrow().clone(),
@@ -8371,9 +8323,10 @@ mod tests {
        let record = ImmutableRecord::from_registers(regs, regs.len());
        let mut payload: Vec<u8> = Vec::new();
        fill_cell_payload(
-            page.get().get_contents().page_type(),
+            page.get().get_contents(),
            Some(0),
            &mut payload,
+            0,
            &record,
            4096,
            conn.pager.borrow().clone(),
@@ -8726,7 +8679,7 @@ mod tests {
            // add a bunch of cells
            while compute_free_space(page.get_contents(), pager.usable_space() as u16) >= size + 10
            {
-                insert_cell(i, size, page.get_contents(), pager.clone(), page_type);
+                insert_cell(i, size, page.get_contents(), pager.clone());
                i += 1;
                size = (rng.next_u64() % 1024) as u16;
            }
@@ -8783,24 +8736,25 @@ mod tests {
        }
    }

-    fn insert_cell(
-        i: u64,
-        size: u16,
-        contents: &mut PageContent,
-        pager: Rc<Pager>,
-        page_type: PageType,
-    ) {
+    fn insert_cell(cell_idx: u64, size: u16, contents: &mut PageContent, pager: Rc<Pager>) {
        let mut payload = Vec::new();
        let regs = &[Register::Value(Value::Blob(vec![0; size as usize]))];
        let record = ImmutableRecord::from_registers(regs, regs.len());
        fill_cell_payload(
-            page_type,
-            Some(i as i64),
+            contents,
+            Some(cell_idx as i64),
            &mut payload,
+            cell_idx as usize,
            &record,
            pager.usable_space() as u16,
            pager.clone(),
        );
-        insert_into_cell(contents, &payload, i as usize, pager.usable_space() as u16).unwrap();
+        insert_into_cell(
+            contents,
+            &payload,
+            cell_idx as usize,
+            pager.usable_space() as u16,
+        )
+        .unwrap();
    }
 }
--- a/core/translate/compound_select.rs
+++ b/core/translate/compound_select.rs
@@ -333,7 +333,7 @@ fn create_dedupe_index(
        root_page: 0,
        ephemeral: true,
        table_name: String::new(),
-        unique: true,
+        unique: false,
        has_rowid: false,
    });
    let cursor_id = program.alloc_cursor_id(CursorType::BTreeIndex(dedupe_index.clone()));
--- a/core/translate/emitter.rs
+++ b/core/translate/emitter.rs
@@ -1153,7 +1153,13 @@ fn emit_update_insns(
            cursor: cursor_id,
            key_reg: rowid_set_clause_reg.unwrap_or(beg),
            record_reg,
-            flag: InsertFlags::new().update(true),
+            flag: if has_user_provided_rowid {
+                // The previous Insn::NotExists and Insn::Delete seek to the old rowid,
+                // so to insert a new user-provided rowid, we need to seek to the correct place.
+                InsertFlags::new().require_seek()
+            } else {
+                InsertFlags::new()
+            },
            table_name: table_ref.identifier.clone(),
        });

--- a/core/translate/insert.rs
+++ b/core/translate/insert.rs
@@ -231,7 +231,8 @@ pub fn translate_insert(
                        cursor: temp_cursor_id,
                        key_reg: rowid_reg,
                        record_reg,
-                        flag: InsertFlags::new(),
+                        // since we are not doing an Insn::NewRowid or an Insn::NotExists here, we need to seek to ensure the insertion happens in the correct place.
+                        flag: InsertFlags::new().require_seek(),
                        table_name: "".to_string(),
                    });

--- a/core/translate/result_row.rs
+++ b/core/translate/result_row.rs
@@ -128,7 +128,8 @@ pub fn emit_result_row_and_limit(
                cursor: *table_cursor_id,
                key_reg: result_columns_start_reg + (plan.result_columns.len() - 1), // Rowid reg is the last register
                record_reg,
-                flag: InsertFlags(0),
+                // since we are not doing an Insn::NewRowid or an Insn::NotExists here, we need to seek to ensure the insertion happens in the correct place.
+                flag: InsertFlags::new().require_seek(),
                table_name: table.name.clone(),
            });
        }
--- a/core/types.rs
+++ b/core/types.rs
@@ -2235,7 +2235,7 @@ macro_rules! return_if_io {
    };
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum SeekResult {
    /// Record matching the [SeekOp] found in the B-tree and cursor was positioned to point onto that record
    Found,
--- a/core/vdbe/execute.rs
+++ b/core/vdbe/execute.rs
@@ -13,6 +13,7 @@ use crate::types::{
    compare_immutable, compare_records_generic, ImmutableRecord, SeekResult, Text, TextSubtype,
 };
 use crate::util::normalize_ident;
+use crate::vdbe::insn::InsertFlags;
 use crate::vdbe::registers_to_ref_values;
 use crate::{
    error::{
@@ -4884,8 +4885,15 @@ pub fn op_insert(
            Register::Aggregate(..) => unreachable!("Cannot insert an aggregate value."),
        };

-        // query planner must emit NewRowId/NotExists/etc op-codes which will properly reposition cursor
-        return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record.as_ref())), true));
+        // In a table insert, if the caller does not pass InsertFlags::REQUIRE_SEEK, they must ensure that a seek has already happened to the correct location.
+        // This typically happens by invoking either Insn::NewRowid or Insn::NotExists, because:
+        // 1. op_new_rowid() seeks to the end of the table, which is the correct insertion position.
+        // 2. op_not_exists() seeks to the position in the table where the target rowid would be inserted.
+        let moved_before = !flag.has(InsertFlags::REQUIRE_SEEK);
+        return_if_io!(cursor.insert(
+            &BTreeKey::new_table_rowid(key, Some(record.as_ref())),
+            moved_before
+        ));
    }

    // Only update last_insert_rowid for regular table inserts, not schema modifications
@@ -5192,8 +5200,17 @@ pub enum OpNewRowidState {
    Start,
    SeekingToLast,
    ReadingMaxRowid,
-    GeneratingRandom { attempts: u32 },
-    VerifyingCandidate { attempts: u32, candidate: i64 },
+    GeneratingRandom {
+        attempts: u32,
+    },
+    VerifyingCandidate {
+        attempts: u32,
+        candidate: i64,
+    },
+    /// In case a rowid was generated and not provided by the user, we need to call next() on the cursor
+    /// after generating the rowid. This is because the rowid was generated by seeking to the last row in the
+    /// table, and we need to insert _after_ that row.
+    GoNext,
 }

 pub fn op_new_rowid(
@@ -5250,9 +5267,8 @@ pub fn op_new_rowid(
                    Some(rowid) if rowid < MAX_ROWID => {
                        // Can use sequential
                        state.registers[*rowid_reg] = Register::Value(Value::Integer(rowid + 1));
-                        state.op_new_rowid_state = OpNewRowidState::Start;
-                        state.pc += 1;
-                        return Ok(InsnFunctionStepResult::Step);
+                        state.op_new_rowid_state = OpNewRowidState::GoNext;
+                        continue;
                    }
                    Some(_) => {
                        // Must use random (rowid == MAX_ROWID)
@@ -5262,9 +5278,8 @@ pub fn op_new_rowid(
                    None => {
                        // Empty table
                        state.registers[*rowid_reg] = Register::Value(Value::Integer(1));
-                        state.op_new_rowid_state = OpNewRowidState::Start;
-                        state.pc += 1;
-                        return Ok(InsnFunctionStepResult::Step);
+                        state.op_new_rowid_state = OpNewRowidState::GoNext;
+                        continue;
                    }
                }
            }
@@ -5315,6 +5330,16 @@ pub fn op_new_rowid(
                    };
                }
            }
+            OpNewRowidState::GoNext => {
+                {
+                    let mut cursor = state.get_cursor(*cursor);
+                    let cursor = cursor.as_btree_mut();
+                    return_if_io!(cursor.next());
+                }
+                state.op_new_rowid_state = OpNewRowidState::Start;
+                state.pc += 1;
+                return Ok(InsnFunctionStepResult::Step);
+            }
        }
    }
 }
--- a/core/vdbe/insn.rs
+++ b/core/vdbe/insn.rs
@@ -109,6 +109,7 @@ pub struct InsertFlags(pub u8);

 impl InsertFlags {
    pub const UPDATE: u8 = 0x01; // Flag indicating this is part of an UPDATE statement
+    pub const REQUIRE_SEEK: u8 = 0x02; // Flag indicating that a seek is required to insert the row

    pub fn new() -> Self {
        InsertFlags(0)
@@ -118,12 +119,8 @@ impl InsertFlags {
        (self.0 & flag) != 0
    }

-    pub fn update(mut self, is_update: bool) -> Self {
-        if is_update {
-            self.0 |= InsertFlags::UPDATE;
-        } else {
-            self.0 &= !InsertFlags::UPDATE;
-        }
+    pub fn require_seek(mut self) -> Self {
+        self.0 |= InsertFlags::REQUIRE_SEEK;
        self
    }
 }