Merge 'b-tree: fix bug in case when no matching rows was found in seek in the leaf page' from Nikita Sivukhin

Current table B-Tree seek code rely on the invariant that if key `K` is
present in interior page then it also must be present in the leaf page.
This is generally not true if data was ever deleted from the table
because leaf row which key was used as a divider in the interior pages
can be deleted. Also, SQLite spec says nothing about such invariant - so
`turso-db` implementation of B-Tree should not rely on it.
This PR introduce 3 options for B-Tree `seek` result: `Found` /
`NotFound` and `TryAdvance` which is generated when leaf page have no
match for `seek_op` but DB don't know if neighbor page can have matching
data.
There is an alternative approach where we can move cursor in the `seek`
itself to the neighbor page - but I was afraid to introduce such changes
because analogue `seek` function from SQLite works exactly like current
version of the code and I think some query planner internals (for
insertion) can rely on the fact that repositioning will leave cursor at
the position of insertion:
> ** If an exact match is not found, then the cursor is always
** left pointing at a leaf page which would hold the entry if it
** were present.  The cursor might point to an entry that comes
** before or after the key.
Also, this PR introduces new B-tree fuzz tests which generate table
B-tree from scratch and execute opreations over it. This can help to
reach some non trivial states and also generate huge DBs faster (that's
how this bug was discovered)

Reviewed-by: Jussi Saurio <jussi.saurio@gmail.com>
Reviewed-by: Pere Diaz Bou <pere-altea@homail.com>

Closes #2065
This commit is contained in:
Pekka Enberg
2025-07-14 12:57:09 +03:00
10 changed files with 841 additions and 139 deletions

15
Cargo.lock generated
View File

@@ -586,6 +586,7 @@ dependencies = [
"tracing",
"tracing-subscriber",
"turso_core",
"zerocopy 0.8.26",
]
[[package]]
@@ -2590,7 +2591,7 @@ version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
dependencies = [
"zerocopy 0.8.24",
"zerocopy 0.8.26",
]
[[package]]
@@ -2808,7 +2809,7 @@ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.3",
"zerocopy 0.8.24",
"zerocopy 0.8.26",
]
[[package]]
@@ -4538,11 +4539,11 @@ dependencies = [
[[package]]
name = "zerocopy"
version = "0.8.24"
version = "0.8.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879"
checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
dependencies = [
"zerocopy-derive 0.8.24",
"zerocopy-derive 0.8.26",
]
[[package]]
@@ -4558,9 +4559,9 @@ dependencies = [
[[package]]
name = "zerocopy-derive"
version = "0.8.24"
version = "0.8.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be"
checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
dependencies = [
"proc-macro2",
"quote",

View File

@@ -15,7 +15,7 @@ use crate::{
turso_assert,
types::{
find_compare, get_tie_breaker_from_seek_op, IndexKeyInfo, IndexKeySortOrder,
ParseRecordState, RecordCompare, RecordCursor,
ParseRecordState, RecordCompare, RecordCursor, SeekResult,
},
MvCursor,
};
@@ -1285,13 +1285,21 @@ impl BTreeCursor {
/// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10)
/// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10).
/// We don't include the rowid in the comparison and that's why the last value from the record is not included.
fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<bool>> {
fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<SeekResult>> {
let ret = return_if_io!(match key {
SeekKey::TableRowId(rowid) => {
self.tablebtree_seek(rowid, op)
}
SeekKey::IndexKey(index_key) => {
self.indexbtree_seek(index_key, op)
match self.indexbtree_seek(index_key, op) {
Ok(CursorResult::Ok(found)) => Ok(CursorResult::Ok(if found {
SeekResult::Found
} else {
SeekResult::NotFound
})),
Ok(CursorResult::IO) => Ok(CursorResult::IO),
Err(err) => Err(err),
}
}
});
self.valid_state = CursorValidState::Valid;
@@ -1677,7 +1685,7 @@ impl BTreeCursor {
/// Specialized version of do_seek() for table btrees that uses binary search instead
/// of iterating cells in order.
#[instrument(skip_all, level = Level::INFO)]
fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result<CursorResult<bool>> {
fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result<CursorResult<SeekResult>> {
turso_assert!(
self.mv_cursor.is_none(),
"attempting to seek with MV cursor"
@@ -1704,7 +1712,7 @@ impl BTreeCursor {
let cell_count = contents.cell_count();
if cell_count == 0 {
self.stack.set_cell_index(0);
return Ok(CursorResult::Ok(false));
return Ok(CursorResult::Ok(SeekResult::NotFound));
}
let min_cell_idx = Cell::new(0);
let max_cell_idx = Cell::new(cell_count as isize - 1);
@@ -1743,9 +1751,28 @@ impl BTreeCursor {
if min > max {
if let Some(nearest_matching_cell) = nearest_matching_cell.get() {
self.stack.set_cell_index(nearest_matching_cell as i32);
return Ok(CursorResult::Ok(true));
return Ok(CursorResult::Ok(SeekResult::Found));
} else {
return Ok(CursorResult::Ok(false));
// if !eq_only - matching entry can exist in neighbour leaf page
// this can happen if key in the interiour page was deleted - but divider kept untouched
// in such case BTree can navigate to the leaf which no longer has matching key for seek_op
// in this case, caller must advance cursor if necessary
return Ok(CursorResult::Ok(if seek_op.eq_only() {
SeekResult::NotFound
} else {
let contents = page.get().contents.as_ref().unwrap();
turso_assert!(
contents.is_leaf(),
"tablebtree_seek() called on non-leaf page"
);
let cell_count = contents.cell_count();
// set cursor to the position where which would hold the op-boundary if it were present
self.stack.set_cell_index(match &seek_op {
SeekOp::GT | SeekOp::GE { .. } => cell_count as i32,
SeekOp::LT | SeekOp::LE { .. } => 0,
});
SeekResult::TryAdvance
}));
};
}
@@ -1766,7 +1793,7 @@ impl BTreeCursor {
// rowids are unique, so we can return the rowid immediately
if found && seek_op.eq_only() {
self.stack.set_cell_index(cur_cell_idx as i32);
return Ok(CursorResult::Ok(true));
return Ok(CursorResult::Ok(SeekResult::Found));
}
if found {
@@ -4134,7 +4161,7 @@ impl BTreeCursor {
}
#[instrument(skip(self), level = Level::INFO)]
pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<bool>> {
pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<SeekResult>> {
assert!(self.mv_cursor.is_none());
// Empty trace to capture the span information
tracing::trace!("");
@@ -4142,13 +4169,14 @@ impl BTreeCursor {
// because it might have been set to false by an unmatched left-join row during the previous iteration
// on the outer loop.
self.set_null_flag(false);
let cursor_has_record = return_if_io!(self.do_seek(key, op));
let seek_result = return_if_io!(self.do_seek(key, op));
self.invalidate_record();
// Reset seek state
self.seek_state = CursorSeekState::Start;
self.valid_state = CursorValidState::Valid;
self.has_record.replace(cursor_has_record);
Ok(CursorResult::Ok(cursor_has_record))
self.has_record
.replace(matches!(seek_result, SeekResult::Found));
Ok(CursorResult::Ok(seek_result))
}
/// Return a reference to the record the cursor is currently pointing to.
@@ -4649,8 +4677,9 @@ impl BTreeCursor {
Value::Integer(i) => i,
_ => unreachable!("btree tables are indexed by integers!"),
};
let has_record =
let seek_result =
return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true }));
let has_record = matches!(seek_result, SeekResult::Found);
self.has_record.set(has_record);
self.invalidate_record();
Ok(CursorResult::Ok(has_record))
@@ -6892,7 +6921,7 @@ mod tests {
assert!(
matches!(
cursor.seek(seek_key, SeekOp::GE { eq_only: true }).unwrap(),
CursorResult::Ok(true)
CursorResult::Ok(SeekResult::Found)
),
"key {key} is not found"
);
@@ -7135,7 +7164,10 @@ mod tests {
pager.deref(),
)
.unwrap();
assert!(exists, "key {key:?} is not found");
assert!(
matches!(exists, SeekResult::Found),
"key {key:?} is not found"
);
}
// Check that key count is right
cursor.move_to_root().unwrap();
@@ -7342,7 +7374,6 @@ mod tests {
}
#[test]
#[ignore]
pub fn test_clear_overflow_pages() -> Result<()> {
let pager = setup_test_env(5);
let num_columns = 5;
@@ -8257,13 +8288,13 @@ mod tests {
let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns);
let seek_key = SeekKey::TableRowId(i);
let found = run_until_done(
let seek_result = run_until_done(
|| cursor.seek(seek_key.clone(), SeekOp::GE { eq_only: true }),
pager.deref(),
)
.unwrap();
if found {
if matches!(seek_result, SeekResult::Found) {
run_until_done(|| cursor.delete(), pager.deref()).unwrap();
}
}

View File

@@ -2291,6 +2291,23 @@ pub enum CursorResult<T> {
IO,
}
#[derive(Debug)]
pub enum SeekResult {
/// Record matching the [SeekOp] found in the B-tree and cursor was positioned to point onto that record
Found,
/// Record matching the [SeekOp] doesn't exists in the B-tree
NotFound,
/// This result can happen only if eq_only for [SeekOp] is false
/// In this case Seek can position cursor to the leaf page boundaries (before the start, after the end)
/// (e.g. if leaf page holds rows with keys from range [1..10], key 10 is absent and [SeekOp] is >= 10)
///
/// turso-db has this extra [SeekResult] in order to make [BTreeCursor::seek] method to position cursor at
/// the leaf of potential insertion, but also communicate to caller the fact that current cursor position
/// doesn't hold a matching entry
/// (necessary for Seek{XX} VM op-codes, so these op-codes will try to advance cursor in order to move it to matching entry)
TryAdvance,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
/// The match condition of a table/index seek.
pub enum SeekOp {

View File

@@ -10,7 +10,7 @@ use crate::storage::wal::DummyWAL;
use crate::storage::{self, header_accessor};
use crate::translate::collate::CollationSeq;
use crate::types::{
compare_immutable, compare_records_generic, ImmutableRecord, Text, TextSubtype,
compare_immutable, compare_records_generic, ImmutableRecord, SeekResult, Text, TextSubtype,
};
use crate::util::normalize_ident;
use crate::vdbe::registers_to_ref_values;
@@ -2357,10 +2357,10 @@ pub fn op_seek_rowid(
match rowid {
Some(rowid) => {
let found = return_if_io!(
let seek_result = return_if_io!(
cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true })
);
if !found {
if !matches!(seek_result, SeekResult::Found) {
target_pc.as_offset_int()
} else {
state.pc + 1
@@ -2392,12 +2392,38 @@ pub fn op_deferred_seek(
Ok(InsnFunctionStepResult::Step)
}
pub enum OpSeekState {
/// Initial state
Start,
/// Position cursor with seek operation with (rowid, op) search parameters
Seek { rowid: i64, op: SeekOp },
/// Advance cursor (with [BTreeCursor::next]/[BTreeCursor::prev] methods) which was
/// positioned after [OpSeekState::Seek] state if [BTreeCursor::seek] returned [SeekResult::TryAdvance]
Advance { op: SeekOp },
/// Move cursor to the last BTree row if DB knows that comparison result will be fixed (due to type ordering, e.g. NUMBER always <= TEXT)
MoveLast,
}
pub fn op_seek(
program: &Program,
state: &mut ProgramState,
insn: &Insn,
pager: &Rc<Pager>,
mv_store: Option<&Rc<MvStore>>,
) -> Result<InsnFunctionStepResult> {
let result = op_seek_internal(program, state, insn, pager, mv_store);
if let Err(_) | Ok(InsnFunctionStepResult::Step) = &result {
state.op_seek_state = OpSeekState::Start;
}
result
}
pub fn op_seek_internal(
program: &Program,
state: &mut ProgramState,
insn: &Insn,
pager: &Rc<Pager>,
mv_store: Option<&Rc<MvStore>>,
) -> Result<InsnFunctionStepResult> {
let (Insn::SeekGE {
cursor_id,
@@ -2447,120 +2473,169 @@ pub fn op_seek(
Insn::SeekLT { .. } => SeekOp::LT,
_ => unreachable!("unexpected Insn {:?}", insn),
};
let op_name = match op {
SeekOp::GE { .. } => "SeekGE",
SeekOp::GT => "SeekGT",
SeekOp::LE { .. } => "SeekLE",
SeekOp::LT => "SeekLT",
};
if *is_index {
let found = {
let seek_result = {
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
let record_from_regs = make_record(&state.registers, start_reg, num_regs);
return_if_io!(cursor.seek(SeekKey::IndexKey(&record_from_regs), op))
};
if !found {
if !matches!(seek_result, SeekResult::Found) {
state.pc = target_pc.as_offset_int();
} else {
state.pc += 1;
}
} else {
let pc = {
let original_value = state.registers[*start_reg].get_owned_value().clone();
let mut temp_value = original_value.clone();
let conversion_successful = if matches!(temp_value, Value::Text(_)) {
let mut temp_reg = Register::Value(temp_value);
let converted = apply_numeric_affinity(&mut temp_reg, false);
temp_value = temp_reg.get_owned_value().clone();
converted
} else {
true // Non-text values don't need conversion
};
let int_key = extract_int_value(&temp_value);
let lost_precision = !conversion_successful || !matches!(temp_value, Value::Integer(_));
let actual_op = if lost_precision {
match &temp_value {
Value::Float(f) => {
let int_key_as_float = int_key as f64;
let c = if int_key_as_float > *f {
1
} else if int_key_as_float < *f {
-1
} else {
0
};
match c.cmp(&0) {
std::cmp::Ordering::Less => match op {
SeekOp::LT => SeekOp::LE { eq_only: false }, // (x < 5.1) -> (x <= 5)
SeekOp::GE { .. } => SeekOp::GT, // (x >= 5.1) -> (x > 5)
other => other,
},
std::cmp::Ordering::Greater => match op {
SeekOp::GT => SeekOp::GE { eq_only: false }, // (x > 4.9) -> (x >= 5)
SeekOp::LE { .. } => SeekOp::LT, // (x <= 4.9) -> (x < 5)
other => other,
},
std::cmp::Ordering::Equal => op,
}
}
Value::Text(_) | Value::Blob(_) => {
match op {
SeekOp::GT | SeekOp::GE { .. } => {
// No integers are > or >= non-numeric text, jump to target (empty result)
state.pc = target_pc.as_offset_int();
return Ok(InsnFunctionStepResult::Step);
}
SeekOp::LT | SeekOp::LE { .. } => {
// All integers are < or <= non-numeric text
// Move to last position and then use the normal seek logic
{
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
return_if_io!(cursor.last());
}
state.pc += 1;
return Ok(InsnFunctionStepResult::Step);
}
}
}
_ => op,
}
} else {
op
};
let rowid = if matches!(original_value, Value::Null) {
match actual_op {
SeekOp::GE { .. } | SeekOp::GT => {
state.pc = target_pc.as_offset_int();
return Ok(InsnFunctionStepResult::Step);
}
SeekOp::LE { .. } | SeekOp::LT => {
// No integers are < NULL, so jump to target
state.pc = target_pc.as_offset_int();
return Ok(InsnFunctionStepResult::Step);
}
}
} else {
int_key
};
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
let found = return_if_io!(cursor.seek(SeekKey::TableRowId(rowid), actual_op));
if !found {
target_pc.as_offset_int()
} else {
state.pc + 1
}
};
state.pc = pc;
return Ok(InsnFunctionStepResult::Step);
}
loop {
match &state.op_seek_state {
OpSeekState::Start => {
let original_value = state.registers[*start_reg].get_owned_value().clone();
let mut temp_value = original_value.clone();
let conversion_successful = if matches!(temp_value, Value::Text(_)) {
let mut temp_reg = Register::Value(temp_value);
let converted = apply_numeric_affinity(&mut temp_reg, false);
temp_value = temp_reg.get_owned_value().clone();
converted
} else {
true // Non-text values don't need conversion
};
let int_key = extract_int_value(&temp_value);
let lost_precision =
!conversion_successful || !matches!(temp_value, Value::Integer(_));
let actual_op = if lost_precision {
match &temp_value {
Value::Float(f) => {
let int_key_as_float = int_key as f64;
let c = if int_key_as_float > *f {
1
} else if int_key_as_float < *f {
-1
} else {
0
};
match c.cmp(&0) {
std::cmp::Ordering::Less => match op {
SeekOp::LT => SeekOp::LE { eq_only: false }, // (x < 5.1) -> (x <= 5)
SeekOp::GE { .. } => SeekOp::GT, // (x >= 5.1) -> (x > 5)
other => other,
},
std::cmp::Ordering::Greater => match op {
SeekOp::GT => SeekOp::GE { eq_only: false }, // (x > 4.9) -> (x >= 5)
SeekOp::LE { .. } => SeekOp::LT, // (x <= 4.9) -> (x < 5)
other => other,
},
std::cmp::Ordering::Equal => op,
}
}
Value::Text(_) | Value::Blob(_) => {
match op {
SeekOp::GT | SeekOp::GE { .. } => {
// No integers are > or >= non-numeric text, jump to target (empty result)
state.pc = target_pc.as_offset_int();
return Ok(InsnFunctionStepResult::Step);
}
SeekOp::LT | SeekOp::LE { .. } => {
// All integers are < or <= non-numeric text
// Move to last position and then use the normal seek logic
state.op_seek_state = OpSeekState::MoveLast;
continue;
}
}
}
_ => op,
}
} else {
op
};
let rowid = if matches!(original_value, Value::Null) {
match actual_op {
SeekOp::GE { .. } | SeekOp::GT => {
state.pc = target_pc.as_offset_int();
return Ok(InsnFunctionStepResult::Step);
}
SeekOp::LE { .. } | SeekOp::LT => {
// No integers are < NULL, so jump to target
state.pc = target_pc.as_offset_int();
return Ok(InsnFunctionStepResult::Step);
}
}
} else {
int_key
};
state.op_seek_state = OpSeekState::Seek {
rowid,
op: actual_op,
};
continue;
}
OpSeekState::Seek { rowid, op } => {
let seek_result = {
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
return_if_io!(cursor.seek(SeekKey::TableRowId(*rowid), *op))
};
let found = match seek_result {
SeekResult::Found => true,
SeekResult::NotFound => false,
SeekResult::TryAdvance => {
state.op_seek_state = OpSeekState::Advance { op: *op };
continue;
}
};
if !found {
state.pc = target_pc.as_offset_int()
} else {
state.pc += 1
}
return Ok(InsnFunctionStepResult::Step);
}
OpSeekState::Advance { op } => {
let found = {
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
// Seek operation has anchor number which equals to the closed boundary of the range
// (e.g. for >= x - anchor is x, for > x - anchor is x + 1)
//
// Before Advance state, cursor was positioned to the leaf page which should hold the anchor.
// Sometimes this leaf page can have no matching rows, and in this case
// we need to move cursor in the direction of Seek to find record which matches the seek filter
//
// Consider following scenario: Seek [> 666]
// interior page dividers: I1: [ .. 667 .. ]
// / \
// leaf pages: P1[661,665] P2[anything here is GT 666]
// After the initial Seek, cursor will be positioned after the end of leaf page P1 [661, 665]
// because this is potential position for insertion of value 666.
// But as P1 has no row matching Seek criteria - we need to move it to the right
// (and as we at the page boundary, we will move cursor to the next neighbor leaf, which guaranteed to have
// row keys greater than divider, which is greater or equal than anchor)
match op {
SeekOp::GT | SeekOp::GE { eq_only: false } => return_if_io!(cursor.next()),
SeekOp::LT | SeekOp::LE { eq_only: false } => return_if_io!(cursor.prev()),
_ => unreachable!("eq_only: true state must be unreachable"),
}
};
if !found {
state.pc = target_pc.as_offset_int()
} else {
state.pc += 1
}
return Ok(InsnFunctionStepResult::Step);
}
OpSeekState::MoveLast => {
{
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
return_if_io!(cursor.last());
}
state.pc += 1;
return Ok(InsnFunctionStepResult::Step);
}
}
}
Ok(InsnFunctionStepResult::Step)
}
/// Returns the tie-breaker ordering for SQLite index comparison opcodes.
@@ -4700,6 +4775,7 @@ pub fn op_insert(
Register::Aggregate(..) => unreachable!("Cannot insert an aggregate value."),
};
// query planner must emit NewRowId/NotExists/etc op-codes which will properly reposition cursor
return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record.as_ref())), true));
}
@@ -4795,16 +4871,16 @@ pub fn op_idx_delete(
let found = {
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
let found = return_if_io!(
let seek_result = return_if_io!(
cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true })
);
tracing::debug!(
"op_idx_delete: found={:?}, rootpage={}, key={:?}",
found,
seek_result,
cursor.root_page(),
record
);
found
matches!(seek_result, SeekResult::Found)
};
if !found {
@@ -5089,10 +5165,11 @@ pub fn op_new_rowid(
let exists = {
let mut cursor = state.get_cursor(*cursor);
let cursor = cursor.as_btree_mut();
return_if_io!(cursor.seek(
let seek_result = return_if_io!(cursor.seek(
SeekKey::TableRowId(*candidate),
SeekOp::GE { eq_only: true }
))
));
matches!(seek_result, SeekResult::Found)
};
if !exists {
@@ -5205,10 +5282,10 @@ pub fn op_no_conflict(
return Ok(InsnFunctionStepResult::Step);
}
let conflict =
let seek_result =
return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true }));
drop(cursor_ref);
if !conflict {
if !matches!(seek_result, SeekResult::Found) {
state.pc = target_pc.as_offset_int();
} else {
state.pc += 1;
@@ -6022,7 +6099,7 @@ pub fn op_found(
let not = matches!(insn, Insn::NotFound { .. });
let found = {
let seek_result = {
let mut cursor = state.get_cursor(*cursor_id);
let cursor = cursor.as_btree_mut();
@@ -6043,6 +6120,7 @@ pub fn op_found(
}
};
let found = matches!(seek_result, SeekResult::Found);
let do_jump = (!found && not) || (found && !not);
if do_jump {
state.pc = target_pc.as_offset_int();

View File

@@ -33,6 +33,7 @@ use crate::{
vdbe::execute::OpIdxInsertState,
vdbe::execute::OpInsertState,
vdbe::execute::OpNewRowidState,
vdbe::execute::OpSeekState,
RefValue,
};
@@ -257,6 +258,7 @@ pub struct ProgramState {
op_new_rowid_state: OpNewRowidState,
op_idx_insert_state: OpIdxInsertState,
op_insert_state: OpInsertState,
op_seek_state: OpSeekState,
}
impl ProgramState {
@@ -286,6 +288,7 @@ impl ProgramState {
op_new_rowid_state: OpNewRowidState::Start,
op_idx_insert_state: OpIdxInsertState::SeekIfUnique,
op_insert_state: OpInsertState::Insert,
op_seek_state: OpSeekState::Start,
}
}

View File

@@ -572,3 +572,48 @@ if {[info exists ::env(SQLITE_EXEC)] && ($::env(SQLITE_EXEC) eq "scripts/limbo-s
select * from t INTERSECT select * from u EXCEPT select * from v;
} {}
}
do_execsql_test_on_specific_db {:memory:} select-no-match-in-leaf-page {
CREATE TABLE t(a INTEGER PRIMARY KEY, b);
insert into t values (1, randomblob(1024));
insert into t values (2, randomblob(1024));
insert into t values (3, randomblob(1024));
insert into t values (4, randomblob(1024));
insert into t values (5, randomblob(1024));
insert into t values (6, randomblob(1024));
insert into t values (7, randomblob(1024));
insert into t values (8, randomblob(1024));
insert into t values (9, randomblob(1024));
insert into t values (10, randomblob(1024));
insert into t values (11, randomblob(1024));
insert into t values (12, randomblob(1024));
insert into t values (13, randomblob(1024));
insert into t values (14, randomblob(1024));
insert into t values (15, randomblob(1024));
insert into t values (16, randomblob(1024));
delete from t where a in (3, 6, 9, 12);
select count(*) from t where a >= 2;
select count(*) from t where a >= 3;
select count(*) from t where a >= 4;
select count(*) from t where a > 1;
select count(*) from t where a > 2;
select count(*) from t where a > 3;
select count(*) from t where a <= 3 ORDER BY a DESC;
select count(*) from t where a <= 4 ORDER BY a DESC;
select count(*) from t where a <= 5 ORDER BY a DESC;
select count(*) from t where a < 2 ORDER BY a DESC;
select count(*) from t where a < 3 ORDER BY a DESC;
select count(*) from t where a < 4 ORDER BY a DESC;
} {11
10
10
11
10
10
2
3
4
1
2
2}

View File

@@ -24,6 +24,7 @@ log = "0.4.22"
assert_cmd = "^2"
rand_chacha = "0.9.0"
rand = "0.9.0"
zerocopy = "0.8.26"
[dev-dependencies]
test-log = { version = "0.2.17", features = ["trace"] }

View File

@@ -1,2 +1,5 @@
mod test_btree;
mod test_read_path;
mod test_write_path;
mod test_multi_thread;

View File

@@ -0,0 +1,496 @@
use std::{
cell::RefCell,
collections::{HashMap, HashSet},
path::Path,
pin::Pin,
rc::Rc,
sync::Arc,
};
use rand::{seq::SliceRandom, RngCore, SeedableRng};
use rand_chacha::ChaCha8Rng;
use turso_core::{
Buffer, Completion, CompletionType, File, OpenFlags, PlatformIO, WriteCompletion, IO,
};
use zerocopy::big_endian::{U16, U32, U64};
use crate::common::{limbo_exec_rows, sqlite_exec_rows, TempDatabase};
#[derive(Debug, Eq, PartialEq)]
pub enum BTreePageType {
Interior,
Leaf,
}
#[derive(Debug)]
struct BTreeFreeBlock {
offset: u16,
size: u16,
}
#[derive(Debug)]
pub struct BTreeLeafCell {
size: usize,
rowid: u64,
on_page_data: Vec<u8>,
overflow_page: Option<Rc<BTreePageData>>,
}
#[derive(Debug)]
pub struct BTreeInteriorCell {
left_child_pointer: Rc<BTreePageData>,
rowid: u64,
}
#[derive(Debug)]
pub enum BTreeCell {
Interior(BTreeInteriorCell),
Leaf(BTreeLeafCell),
}
impl BTreeCell {
fn size(&self) -> u16 {
match self {
BTreeCell::Interior(cell) => 4 + length_varint(cell.rowid) as u16,
BTreeCell::Leaf(cell) => {
(length_varint(cell.size as u64)
+ length_varint(cell.rowid)
+ cell.on_page_data.len()
+ cell.overflow_page.as_ref().map(|_| 4).unwrap_or(0)) as u16
}
}
}
}
#[derive(Debug)]
pub struct BTreeOverflowPageData {
next: Option<Rc<BTreePageData>>,
payload: Vec<u8>,
}
#[derive(Debug)]
pub struct BTreeTablePageData {
page_type: BTreePageType,
cell_content_area: u16,
cell_right_pointer: Option<Rc<BTreePageData>>,
fragmented_free_bytes: u8,
cells: Vec<(u16, BTreeCell)>,
free_blocks: Vec<BTreeFreeBlock>,
}
#[derive(Debug)]
pub enum BTreePageData {
Table(BTreeTablePageData),
#[allow(dead_code)]
Overflow(BTreeOverflowPageData),
}
pub fn list_pages(root: &Rc<BTreePageData>, pages: &mut Vec<Rc<BTreePageData>>) {
pages.push(root.clone());
match root.as_ref() {
BTreePageData::Table(root) => {
for (_, cell) in &root.cells {
match cell {
BTreeCell::Interior(cell) => list_pages(&cell.left_child_pointer, pages),
BTreeCell::Leaf(cell) => {
let Some(overflow_page) = &cell.overflow_page else {
continue;
};
list_pages(overflow_page, pages);
}
}
}
if let Some(right) = &root.cell_right_pointer {
list_pages(right, pages);
}
}
BTreePageData::Overflow(root) => {
if let Some(next) = &root.next {
list_pages(next, pages);
}
}
}
}
pub fn write_varint(buf: &mut [u8], value: u64) -> usize {
if value <= 0x7f {
buf[0] = (value & 0x7f) as u8;
return 1;
}
if value <= 0x3fff {
buf[0] = (((value >> 7) & 0x7f) | 0x80) as u8;
buf[1] = (value & 0x7f) as u8;
return 2;
}
let mut value = value;
if (value & ((0xff000000_u64) << 32)) > 0 {
buf[8] = value as u8;
value >>= 8;
for i in (0..8).rev() {
buf[i] = ((value & 0x7f) | 0x80) as u8;
value >>= 7;
}
return 9;
}
let mut encoded: [u8; 10] = [0; 10];
let mut bytes = value;
let mut n = 0;
while bytes != 0 {
let v = 0x80 | (bytes & 0x7f);
encoded[n] = v as u8;
bytes >>= 7;
n += 1;
}
encoded[0] &= 0x7f;
for i in 0..n {
buf[i] = encoded[n - 1 - i];
}
n
}
pub fn length_varint(value: u64) -> usize {
let mut buf = [0u8; 10];
write_varint(&mut buf, value)
}
fn write_u64_column(header: &mut Vec<u8>, data: &mut Vec<u8>, value: u64) {
let mut buf = [0u8; 10];
let buf_len = write_varint(&mut buf, 6u64);
header.extend_from_slice(&buf[0..buf_len]);
data.extend_from_slice(&U64::new(value).to_bytes());
}
fn write_blob_column(header: &mut Vec<u8>, data: &mut Vec<u8>, value: &[u8]) {
let mut buf = [0u8; 10];
let buf_len = write_varint(&mut buf, (value.len() * 2 + 12) as u64);
header.extend_from_slice(&buf[0..buf_len]);
data.extend_from_slice(value);
}
fn create_simple_record(value: u64, payload: &[u8]) -> Vec<u8> {
let mut header = Vec::new();
let mut data = Vec::new();
write_u64_column(&mut header, &mut data, value);
write_blob_column(&mut header, &mut data, payload);
let header_len = header.len() + 1;
assert!(header_len <= 127);
let mut buf = [0u8; 10];
let buf_len = write_varint(&mut buf, header_len as u64);
let mut result = buf[0..buf_len].to_vec();
result.extend_from_slice(&header);
result.extend_from_slice(&data);
result
}
struct BTreeGenerator<'a> {
rng: &'a mut ChaCha8Rng,
max_interior_keys: usize,
max_leaf_keys: usize,
max_payload_size: usize,
}
impl BTreeGenerator<'_> {
pub fn create_page(
&self,
page: &BTreePageData,
page_numbers: &HashMap<*const BTreePageData, u32>,
) -> Vec<u8> {
match page {
BTreePageData::Table(page) => self.create_btree_page(page, page_numbers),
BTreePageData::Overflow(page) => self.create_overflow_page(page, page_numbers),
}
}
pub fn create_overflow_page(
&self,
page: &BTreeOverflowPageData,
page_numbers: &HashMap<*const BTreePageData, u32>,
) -> Vec<u8> {
let mut data = [255u8; 4096];
let first_4bytes = if let Some(next) = &page.next {
*page_numbers.get(&Rc::as_ptr(next)).unwrap()
} else {
0
};
data[0..4].copy_from_slice(&U32::new(first_4bytes).to_bytes());
data[4..4 + page.payload.len()].copy_from_slice(&page.payload);
data.to_vec()
}
pub fn create_btree_page(
&self,
page: &BTreeTablePageData,
page_numbers: &HashMap<*const BTreePageData, u32>,
) -> Vec<u8> {
let mut data = [255u8; 4096];
data[0] = match page.page_type {
BTreePageType::Interior => 0x05,
BTreePageType::Leaf => 0x0d,
};
data[1..3].copy_from_slice(
&U16::new(page.free_blocks.first().map(|x| x.offset).unwrap_or(0)).to_bytes(),
);
data[3..5].copy_from_slice(&U16::new(page.cells.len() as u16).to_bytes());
data[5..7].copy_from_slice(&U16::new(page.cell_content_area).to_bytes());
data[7] = page.fragmented_free_bytes;
let mut offset = 8;
if page.page_type == BTreePageType::Interior {
let cell_right_pointer = page.cell_right_pointer.as_ref().unwrap();
let cell_right_pointer = Rc::as_ptr(cell_right_pointer);
let cell_right_pointer = page_numbers.get(&cell_right_pointer).unwrap();
data[8..12].copy_from_slice(&U32::new(*cell_right_pointer).to_bytes());
offset = 12;
}
for (i, (pointer, _)) in page.cells.iter().enumerate() {
data[offset + 2 * i..offset + 2 * (i + 1)]
.copy_from_slice(&U16::new(*pointer).to_bytes());
}
for i in 0..page.free_blocks.len() {
let offset = page.free_blocks[i].offset as usize;
data[offset..offset + 2].copy_from_slice(
&U16::new(page.free_blocks.get(i + 1).map(|x| x.offset).unwrap_or(0)).to_bytes(),
);
data[offset + 2..offset + 4]
.copy_from_slice(&U16::new(page.free_blocks[i].size).to_bytes());
}
for (pointer, cell) in page.cells.iter() {
let mut p = *pointer as usize;
match cell {
BTreeCell::Interior(cell) => {
let left_child_pointer = Rc::as_ptr(&cell.left_child_pointer);
let left_child_pointer = page_numbers.get(&left_child_pointer).unwrap();
data[p..p + 4].copy_from_slice(&U32::new(*left_child_pointer).to_bytes());
p += 4;
_ = write_varint(&mut data[p..], cell.rowid);
}
BTreeCell::Leaf(cell) => {
p += write_varint(&mut data[p..], cell.size as u64);
p += write_varint(&mut data[p..], cell.rowid);
data[p..p + cell.on_page_data.len()].copy_from_slice(&cell.on_page_data);
p += cell.on_page_data.len();
if let Some(overflow_page) = &cell.overflow_page {
let overflow_page = Rc::as_ptr(overflow_page);
let overflow_page = page_numbers.get(&overflow_page).unwrap();
data[p..p + 4].copy_from_slice(&U32::new(*overflow_page).to_bytes());
}
}
}
}
data.into()
}
fn generate_btree(&mut self, depth: usize, mut l: u64, r: u64) -> Rc<BTreePageData> {
let mut cells = vec![];
let cells_max_limit = if depth == 0 {
self.max_leaf_keys
} else {
self.max_interior_keys
};
let cells_limit = self.rng.next_u32() as usize % cells_max_limit + 1;
let mut rowids = HashSet::new();
for _ in 0..cells_limit {
let rowid = l + self.rng.next_u64() % (r - l + 1);
if rowids.contains(&rowid) {
continue;
}
rowids.insert(rowid);
}
let mut rowids = rowids.into_iter().collect::<Vec<_>>();
rowids.sort();
let header_offset = if depth == 0 { 8 } else { 12 };
let mut it = 0;
let mut cells_size = header_offset;
while cells.len() < cells_limit && it < rowids.len() {
let rowid = rowids[it];
it += 1;
let cell = if depth == 0 {
let length = self.rng.next_u32() as usize % self.max_payload_size;
let record = create_simple_record(rowid, &vec![1u8; length]);
BTreeCell::Leaf(BTreeLeafCell {
size: record.len(),
rowid,
on_page_data: record,
overflow_page: None,
})
} else {
BTreeCell::Interior(BTreeInteriorCell {
left_child_pointer: self.generate_btree(depth - 1, l, rowid),
rowid,
})
};
if cells_size + 2 + cell.size() > 4096 {
break;
}
cells_size += 2 + cell.size();
cells.push((rowid, cell));
if depth > 0 {
l = rowid + 1;
}
}
cells.shuffle(&mut self.rng);
let mut cells_with_offset = Vec::new();
let mut fragmentation_budget = 4096 - cells_size;
let mut pointer_offset = header_offset;
let mut content_offset = 4096;
let mut fragmented_free_bytes = 0;
let mut free_blocks = vec![];
for (rowid, cell) in cells {
let mut fragmentation = ((self.rng.next_u32() % 4) as u16).min(fragmentation_budget);
if fragmented_free_bytes + fragmentation > 60 {
fragmentation = 0;
}
let mut free_block_size = 0;
if fragmentation == 0 && fragmentation_budget >= 4 {
free_block_size = 4 + self.rng.next_u32() as u16 % (fragmentation_budget - 3);
}
let cell_size = cell.size() + fragmentation.max(free_block_size);
assert!(pointer_offset + 2 + cell_size <= content_offset);
pointer_offset += 2;
content_offset -= cell_size;
fragmented_free_bytes += fragmentation;
fragmentation_budget -= fragmentation.max(free_block_size);
if free_block_size > 0 {
free_blocks.push(BTreeFreeBlock {
offset: content_offset + cell.size(),
size: free_block_size,
});
}
cells_with_offset.push((rowid, content_offset, cell));
}
cells_with_offset.sort_by_key(|(rowid, ..)| *rowid);
let cells = cells_with_offset
.into_iter()
.map(|(_, offset, cell)| (offset, cell))
.collect::<Vec<_>>();
free_blocks.sort_by_key(|x| x.offset);
if depth == 0 {
Rc::new(BTreePageData::Table(BTreeTablePageData {
page_type: BTreePageType::Leaf,
cell_content_area: content_offset,
cell_right_pointer: None,
fragmented_free_bytes: fragmented_free_bytes as u8,
cells,
free_blocks,
}))
} else {
Rc::new(BTreePageData::Table(BTreeTablePageData {
page_type: BTreePageType::Interior,
cell_content_area: content_offset,
cell_right_pointer: if l <= r {
Some(self.generate_btree(depth - 1, l, r))
} else {
None
},
fragmented_free_bytes: fragmented_free_bytes as u8,
cells,
free_blocks,
}))
}
}
fn write_btree(&mut self, path: &Path, root: &Rc<BTreePageData>, start_page: u32) {
let mut pages = Vec::new();
list_pages(root, &mut pages);
pages[1..].shuffle(&mut self.rng);
let mut page_numbers = HashMap::new();
for (page, page_no) in pages.iter().zip(start_page..) {
page_numbers.insert(Rc::as_ptr(page), page_no);
}
let io = PlatformIO::new().unwrap();
let file = io
.open_file(path.to_str().unwrap(), OpenFlags::None, true)
.unwrap();
assert_eq!(file.size().unwrap(), 4096 * 2);
for (i, page) in pages.iter().enumerate() {
let page = self.create_page(page, &page_numbers);
write_at(&io, file.clone(), 4096 * (i + 1), &page);
}
let size = 1 + pages.len();
let size_bytes = U32::new(size as u32).to_bytes();
write_at(&io, file, 28, &size_bytes);
}
}
fn write_at(io: &impl IO, file: Arc<dyn File>, offset: usize, data: &[u8]) {
let completion = Completion::new(CompletionType::Write(WriteCompletion::new(Box::new(
|_| {},
))));
let drop_fn = Rc::new(move |_| {});
#[allow(clippy::arc_with_non_send_sync)]
let buffer = Arc::new(RefCell::new(Buffer::new(Pin::new(data.to_vec()), drop_fn)));
let result = file.pwrite(offset, buffer, completion).unwrap();
while !result.is_completed() {
io.run_once().unwrap();
}
}
#[test]
fn test_btree() {
let _ = env_logger::try_init();
let mut rng = ChaCha8Rng::seed_from_u64(0);
for depth in 0..4 {
for attempt in 0..16 {
let db = TempDatabase::new_with_rusqlite(
"create table test (k INTEGER PRIMARY KEY, b BLOB);",
false,
);
log::info!(
"depth: {}, attempt: {}, path: {:?}",
depth,
attempt,
db.path
);
let mut generator = BTreeGenerator {
rng: &mut rng,
max_interior_keys: 3,
max_leaf_keys: 4096,
max_payload_size: 100,
};
let root = generator.generate_btree(depth, 0, i64::MAX as u64);
generator.write_btree(&db.path, &root, 2);
for _ in 0..16 {
let mut l = rng.next_u64() % (i64::MAX as u64);
let mut r = rng.next_u64() % (i64::MAX as u64);
if l > r {
(l, r) = (r, l);
}
let query = format!("SELECT SUM(LENGTH(b)) FROM test WHERE k >= {l} AND k <= {r}");
let sqlite_sum = {
let conn = rusqlite::Connection::open(&db.path).unwrap();
sqlite_exec_rows(&conn, &query)
};
let limbo_sum = {
let conn = db.connect_limbo();
limbo_exec_rows(&db, &conn, &query)
};
assert_eq!(
limbo_sum, sqlite_sum,
"query={}, limbo={:?}, sqlite={:?}",
query, limbo_sum, sqlite_sum
);
}
}
}
}

View File

@@ -0,0 +1,27 @@
use crate::common::TempDatabase;
#[test]
fn test_schema_change() {
let tmp_db = TempDatabase::new_empty(false);
let conn1 = tmp_db.connect_limbo();
conn1.execute("CREATE TABLE t(x, y, z)").unwrap();
conn1
.execute("INSERT INTO t VALUES (1, 2, 3), (10, 20, 30)")
.unwrap();
let conn2 = tmp_db.connect_limbo();
let mut stmt = conn2.prepare("SELECT x, z FROM t").unwrap();
conn1.execute("ALTER TABLE t DROP COLUMN x").unwrap();
let row = loop {
match stmt.step() {
Ok(turso_core::StepResult::Row) => {
let row = stmt.row().unwrap();
break row;
}
Ok(turso_core::StepResult::IO) => {
stmt.run_once().unwrap();
}
_ => panic!("unexpected step result"),
}
};
println!("{:?} {:?}", row.get_value(0), row.get_value(1));
}