From 77bf6c287dec1b265850aaa748a41543f56d80f7 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 19:36:17 +0400 Subject: [PATCH 01/12] introduce proper state machine for seek op code --- core/vdbe/execute.rs | 231 ++++++++++++++++++++++++------------------- core/vdbe/mod.rs | 3 + 2 files changed, 133 insertions(+), 101 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index ddd15bd55..30a0fa26c 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2392,12 +2392,32 @@ pub fn op_deferred_seek( Ok(InsnFunctionStepResult::Step) } +pub enum OpSeekState { + Start, + Seek { rowid: i64, op: SeekOp }, + MoveLast, +} + pub fn op_seek( program: &Program, state: &mut ProgramState, insn: &Insn, pager: &Rc, mv_store: Option<&Rc>, +) -> Result { + let result = op_seek_internal(program, state, insn, pager, mv_store); + if let Err(_) | Ok(InsnFunctionStepResult::Step) = &result { + state.op_seek_state = OpSeekState::Start; + } + result +} + +pub fn op_seek_internal( + program: &Program, + state: &mut ProgramState, + insn: &Insn, + pager: &Rc, + mv_store: Option<&Rc>, ) -> Result { let (Insn::SeekGE { cursor_id, @@ -2447,12 +2467,7 @@ pub fn op_seek( Insn::SeekLT { .. } => SeekOp::LT, _ => unreachable!("unexpected Insn {:?}", insn), }; - let op_name = match op { - SeekOp::GE { .. } => "SeekGE", - SeekOp::GT => "SeekGT", - SeekOp::LE { .. } => "SeekLE", - SeekOp::LT => "SeekLT", - }; + // todo (sivukhin): fix index too if *is_index { let found = { let mut cursor = state.get_cursor(*cursor_id); @@ -2465,102 +2480,116 @@ pub fn op_seek( } else { state.pc += 1; } - } else { - let pc = { - let original_value = state.registers[*start_reg].get_owned_value().clone(); - let mut temp_value = original_value.clone(); - - let conversion_successful = if matches!(temp_value, Value::Text(_)) { - let mut temp_reg = Register::Value(temp_value); - let converted = apply_numeric_affinity(&mut temp_reg, false); - temp_value = temp_reg.get_owned_value().clone(); - converted - } else { - true // Non-text values don't need conversion - }; - - let int_key = extract_int_value(&temp_value); - let lost_precision = !conversion_successful || !matches!(temp_value, Value::Integer(_)); - let actual_op = if lost_precision { - match &temp_value { - Value::Float(f) => { - let int_key_as_float = int_key as f64; - let c = if int_key_as_float > *f { - 1 - } else if int_key_as_float < *f { - -1 - } else { - 0 - }; - - match c.cmp(&0) { - std::cmp::Ordering::Less => match op { - SeekOp::LT => SeekOp::LE { eq_only: false }, // (x < 5.1) -> (x <= 5) - SeekOp::GE { .. } => SeekOp::GT, // (x >= 5.1) -> (x > 5) - other => other, - }, - std::cmp::Ordering::Greater => match op { - SeekOp::GT => SeekOp::GE { eq_only: false }, // (x > 4.9) -> (x >= 5) - SeekOp::LE { .. } => SeekOp::LT, // (x <= 4.9) -> (x < 5) - other => other, - }, - std::cmp::Ordering::Equal => op, - } - } - Value::Text(_) | Value::Blob(_) => { - match op { - SeekOp::GT | SeekOp::GE { .. } => { - // No integers are > or >= non-numeric text, jump to target (empty result) - state.pc = target_pc.as_offset_int(); - return Ok(InsnFunctionStepResult::Step); - } - SeekOp::LT | SeekOp::LE { .. } => { - // All integers are < or <= non-numeric text - // Move to last position and then use the normal seek logic - { - let mut cursor = state.get_cursor(*cursor_id); - let cursor = cursor.as_btree_mut(); - return_if_io!(cursor.last()); - } - state.pc += 1; - return Ok(InsnFunctionStepResult::Step); - } - } - } - _ => op, - } - } else { - op - }; - - let rowid = if matches!(original_value, Value::Null) { - match actual_op { - SeekOp::GE { .. } | SeekOp::GT => { - state.pc = target_pc.as_offset_int(); - return Ok(InsnFunctionStepResult::Step); - } - SeekOp::LE { .. } | SeekOp::LT => { - // No integers are < NULL, so jump to target - state.pc = target_pc.as_offset_int(); - return Ok(InsnFunctionStepResult::Step); - } - } - } else { - int_key - }; - let mut cursor = state.get_cursor(*cursor_id); - let cursor = cursor.as_btree_mut(); - let found = return_if_io!(cursor.seek(SeekKey::TableRowId(rowid), actual_op)); - - if !found { - target_pc.as_offset_int() - } else { - state.pc + 1 - } - }; - state.pc = pc; + return Ok(InsnFunctionStepResult::Step); + } + loop { + match &state.op_seek_state { + OpSeekState::Start => { + let original_value = state.registers[*start_reg].get_owned_value().clone(); + let mut temp_value = original_value.clone(); + + let conversion_successful = if matches!(temp_value, Value::Text(_)) { + let mut temp_reg = Register::Value(temp_value); + let converted = apply_numeric_affinity(&mut temp_reg, false); + temp_value = temp_reg.get_owned_value().clone(); + converted + } else { + true // Non-text values don't need conversion + }; + let int_key = extract_int_value(&temp_value); + let lost_precision = + !conversion_successful || !matches!(temp_value, Value::Integer(_)); + let actual_op = if lost_precision { + match &temp_value { + Value::Float(f) => { + let int_key_as_float = int_key as f64; + let c = if int_key_as_float > *f { + 1 + } else if int_key_as_float < *f { + -1 + } else { + 0 + }; + + match c.cmp(&0) { + std::cmp::Ordering::Less => match op { + SeekOp::LT => SeekOp::LE { eq_only: false }, // (x < 5.1) -> (x <= 5) + SeekOp::GE { .. } => SeekOp::GT, // (x >= 5.1) -> (x > 5) + other => other, + }, + std::cmp::Ordering::Greater => match op { + SeekOp::GT => SeekOp::GE { eq_only: false }, // (x > 4.9) -> (x >= 5) + SeekOp::LE { .. } => SeekOp::LT, // (x <= 4.9) -> (x < 5) + other => other, + }, + std::cmp::Ordering::Equal => op, + } + } + Value::Text(_) | Value::Blob(_) => { + match op { + SeekOp::GT | SeekOp::GE { .. } => { + // No integers are > or >= non-numeric text, jump to target (empty result) + state.pc = target_pc.as_offset_int(); + return Ok(InsnFunctionStepResult::Step); + } + SeekOp::LT | SeekOp::LE { .. } => { + // All integers are < or <= non-numeric text + // Move to last position and then use the normal seek logic + state.op_seek_state = OpSeekState::MoveLast; + continue; + } + } + } + _ => op, + } + } else { + op + }; + let rowid = if matches!(original_value, Value::Null) { + match actual_op { + SeekOp::GE { .. } | SeekOp::GT => { + state.pc = target_pc.as_offset_int(); + return Ok(InsnFunctionStepResult::Step); + } + SeekOp::LE { .. } | SeekOp::LT => { + // No integers are < NULL, so jump to target + state.pc = target_pc.as_offset_int(); + return Ok(InsnFunctionStepResult::Step); + } + } + } else { + int_key + }; + state.op_seek_state = OpSeekState::Seek { + rowid, + op: actual_op, + }; + continue; + } + OpSeekState::Seek { rowid, op } => { + let found = { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.seek(SeekKey::TableRowId(*rowid), *op)) + }; + if !found { + state.pc = target_pc.as_offset_int() + } else { + state.pc += 1 + } + return Ok(InsnFunctionStepResult::Step); + } + OpSeekState::MoveLast => { + { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.last()); + } + state.pc += 1; + return Ok(InsnFunctionStepResult::Step); + } + } } - Ok(InsnFunctionStepResult::Step) } /// Returns the tie-breaker ordering for SQLite index comparison opcodes. diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 338f97646..d79e88845 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -33,6 +33,7 @@ use crate::{ vdbe::execute::OpIdxInsertState, vdbe::execute::OpInsertState, vdbe::execute::OpNewRowidState, + vdbe::execute::OpSeekState, RefValue, }; @@ -257,6 +258,7 @@ pub struct ProgramState { op_new_rowid_state: OpNewRowidState, op_idx_insert_state: OpIdxInsertState, op_insert_state: OpInsertState, + op_seek_state: OpSeekState, } impl ProgramState { @@ -286,6 +288,7 @@ impl ProgramState { op_new_rowid_state: OpNewRowidState::Start, op_idx_insert_state: OpIdxInsertState::SeekIfUnique, op_insert_state: OpInsertState::Insert, + op_seek_state: OpSeekState::Start, } } From 03b2725cc7f1b9c4ee02611119439c2ef6da18d7 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 20:09:19 +0400 Subject: [PATCH 02/12] return SeekResult from seek operation - Apart from regular states Found/NotFound seek result has TryAdvance value which tells caller to advance the cursor in necessary direction because the leaf page which would hold the entry if it was present actually has no matching entry (but neighbouring page can have match) --- core/storage/btree.rs | 59 ++++++++++++++++++++++++++++++------------- core/types.rs | 7 +++++ core/vdbe/execute.rs | 25 +++++++++--------- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index ef49158ea..a58893de4 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -15,7 +15,7 @@ use crate::{ turso_assert, types::{ find_compare, get_tie_breaker_from_seek_op, IndexKeyInfo, IndexKeySortOrder, - ParseRecordState, RecordCompare, RecordCursor, + ParseRecordState, RecordCompare, RecordCursor, SeekResult, }, MvCursor, }; @@ -1285,13 +1285,21 @@ impl BTreeCursor { /// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10) /// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10). /// We don't include the rowid in the comparison and that's why the last value from the record is not included. - fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { + fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { let ret = return_if_io!(match key { SeekKey::TableRowId(rowid) => { self.tablebtree_seek(rowid, op) } SeekKey::IndexKey(index_key) => { - self.indexbtree_seek(index_key, op) + match self.indexbtree_seek(index_key, op) { + Ok(CursorResult::Ok(found)) => Ok(CursorResult::Ok(if found { + SeekResult::Found + } else { + SeekResult::NotFound + })), + Ok(CursorResult::IO) => Ok(CursorResult::IO), + Err(err) => Err(err), + } } }); self.valid_state = CursorValidState::Valid; @@ -1677,7 +1685,7 @@ impl BTreeCursor { /// Specialized version of do_seek() for table btrees that uses binary search instead /// of iterating cells in order. #[instrument(skip_all, level = Level::INFO)] - fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { + fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { turso_assert!( self.mv_cursor.is_none(), "attempting to seek with MV cursor" @@ -1704,7 +1712,7 @@ impl BTreeCursor { let cell_count = contents.cell_count(); if cell_count == 0 { self.stack.set_cell_index(0); - return Ok(CursorResult::Ok(false)); + return Ok(CursorResult::Ok(SeekResult::NotFound)); } let min_cell_idx = Cell::new(0); let max_cell_idx = Cell::new(cell_count as isize - 1); @@ -1743,9 +1751,21 @@ impl BTreeCursor { if min > max { if let Some(nearest_matching_cell) = nearest_matching_cell.get() { self.stack.set_cell_index(nearest_matching_cell as i32); - return Ok(CursorResult::Ok(true)); + return Ok(CursorResult::Ok(SeekResult::Found)); } else { - return Ok(CursorResult::Ok(false)); + let eq_only = match &seek_op { + SeekOp::GE { eq_only } | SeekOp::LE { eq_only } => *eq_only, + SeekOp::LT | SeekOp::GT => false, + }; + // if !eq_only - matching entry can exist in neighbour leaf page + // this can happen if key in the interiour page was deleted - but divider kept untouched + // in such case BTree can navigate to the leaf which no longer has matching key for seek_op + // in this case, caller must advance cursor if necessary + return Ok(CursorResult::Ok(if eq_only { + SeekResult::NotFound + } else { + SeekResult::TryAdvance + })); }; } @@ -1766,7 +1786,7 @@ impl BTreeCursor { // rowids are unique, so we can return the rowid immediately if found && seek_op.eq_only() { self.stack.set_cell_index(cur_cell_idx as i32); - return Ok(CursorResult::Ok(true)); + return Ok(CursorResult::Ok(SeekResult::Found)); } if found { @@ -4134,7 +4154,7 @@ impl BTreeCursor { } #[instrument(skip(self), level = Level::INFO)] - pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { + pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { assert!(self.mv_cursor.is_none()); // Empty trace to capture the span information tracing::trace!(""); @@ -4142,13 +4162,14 @@ impl BTreeCursor { // because it might have been set to false by an unmatched left-join row during the previous iteration // on the outer loop. self.set_null_flag(false); - let cursor_has_record = return_if_io!(self.do_seek(key, op)); + let seek_result = return_if_io!(self.do_seek(key, op)); self.invalidate_record(); // Reset seek state self.seek_state = CursorSeekState::Start; self.valid_state = CursorValidState::Valid; - self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(cursor_has_record)) + self.has_record + .replace(matches!(seek_result, SeekResult::Found)); + Ok(CursorResult::Ok(seek_result)) } /// Return a reference to the record the cursor is currently pointing to. @@ -4649,8 +4670,9 @@ impl BTreeCursor { Value::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - let has_record = + let seek_result = return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true })); + let has_record = matches!(seek_result, SeekResult::Found); self.has_record.set(has_record); self.invalidate_record(); Ok(CursorResult::Ok(has_record)) @@ -6892,7 +6914,7 @@ mod tests { assert!( matches!( cursor.seek(seek_key, SeekOp::GE { eq_only: true }).unwrap(), - CursorResult::Ok(true) + CursorResult::Ok(SeekResult::Found) ), "key {key} is not found" ); @@ -7135,7 +7157,10 @@ mod tests { pager.deref(), ) .unwrap(); - assert!(exists, "key {key:?} is not found"); + assert!( + matches!(exists, SeekResult::Found), + "key {key:?} is not found" + ); } // Check that key count is right cursor.move_to_root().unwrap(); @@ -8257,13 +8282,13 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns); let seek_key = SeekKey::TableRowId(i); - let found = run_until_done( + let seek_result = run_until_done( || cursor.seek(seek_key.clone(), SeekOp::GE { eq_only: true }), pager.deref(), ) .unwrap(); - if found { + if matches!(seek_result, SeekResult::Found) { run_until_done(|| cursor.delete(), pager.deref()).unwrap(); } } diff --git a/core/types.rs b/core/types.rs index df477cfdd..b018852ff 100644 --- a/core/types.rs +++ b/core/types.rs @@ -2291,6 +2291,13 @@ pub enum CursorResult { IO, } +#[derive(Debug)] +pub enum SeekResult { + Found, + NotFound, + TryAdvance, +} + #[derive(Clone, Copy, PartialEq, Eq, Debug)] /// The match condition of a table/index seek. pub enum SeekOp { diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 30a0fa26c..f6cd37498 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -10,7 +10,7 @@ use crate::storage::wal::DummyWAL; use crate::storage::{self, header_accessor}; use crate::translate::collate::CollationSeq; use crate::types::{ - compare_immutable, compare_records_generic, ImmutableRecord, Text, TextSubtype, + compare_immutable, compare_records_generic, ImmutableRecord, SeekResult, Text, TextSubtype, }; use crate::util::normalize_ident; use crate::vdbe::registers_to_ref_values; @@ -2357,10 +2357,10 @@ pub fn op_seek_rowid( match rowid { Some(rowid) => { - let found = return_if_io!( + let seek_result = return_if_io!( cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true }) ); - if !found { + if !matches!(seek_result, SeekResult::Found) { target_pc.as_offset_int() } else { state.pc + 1 @@ -2469,13 +2469,13 @@ pub fn op_seek_internal( }; // todo (sivukhin): fix index too if *is_index { - let found = { + let seek_result = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); return_if_io!(cursor.seek(SeekKey::IndexKey(&record_from_regs), op)) }; - if !found { + if !matches!(seek_result, SeekResult::Found) { state.pc = target_pc.as_offset_int(); } else { state.pc += 1; @@ -2567,12 +2567,12 @@ pub fn op_seek_internal( continue; } OpSeekState::Seek { rowid, op } => { - let found = { + let seek_result = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); return_if_io!(cursor.seek(SeekKey::TableRowId(*rowid), *op)) }; - if !found { + if !matches!(seek_result, SeekResult::Found) { state.pc = target_pc.as_offset_int() } else { state.pc += 1 @@ -4824,12 +4824,12 @@ pub fn op_idx_delete( let found = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - let found = return_if_io!( + let seek_result = return_if_io!( cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true }) ); tracing::debug!( "op_idx_delete: found={:?}, rootpage={}, key={:?}", - found, + seek_result, cursor.root_page(), record ); @@ -5234,10 +5234,10 @@ pub fn op_no_conflict( return Ok(InsnFunctionStepResult::Step); } - let conflict = + let seek_result = return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true })); drop(cursor_ref); - if !conflict { + if !matches!(seek_result, SeekResult::Found) { state.pc = target_pc.as_offset_int(); } else { state.pc += 1; @@ -6051,7 +6051,7 @@ pub fn op_found( let not = matches!(insn, Insn::NotFound { .. }); - let found = { + let seek_result = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); @@ -6072,6 +6072,7 @@ pub fn op_found( } }; + let found = matches!(seek_result, SeekResult::Found); let do_jump = (!found && not) || (found && !not); if do_jump { state.pc = target_pc.as_offset_int(); From fc400906d521f5406fc0c7b3f2ee8ef3cb8396e4 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 21:06:42 +0400 Subject: [PATCH 03/12] handle case when target seek page has no matching entries --- core/storage/btree.rs | 11 +++++++++++ core/vdbe/execute.rs | 30 ++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index a58893de4..6cc0addf8 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1764,6 +1764,17 @@ impl BTreeCursor { return Ok(CursorResult::Ok(if eq_only { SeekResult::NotFound } else { + let contents = page.get().contents.as_ref().unwrap(); + turso_assert!( + contents.is_leaf(), + "tablebtree_seek() called on non-leaf page" + ); + let cell_count = contents.cell_count(); + // set cursor to the position where which would hold the op-boundary if it were present + self.stack.set_cell_index(match &seek_op { + SeekOp::GT | SeekOp::GE { .. } => cell_count as i32, + SeekOp::LT | SeekOp::LE { .. } => 0, + }); SeekResult::TryAdvance })); }; diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index f6cd37498..08a0095be 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2395,6 +2395,7 @@ pub fn op_deferred_seek( pub enum OpSeekState { Start, Seek { rowid: i64, op: SeekOp }, + Advance { op: SeekOp }, MoveLast, } @@ -2572,7 +2573,32 @@ pub fn op_seek_internal( let cursor = cursor.as_btree_mut(); return_if_io!(cursor.seek(SeekKey::TableRowId(*rowid), *op)) }; - if !matches!(seek_result, SeekResult::Found) { + let found = match seek_result { + SeekResult::Found => true, + SeekResult::NotFound => false, + SeekResult::TryAdvance => { + state.op_seek_state = OpSeekState::Advance { op: *op }; + continue; + } + }; + if !found { + state.pc = target_pc.as_offset_int() + } else { + state.pc += 1 + } + return Ok(InsnFunctionStepResult::Step); + } + OpSeekState::Advance { op } => { + let found = { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + match op { + SeekOp::GT | SeekOp::GE { eq_only: false } => return_if_io!(cursor.next()), + SeekOp::LT | SeekOp::LE { eq_only: false } => return_if_io!(cursor.prev()), + _ => unreachable!("eq_only: true state must be unreachable"), + } + }; + if !found { state.pc = target_pc.as_offset_int() } else { state.pc += 1 @@ -4833,7 +4859,7 @@ pub fn op_idx_delete( cursor.root_page(), record ); - found + matches!(seek_result, SeekResult::Found) }; if !found { From f9cd5fad4cfeaf9c0303aabad7e5e167a1b4871d Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 21:06:54 +0400 Subject: [PATCH 04/12] add small comment --- core/vdbe/execute.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 08a0095be..a04c489be 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -4755,6 +4755,7 @@ pub fn op_insert( Register::Aggregate(..) => unreachable!("Cannot insert an aggregate value."), }; + // query planner must emit NewRowId/NotExists/etc op-codes which will properly reposition cursor return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record.as_ref())), true)); } From 6e2ccdff206623bf40ae90516420733cf4d80b17 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sun, 6 Jul 2025 21:13:32 +0400 Subject: [PATCH 05/12] add btree fuzz tests which generate seed file from scratch --- Cargo.lock | 15 +- core/storage/btree.rs | 1 - tests/Cargo.toml | 1 + tests/integration/query_processing/mod.rs | 3 + .../query_processing/test_btree.rs | 479 ++++++++++++++++++ .../query_processing/test_multi_thread.rs | 27 + 6 files changed, 518 insertions(+), 8 deletions(-) create mode 100644 tests/integration/query_processing/test_btree.rs create mode 100644 tests/integration/query_processing/test_multi_thread.rs diff --git a/Cargo.lock b/Cargo.lock index d731ec210..dd099b787 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -586,6 +586,7 @@ dependencies = [ "tracing", "tracing-subscriber", "turso_core", + "zerocopy 0.8.26", ] [[package]] @@ -2590,7 +2591,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -2808,7 +2809,7 @@ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -4538,11 +4539,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" dependencies = [ - "zerocopy-derive 0.8.24", + "zerocopy-derive 0.8.26", ] [[package]] @@ -4558,9 +4559,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 6cc0addf8..838d008a9 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -7378,7 +7378,6 @@ mod tests { } #[test] - #[ignore] pub fn test_clear_overflow_pages() -> Result<()> { let pager = setup_test_env(5); let num_columns = 5; diff --git a/tests/Cargo.toml b/tests/Cargo.toml index a891145f2..c50c69e40 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -24,6 +24,7 @@ log = "0.4.22" assert_cmd = "^2" rand_chacha = "0.9.0" rand = "0.9.0" +zerocopy = "0.8.26" [dev-dependencies] test-log = { version = "0.2.17", features = ["trace"] } diff --git a/tests/integration/query_processing/mod.rs b/tests/integration/query_processing/mod.rs index 71309fe5a..3c93c3bb0 100644 --- a/tests/integration/query_processing/mod.rs +++ b/tests/integration/query_processing/mod.rs @@ -1,2 +1,5 @@ +mod test_btree; mod test_read_path; mod test_write_path; + +mod test_multi_thread; diff --git a/tests/integration/query_processing/test_btree.rs b/tests/integration/query_processing/test_btree.rs new file mode 100644 index 000000000..22881d724 --- /dev/null +++ b/tests/integration/query_processing/test_btree.rs @@ -0,0 +1,479 @@ +use std::{ + collections::{HashMap, HashSet}, + i64, + os::unix::fs::FileExt, + path::Path, + rc::Rc, +}; + +use rand::{seq::SliceRandom, RngCore, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use zerocopy::big_endian::{U16, U32, U64}; + +use crate::common::{limbo_exec_rows, sqlite_exec_rows, TempDatabase}; + +#[derive(Debug, Eq, PartialEq)] +pub enum BTreePageType { + Interior, + Leaf, +} + +#[derive(Debug)] +struct BTreeFreeBlock { + offset: u16, + size: u16, +} + +#[derive(Debug)] +pub struct BTreeLeafCell { + size: usize, + rowid: u64, + on_page_data: Vec, + overflow_page: Option>, +} +#[derive(Debug)] +pub struct BTreeInteriorCell { + left_child_pointer: Rc, + rowid: u64, +} + +#[derive(Debug)] +pub enum BTreeCell { + Interior(BTreeInteriorCell), + Leaf(BTreeLeafCell), +} + +impl BTreeCell { + fn size(&self) -> u16 { + match self { + BTreeCell::Interior(cell) => 4 + length_varint(cell.rowid) as u16, + BTreeCell::Leaf(cell) => { + (length_varint(cell.size as u64) + + length_varint(cell.rowid) + + cell.on_page_data.len() + + cell.overflow_page.as_ref().map(|_| 4).unwrap_or(0)) as u16 + } + } + } +} + +#[derive(Debug)] +pub struct BTreeOverflowPageData { + next: Option>, + payload: Vec, +} + +#[derive(Debug)] +pub struct BTreeTablePageData { + page_type: BTreePageType, + cell_content_area: u16, + cell_right_pointer: Option>, + fragmented_free_bytes: u8, + cells: Vec<(u16, BTreeCell)>, + free_blocks: Vec, +} + +#[derive(Debug)] +pub enum BTreePageData { + Table(BTreeTablePageData), + Overflow(BTreeOverflowPageData), +} + +pub fn list_pages(root: &Rc, pages: &mut Vec>) { + pages.push(root.clone()); + match root.as_ref() { + BTreePageData::Table(root) => { + for (_, cell) in &root.cells { + match cell { + BTreeCell::Interior(cell) => list_pages(&cell.left_child_pointer, pages), + BTreeCell::Leaf(cell) => { + let Some(overflow_page) = &cell.overflow_page else { + continue; + }; + list_pages(&overflow_page, pages); + } + } + } + if let Some(right) = &root.cell_right_pointer { + list_pages(right, pages); + } + } + BTreePageData::Overflow(root) => { + if let Some(next) = &root.next { + list_pages(next, pages); + } + } + } +} + +pub fn write_varint(buf: &mut [u8], value: u64) -> usize { + if value <= 0x7f { + buf[0] = (value & 0x7f) as u8; + return 1; + } + + if value <= 0x3fff { + buf[0] = (((value >> 7) & 0x7f) | 0x80) as u8; + buf[1] = (value & 0x7f) as u8; + return 2; + } + + let mut value = value; + if (value & ((0xff000000_u64) << 32)) > 0 { + buf[8] = value as u8; + value >>= 8; + for i in (0..8).rev() { + buf[i] = ((value & 0x7f) | 0x80) as u8; + value >>= 7; + } + return 9; + } + + let mut encoded: [u8; 10] = [0; 10]; + let mut bytes = value; + let mut n = 0; + while bytes != 0 { + let v = 0x80 | (bytes & 0x7f); + encoded[n] = v as u8; + bytes >>= 7; + n += 1; + } + encoded[0] &= 0x7f; + for i in 0..n { + buf[i] = encoded[n - 1 - i]; + } + n +} + +pub fn length_varint(value: u64) -> usize { + let mut buf = [0u8; 10]; + write_varint(&mut buf, value) +} + +fn write_u64_column(header: &mut Vec, data: &mut Vec, value: u64) { + let mut buf = [0u8; 10]; + let buf_len = write_varint(&mut buf, 6 as u64); + header.extend_from_slice(&buf[0..buf_len]); + data.extend_from_slice(&U64::new(value).to_bytes()); +} + +fn write_blob_column(header: &mut Vec, data: &mut Vec, value: &[u8]) { + let mut buf = [0u8; 10]; + let buf_len = write_varint(&mut buf, (value.len() * 2 + 12) as u64); + header.extend_from_slice(&buf[0..buf_len]); + data.extend_from_slice(value); +} + +fn create_simple_record(value: u64, payload: &[u8]) -> Vec { + let mut header = Vec::new(); + let mut data = Vec::new(); + write_u64_column(&mut header, &mut data, value); + write_blob_column(&mut header, &mut data, payload); + let header_len = header.len() + 1; + assert!(header_len <= 127); + let mut buf = [0u8; 10]; + let buf_len = write_varint(&mut buf, header_len as u64); + let mut result = buf[0..buf_len].to_vec(); + result.extend_from_slice(&header); + result.extend_from_slice(&data); + result +} + +struct BTreeGenerator<'a> { + rng: &'a mut ChaCha8Rng, + max_interior_keys: usize, + max_leaf_keys: usize, + max_payload_size: usize, +} + +impl<'a> BTreeGenerator<'a> { + pub fn create_page( + &self, + page: &BTreePageData, + page_numbers: &HashMap<*const BTreePageData, u32>, + ) -> Vec { + match page { + BTreePageData::Table(page) => self.create_btree_page(page, page_numbers), + BTreePageData::Overflow(page) => self.create_overflow_page(page, page_numbers), + } + } + pub fn create_overflow_page( + &self, + page: &BTreeOverflowPageData, + page_numbers: &HashMap<*const BTreePageData, u32>, + ) -> Vec { + let mut data = [255u8; 4096]; + let first_4bytes = if let Some(next) = &page.next { + *page_numbers.get(&Rc::as_ptr(&next)).unwrap() + } else { + 0 + }; + data[0..4].copy_from_slice(&U32::new(first_4bytes).to_bytes()); + data[4..4 + page.payload.len()].copy_from_slice(&page.payload); + data.to_vec() + } + pub fn create_btree_page( + &self, + page: &BTreeTablePageData, + page_numbers: &HashMap<*const BTreePageData, u32>, + ) -> Vec { + let mut data = [255u8; 4096]; + + data[0] = match page.page_type { + BTreePageType::Interior => 0x05, + BTreePageType::Leaf => 0x0d, + }; + data[1..3].copy_from_slice( + &U16::new(page.free_blocks.get(0).map(|x| x.offset).unwrap_or(0)).to_bytes(), + ); + data[3..5].copy_from_slice(&U16::new(page.cells.len() as u16).to_bytes()); + data[5..7].copy_from_slice(&U16::new(page.cell_content_area).to_bytes()); + data[7] = page.fragmented_free_bytes; + let mut offset = 8; + if page.page_type == BTreePageType::Interior { + let cell_right_pointer = page.cell_right_pointer.as_ref().unwrap(); + let cell_right_pointer = Rc::as_ptr(&cell_right_pointer); + let cell_right_pointer = page_numbers.get(&cell_right_pointer).unwrap(); + data[8..12].copy_from_slice(&U32::new(*cell_right_pointer).to_bytes()); + offset = 12; + } + + for (i, (pointer, _)) in page.cells.iter().enumerate() { + data[offset + 2 * i..offset + 2 * (i + 1)] + .copy_from_slice(&U16::new(*pointer).to_bytes()); + } + + for i in 0..page.free_blocks.len() { + let offset = page.free_blocks[i].offset as usize; + data[offset + 0..offset + 2].copy_from_slice( + &U16::new(page.free_blocks.get(i + 1).map(|x| x.offset).unwrap_or(0)).to_bytes(), + ); + data[offset + 2..offset + 4] + .copy_from_slice(&U16::new(page.free_blocks[i].size).to_bytes()); + } + + for (pointer, cell) in page.cells.iter() { + let mut p = *pointer as usize; + match cell { + BTreeCell::Interior(cell) => { + let left_child_pointer = Rc::as_ptr(&cell.left_child_pointer); + let left_child_pointer = page_numbers.get(&left_child_pointer).unwrap(); + data[p..p + 4].copy_from_slice(&U32::new(*left_child_pointer).to_bytes()); + p += 4; + _ = write_varint(&mut data[p..], cell.rowid); + } + BTreeCell::Leaf(cell) => { + p += write_varint(&mut data[p..], cell.size as u64); + p += write_varint(&mut data[p..], cell.rowid as u64); + data[p..p + cell.on_page_data.len()].copy_from_slice(&cell.on_page_data); + p += cell.on_page_data.len(); + if let Some(overflow_page) = &cell.overflow_page { + let overflow_page = Rc::as_ptr(&overflow_page); + let overflow_page = page_numbers.get(&overflow_page).unwrap(); + data[p..p + 4].copy_from_slice(&U32::new(*overflow_page).to_bytes()); + } + } + } + } + + data.into() + } + + fn generate_btree(&mut self, depth: usize, mut l: u64, r: u64) -> Rc { + let mut cells = vec![]; + let cells_max_limit = if depth == 0 { + self.max_leaf_keys + } else { + self.max_interior_keys + }; + let cells_limit = self.rng.next_u32() as usize % cells_max_limit + 1; + + let mut rowids = HashSet::new(); + for _ in 0..cells_limit { + let rowid = l + self.rng.next_u64() % (r - l + 1); + if rowids.contains(&rowid) { + continue; + } + rowids.insert(rowid); + } + + let mut rowids = rowids.into_iter().collect::>(); + rowids.sort(); + + let header_offset = if depth == 0 { 8 } else { 12 }; + let mut it = 0; + let mut cells_size = header_offset; + while cells.len() < cells_limit && it < rowids.len() { + let rowid = rowids[it]; + it += 1; + + let cell = if depth == 0 { + let length = self.rng.next_u32() as usize % self.max_payload_size; + let record = create_simple_record(rowid, &vec![1u8; length]); + BTreeCell::Leaf(BTreeLeafCell { + size: record.len(), + rowid, + on_page_data: record, + overflow_page: None, + }) + } else { + BTreeCell::Interior(BTreeInteriorCell { + left_child_pointer: self.generate_btree(depth - 1, l, rowid), + rowid, + }) + }; + if cells_size + 2 + cell.size() > 4096 { + break; + } + cells_size += 2 + cell.size(); + cells.push((rowid, cell)); + if depth > 0 { + l = rowid + 1; + } + } + + cells.shuffle(&mut self.rng); + + let mut cells_with_offset = Vec::new(); + let mut fragmentation_budget = 4096 - cells_size; + let mut pointer_offset = header_offset; + let mut content_offset = 4096; + let mut fragmented_free_bytes = 0; + let mut free_blocks = vec![]; + + for (rowid, cell) in cells { + let mut fragmentation = ((self.rng.next_u32() % 4) as u16).min(fragmentation_budget); + if fragmented_free_bytes + fragmentation > 60 { + fragmentation = 0; + } + let mut free_block_size = 0; + if fragmentation == 0 && fragmentation_budget >= 4 { + free_block_size = 4 + self.rng.next_u32() as u16 % (fragmentation_budget - 3); + } + + let cell_size = cell.size() + fragmentation.max(free_block_size); + assert!(pointer_offset + 2 + cell_size <= content_offset); + + pointer_offset += 2; + content_offset -= cell_size; + fragmented_free_bytes += fragmentation; + fragmentation_budget -= fragmentation.max(free_block_size); + if free_block_size > 0 { + free_blocks.push(BTreeFreeBlock { + offset: content_offset + cell.size(), + size: free_block_size, + }); + } + cells_with_offset.push((rowid, content_offset, cell)); + } + + cells_with_offset.sort_by_key(|(rowid, ..)| *rowid); + let cells = cells_with_offset + .into_iter() + .map(|(_, offset, cell)| (offset, cell)) + .collect::>(); + + free_blocks.sort_by_key(|x| x.offset); + + if depth == 0 { + Rc::new(BTreePageData::Table(BTreeTablePageData { + page_type: BTreePageType::Leaf, + cell_content_area: content_offset, + cell_right_pointer: None, + fragmented_free_bytes: fragmented_free_bytes as u8, + cells, + free_blocks, + })) + } else { + Rc::new(BTreePageData::Table(BTreeTablePageData { + page_type: BTreePageType::Interior, + cell_content_area: content_offset, + cell_right_pointer: if l <= r { + Some(self.generate_btree(depth - 1, l, r)) + } else { + None + }, + fragmented_free_bytes: fragmented_free_bytes as u8, + cells, + free_blocks, + })) + } + } + + fn write_btree(&mut self, path: &Path, root: &Rc, start_page: u32) { + let mut pages = Vec::new(); + list_pages(&root, &mut pages); + pages[1..].shuffle(&mut self.rng); + let mut page_numbers = HashMap::new(); + for (page, page_no) in pages.iter().zip(start_page..) { + page_numbers.insert(Rc::as_ptr(&page), page_no); + } + + let file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(path) + .unwrap(); + + assert_eq!(file.metadata().unwrap().len(), 4096 * 2); + for (i, page) in pages.iter().enumerate() { + let page = self.create_page(page, &page_numbers); + file.write_at(&page, 4096 * (i + 1) as u64).unwrap(); + } + let size = 1 + pages.len(); + let size_bytes = U32::new(size as u32).to_bytes(); + file.write_at(&size_bytes, 28).unwrap(); + } +} + +#[test] +fn test_btree() { + let _ = env_logger::try_init(); + let mut rng = ChaCha8Rng::seed_from_u64(0); + for depth in 0..4 { + for attempt in 0..16 { + let db = TempDatabase::new_with_rusqlite( + "create table test (k INTEGER PRIMARY KEY, b BLOB);", + false, + ); + log::info!( + "depth: {}, attempt: {}, path: {:?}", + depth, + attempt, + db.path + ); + + let mut generator = BTreeGenerator { + rng: &mut rng, + max_interior_keys: 3, + max_leaf_keys: 4096, + max_payload_size: 100, + }; + let root = generator.generate_btree(depth, 0, i64::MAX as u64); + generator.write_btree(&db.path, &root, 2); + + for _ in 0..16 { + let mut l = rng.next_u64() % (i64::MAX as u64); + let mut r = rng.next_u64() % (i64::MAX as u64); + if l > r { + (l, r) = (r, l); + } + + let query = format!("SELECT SUM(LENGTH(b)) FROM test WHERE k >= {l} AND k <= {r}"); + let sqlite_sum = { + let conn = rusqlite::Connection::open(&db.path).unwrap(); + sqlite_exec_rows(&conn, &query) + }; + let limbo_sum = { + let conn = db.connect_limbo(); + limbo_exec_rows(&db, &conn, &query) + }; + assert_eq!( + limbo_sum, sqlite_sum, + "query={}, limbo={:?}, sqlite={:?}", + query, limbo_sum, sqlite_sum + ); + } + } + } +} diff --git a/tests/integration/query_processing/test_multi_thread.rs b/tests/integration/query_processing/test_multi_thread.rs new file mode 100644 index 000000000..1d96e0422 --- /dev/null +++ b/tests/integration/query_processing/test_multi_thread.rs @@ -0,0 +1,27 @@ +use crate::common::TempDatabase; + +#[test] +fn test_schema_change() { + let tmp_db = TempDatabase::new_empty(false); + let conn1 = tmp_db.connect_limbo(); + conn1.execute("CREATE TABLE t(x, y, z)").unwrap(); + conn1 + .execute("INSERT INTO t VALUES (1, 2, 3), (10, 20, 30)") + .unwrap(); + let conn2 = tmp_db.connect_limbo(); + let mut stmt = conn2.prepare("SELECT x, z FROM t").unwrap(); + conn1.execute("ALTER TABLE t DROP COLUMN x").unwrap(); + let row = loop { + match stmt.step() { + Ok(turso_core::StepResult::Row) => { + let row = stmt.row().unwrap(); + break row; + } + Ok(turso_core::StepResult::IO) => { + stmt.run_once().unwrap(); + } + _ => panic!("unexpected step result"), + } + }; + println!("{:?} {:?}", row.get_value(0), row.get_value(1)); +} From aceaf182b12210764289749159ab470030b2351f Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 21:12:27 +0400 Subject: [PATCH 06/12] remove comment --- core/vdbe/execute.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index a04c489be..b0e650a6d 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2468,7 +2468,6 @@ pub fn op_seek_internal( Insn::SeekLT { .. } => SeekOp::LT, _ => unreachable!("unexpected Insn {:?}", insn), }; - // todo (sivukhin): fix index too if *is_index { let seek_result = { let mut cursor = state.get_cursor(*cursor_id); From c4841e18f3c51e09dafa0654b74d0d8954f5dca3 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 21:19:33 +0400 Subject: [PATCH 07/12] fix clippy --- tests/integration/query_processing/test_btree.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/query_processing/test_btree.rs b/tests/integration/query_processing/test_btree.rs index 22881d724..0815e3e36 100644 --- a/tests/integration/query_processing/test_btree.rs +++ b/tests/integration/query_processing/test_btree.rs @@ -76,6 +76,7 @@ pub struct BTreeTablePageData { #[derive(Debug)] pub enum BTreePageData { Table(BTreeTablePageData), + #[allow(dead_code)] Overflow(BTreeOverflowPageData), } From 9a347d8852134cf6a6f082ddeaf76bcc58c8ca8e Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 21:34:18 +0400 Subject: [PATCH 08/12] add simple tcl test --- testing/select.test | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/testing/select.test b/testing/select.test index 6471254e3..5cfe2c15d 100755 --- a/testing/select.test +++ b/testing/select.test @@ -572,3 +572,48 @@ if {[info exists ::env(SQLITE_EXEC)] && ($::env(SQLITE_EXEC) eq "scripts/limbo-s select * from t INTERSECT select * from u EXCEPT select * from v; } {} } + +do_execsql_test_on_specific_db {:memory:} select-no-match-in-leaf-page { + CREATE TABLE t(a INTEGER PRIMARY KEY, b); + insert into t values (1, randomblob(1024)); + insert into t values (2, randomblob(1024)); + insert into t values (3, randomblob(1024)); + insert into t values (4, randomblob(1024)); + insert into t values (5, randomblob(1024)); + insert into t values (6, randomblob(1024)); + insert into t values (7, randomblob(1024)); + insert into t values (8, randomblob(1024)); + insert into t values (9, randomblob(1024)); + insert into t values (10, randomblob(1024)); + insert into t values (11, randomblob(1024)); + insert into t values (12, randomblob(1024)); + insert into t values (13, randomblob(1024)); + insert into t values (14, randomblob(1024)); + insert into t values (15, randomblob(1024)); + insert into t values (16, randomblob(1024)); + delete from t where a in (3, 6, 9, 12); + select count(*) from t where a >= 2; + select count(*) from t where a >= 3; + select count(*) from t where a >= 4; + select count(*) from t where a > 1; + select count(*) from t where a > 2; + select count(*) from t where a > 3; + select count(*) from t where a <= 3 ORDER BY a DESC; + select count(*) from t where a <= 4 ORDER BY a DESC; + select count(*) from t where a <= 5 ORDER BY a DESC; + select count(*) from t where a < 2 ORDER BY a DESC; + select count(*) from t where a < 3 ORDER BY a DESC; + select count(*) from t where a < 4 ORDER BY a DESC; +} {11 +10 +10 +11 +10 +10 +2 +3 +4 +1 +2 +2} + From 47ab260f6c0f1886025edbd0b5e81c7e5f28d9d4 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 22:20:22 +0400 Subject: [PATCH 09/12] use PlatformIO in the fuzz test code --- .../query_processing/test_btree.rs | 53 ++++++++++++------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/tests/integration/query_processing/test_btree.rs b/tests/integration/query_processing/test_btree.rs index 0815e3e36..b2b586748 100644 --- a/tests/integration/query_processing/test_btree.rs +++ b/tests/integration/query_processing/test_btree.rs @@ -1,13 +1,17 @@ use std::{ + cell::RefCell, collections::{HashMap, HashSet}, - i64, - os::unix::fs::FileExt, path::Path, + pin::Pin, rc::Rc, + sync::Arc, }; use rand::{seq::SliceRandom, RngCore, SeedableRng}; use rand_chacha::ChaCha8Rng; +use turso_core::{ + Buffer, Completion, CompletionType, File, OpenFlags, PlatformIO, WriteCompletion, IO, +}; use zerocopy::big_endian::{U16, U32, U64}; use crate::common::{limbo_exec_rows, sqlite_exec_rows, TempDatabase}; @@ -91,7 +95,7 @@ pub fn list_pages(root: &Rc, pages: &mut Vec>) let Some(overflow_page) = &cell.overflow_page else { continue; }; - list_pages(&overflow_page, pages); + list_pages(overflow_page, pages); } } } @@ -153,7 +157,7 @@ pub fn length_varint(value: u64) -> usize { fn write_u64_column(header: &mut Vec, data: &mut Vec, value: u64) { let mut buf = [0u8; 10]; - let buf_len = write_varint(&mut buf, 6 as u64); + let buf_len = write_varint(&mut buf, 6u64); header.extend_from_slice(&buf[0..buf_len]); data.extend_from_slice(&U64::new(value).to_bytes()); } @@ -187,7 +191,7 @@ struct BTreeGenerator<'a> { max_payload_size: usize, } -impl<'a> BTreeGenerator<'a> { +impl BTreeGenerator<'_> { pub fn create_page( &self, page: &BTreePageData, @@ -205,7 +209,7 @@ impl<'a> BTreeGenerator<'a> { ) -> Vec { let mut data = [255u8; 4096]; let first_4bytes = if let Some(next) = &page.next { - *page_numbers.get(&Rc::as_ptr(&next)).unwrap() + *page_numbers.get(&Rc::as_ptr(next)).unwrap() } else { 0 }; @@ -233,7 +237,7 @@ impl<'a> BTreeGenerator<'a> { let mut offset = 8; if page.page_type == BTreePageType::Interior { let cell_right_pointer = page.cell_right_pointer.as_ref().unwrap(); - let cell_right_pointer = Rc::as_ptr(&cell_right_pointer); + let cell_right_pointer = Rc::as_ptr(cell_right_pointer); let cell_right_pointer = page_numbers.get(&cell_right_pointer).unwrap(); data[8..12].copy_from_slice(&U32::new(*cell_right_pointer).to_bytes()); offset = 12; @@ -246,7 +250,7 @@ impl<'a> BTreeGenerator<'a> { for i in 0..page.free_blocks.len() { let offset = page.free_blocks[i].offset as usize; - data[offset + 0..offset + 2].copy_from_slice( + data[offset..offset + 2].copy_from_slice( &U16::new(page.free_blocks.get(i + 1).map(|x| x.offset).unwrap_or(0)).to_bytes(), ); data[offset + 2..offset + 4] @@ -265,11 +269,11 @@ impl<'a> BTreeGenerator<'a> { } BTreeCell::Leaf(cell) => { p += write_varint(&mut data[p..], cell.size as u64); - p += write_varint(&mut data[p..], cell.rowid as u64); + p += write_varint(&mut data[p..], cell.rowid); data[p..p + cell.on_page_data.len()].copy_from_slice(&cell.on_page_data); p += cell.on_page_data.len(); if let Some(overflow_page) = &cell.overflow_page { - let overflow_page = Rc::as_ptr(&overflow_page); + let overflow_page = Rc::as_ptr(overflow_page); let overflow_page = page_numbers.get(&overflow_page).unwrap(); data[p..p + 4].copy_from_slice(&U32::new(*overflow_page).to_bytes()); } @@ -403,27 +407,38 @@ impl<'a> BTreeGenerator<'a> { fn write_btree(&mut self, path: &Path, root: &Rc, start_page: u32) { let mut pages = Vec::new(); - list_pages(&root, &mut pages); + list_pages(root, &mut pages); pages[1..].shuffle(&mut self.rng); let mut page_numbers = HashMap::new(); for (page, page_no) in pages.iter().zip(start_page..) { - page_numbers.insert(Rc::as_ptr(&page), page_no); + page_numbers.insert(Rc::as_ptr(page), page_no); } - let file = std::fs::OpenOptions::new() - .read(true) - .write(true) - .open(path) + let io = PlatformIO::new().unwrap(); + let file = io + .open_file(path.to_str().unwrap(), OpenFlags::None, true) .unwrap(); - assert_eq!(file.metadata().unwrap().len(), 4096 * 2); + assert_eq!(file.size().unwrap(), 4096 * 2); for (i, page) in pages.iter().enumerate() { let page = self.create_page(page, &page_numbers); - file.write_at(&page, 4096 * (i + 1) as u64).unwrap(); + write_at(&io, file.clone(), 4096 * (i + 1), &page); } let size = 1 + pages.len(); let size_bytes = U32::new(size as u32).to_bytes(); - file.write_at(&size_bytes, 28).unwrap(); + write_at(&io, file, 28, &size_bytes); + } +} + +fn write_at(io: &impl IO, file: Arc, offset: usize, data: &[u8]) { + let completion = Completion::new(CompletionType::Write(WriteCompletion::new(Box::new( + |_| {}, + )))); + let drop_fn = Rc::new(move |_| {}); + let buffer = Arc::new(RefCell::new(Buffer::new(Pin::new(data.to_vec()), drop_fn))); + let result = file.pwrite(offset, buffer, completion).unwrap(); + while !result.is_completed() { + io.run_once().unwrap(); } } From 5bd3287826341725740ca0de3a30afc6832b1bd6 Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 22:53:57 +0400 Subject: [PATCH 10/12] add comments --- core/storage/btree.rs | 6 +----- core/types.rs | 10 ++++++++++ core/vdbe/execute.rs | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 838d008a9..09abcc2dd 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1753,15 +1753,11 @@ impl BTreeCursor { self.stack.set_cell_index(nearest_matching_cell as i32); return Ok(CursorResult::Ok(SeekResult::Found)); } else { - let eq_only = match &seek_op { - SeekOp::GE { eq_only } | SeekOp::LE { eq_only } => *eq_only, - SeekOp::LT | SeekOp::GT => false, - }; // if !eq_only - matching entry can exist in neighbour leaf page // this can happen if key in the interiour page was deleted - but divider kept untouched // in such case BTree can navigate to the leaf which no longer has matching key for seek_op // in this case, caller must advance cursor if necessary - return Ok(CursorResult::Ok(if eq_only { + return Ok(CursorResult::Ok(if seek_op.eq_only() { SeekResult::NotFound } else { let contents = page.get().contents.as_ref().unwrap(); diff --git a/core/types.rs b/core/types.rs index b018852ff..cdfa8eec6 100644 --- a/core/types.rs +++ b/core/types.rs @@ -2293,8 +2293,18 @@ pub enum CursorResult { #[derive(Debug)] pub enum SeekResult { + /// Record matching the [SeekOp] found in the B-tree and cursor was positioned to point onto that record Found, + /// Record matching the [SeekOp] doesn't exists in the B-tree NotFound, + /// This result can happen only if eq_only for [SeekOp] is false + /// In this case Seek can position cursor to the leaf page boundaries (before the start, after the end) + /// (e.g. if leaf page holds rows with keys from range [1..10], key 10 is absent and [SeekOp] is >= 10) + /// + /// turso-db has this extra [SeekResult] in order to make [BTreeCursor::seek] method to position cursor at + /// the leaf of potential insertion, but also communicate to caller the fact that current cursor position + /// doesn't hold a matching entry + /// (necessary for Seek{XX} VM op-codes, so these op-codes will try to advance cursor in order to move it to matching entry) TryAdvance, } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index b0e650a6d..f20a9af27 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2393,9 +2393,14 @@ pub fn op_deferred_seek( } pub enum OpSeekState { + /// Initial state Start, + /// Position cursor with seek operation with (rowid, op) search parameters Seek { rowid: i64, op: SeekOp }, + /// Advance cursor (with [BTreeCursor::next]/[BTreeCursor::prev] methods) which was + /// positioned after [OpSeekState::Seek] state if [BTreeCursor::seek] returned [SeekResult::TryAdvance] Advance { op: SeekOp }, + /// Move cursor to the last BTree row if DB knows that comparison result will be fixed (due to type ordering, e.g. NUMBER always <= TEXT) MoveLast, } @@ -2591,6 +2596,22 @@ pub fn op_seek_internal( let found = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); + // Seek operation has anchor number which equals to the closed boundary of the range + // (e.g. for >= x - anchor is x, for > x - anchor is x + 1) + // + // Before Advance state, cursor was positioned to the leaf page which should hold the anchor. + // Sometimes this leaf page can have no matching rows, and in this case + // we need to move cursor in the direction of Seek to find record which matches the seek filter + // + // Consider following scenario: Seek [> 666] + // interior page dividers: I1: [ .. 667 .. ] + // / \ + // leaf pages: P1[661,665] P2[anything here is GT 666] + // After the initial Seek, cursor will be positioned after the end of leaf page P1 [661, 665] + // because this is potential position for insertion of value 666. + // But as P1 has no row matching Seek criteria - we need to move it to the right + // (and as we at the page boundary, we will move cursor to the next neighbor leaf, which guaranteed to have + // row keys greater than divider, which is greater or equal than anchor) match op { SeekOp::GT | SeekOp::GE { eq_only: false } => return_if_io!(cursor.next()), SeekOp::LT | SeekOp::LE { eq_only: false } => return_if_io!(cursor.prev()), From 82773d65630f8f643d594787692a0f88b286cffa Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Sat, 12 Jul 2025 23:01:04 +0400 Subject: [PATCH 11/12] fix clippy --- tests/integration/query_processing/test_btree.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/query_processing/test_btree.rs b/tests/integration/query_processing/test_btree.rs index b2b586748..705557687 100644 --- a/tests/integration/query_processing/test_btree.rs +++ b/tests/integration/query_processing/test_btree.rs @@ -229,7 +229,7 @@ impl BTreeGenerator<'_> { BTreePageType::Leaf => 0x0d, }; data[1..3].copy_from_slice( - &U16::new(page.free_blocks.get(0).map(|x| x.offset).unwrap_or(0)).to_bytes(), + &U16::new(page.free_blocks.first().map(|x| x.offset).unwrap_or(0)).to_bytes(), ); data[3..5].copy_from_slice(&U16::new(page.cells.len() as u16).to_bytes()); data[5..7].copy_from_slice(&U16::new(page.cell_content_area).to_bytes()); @@ -435,6 +435,7 @@ fn write_at(io: &impl IO, file: Arc, offset: usize, data: &[u8]) { |_| {}, )))); let drop_fn = Rc::new(move |_| {}); + #[allow(clippy::arc_with_non_send_sync)] let buffer = Arc::new(RefCell::new(Buffer::new(Pin::new(data.to_vec()), drop_fn))); let result = file.pwrite(offset, buffer, completion).unwrap(); while !result.is_completed() { From 413d93f04152837a9d3b1c348e166032f958171d Mon Sep 17 00:00:00 2001 From: Nikita Sivukhin Date: Mon, 14 Jul 2025 13:05:20 +0400 Subject: [PATCH 12/12] fix after rebase --- core/vdbe/execute.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index f20a9af27..c66a318a8 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -5165,10 +5165,11 @@ pub fn op_new_rowid( let exists = { let mut cursor = state.get_cursor(*cursor); let cursor = cursor.as_btree_mut(); - return_if_io!(cursor.seek( + let seek_result = return_if_io!(cursor.seek( SeekKey::TableRowId(*candidate), SeekOp::GE { eq_only: true } - )) + )); + matches!(seek_result, SeekResult::Found) }; if !exists {