diff --git a/Cargo.lock b/Cargo.lock index d731ec210..dd099b787 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -586,6 +586,7 @@ dependencies = [ "tracing", "tracing-subscriber", "turso_core", + "zerocopy 0.8.26", ] [[package]] @@ -2590,7 +2591,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -2808,7 +2809,7 @@ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -4538,11 +4539,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" dependencies = [ - "zerocopy-derive 0.8.24", + "zerocopy-derive 0.8.26", ] [[package]] @@ -4558,9 +4559,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", diff --git a/core/storage/btree.rs b/core/storage/btree.rs index ef49158ea..09abcc2dd 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -15,7 +15,7 @@ use crate::{ turso_assert, types::{ find_compare, get_tie_breaker_from_seek_op, IndexKeyInfo, IndexKeySortOrder, - ParseRecordState, RecordCompare, RecordCursor, + ParseRecordState, RecordCompare, RecordCursor, SeekResult, }, MvCursor, }; @@ -1285,13 +1285,21 @@ impl BTreeCursor { /// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10) /// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10). /// We don't include the rowid in the comparison and that's why the last value from the record is not included. - fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { + fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { let ret = return_if_io!(match key { SeekKey::TableRowId(rowid) => { self.tablebtree_seek(rowid, op) } SeekKey::IndexKey(index_key) => { - self.indexbtree_seek(index_key, op) + match self.indexbtree_seek(index_key, op) { + Ok(CursorResult::Ok(found)) => Ok(CursorResult::Ok(if found { + SeekResult::Found + } else { + SeekResult::NotFound + })), + Ok(CursorResult::IO) => Ok(CursorResult::IO), + Err(err) => Err(err), + } } }); self.valid_state = CursorValidState::Valid; @@ -1677,7 +1685,7 @@ impl BTreeCursor { /// Specialized version of do_seek() for table btrees that uses binary search instead /// of iterating cells in order. #[instrument(skip_all, level = Level::INFO)] - fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { + fn tablebtree_seek(&mut self, rowid: i64, seek_op: SeekOp) -> Result> { turso_assert!( self.mv_cursor.is_none(), "attempting to seek with MV cursor" @@ -1704,7 +1712,7 @@ impl BTreeCursor { let cell_count = contents.cell_count(); if cell_count == 0 { self.stack.set_cell_index(0); - return Ok(CursorResult::Ok(false)); + return Ok(CursorResult::Ok(SeekResult::NotFound)); } let min_cell_idx = Cell::new(0); let max_cell_idx = Cell::new(cell_count as isize - 1); @@ -1743,9 +1751,28 @@ impl BTreeCursor { if min > max { if let Some(nearest_matching_cell) = nearest_matching_cell.get() { self.stack.set_cell_index(nearest_matching_cell as i32); - return Ok(CursorResult::Ok(true)); + return Ok(CursorResult::Ok(SeekResult::Found)); } else { - return Ok(CursorResult::Ok(false)); + // if !eq_only - matching entry can exist in neighbour leaf page + // this can happen if key in the interiour page was deleted - but divider kept untouched + // in such case BTree can navigate to the leaf which no longer has matching key for seek_op + // in this case, caller must advance cursor if necessary + return Ok(CursorResult::Ok(if seek_op.eq_only() { + SeekResult::NotFound + } else { + let contents = page.get().contents.as_ref().unwrap(); + turso_assert!( + contents.is_leaf(), + "tablebtree_seek() called on non-leaf page" + ); + let cell_count = contents.cell_count(); + // set cursor to the position where which would hold the op-boundary if it were present + self.stack.set_cell_index(match &seek_op { + SeekOp::GT | SeekOp::GE { .. } => cell_count as i32, + SeekOp::LT | SeekOp::LE { .. } => 0, + }); + SeekResult::TryAdvance + })); }; } @@ -1766,7 +1793,7 @@ impl BTreeCursor { // rowids are unique, so we can return the rowid immediately if found && seek_op.eq_only() { self.stack.set_cell_index(cur_cell_idx as i32); - return Ok(CursorResult::Ok(true)); + return Ok(CursorResult::Ok(SeekResult::Found)); } if found { @@ -4134,7 +4161,7 @@ impl BTreeCursor { } #[instrument(skip(self), level = Level::INFO)] - pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { + pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result> { assert!(self.mv_cursor.is_none()); // Empty trace to capture the span information tracing::trace!(""); @@ -4142,13 +4169,14 @@ impl BTreeCursor { // because it might have been set to false by an unmatched left-join row during the previous iteration // on the outer loop. self.set_null_flag(false); - let cursor_has_record = return_if_io!(self.do_seek(key, op)); + let seek_result = return_if_io!(self.do_seek(key, op)); self.invalidate_record(); // Reset seek state self.seek_state = CursorSeekState::Start; self.valid_state = CursorValidState::Valid; - self.has_record.replace(cursor_has_record); - Ok(CursorResult::Ok(cursor_has_record)) + self.has_record + .replace(matches!(seek_result, SeekResult::Found)); + Ok(CursorResult::Ok(seek_result)) } /// Return a reference to the record the cursor is currently pointing to. @@ -4649,8 +4677,9 @@ impl BTreeCursor { Value::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - let has_record = + let seek_result = return_if_io!(self.seek(SeekKey::TableRowId(*int_key), SeekOp::GE { eq_only: true })); + let has_record = matches!(seek_result, SeekResult::Found); self.has_record.set(has_record); self.invalidate_record(); Ok(CursorResult::Ok(has_record)) @@ -6892,7 +6921,7 @@ mod tests { assert!( matches!( cursor.seek(seek_key, SeekOp::GE { eq_only: true }).unwrap(), - CursorResult::Ok(true) + CursorResult::Ok(SeekResult::Found) ), "key {key} is not found" ); @@ -7135,7 +7164,10 @@ mod tests { pager.deref(), ) .unwrap(); - assert!(exists, "key {key:?} is not found"); + assert!( + matches!(exists, SeekResult::Found), + "key {key:?} is not found" + ); } // Check that key count is right cursor.move_to_root().unwrap(); @@ -7342,7 +7374,6 @@ mod tests { } #[test] - #[ignore] pub fn test_clear_overflow_pages() -> Result<()> { let pager = setup_test_env(5); let num_columns = 5; @@ -8257,13 +8288,13 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), root_page, num_columns); let seek_key = SeekKey::TableRowId(i); - let found = run_until_done( + let seek_result = run_until_done( || cursor.seek(seek_key.clone(), SeekOp::GE { eq_only: true }), pager.deref(), ) .unwrap(); - if found { + if matches!(seek_result, SeekResult::Found) { run_until_done(|| cursor.delete(), pager.deref()).unwrap(); } } diff --git a/core/types.rs b/core/types.rs index df477cfdd..cdfa8eec6 100644 --- a/core/types.rs +++ b/core/types.rs @@ -2291,6 +2291,23 @@ pub enum CursorResult { IO, } +#[derive(Debug)] +pub enum SeekResult { + /// Record matching the [SeekOp] found in the B-tree and cursor was positioned to point onto that record + Found, + /// Record matching the [SeekOp] doesn't exists in the B-tree + NotFound, + /// This result can happen only if eq_only for [SeekOp] is false + /// In this case Seek can position cursor to the leaf page boundaries (before the start, after the end) + /// (e.g. if leaf page holds rows with keys from range [1..10], key 10 is absent and [SeekOp] is >= 10) + /// + /// turso-db has this extra [SeekResult] in order to make [BTreeCursor::seek] method to position cursor at + /// the leaf of potential insertion, but also communicate to caller the fact that current cursor position + /// doesn't hold a matching entry + /// (necessary for Seek{XX} VM op-codes, so these op-codes will try to advance cursor in order to move it to matching entry) + TryAdvance, +} + #[derive(Clone, Copy, PartialEq, Eq, Debug)] /// The match condition of a table/index seek. pub enum SeekOp { diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index ddd15bd55..c66a318a8 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -10,7 +10,7 @@ use crate::storage::wal::DummyWAL; use crate::storage::{self, header_accessor}; use crate::translate::collate::CollationSeq; use crate::types::{ - compare_immutable, compare_records_generic, ImmutableRecord, Text, TextSubtype, + compare_immutable, compare_records_generic, ImmutableRecord, SeekResult, Text, TextSubtype, }; use crate::util::normalize_ident; use crate::vdbe::registers_to_ref_values; @@ -2357,10 +2357,10 @@ pub fn op_seek_rowid( match rowid { Some(rowid) => { - let found = return_if_io!( + let seek_result = return_if_io!( cursor.seek(SeekKey::TableRowId(rowid), SeekOp::GE { eq_only: true }) ); - if !found { + if !matches!(seek_result, SeekResult::Found) { target_pc.as_offset_int() } else { state.pc + 1 @@ -2392,12 +2392,38 @@ pub fn op_deferred_seek( Ok(InsnFunctionStepResult::Step) } +pub enum OpSeekState { + /// Initial state + Start, + /// Position cursor with seek operation with (rowid, op) search parameters + Seek { rowid: i64, op: SeekOp }, + /// Advance cursor (with [BTreeCursor::next]/[BTreeCursor::prev] methods) which was + /// positioned after [OpSeekState::Seek] state if [BTreeCursor::seek] returned [SeekResult::TryAdvance] + Advance { op: SeekOp }, + /// Move cursor to the last BTree row if DB knows that comparison result will be fixed (due to type ordering, e.g. NUMBER always <= TEXT) + MoveLast, +} + pub fn op_seek( program: &Program, state: &mut ProgramState, insn: &Insn, pager: &Rc, mv_store: Option<&Rc>, +) -> Result { + let result = op_seek_internal(program, state, insn, pager, mv_store); + if let Err(_) | Ok(InsnFunctionStepResult::Step) = &result { + state.op_seek_state = OpSeekState::Start; + } + result +} + +pub fn op_seek_internal( + program: &Program, + state: &mut ProgramState, + insn: &Insn, + pager: &Rc, + mv_store: Option<&Rc>, ) -> Result { let (Insn::SeekGE { cursor_id, @@ -2447,120 +2473,169 @@ pub fn op_seek( Insn::SeekLT { .. } => SeekOp::LT, _ => unreachable!("unexpected Insn {:?}", insn), }; - let op_name = match op { - SeekOp::GE { .. } => "SeekGE", - SeekOp::GT => "SeekGT", - SeekOp::LE { .. } => "SeekLE", - SeekOp::LT => "SeekLT", - }; if *is_index { - let found = { + let seek_result = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); let record_from_regs = make_record(&state.registers, start_reg, num_regs); return_if_io!(cursor.seek(SeekKey::IndexKey(&record_from_regs), op)) }; - if !found { + if !matches!(seek_result, SeekResult::Found) { state.pc = target_pc.as_offset_int(); } else { state.pc += 1; } - } else { - let pc = { - let original_value = state.registers[*start_reg].get_owned_value().clone(); - let mut temp_value = original_value.clone(); - - let conversion_successful = if matches!(temp_value, Value::Text(_)) { - let mut temp_reg = Register::Value(temp_value); - let converted = apply_numeric_affinity(&mut temp_reg, false); - temp_value = temp_reg.get_owned_value().clone(); - converted - } else { - true // Non-text values don't need conversion - }; - - let int_key = extract_int_value(&temp_value); - let lost_precision = !conversion_successful || !matches!(temp_value, Value::Integer(_)); - let actual_op = if lost_precision { - match &temp_value { - Value::Float(f) => { - let int_key_as_float = int_key as f64; - let c = if int_key_as_float > *f { - 1 - } else if int_key_as_float < *f { - -1 - } else { - 0 - }; - - match c.cmp(&0) { - std::cmp::Ordering::Less => match op { - SeekOp::LT => SeekOp::LE { eq_only: false }, // (x < 5.1) -> (x <= 5) - SeekOp::GE { .. } => SeekOp::GT, // (x >= 5.1) -> (x > 5) - other => other, - }, - std::cmp::Ordering::Greater => match op { - SeekOp::GT => SeekOp::GE { eq_only: false }, // (x > 4.9) -> (x >= 5) - SeekOp::LE { .. } => SeekOp::LT, // (x <= 4.9) -> (x < 5) - other => other, - }, - std::cmp::Ordering::Equal => op, - } - } - Value::Text(_) | Value::Blob(_) => { - match op { - SeekOp::GT | SeekOp::GE { .. } => { - // No integers are > or >= non-numeric text, jump to target (empty result) - state.pc = target_pc.as_offset_int(); - return Ok(InsnFunctionStepResult::Step); - } - SeekOp::LT | SeekOp::LE { .. } => { - // All integers are < or <= non-numeric text - // Move to last position and then use the normal seek logic - { - let mut cursor = state.get_cursor(*cursor_id); - let cursor = cursor.as_btree_mut(); - return_if_io!(cursor.last()); - } - state.pc += 1; - return Ok(InsnFunctionStepResult::Step); - } - } - } - _ => op, - } - } else { - op - }; - - let rowid = if matches!(original_value, Value::Null) { - match actual_op { - SeekOp::GE { .. } | SeekOp::GT => { - state.pc = target_pc.as_offset_int(); - return Ok(InsnFunctionStepResult::Step); - } - SeekOp::LE { .. } | SeekOp::LT => { - // No integers are < NULL, so jump to target - state.pc = target_pc.as_offset_int(); - return Ok(InsnFunctionStepResult::Step); - } - } - } else { - int_key - }; - let mut cursor = state.get_cursor(*cursor_id); - let cursor = cursor.as_btree_mut(); - let found = return_if_io!(cursor.seek(SeekKey::TableRowId(rowid), actual_op)); - - if !found { - target_pc.as_offset_int() - } else { - state.pc + 1 - } - }; - state.pc = pc; + return Ok(InsnFunctionStepResult::Step); + } + loop { + match &state.op_seek_state { + OpSeekState::Start => { + let original_value = state.registers[*start_reg].get_owned_value().clone(); + let mut temp_value = original_value.clone(); + + let conversion_successful = if matches!(temp_value, Value::Text(_)) { + let mut temp_reg = Register::Value(temp_value); + let converted = apply_numeric_affinity(&mut temp_reg, false); + temp_value = temp_reg.get_owned_value().clone(); + converted + } else { + true // Non-text values don't need conversion + }; + let int_key = extract_int_value(&temp_value); + let lost_precision = + !conversion_successful || !matches!(temp_value, Value::Integer(_)); + let actual_op = if lost_precision { + match &temp_value { + Value::Float(f) => { + let int_key_as_float = int_key as f64; + let c = if int_key_as_float > *f { + 1 + } else if int_key_as_float < *f { + -1 + } else { + 0 + }; + + match c.cmp(&0) { + std::cmp::Ordering::Less => match op { + SeekOp::LT => SeekOp::LE { eq_only: false }, // (x < 5.1) -> (x <= 5) + SeekOp::GE { .. } => SeekOp::GT, // (x >= 5.1) -> (x > 5) + other => other, + }, + std::cmp::Ordering::Greater => match op { + SeekOp::GT => SeekOp::GE { eq_only: false }, // (x > 4.9) -> (x >= 5) + SeekOp::LE { .. } => SeekOp::LT, // (x <= 4.9) -> (x < 5) + other => other, + }, + std::cmp::Ordering::Equal => op, + } + } + Value::Text(_) | Value::Blob(_) => { + match op { + SeekOp::GT | SeekOp::GE { .. } => { + // No integers are > or >= non-numeric text, jump to target (empty result) + state.pc = target_pc.as_offset_int(); + return Ok(InsnFunctionStepResult::Step); + } + SeekOp::LT | SeekOp::LE { .. } => { + // All integers are < or <= non-numeric text + // Move to last position and then use the normal seek logic + state.op_seek_state = OpSeekState::MoveLast; + continue; + } + } + } + _ => op, + } + } else { + op + }; + let rowid = if matches!(original_value, Value::Null) { + match actual_op { + SeekOp::GE { .. } | SeekOp::GT => { + state.pc = target_pc.as_offset_int(); + return Ok(InsnFunctionStepResult::Step); + } + SeekOp::LE { .. } | SeekOp::LT => { + // No integers are < NULL, so jump to target + state.pc = target_pc.as_offset_int(); + return Ok(InsnFunctionStepResult::Step); + } + } + } else { + int_key + }; + state.op_seek_state = OpSeekState::Seek { + rowid, + op: actual_op, + }; + continue; + } + OpSeekState::Seek { rowid, op } => { + let seek_result = { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.seek(SeekKey::TableRowId(*rowid), *op)) + }; + let found = match seek_result { + SeekResult::Found => true, + SeekResult::NotFound => false, + SeekResult::TryAdvance => { + state.op_seek_state = OpSeekState::Advance { op: *op }; + continue; + } + }; + if !found { + state.pc = target_pc.as_offset_int() + } else { + state.pc += 1 + } + return Ok(InsnFunctionStepResult::Step); + } + OpSeekState::Advance { op } => { + let found = { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + // Seek operation has anchor number which equals to the closed boundary of the range + // (e.g. for >= x - anchor is x, for > x - anchor is x + 1) + // + // Before Advance state, cursor was positioned to the leaf page which should hold the anchor. + // Sometimes this leaf page can have no matching rows, and in this case + // we need to move cursor in the direction of Seek to find record which matches the seek filter + // + // Consider following scenario: Seek [> 666] + // interior page dividers: I1: [ .. 667 .. ] + // / \ + // leaf pages: P1[661,665] P2[anything here is GT 666] + // After the initial Seek, cursor will be positioned after the end of leaf page P1 [661, 665] + // because this is potential position for insertion of value 666. + // But as P1 has no row matching Seek criteria - we need to move it to the right + // (and as we at the page boundary, we will move cursor to the next neighbor leaf, which guaranteed to have + // row keys greater than divider, which is greater or equal than anchor) + match op { + SeekOp::GT | SeekOp::GE { eq_only: false } => return_if_io!(cursor.next()), + SeekOp::LT | SeekOp::LE { eq_only: false } => return_if_io!(cursor.prev()), + _ => unreachable!("eq_only: true state must be unreachable"), + } + }; + if !found { + state.pc = target_pc.as_offset_int() + } else { + state.pc += 1 + } + return Ok(InsnFunctionStepResult::Step); + } + OpSeekState::MoveLast => { + { + let mut cursor = state.get_cursor(*cursor_id); + let cursor = cursor.as_btree_mut(); + return_if_io!(cursor.last()); + } + state.pc += 1; + return Ok(InsnFunctionStepResult::Step); + } + } } - Ok(InsnFunctionStepResult::Step) } /// Returns the tie-breaker ordering for SQLite index comparison opcodes. @@ -4700,6 +4775,7 @@ pub fn op_insert( Register::Aggregate(..) => unreachable!("Cannot insert an aggregate value."), }; + // query planner must emit NewRowId/NotExists/etc op-codes which will properly reposition cursor return_if_io!(cursor.insert(&BTreeKey::new_table_rowid(key, Some(record.as_ref())), true)); } @@ -4795,16 +4871,16 @@ pub fn op_idx_delete( let found = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); - let found = return_if_io!( + let seek_result = return_if_io!( cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true }) ); tracing::debug!( "op_idx_delete: found={:?}, rootpage={}, key={:?}", - found, + seek_result, cursor.root_page(), record ); - found + matches!(seek_result, SeekResult::Found) }; if !found { @@ -5089,10 +5165,11 @@ pub fn op_new_rowid( let exists = { let mut cursor = state.get_cursor(*cursor); let cursor = cursor.as_btree_mut(); - return_if_io!(cursor.seek( + let seek_result = return_if_io!(cursor.seek( SeekKey::TableRowId(*candidate), SeekOp::GE { eq_only: true } - )) + )); + matches!(seek_result, SeekResult::Found) }; if !exists { @@ -5205,10 +5282,10 @@ pub fn op_no_conflict( return Ok(InsnFunctionStepResult::Step); } - let conflict = + let seek_result = return_if_io!(cursor.seek(SeekKey::IndexKey(record), SeekOp::GE { eq_only: true })); drop(cursor_ref); - if !conflict { + if !matches!(seek_result, SeekResult::Found) { state.pc = target_pc.as_offset_int(); } else { state.pc += 1; @@ -6022,7 +6099,7 @@ pub fn op_found( let not = matches!(insn, Insn::NotFound { .. }); - let found = { + let seek_result = { let mut cursor = state.get_cursor(*cursor_id); let cursor = cursor.as_btree_mut(); @@ -6043,6 +6120,7 @@ pub fn op_found( } }; + let found = matches!(seek_result, SeekResult::Found); let do_jump = (!found && not) || (found && !not); if do_jump { state.pc = target_pc.as_offset_int(); diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 338f97646..d79e88845 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -33,6 +33,7 @@ use crate::{ vdbe::execute::OpIdxInsertState, vdbe::execute::OpInsertState, vdbe::execute::OpNewRowidState, + vdbe::execute::OpSeekState, RefValue, }; @@ -257,6 +258,7 @@ pub struct ProgramState { op_new_rowid_state: OpNewRowidState, op_idx_insert_state: OpIdxInsertState, op_insert_state: OpInsertState, + op_seek_state: OpSeekState, } impl ProgramState { @@ -286,6 +288,7 @@ impl ProgramState { op_new_rowid_state: OpNewRowidState::Start, op_idx_insert_state: OpIdxInsertState::SeekIfUnique, op_insert_state: OpInsertState::Insert, + op_seek_state: OpSeekState::Start, } } diff --git a/testing/select.test b/testing/select.test index 6471254e3..5cfe2c15d 100755 --- a/testing/select.test +++ b/testing/select.test @@ -572,3 +572,48 @@ if {[info exists ::env(SQLITE_EXEC)] && ($::env(SQLITE_EXEC) eq "scripts/limbo-s select * from t INTERSECT select * from u EXCEPT select * from v; } {} } + +do_execsql_test_on_specific_db {:memory:} select-no-match-in-leaf-page { + CREATE TABLE t(a INTEGER PRIMARY KEY, b); + insert into t values (1, randomblob(1024)); + insert into t values (2, randomblob(1024)); + insert into t values (3, randomblob(1024)); + insert into t values (4, randomblob(1024)); + insert into t values (5, randomblob(1024)); + insert into t values (6, randomblob(1024)); + insert into t values (7, randomblob(1024)); + insert into t values (8, randomblob(1024)); + insert into t values (9, randomblob(1024)); + insert into t values (10, randomblob(1024)); + insert into t values (11, randomblob(1024)); + insert into t values (12, randomblob(1024)); + insert into t values (13, randomblob(1024)); + insert into t values (14, randomblob(1024)); + insert into t values (15, randomblob(1024)); + insert into t values (16, randomblob(1024)); + delete from t where a in (3, 6, 9, 12); + select count(*) from t where a >= 2; + select count(*) from t where a >= 3; + select count(*) from t where a >= 4; + select count(*) from t where a > 1; + select count(*) from t where a > 2; + select count(*) from t where a > 3; + select count(*) from t where a <= 3 ORDER BY a DESC; + select count(*) from t where a <= 4 ORDER BY a DESC; + select count(*) from t where a <= 5 ORDER BY a DESC; + select count(*) from t where a < 2 ORDER BY a DESC; + select count(*) from t where a < 3 ORDER BY a DESC; + select count(*) from t where a < 4 ORDER BY a DESC; +} {11 +10 +10 +11 +10 +10 +2 +3 +4 +1 +2 +2} + diff --git a/tests/Cargo.toml b/tests/Cargo.toml index a891145f2..c50c69e40 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -24,6 +24,7 @@ log = "0.4.22" assert_cmd = "^2" rand_chacha = "0.9.0" rand = "0.9.0" +zerocopy = "0.8.26" [dev-dependencies] test-log = { version = "0.2.17", features = ["trace"] } diff --git a/tests/integration/query_processing/mod.rs b/tests/integration/query_processing/mod.rs index 71309fe5a..3c93c3bb0 100644 --- a/tests/integration/query_processing/mod.rs +++ b/tests/integration/query_processing/mod.rs @@ -1,2 +1,5 @@ +mod test_btree; mod test_read_path; mod test_write_path; + +mod test_multi_thread; diff --git a/tests/integration/query_processing/test_btree.rs b/tests/integration/query_processing/test_btree.rs new file mode 100644 index 000000000..705557687 --- /dev/null +++ b/tests/integration/query_processing/test_btree.rs @@ -0,0 +1,496 @@ +use std::{ + cell::RefCell, + collections::{HashMap, HashSet}, + path::Path, + pin::Pin, + rc::Rc, + sync::Arc, +}; + +use rand::{seq::SliceRandom, RngCore, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use turso_core::{ + Buffer, Completion, CompletionType, File, OpenFlags, PlatformIO, WriteCompletion, IO, +}; +use zerocopy::big_endian::{U16, U32, U64}; + +use crate::common::{limbo_exec_rows, sqlite_exec_rows, TempDatabase}; + +#[derive(Debug, Eq, PartialEq)] +pub enum BTreePageType { + Interior, + Leaf, +} + +#[derive(Debug)] +struct BTreeFreeBlock { + offset: u16, + size: u16, +} + +#[derive(Debug)] +pub struct BTreeLeafCell { + size: usize, + rowid: u64, + on_page_data: Vec, + overflow_page: Option>, +} +#[derive(Debug)] +pub struct BTreeInteriorCell { + left_child_pointer: Rc, + rowid: u64, +} + +#[derive(Debug)] +pub enum BTreeCell { + Interior(BTreeInteriorCell), + Leaf(BTreeLeafCell), +} + +impl BTreeCell { + fn size(&self) -> u16 { + match self { + BTreeCell::Interior(cell) => 4 + length_varint(cell.rowid) as u16, + BTreeCell::Leaf(cell) => { + (length_varint(cell.size as u64) + + length_varint(cell.rowid) + + cell.on_page_data.len() + + cell.overflow_page.as_ref().map(|_| 4).unwrap_or(0)) as u16 + } + } + } +} + +#[derive(Debug)] +pub struct BTreeOverflowPageData { + next: Option>, + payload: Vec, +} + +#[derive(Debug)] +pub struct BTreeTablePageData { + page_type: BTreePageType, + cell_content_area: u16, + cell_right_pointer: Option>, + fragmented_free_bytes: u8, + cells: Vec<(u16, BTreeCell)>, + free_blocks: Vec, +} + +#[derive(Debug)] +pub enum BTreePageData { + Table(BTreeTablePageData), + #[allow(dead_code)] + Overflow(BTreeOverflowPageData), +} + +pub fn list_pages(root: &Rc, pages: &mut Vec>) { + pages.push(root.clone()); + match root.as_ref() { + BTreePageData::Table(root) => { + for (_, cell) in &root.cells { + match cell { + BTreeCell::Interior(cell) => list_pages(&cell.left_child_pointer, pages), + BTreeCell::Leaf(cell) => { + let Some(overflow_page) = &cell.overflow_page else { + continue; + }; + list_pages(overflow_page, pages); + } + } + } + if let Some(right) = &root.cell_right_pointer { + list_pages(right, pages); + } + } + BTreePageData::Overflow(root) => { + if let Some(next) = &root.next { + list_pages(next, pages); + } + } + } +} + +pub fn write_varint(buf: &mut [u8], value: u64) -> usize { + if value <= 0x7f { + buf[0] = (value & 0x7f) as u8; + return 1; + } + + if value <= 0x3fff { + buf[0] = (((value >> 7) & 0x7f) | 0x80) as u8; + buf[1] = (value & 0x7f) as u8; + return 2; + } + + let mut value = value; + if (value & ((0xff000000_u64) << 32)) > 0 { + buf[8] = value as u8; + value >>= 8; + for i in (0..8).rev() { + buf[i] = ((value & 0x7f) | 0x80) as u8; + value >>= 7; + } + return 9; + } + + let mut encoded: [u8; 10] = [0; 10]; + let mut bytes = value; + let mut n = 0; + while bytes != 0 { + let v = 0x80 | (bytes & 0x7f); + encoded[n] = v as u8; + bytes >>= 7; + n += 1; + } + encoded[0] &= 0x7f; + for i in 0..n { + buf[i] = encoded[n - 1 - i]; + } + n +} + +pub fn length_varint(value: u64) -> usize { + let mut buf = [0u8; 10]; + write_varint(&mut buf, value) +} + +fn write_u64_column(header: &mut Vec, data: &mut Vec, value: u64) { + let mut buf = [0u8; 10]; + let buf_len = write_varint(&mut buf, 6u64); + header.extend_from_slice(&buf[0..buf_len]); + data.extend_from_slice(&U64::new(value).to_bytes()); +} + +fn write_blob_column(header: &mut Vec, data: &mut Vec, value: &[u8]) { + let mut buf = [0u8; 10]; + let buf_len = write_varint(&mut buf, (value.len() * 2 + 12) as u64); + header.extend_from_slice(&buf[0..buf_len]); + data.extend_from_slice(value); +} + +fn create_simple_record(value: u64, payload: &[u8]) -> Vec { + let mut header = Vec::new(); + let mut data = Vec::new(); + write_u64_column(&mut header, &mut data, value); + write_blob_column(&mut header, &mut data, payload); + let header_len = header.len() + 1; + assert!(header_len <= 127); + let mut buf = [0u8; 10]; + let buf_len = write_varint(&mut buf, header_len as u64); + let mut result = buf[0..buf_len].to_vec(); + result.extend_from_slice(&header); + result.extend_from_slice(&data); + result +} + +struct BTreeGenerator<'a> { + rng: &'a mut ChaCha8Rng, + max_interior_keys: usize, + max_leaf_keys: usize, + max_payload_size: usize, +} + +impl BTreeGenerator<'_> { + pub fn create_page( + &self, + page: &BTreePageData, + page_numbers: &HashMap<*const BTreePageData, u32>, + ) -> Vec { + match page { + BTreePageData::Table(page) => self.create_btree_page(page, page_numbers), + BTreePageData::Overflow(page) => self.create_overflow_page(page, page_numbers), + } + } + pub fn create_overflow_page( + &self, + page: &BTreeOverflowPageData, + page_numbers: &HashMap<*const BTreePageData, u32>, + ) -> Vec { + let mut data = [255u8; 4096]; + let first_4bytes = if let Some(next) = &page.next { + *page_numbers.get(&Rc::as_ptr(next)).unwrap() + } else { + 0 + }; + data[0..4].copy_from_slice(&U32::new(first_4bytes).to_bytes()); + data[4..4 + page.payload.len()].copy_from_slice(&page.payload); + data.to_vec() + } + pub fn create_btree_page( + &self, + page: &BTreeTablePageData, + page_numbers: &HashMap<*const BTreePageData, u32>, + ) -> Vec { + let mut data = [255u8; 4096]; + + data[0] = match page.page_type { + BTreePageType::Interior => 0x05, + BTreePageType::Leaf => 0x0d, + }; + data[1..3].copy_from_slice( + &U16::new(page.free_blocks.first().map(|x| x.offset).unwrap_or(0)).to_bytes(), + ); + data[3..5].copy_from_slice(&U16::new(page.cells.len() as u16).to_bytes()); + data[5..7].copy_from_slice(&U16::new(page.cell_content_area).to_bytes()); + data[7] = page.fragmented_free_bytes; + let mut offset = 8; + if page.page_type == BTreePageType::Interior { + let cell_right_pointer = page.cell_right_pointer.as_ref().unwrap(); + let cell_right_pointer = Rc::as_ptr(cell_right_pointer); + let cell_right_pointer = page_numbers.get(&cell_right_pointer).unwrap(); + data[8..12].copy_from_slice(&U32::new(*cell_right_pointer).to_bytes()); + offset = 12; + } + + for (i, (pointer, _)) in page.cells.iter().enumerate() { + data[offset + 2 * i..offset + 2 * (i + 1)] + .copy_from_slice(&U16::new(*pointer).to_bytes()); + } + + for i in 0..page.free_blocks.len() { + let offset = page.free_blocks[i].offset as usize; + data[offset..offset + 2].copy_from_slice( + &U16::new(page.free_blocks.get(i + 1).map(|x| x.offset).unwrap_or(0)).to_bytes(), + ); + data[offset + 2..offset + 4] + .copy_from_slice(&U16::new(page.free_blocks[i].size).to_bytes()); + } + + for (pointer, cell) in page.cells.iter() { + let mut p = *pointer as usize; + match cell { + BTreeCell::Interior(cell) => { + let left_child_pointer = Rc::as_ptr(&cell.left_child_pointer); + let left_child_pointer = page_numbers.get(&left_child_pointer).unwrap(); + data[p..p + 4].copy_from_slice(&U32::new(*left_child_pointer).to_bytes()); + p += 4; + _ = write_varint(&mut data[p..], cell.rowid); + } + BTreeCell::Leaf(cell) => { + p += write_varint(&mut data[p..], cell.size as u64); + p += write_varint(&mut data[p..], cell.rowid); + data[p..p + cell.on_page_data.len()].copy_from_slice(&cell.on_page_data); + p += cell.on_page_data.len(); + if let Some(overflow_page) = &cell.overflow_page { + let overflow_page = Rc::as_ptr(overflow_page); + let overflow_page = page_numbers.get(&overflow_page).unwrap(); + data[p..p + 4].copy_from_slice(&U32::new(*overflow_page).to_bytes()); + } + } + } + } + + data.into() + } + + fn generate_btree(&mut self, depth: usize, mut l: u64, r: u64) -> Rc { + let mut cells = vec![]; + let cells_max_limit = if depth == 0 { + self.max_leaf_keys + } else { + self.max_interior_keys + }; + let cells_limit = self.rng.next_u32() as usize % cells_max_limit + 1; + + let mut rowids = HashSet::new(); + for _ in 0..cells_limit { + let rowid = l + self.rng.next_u64() % (r - l + 1); + if rowids.contains(&rowid) { + continue; + } + rowids.insert(rowid); + } + + let mut rowids = rowids.into_iter().collect::>(); + rowids.sort(); + + let header_offset = if depth == 0 { 8 } else { 12 }; + let mut it = 0; + let mut cells_size = header_offset; + while cells.len() < cells_limit && it < rowids.len() { + let rowid = rowids[it]; + it += 1; + + let cell = if depth == 0 { + let length = self.rng.next_u32() as usize % self.max_payload_size; + let record = create_simple_record(rowid, &vec![1u8; length]); + BTreeCell::Leaf(BTreeLeafCell { + size: record.len(), + rowid, + on_page_data: record, + overflow_page: None, + }) + } else { + BTreeCell::Interior(BTreeInteriorCell { + left_child_pointer: self.generate_btree(depth - 1, l, rowid), + rowid, + }) + }; + if cells_size + 2 + cell.size() > 4096 { + break; + } + cells_size += 2 + cell.size(); + cells.push((rowid, cell)); + if depth > 0 { + l = rowid + 1; + } + } + + cells.shuffle(&mut self.rng); + + let mut cells_with_offset = Vec::new(); + let mut fragmentation_budget = 4096 - cells_size; + let mut pointer_offset = header_offset; + let mut content_offset = 4096; + let mut fragmented_free_bytes = 0; + let mut free_blocks = vec![]; + + for (rowid, cell) in cells { + let mut fragmentation = ((self.rng.next_u32() % 4) as u16).min(fragmentation_budget); + if fragmented_free_bytes + fragmentation > 60 { + fragmentation = 0; + } + let mut free_block_size = 0; + if fragmentation == 0 && fragmentation_budget >= 4 { + free_block_size = 4 + self.rng.next_u32() as u16 % (fragmentation_budget - 3); + } + + let cell_size = cell.size() + fragmentation.max(free_block_size); + assert!(pointer_offset + 2 + cell_size <= content_offset); + + pointer_offset += 2; + content_offset -= cell_size; + fragmented_free_bytes += fragmentation; + fragmentation_budget -= fragmentation.max(free_block_size); + if free_block_size > 0 { + free_blocks.push(BTreeFreeBlock { + offset: content_offset + cell.size(), + size: free_block_size, + }); + } + cells_with_offset.push((rowid, content_offset, cell)); + } + + cells_with_offset.sort_by_key(|(rowid, ..)| *rowid); + let cells = cells_with_offset + .into_iter() + .map(|(_, offset, cell)| (offset, cell)) + .collect::>(); + + free_blocks.sort_by_key(|x| x.offset); + + if depth == 0 { + Rc::new(BTreePageData::Table(BTreeTablePageData { + page_type: BTreePageType::Leaf, + cell_content_area: content_offset, + cell_right_pointer: None, + fragmented_free_bytes: fragmented_free_bytes as u8, + cells, + free_blocks, + })) + } else { + Rc::new(BTreePageData::Table(BTreeTablePageData { + page_type: BTreePageType::Interior, + cell_content_area: content_offset, + cell_right_pointer: if l <= r { + Some(self.generate_btree(depth - 1, l, r)) + } else { + None + }, + fragmented_free_bytes: fragmented_free_bytes as u8, + cells, + free_blocks, + })) + } + } + + fn write_btree(&mut self, path: &Path, root: &Rc, start_page: u32) { + let mut pages = Vec::new(); + list_pages(root, &mut pages); + pages[1..].shuffle(&mut self.rng); + let mut page_numbers = HashMap::new(); + for (page, page_no) in pages.iter().zip(start_page..) { + page_numbers.insert(Rc::as_ptr(page), page_no); + } + + let io = PlatformIO::new().unwrap(); + let file = io + .open_file(path.to_str().unwrap(), OpenFlags::None, true) + .unwrap(); + + assert_eq!(file.size().unwrap(), 4096 * 2); + for (i, page) in pages.iter().enumerate() { + let page = self.create_page(page, &page_numbers); + write_at(&io, file.clone(), 4096 * (i + 1), &page); + } + let size = 1 + pages.len(); + let size_bytes = U32::new(size as u32).to_bytes(); + write_at(&io, file, 28, &size_bytes); + } +} + +fn write_at(io: &impl IO, file: Arc, offset: usize, data: &[u8]) { + let completion = Completion::new(CompletionType::Write(WriteCompletion::new(Box::new( + |_| {}, + )))); + let drop_fn = Rc::new(move |_| {}); + #[allow(clippy::arc_with_non_send_sync)] + let buffer = Arc::new(RefCell::new(Buffer::new(Pin::new(data.to_vec()), drop_fn))); + let result = file.pwrite(offset, buffer, completion).unwrap(); + while !result.is_completed() { + io.run_once().unwrap(); + } +} + +#[test] +fn test_btree() { + let _ = env_logger::try_init(); + let mut rng = ChaCha8Rng::seed_from_u64(0); + for depth in 0..4 { + for attempt in 0..16 { + let db = TempDatabase::new_with_rusqlite( + "create table test (k INTEGER PRIMARY KEY, b BLOB);", + false, + ); + log::info!( + "depth: {}, attempt: {}, path: {:?}", + depth, + attempt, + db.path + ); + + let mut generator = BTreeGenerator { + rng: &mut rng, + max_interior_keys: 3, + max_leaf_keys: 4096, + max_payload_size: 100, + }; + let root = generator.generate_btree(depth, 0, i64::MAX as u64); + generator.write_btree(&db.path, &root, 2); + + for _ in 0..16 { + let mut l = rng.next_u64() % (i64::MAX as u64); + let mut r = rng.next_u64() % (i64::MAX as u64); + if l > r { + (l, r) = (r, l); + } + + let query = format!("SELECT SUM(LENGTH(b)) FROM test WHERE k >= {l} AND k <= {r}"); + let sqlite_sum = { + let conn = rusqlite::Connection::open(&db.path).unwrap(); + sqlite_exec_rows(&conn, &query) + }; + let limbo_sum = { + let conn = db.connect_limbo(); + limbo_exec_rows(&db, &conn, &query) + }; + assert_eq!( + limbo_sum, sqlite_sum, + "query={}, limbo={:?}, sqlite={:?}", + query, limbo_sum, sqlite_sum + ); + } + } + } +} diff --git a/tests/integration/query_processing/test_multi_thread.rs b/tests/integration/query_processing/test_multi_thread.rs new file mode 100644 index 000000000..1d96e0422 --- /dev/null +++ b/tests/integration/query_processing/test_multi_thread.rs @@ -0,0 +1,27 @@ +use crate::common::TempDatabase; + +#[test] +fn test_schema_change() { + let tmp_db = TempDatabase::new_empty(false); + let conn1 = tmp_db.connect_limbo(); + conn1.execute("CREATE TABLE t(x, y, z)").unwrap(); + conn1 + .execute("INSERT INTO t VALUES (1, 2, 3), (10, 20, 30)") + .unwrap(); + let conn2 = tmp_db.connect_limbo(); + let mut stmt = conn2.prepare("SELECT x, z FROM t").unwrap(); + conn1.execute("ALTER TABLE t DROP COLUMN x").unwrap(); + let row = loop { + match stmt.step() { + Ok(turso_core::StepResult::Row) => { + let row = stmt.row().unwrap(); + break row; + } + Ok(turso_core::StepResult::IO) => { + stmt.run_once().unwrap(); + } + _ => panic!("unexpected step result"), + } + }; + println!("{:?} {:?}", row.get_value(0), row.get_value(1)); +}