//! The virtual database engine (VDBE). //! //! The VDBE is a register-based virtual machine that execute bytecode //! instructions that represent SQL statements. When an application prepares //! an SQL statement, the statement is compiled into a sequence of bytecode //! instructions that perform the needed operations, such as reading or //! writing to a b-tree, sorting, or aggregating data. //! //! The instruction set of the VDBE is similar to SQLite's instruction set, //! but with the exception that bytecodes that perform I/O operations are //! return execution back to the caller instead of blocking. This is because //! Turso is designed for applications that need high concurrency such as //! serverless runtimes. In addition, asynchronous I/O makes storage //! disaggregation easier. //! //! You can find a full list of SQLite opcodes at: //! //! https://www.sqlite.org/opcode.html pub mod affinity; pub mod builder; pub mod execute; pub mod explain; pub mod insn; pub mod likeop; pub mod metrics; pub mod rowset; pub mod sorter; pub mod value; use crate::{ error::LimboError, function::{AggFunc, FuncCtx}, mvcc::{database::CommitStateMachine, LocalClock}, return_if_io, schema::Trigger, state_machine::StateMachine, storage::{pager::PagerCommitResult, sqlite3_ondisk::SmallVec}, translate::{collate::CollationSeq, plan::TableReferences}, types::{IOCompletions, IOResult}, vdbe::{ execute::{ OpCheckpointState, OpColumnState, OpDeleteState, OpDeleteSubState, OpDestroyState, OpIdxInsertState, OpInsertState, OpInsertSubState, OpNewRowidState, OpNoConflictState, OpProgramState, OpRowIdState, OpSeekState, OpTransactionState, }, metrics::StatementMetrics, }, ValueRef, }; use crate::{ storage::pager::Pager, translate::plan::ResultSetColumn, types::{AggContext, Cursor, ImmutableRecord, Value}, vdbe::{builder::CursorType, insn::Insn}, }; #[cfg(feature = "json")] use crate::json::JsonCacheCell; use crate::{Connection, MvStore, Result, TransactionState}; use builder::{CursorKey, QueryMode}; use execute::{ InsnFunction, InsnFunctionStepResult, OpIdxDeleteState, OpIntegrityCheckState, OpOpenEphemeralState, }; use parking_lot::RwLock; use turso_parser::ast::ResolveType; use crate::vdbe::rowset::RowSet; use explain::{insn_to_row_with_comment, EXPLAIN_COLUMNS, EXPLAIN_QUERY_PLAN_COLUMNS}; use regex::Regex; use std::{ collections::HashMap, num::NonZero, sync::{ atomic::{AtomicI64, AtomicIsize, Ordering}, Arc, }, task::Waker, }; use tracing::{instrument, Level}; /// State machine for committing view deltas with I/O handling #[derive(Debug, Clone)] pub enum ViewDeltaCommitState { NotStarted, Processing { views: Vec, // view names (all materialized views have storage) current_index: usize, }, Done, } /// We use labels to indicate that we want to jump to whatever the instruction offset /// will be at runtime, because the offset cannot always be determined when the jump /// instruction is created. /// /// In some cases, we want to jump to EXACTLY a specific instruction. /// - Example: a condition is not met, so we want to jump to wherever Halt is. /// /// In other cases, we don't care what the exact instruction is, but we know that we /// want to jump to whatever comes AFTER a certain instruction. /// - Example: a Next instruction will want to jump to "whatever the start of the loop is", /// but it doesn't care what instruction that is. /// /// The reason this distinction is important is that we might reorder instructions that are /// constant at compile time, and when we do that, we need to change the offsets of any impacted /// jump instructions, so the instruction that comes immediately after "next Insn" might have changed during the reordering. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum JumpTarget { ExactlyThisInsn, AfterThisInsn, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] /// Represents a target for a jump instruction. /// Stores 32-bit ints to keep the enum word-sized. pub enum BranchOffset { /// A label is a named location in the program. /// If there are references to it, it must always be resolved to an Offset /// via program.resolve_label(). Label(u32), /// An offset is a direct index into the instruction list. Offset(InsnReference), /// A placeholder is a temporary value to satisfy the compiler. /// It must be set later. Placeholder, } impl BranchOffset { /// Returns true if the branch offset is a label. pub fn is_label(&self) -> bool { matches!(self, BranchOffset::Label(_)) } /// Returns true if the branch offset is an offset. pub fn is_offset(&self) -> bool { matches!(self, BranchOffset::Offset(_)) } /// Returns the offset value. Panics if the branch offset is a label or placeholder. pub fn as_offset_int(&self) -> InsnReference { match self { BranchOffset::Label(v) => unreachable!("Unresolved label: {}", v), BranchOffset::Offset(v) => *v, BranchOffset::Placeholder => unreachable!("Unresolved placeholder"), } } /// Returns the branch offset as a signed integer. /// Used in explain output, where we don't want to panic in case we have an unresolved /// label or placeholder. pub fn as_debug_int(&self) -> i32 { match self { BranchOffset::Label(v) => *v as i32, BranchOffset::Offset(v) => *v as i32, BranchOffset::Placeholder => i32::MAX, } } /// Adds an integer value to the branch offset. /// Returns a new branch offset. /// Panics if the branch offset is a label or placeholder. pub fn add>(self, n: N) -> BranchOffset { BranchOffset::Offset(self.as_offset_int() + n.into()) } pub fn sub>(self, n: N) -> BranchOffset { BranchOffset::Offset(self.as_offset_int() - n.into()) } } pub type CursorID = usize; pub type PageIdx = i64; // Index of insn in list of insns type InsnReference = u32; #[derive(Debug)] pub enum StepResult { Done, IO, Row, Interrupt, Busy, } struct RegexCache { like: HashMap, glob: HashMap, } impl RegexCache { fn new() -> Self { Self { like: HashMap::new(), glob: HashMap::new(), } } } struct Bitfield([u64; N]); impl Bitfield { fn new() -> Self { Self([0; N]) } fn set(&mut self, bit: usize) { assert!(bit < N * 64, "bit out of bounds"); self.0[bit / 64] |= 1 << (bit % 64); } fn unset(&mut self, bit: usize) { assert!(bit < N * 64, "bit out of bounds"); self.0[bit / 64] &= !(1 << (bit % 64)); } fn get(&self, bit: usize) -> bool { assert!(bit < N * 64, "bit out of bounds"); (self.0[bit / 64] & (1 << (bit % 64))) != 0 } } #[derive(Debug)] #[allow(clippy::large_enum_variant)] /// The commit state of the program. /// There are two states: /// - Ready: The program is ready to run the next instruction, or has shut down after /// the last instruction. /// - Committing: The program is committing a write transaction. It is waiting for the pager to finish flushing the cache to disk, /// primarily to the WAL, but also possibly checkpointing the WAL to the database file. enum CommitState { Ready, Committing, CommitingMvcc { state_machine: StateMachine>, }, } #[derive(Debug, Clone)] pub enum Register { Value(Value), Aggregate(AggContext), Record(ImmutableRecord), } impl Register { #[inline] pub fn is_null(&self) -> bool { matches!(self, Register::Value(Value::Null)) } } /// A row is a the list of registers that hold the values for a filtered row. This row is a pointer, therefore /// after stepping again, row will be invalidated to be sure it doesn't point to somewhere unexpected. #[derive(Debug)] pub struct Row { values: *const Register, count: usize, } // SAFETY: This needs to be audited for thread safety. // See: https://github.com/tursodatabase/turso/issues/1552 unsafe impl Send for Row {} unsafe impl Sync for Row {} #[derive(Debug, Clone, Copy, PartialEq)] pub enum TxnCleanup { None, RollbackTxn, } /// The program state describes the environment in which the program executes. pub struct ProgramState { pub io_completions: Option, pub pc: InsnReference, pub(crate) cursors: Vec>, cursor_seqs: Vec, registers: Vec, pub(crate) result_row: Option, last_compare: Option, deferred_seeks: Vec>, ended_coroutine: Bitfield<4>, // flag to indicate that a coroutine has ended (key is the yield register. currently we assume that the yield register is always between 0-255, YOLO) /// Indicate whether an [Insn::Once] instruction at a given program counter position has already been executed, well, once. once: SmallVec, regex_cache: RegexCache, interrupted: bool, pub parameters: HashMap, Value>, commit_state: CommitState, #[cfg(feature = "json")] json_cache: JsonCacheCell, op_delete_state: OpDeleteState, op_destroy_state: OpDestroyState, op_idx_delete_state: Option, op_integrity_check_state: OpIntegrityCheckState, /// Metrics collected during statement execution pub metrics: StatementMetrics, op_open_ephemeral_state: OpOpenEphemeralState, op_program_state: OpProgramState, op_new_rowid_state: OpNewRowidState, op_idx_insert_state: OpIdxInsertState, op_insert_state: OpInsertState, op_no_conflict_state: OpNoConflictState, seek_state: OpSeekState, /// Current collation sequence set by OP_CollSeq instruction current_collation: Option, op_column_state: OpColumnState, op_row_id_state: OpRowIdState, op_transaction_state: OpTransactionState, op_checkpoint_state: OpCheckpointState, /// State machine for committing view deltas with I/O handling view_delta_state: ViewDeltaCommitState, /// Marker which tells about auto transaction cleanup necessary for that connection in case of reset /// This is used when statement in auto-commit mode reseted after previous uncomplete execution - in which case we may need to rollback transaction started on previous attempt /// Note, that MVCC transactions are always explicit - so they do not update auto_txn_cleanup marker pub(crate) auto_txn_cleanup: TxnCleanup, /// Number of deferred foreign key violations when the statement started. /// When a statement subtransaction rolls back, the connection's deferred foreign key violations counter /// is reset to this value. fk_deferred_violations_when_stmt_started: AtomicIsize, /// Number of immediate foreign key violations that occurred during the active statement. If nonzero, /// the statement subtransactionwill roll back. fk_immediate_violations_during_stmt: AtomicIsize, /// RowSet objects stored by register index rowsets: HashMap, } impl std::fmt::Debug for Program { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Program").finish() } } // SAFETY: This needs to be audited for thread safety. // See: https://github.com/tursodatabase/turso/issues/1552 unsafe impl Send for ProgramState {} unsafe impl Sync for ProgramState {} impl ProgramState { pub fn new(max_registers: usize, max_cursors: usize) -> Self { let cursors: Vec> = (0..max_cursors).map(|_| None).collect(); let cursor_seqs = vec![0i64; max_cursors]; let registers = vec![Register::Value(Value::Null); max_registers]; Self { io_completions: None, pc: 0, cursors, cursor_seqs, registers, result_row: None, last_compare: None, deferred_seeks: vec![None; max_cursors], ended_coroutine: Bitfield::new(), once: SmallVec::::new(), regex_cache: RegexCache::new(), interrupted: false, parameters: HashMap::new(), commit_state: CommitState::Ready, #[cfg(feature = "json")] json_cache: JsonCacheCell::new(), op_delete_state: OpDeleteState { sub_state: OpDeleteSubState::MaybeCaptureRecord, deleted_record: None, }, op_destroy_state: OpDestroyState::CreateCursor, op_idx_delete_state: None, op_integrity_check_state: OpIntegrityCheckState::Start, metrics: StatementMetrics::new(), op_open_ephemeral_state: OpOpenEphemeralState::Start, op_program_state: OpProgramState::Start, op_new_rowid_state: OpNewRowidState::Start, op_idx_insert_state: OpIdxInsertState::MaybeSeek, op_insert_state: OpInsertState { sub_state: OpInsertSubState::MaybeCaptureRecord, old_record: None, }, op_no_conflict_state: OpNoConflictState::Start, seek_state: OpSeekState::Start, current_collation: None, op_column_state: OpColumnState::Start, op_row_id_state: OpRowIdState::Start, op_transaction_state: OpTransactionState::Start, op_checkpoint_state: OpCheckpointState::StartCheckpoint, view_delta_state: ViewDeltaCommitState::NotStarted, auto_txn_cleanup: TxnCleanup::None, fk_deferred_violations_when_stmt_started: AtomicIsize::new(0), fk_immediate_violations_during_stmt: AtomicIsize::new(0), rowsets: HashMap::new(), } } pub fn set_register(&mut self, idx: usize, value: Register) { self.registers[idx] = value; } pub fn get_register(&self, idx: usize) -> &Register { &self.registers[idx] } pub fn column_count(&self) -> usize { self.registers.len() } pub fn column(&self, i: usize) -> Option { Some(format!("{:?}", self.registers[i])) } pub fn interrupt(&mut self) { self.interrupted = true; } pub fn is_interrupted(&self) -> bool { self.interrupted } pub fn bind_at(&mut self, index: NonZero, value: Value) { self.parameters.insert(index, value); } pub fn clear_bindings(&mut self) { self.parameters.clear(); } pub fn get_parameter(&self, index: NonZero) -> Value { self.parameters.get(&index).cloned().unwrap_or(Value::Null) } pub fn reset(&mut self, max_registers: Option, max_cursors: Option) { self.pc = 0; if let Some(max_cursors) = max_cursors { self.cursors.resize_with(max_cursors, || None); self.cursor_seqs.resize(max_cursors, 0); } if let Some(max_registers) = max_registers { self.registers .resize_with(max_registers, || Register::Value(Value::Null)); } // reset cursors as they can have cached information which will be no longer relevant on next program execution self.cursors.iter_mut().for_each(|c| { let _ = c.take(); }); self.registers .iter_mut() .for_each(|r| *r = Register::Value(Value::Null)); self.last_compare = None; self.deferred_seeks.iter_mut().for_each(|s| *s = None); self.ended_coroutine.0 = [0; 4]; self.regex_cache.like.clear(); self.interrupted = false; self.current_collation = None; #[cfg(feature = "json")] self.json_cache.clear(); // Reset state machines self.op_delete_state = OpDeleteState { sub_state: OpDeleteSubState::MaybeCaptureRecord, deleted_record: None, }; self.op_idx_delete_state = None; self.op_integrity_check_state = OpIntegrityCheckState::Start; self.metrics = StatementMetrics::new(); self.op_open_ephemeral_state = OpOpenEphemeralState::Start; self.op_new_rowid_state = OpNewRowidState::Start; self.op_idx_insert_state = OpIdxInsertState::MaybeSeek; self.op_insert_state = OpInsertState { sub_state: OpInsertSubState::MaybeCaptureRecord, old_record: None, }; self.op_no_conflict_state = OpNoConflictState::Start; self.seek_state = OpSeekState::Start; self.current_collation = None; self.op_column_state = OpColumnState::Start; self.op_row_id_state = OpRowIdState::Start; self.view_delta_state = ViewDeltaCommitState::NotStarted; self.auto_txn_cleanup = TxnCleanup::None; self.fk_immediate_violations_during_stmt .store(0, Ordering::SeqCst); self.fk_deferred_violations_when_stmt_started .store(0, Ordering::SeqCst); self.rowsets.clear(); } pub fn get_cursor(&mut self, cursor_id: CursorID) -> &mut Cursor { self.cursors .get_mut(cursor_id) .unwrap_or_else(|| panic!("cursor id {cursor_id} out of bounds")) .as_mut() .unwrap_or_else(|| panic!("cursor id {cursor_id} is None")) } /// Begin a statement subtransaction. pub fn begin_statement( &mut self, connection: &Connection, pager: &Arc, write: bool, ) -> Result> { // Store the deferred foreign key violations counter at the start of the statement. // This is used to ensure that if an interactive transaction had deferred FK violations and a statement subtransaction rolls back, // the deferred FK violations are not lost. self.fk_deferred_violations_when_stmt_started.store( connection.fk_deferred_violations.load(Ordering::Acquire), Ordering::SeqCst, ); // Reset the immediate foreign key violations counter to 0. If this is nonzero when the statement completes, the statement subtransaction will roll back. self.fk_immediate_violations_during_stmt .store(0, Ordering::SeqCst); if write { let db_size = return_if_io!(pager.with_header(|header| header.database_size.get())); pager.begin_statement(db_size)?; } Ok(IOResult::Done(())) } /// End a statement subtransaction. pub fn end_statement( &mut self, connection: &Connection, pager: &Arc, end_statement: EndStatement, ) -> Result<()> { match end_statement { EndStatement::ReleaseSavepoint => pager.release_savepoint(), EndStatement::RollbackSavepoint => { let stmt_was_rolled_back = pager.rollback_to_newest_savepoint()?; if !stmt_was_rolled_back { // We sometimes call end_statement() on errors without explicitly knowing whether a stmt transaction // caused the error or not. If it didn't, don't reset any FK violation counters. return Ok(()); } // Reset the deferred foreign key violations counter to the value it had at the start of the statement. // This is used to ensure that if an interactive transaction had deferred FK violations, they are not lost. connection.fk_deferred_violations.store( self.fk_deferred_violations_when_stmt_started .load(Ordering::Acquire), Ordering::SeqCst, ); Ok(()) } } } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] /// Action to take at the end of a statement subtransaction. pub enum EndStatement { /// Release (commit) the savepoint -- effectively removing the savepoint as it is no longer needed for undo purposes. ReleaseSavepoint, /// Rollback (abort) to the newest savepoint: read pages from the subjournal and restore them to the page cache. /// This is used to undo the changes made by the statement. RollbackSavepoint, } impl Register { pub fn get_value(&self) -> &Value { match self { Register::Value(v) => v, Register::Record(r) => { assert!(!r.is_invalidated()); r.as_blob_value() } _ => panic!("register holds unexpected value: {self:?}"), } } } #[macro_export] macro_rules! must_be_btree_cursor { ($cursor_id:expr, $cursor_ref:expr, $state:expr, $insn_name:expr) => {{ let (_, cursor_type) = $cursor_ref.get($cursor_id).unwrap(); if matches!( cursor_type, CursorType::BTreeTable(_) | CursorType::BTreeIndex(_) | CursorType::MaterializedView(_, _) ) { $crate::get_cursor!($state, $cursor_id) } else { panic!("{} on unexpected cursor", $insn_name) } }}; } /// Macro is necessary to help the borrow checker see we are only accessing state.cursor field /// and nothing else #[macro_export] macro_rules! get_cursor { ($state:expr, $cursor_id:expr) => { $state .cursors .get_mut($cursor_id) .unwrap_or_else(|| panic!("cursor id {} out of bounds", $cursor_id)) .as_mut() .unwrap_or_else(|| panic!("cursor id {} is None", $cursor_id)) }; } /// Tracks the state of explain mode execution, including which subprograms need to be processed. #[derive(Default)] pub struct ExplainState { /// Program counter positions in the parent program where `Insn::Program` instructions occur. parent_program_pcs: Vec, /// Index of the subprogram currently being processed, if any. current_subprogram_index: Option, /// PC value when we started processing the current subprogram, to detect if we need to reset. subprogram_start_pc: Option, } pub struct Program { pub max_registers: usize, // we store original indices because we don't want to create new vec from // ProgramBuilder pub insns: Vec<(Insn, usize)>, pub cursor_ref: Vec<(Option, CursorType)>, pub comments: Vec<(InsnReference, &'static str)>, pub parameters: crate::parameters::Parameters, pub connection: Arc, pub n_change: AtomicI64, pub change_cnt_on: bool, pub result_columns: Vec, pub table_references: TableReferences, pub sql: String, /// Whether the program accesses the database. /// Used to determine whether we need to check for schema changes when /// starting a transaction. pub accesses_db: bool, /// In SQLite, whether statement subtransactions will be used for executing a program (`usesStmtJournal`) /// is determined by the parser flags "mayAbort" and "isMultiWrite". Essentially this means that the individual /// statement may need to be aborted due to a constraint conflict, etc. instead of the entire transaction. pub needs_stmt_subtransactions: bool, pub trigger: Option>, pub resolve_type: ResolveType, pub explain_state: RwLock, } impl Program { fn get_pager_from_database_index(&self, idx: &usize) -> Arc { self.connection.get_pager_from_database_index(idx) } pub fn step( &self, state: &mut ProgramState, mv_store: Option<&Arc>, pager: Arc, query_mode: QueryMode, waker: Option<&Waker>, ) -> Result { match query_mode { QueryMode::Normal => self.normal_step(state, mv_store, pager, waker), QueryMode::Explain => self.explain_step(state, mv_store, pager), QueryMode::ExplainQueryPlan => self.explain_query_plan_step(state, mv_store, pager), } } fn explain_step( &self, state: &mut ProgramState, _mv_store: Option<&Arc>, pager: Arc, ) -> Result { debug_assert!(state.column_count() == EXPLAIN_COLUMNS.len()); if self.connection.is_closed() { // Connection is closed for whatever reason, rollback the transaction. let state = self.connection.get_tx_state(); if let TransactionState::Write { .. } = state { pager.rollback_tx(&self.connection); } return Err(LimboError::InternalError("Connection closed".to_string())); } if state.is_interrupted() { return Ok(StepResult::Interrupt); } // FIXME: do we need this? state.metrics.vm_steps = state.metrics.vm_steps.saturating_add(1); let mut explain_state = self.explain_state.write(); // Check if we're processing a subprogram if let Some(sub_idx) = explain_state.current_subprogram_index { if sub_idx >= explain_state.parent_program_pcs.len() { // All subprograms processed *explain_state = ExplainState::default(); return Ok(StepResult::Done); } let parent_pc = explain_state.parent_program_pcs[sub_idx]; let Insn::Program { program: p, .. } = &self.insns[parent_pc].0 else { panic!("Expected program insn at pc {parent_pc}"); }; let p = &mut p.write().program; let subprogram_insn_count = p.insns.len(); // Check if the subprogram has already finished (PC is out of bounds) // This can happen if the subprogram finished in a previous call but we're being called again if state.pc as usize >= subprogram_insn_count { // Subprogram is done, move to next one explain_state.subprogram_start_pc = None; if sub_idx + 1 < explain_state.parent_program_pcs.len() { explain_state.current_subprogram_index = Some(sub_idx + 1); state.pc = 0; drop(explain_state); return self.explain_step(state, _mv_store, pager); } else { *explain_state = ExplainState::default(); return Ok(StepResult::Done); } } // Reset PC to 0 only when starting a new subprogram (when subprogram_start_pc is None) // Once we've started, let the subprogram manage its own PC through its explain_step if explain_state.subprogram_start_pc.is_none() { state.pc = 0; explain_state.subprogram_start_pc = Some(0); } // Process the subprogram - it will handle its own explain_step internally // The subprogram's explain_step will process all its instructions (including any nested subprograms) // and return StepResult::Row for each instruction, then StepResult::Done when finished let result = p.step(state, None, pager.clone(), QueryMode::Explain, None)?; match result { StepResult::Done => { // This subprogram is done, move to next one explain_state.subprogram_start_pc = None; // Clear the start PC marker if sub_idx + 1 < explain_state.parent_program_pcs.len() { // Move to next subprogram explain_state.current_subprogram_index = Some(sub_idx + 1); // Reset PC to 0 for the next subprogram state.pc = 0; // Recursively call to process the next subprogram drop(explain_state); return self.explain_step(state, _mv_store, pager); } else { // All subprograms done *explain_state = ExplainState::default(); return Ok(StepResult::Done); } } StepResult::Row => { // Output a row from the subprogram // The subprogram's step already set up the registers with PC starting at 0 // Don't reset subprogram_start_pc - we're still processing this subprogram drop(explain_state); return Ok(StepResult::Row); } other => { drop(explain_state); return Ok(other); } } } // We're processing the parent program if state.pc as usize >= self.insns.len() { // Parent program is done, start processing subprograms if explain_state.parent_program_pcs.is_empty() { // No subprograms to process *explain_state = ExplainState::default(); return Ok(StepResult::Done); } // Start processing the first subprogram explain_state.current_subprogram_index = Some(0); explain_state.subprogram_start_pc = None; // Will be set when we actually start processing state.pc = 0; // Reset PC to 0 for the first subprogram drop(explain_state); return self.explain_step(state, _mv_store, pager); } let (current_insn, _) = &self.insns[state.pc as usize]; if matches!(current_insn, Insn::Program { .. }) { explain_state.parent_program_pcs.push(state.pc as usize); } let (opcode, p1, p2, p3, p4, p5, comment) = insn_to_row_with_comment( self, current_insn, self.comments .iter() .find(|(offset, _)| *offset == state.pc) .map(|(_, comment)| comment) .copied(), ); state.registers[0] = Register::Value(Value::Integer(state.pc as i64)); state.registers[1] = Register::Value(Value::from_text(opcode)); state.registers[2] = Register::Value(Value::Integer(p1 as i64)); state.registers[3] = Register::Value(Value::Integer(p2 as i64)); state.registers[4] = Register::Value(Value::Integer(p3 as i64)); state.registers[5] = Register::Value(p4); state.registers[6] = Register::Value(Value::Integer(p5 as i64)); state.registers[7] = Register::Value(Value::from_text(comment)); state.result_row = Some(Row { values: &state.registers[0] as *const Register, count: EXPLAIN_COLUMNS.len(), }); state.pc += 1; Ok(StepResult::Row) } fn explain_query_plan_step( &self, state: &mut ProgramState, _mv_store: Option<&Arc>, pager: Arc, ) -> Result { debug_assert!(state.column_count() == EXPLAIN_QUERY_PLAN_COLUMNS.len()); loop { if self.connection.is_closed() { // Connection is closed for whatever reason, rollback the transaction. let state = self.connection.get_tx_state(); if let TransactionState::Write { .. } = state { pager.rollback_tx(&self.connection); } return Err(LimboError::InternalError("Connection closed".to_string())); } if state.is_interrupted() { return Ok(StepResult::Interrupt); } // FIXME: do we need this? state.metrics.vm_steps = state.metrics.vm_steps.saturating_add(1); if state.pc as usize >= self.insns.len() { return Ok(StepResult::Done); } let Insn::Explain { p1, p2, detail } = &self.insns[state.pc as usize].0 else { state.pc += 1; continue; }; state.registers[0] = Register::Value(Value::Integer(*p1 as i64)); state.registers[1] = Register::Value(Value::Integer(p2.as_ref().map(|p| *p).unwrap_or(0) as i64)); state.registers[2] = Register::Value(Value::Integer(0)); state.registers[3] = Register::Value(Value::from_text(detail.clone())); state.result_row = Some(Row { values: &state.registers[0] as *const Register, count: EXPLAIN_QUERY_PLAN_COLUMNS.len(), }); state.pc += 1; return Ok(StepResult::Row); } } #[instrument(skip_all, level = Level::DEBUG)] fn normal_step( &self, state: &mut ProgramState, mv_store: Option<&Arc>, pager: Arc, waker: Option<&Waker>, ) -> Result { let enable_tracing = tracing::enabled!(tracing::Level::TRACE); loop { if self.connection.is_closed() { // Connection is closed for whatever reason, rollback the transaction. let state = self.connection.get_tx_state(); if let TransactionState::Write { .. } = state { pager.rollback_tx(&self.connection); } return Err(LimboError::InternalError("Connection closed".to_string())); } if state.is_interrupted() { self.abort(mv_store, &pager, None, state); return Ok(StepResult::Interrupt); } if let Some(io) = &state.io_completions { if !io.finished() { io.set_waker(waker); return Ok(StepResult::IO); } if let Some(err) = io.get_error() { let err = err.into(); self.abort(mv_store, &pager, Some(&err), state); return Err(err); } state.io_completions = None; } // invalidate row let _ = state.result_row.take(); let (insn, _) = &self.insns[state.pc as usize]; let insn_function = insn.to_function(); if enable_tracing { trace_insn(self, state.pc as InsnReference, insn); } // Always increment VM steps for every loop iteration state.metrics.vm_steps = state.metrics.vm_steps.saturating_add(1); match insn_function(self, state, insn, &pager, mv_store) { Ok(InsnFunctionStepResult::Step) => { // Instruction completed, moving to next state.metrics.insn_executed = state.metrics.insn_executed.saturating_add(1); } Ok(InsnFunctionStepResult::Done) => { // Instruction completed execution state.metrics.insn_executed = state.metrics.insn_executed.saturating_add(1); state.auto_txn_cleanup = TxnCleanup::None; return Ok(StepResult::Done); } Ok(InsnFunctionStepResult::IO(io)) => { // Instruction not complete - waiting for I/O, will resume at same PC io.set_waker(waker); state.io_completions = Some(io); return Ok(StepResult::IO); } Ok(InsnFunctionStepResult::Row) => { // Instruction completed (ResultRow already incremented PC) state.metrics.insn_executed = state.metrics.insn_executed.saturating_add(1); return Ok(StepResult::Row); } Err(LimboError::Busy) => { // Instruction blocked - will retry at same PC return Ok(StepResult::Busy); } Err(err) => { self.abort(mv_store, &pager, Some(&err), state); return Err(err); } } } } #[instrument(skip_all, level = Level::DEBUG)] fn apply_view_deltas( &self, state: &mut ProgramState, rollback: bool, pager: &Arc, ) -> Result> { use crate::types::IOResult; loop { match &state.view_delta_state { ViewDeltaCommitState::NotStarted => { if self.connection.view_transaction_states.is_empty() { return Ok(IOResult::Done(())); } if rollback { // On rollback, just clear and done self.connection.view_transaction_states.clear(); return Ok(IOResult::Done(())); } // Not a rollback - proceed with processing let schema = self.connection.schema.read(); // Collect materialized views - they should all have storage let mut views = Vec::new(); for view_name in self.connection.view_transaction_states.get_view_names() { if let Some(view_mutex) = schema.get_materialized_view(&view_name) { let view = view_mutex.lock(); let root_page = view.get_root_page(); // Materialized views should always have storage (root_page != 0) assert!( root_page != 0, "Materialized view '{view_name}' should have a root page" ); views.push(view_name); } } state.view_delta_state = ViewDeltaCommitState::Processing { views, current_index: 0, }; } ViewDeltaCommitState::Processing { views, current_index, } => { // At this point we know it's not a rollback if *current_index >= views.len() { // All done, clear the transaction states self.connection.view_transaction_states.clear(); state.view_delta_state = ViewDeltaCommitState::Done; return Ok(IOResult::Done(())); } let view_name = &views[*current_index]; let table_deltas = self .connection .view_transaction_states .get(view_name) .unwrap() .get_table_deltas(); let schema = self.connection.schema.read(); if let Some(view_mutex) = schema.get_materialized_view(view_name) { let mut view = view_mutex.lock(); // Create a DeltaSet from the per-table deltas let mut delta_set = crate::incremental::compiler::DeltaSet::new(); for (table_name, delta) in table_deltas { delta_set.insert(table_name, delta); } // Handle I/O from merge_delta - pass pager, circuit will create its own cursor match view.merge_delta(delta_set, pager.clone())? { IOResult::Done(_) => { // Move to next view state.view_delta_state = ViewDeltaCommitState::Processing { views: views.clone(), current_index: current_index + 1, }; } IOResult::IO(io) => { // Return I/O, will resume at same index return Ok(IOResult::IO(io)); } } } } ViewDeltaCommitState::Done => { return Ok(IOResult::Done(())); } } } } pub fn commit_txn( &self, pager: Arc, program_state: &mut ProgramState, mv_store: Option<&Arc>, rollback: bool, ) -> Result> { // Apply view deltas with I/O handling match self.apply_view_deltas(program_state, rollback, &pager)? { IOResult::IO(io) => return Ok(IOResult::IO(io)), IOResult::Done(_) => {} } // Reset state for next use program_state.view_delta_state = ViewDeltaCommitState::NotStarted; if self.connection.get_tx_state() == TransactionState::None { // No need to do any work here if not in tx. Current MVCC logic doesn't work with this assumption, // hence the mv_store.is_none() check. return Ok(IOResult::Done(())); } if self.connection.is_nested_stmt() { // We don't want to commit on nested statements. Let parent handle it. return Ok(IOResult::Done(())); } if let Some(mv_store) = mv_store { let conn = self.connection.clone(); let auto_commit = conn.auto_commit.load(Ordering::SeqCst); if auto_commit { // FIXME: we don't want to commit stuff from other programs. if matches!(program_state.commit_state, CommitState::Ready) { let Some(tx_id) = conn.get_mv_tx_id() else { return Ok(IOResult::Done(())); }; let state_machine = mv_store.commit_tx(tx_id, &conn).unwrap(); program_state.commit_state = CommitState::CommitingMvcc { state_machine }; } let CommitState::CommitingMvcc { state_machine } = &mut program_state.commit_state else { panic!("invalid state for mvcc commit step") }; match self.step_end_mvcc_txn(state_machine, mv_store)? { IOResult::Done(_) => { assert!(state_machine.is_finalized()); *conn.mv_tx.write() = None; conn.set_tx_state(TransactionState::None); program_state.commit_state = CommitState::Ready; return Ok(IOResult::Done(())); } IOResult::IO(io) => { return Ok(IOResult::IO(io)); } } } Ok(IOResult::Done(())) } else { let connection = self.connection.clone(); let auto_commit = connection.auto_commit.load(Ordering::SeqCst); tracing::debug!( "Halt auto_commit {}, state={:?}", auto_commit, program_state.commit_state ); if matches!(program_state.commit_state, CommitState::Committing) { let TransactionState::Write { .. } = connection.get_tx_state() else { unreachable!("invalid state for write commit step") }; self.step_end_write_txn( &pager, &mut program_state.commit_state, &connection, rollback, ) } else if auto_commit { let current_state = connection.get_tx_state(); tracing::trace!("Auto-commit state: {:?}", current_state); match current_state { TransactionState::Write { .. } => self.step_end_write_txn( &pager, &mut program_state.commit_state, &connection, rollback, ), TransactionState::Read => { connection.set_tx_state(TransactionState::None); pager.end_read_tx(); Ok(IOResult::Done(())) } TransactionState::None => Ok(IOResult::Done(())), TransactionState::PendingUpgrade => { panic!("Unexpected transaction state: {current_state:?} during auto-commit",) } } } else { if self.change_cnt_on { self.connection .set_changes(self.n_change.load(Ordering::SeqCst)); } Ok(IOResult::Done(())) } } } #[instrument(skip(self, pager, connection), level = Level::DEBUG)] fn step_end_write_txn( &self, pager: &Arc, commit_state: &mut CommitState, connection: &Connection, rollback: bool, ) -> Result> { let cacheflush_status = if !rollback { pager.commit_tx(connection)? } else { pager.rollback_tx(connection); IOResult::Done(PagerCommitResult::Rollback) }; match cacheflush_status { IOResult::Done(_) => { if self.change_cnt_on { self.connection .set_changes(self.n_change.load(Ordering::SeqCst)); } connection.set_tx_state(TransactionState::None); *commit_state = CommitState::Ready; } IOResult::IO(io) => { tracing::trace!("Cacheflush IO"); *commit_state = CommitState::Committing; return Ok(IOResult::IO(io)); } } Ok(IOResult::Done(())) } #[instrument(skip(self, commit_state, mv_store), level = Level::DEBUG)] fn step_end_mvcc_txn( &self, commit_state: &mut StateMachine>, mv_store: &Arc, ) -> Result> { commit_state.step(mv_store) } /// Aborts the program due to various conditions (explicit error, interrupt or reset of unfinished statement) by rolling back the transaction /// This method is no-op if program was already finished (either aborted or executed to completion) pub fn abort( &self, mv_store: Option<&Arc>, pager: &Arc, err: Option<&LimboError>, state: &mut ProgramState, ) { if self.is_trigger_subprogram() { self.connection.end_trigger_execution(); } // Errors from nested statements are handled by the parent statement. if !self.connection.is_nested_stmt() && !self.is_trigger_subprogram() { if err.is_some() { // Any error apart from deferred FK volations causes the statement subtransaction to roll back. let res = state.end_statement(&self.connection, pager, EndStatement::RollbackSavepoint); if let Err(e) = res { tracing::error!("Error rolling back statement: {}", e); } } match err { // Transaction errors, e.g. trying to start a nested transaction, do not cause a rollback. Some(LimboError::TxError(_)) => {} // Table locked errors, e.g. trying to checkpoint in an interactive transaction, do not cause a rollback. Some(LimboError::TableLocked) => {} // Busy errors do not cause a rollback. Some(LimboError::Busy) => {} // Constraint errors do not cause a rollback of the transaction by default; // Instead individual statement subtransactions will roll back and these are handled in op_auto_commit // and op_halt. Some(LimboError::Constraint(_)) => {} _ => { if state.auto_txn_cleanup != TxnCleanup::None || err.is_some() { if let Some(mv_store) = mv_store { if let Some(tx_id) = self.connection.get_mv_tx_id() { self.connection.auto_commit.store(true, Ordering::SeqCst); mv_store.rollback_tx(tx_id, pager.clone(), &self.connection); } } else { pager.rollback_tx(&self.connection); self.connection.auto_commit.store(true, Ordering::SeqCst); } self.connection.set_tx_state(TransactionState::None); } } } } state.auto_txn_cleanup = TxnCleanup::None; } pub fn is_trigger_subprogram(&self) -> bool { self.trigger.is_some() } } fn make_record(registers: &[Register], start_reg: &usize, count: &usize) -> ImmutableRecord { let regs = ®isters[*start_reg..*start_reg + *count]; ImmutableRecord::from_registers(regs, regs.len()) } pub fn registers_to_ref_values<'a>( registers: &'a [Register], ) -> impl ExactSizeIterator> { registers.iter().map(|reg| reg.get_value().as_ref()) } #[instrument(skip(program), level = Level::DEBUG)] fn trace_insn(program: &Program, addr: InsnReference, insn: &Insn) { tracing::trace!( "\n{}", explain::insn_to_str( program, addr, insn, String::new(), program .comments .iter() .find(|(offset, _)| *offset == addr) .map(|(_, comment)| comment) .copied() ) ); } pub trait FromValueRow<'a> { fn from_value(value: &'a Value) -> Result where Self: Sized + 'a; } impl<'a> FromValueRow<'a> for i64 { fn from_value(value: &'a Value) -> Result { match value { Value::Integer(i) => Ok(*i), _ => Err(LimboError::ConversionError("Expected integer value".into())), } } } impl<'a> FromValueRow<'a> for f64 { fn from_value(value: &'a Value) -> Result { match value { Value::Float(f) => Ok(*f), _ => Err(LimboError::ConversionError("Expected integer value".into())), } } } impl<'a> FromValueRow<'a> for String { fn from_value(value: &'a Value) -> Result { match value { Value::Text(s) => Ok(s.as_str().to_string()), _ => Err(LimboError::ConversionError("Expected text value".into())), } } } impl<'a> FromValueRow<'a> for &'a str { fn from_value(value: &'a Value) -> Result { match value { Value::Text(s) => Ok(s.as_str()), _ => Err(LimboError::ConversionError("Expected text value".into())), } } } impl<'a> FromValueRow<'a> for &'a Value { fn from_value(value: &'a Value) -> Result { Ok(value) } } impl Row { pub fn get<'a, T: FromValueRow<'a> + 'a>(&'a self, idx: usize) -> Result { let value = unsafe { self.values.add(idx).as_ref().unwrap() }; let value = match value { Register::Value(value) => value, _ => unreachable!("a row should be formed of values only"), }; T::from_value(value) } pub fn get_value(&self, idx: usize) -> &Value { let value = unsafe { self.values.add(idx).as_ref().unwrap() }; match value { Register::Value(value) => value, _ => unreachable!("a row should be formed of values only"), } } pub fn get_values(&self) -> impl Iterator { let values = unsafe { std::slice::from_raw_parts(self.values, self.count) }; // This should be ownedvalues // TODO: add check for this values.iter().map(|v| v.get_value()) } pub fn len(&self) -> usize { self.count } }