turso/core/vdbe/mod.rs

//! The virtual database engine (VDBE).
//!
//! The VDBE is a register-based virtual machine that execute bytecode
//! instructions that represent SQL statements. When an application prepares
//! an SQL statement, the statement is compiled into a sequence of bytecode
//! instructions that perform the needed operations, such as reading or
//! writing to a b-tree, sorting, or aggregating data.
//!
//! The instruction set of the VDBE is similar to SQLite's instruction set,
//! but with the exception that bytecodes that perform I/O operations are
//! return execution back to the caller instead of blocking. This is because
//! Turso is designed for applications that need high concurrency such as
//! serverless runtimes. In addition, asynchronous I/O makes storage
//! disaggregation easier.
//!
//! You can find a full list of SQLite opcodes at:
//!
//! https://www.sqlite.org/opcode.html

pub mod affinity;
pub mod builder;
pub mod execute;
pub mod explain;
pub mod insn;
pub mod likeop;
pub mod metrics;
pub mod rowset;
pub mod sorter;
pub mod value;

use crate::{
    error::LimboError,
    function::{AggFunc, FuncCtx},
    mvcc::{database::CommitStateMachine, LocalClock},
    return_if_io,
    schema::Trigger,
    state_machine::StateMachine,
    storage::{pager::PagerCommitResult, sqlite3_ondisk::SmallVec},
    translate::{collate::CollationSeq, plan::TableReferences},
    types::{IOCompletions, IOResult},
    vdbe::{
        execute::{
            OpCheckpointState, OpColumnState, OpDeleteState, OpDeleteSubState, OpDestroyState,
            OpIdxInsertState, OpInsertState, OpInsertSubState, OpNewRowidState, OpNoConflictState,
            OpProgramState, OpRowIdState, OpSeekState, OpTransactionState,
        },
        metrics::StatementMetrics,
    },
    ValueRef,
};

use crate::{
    storage::pager::Pager,
    translate::plan::ResultSetColumn,
    types::{AggContext, Cursor, ImmutableRecord, Value},
    vdbe::{builder::CursorType, insn::Insn},
};

#[cfg(feature = "json")]
use crate::json::JsonCacheCell;
use crate::{Connection, MvStore, Result, TransactionState};
use builder::{CursorKey, QueryMode};
use execute::{
    InsnFunction, InsnFunctionStepResult, OpIdxDeleteState, OpIntegrityCheckState,
    OpOpenEphemeralState,
};
use parking_lot::RwLock;
use turso_parser::ast::ResolveType;

use crate::vdbe::rowset::RowSet;
use explain::{insn_to_row_with_comment, EXPLAIN_COLUMNS, EXPLAIN_QUERY_PLAN_COLUMNS};
use regex::Regex;
use std::{
    collections::HashMap,
    num::NonZero,
    sync::{
        atomic::{AtomicI64, AtomicIsize, Ordering},
        Arc,
    },
    task::Waker,
};
use tracing::{instrument, Level};

/// State machine for committing view deltas with I/O handling
#[derive(Debug, Clone)]
pub enum ViewDeltaCommitState {
    NotStarted,
    Processing {
        views: Vec<String>, // view names (all materialized views have storage)
        current_index: usize,
    },
    Done,
}

/// We use labels to indicate that we want to jump to whatever the instruction offset
/// will be at runtime, because the offset cannot always be determined when the jump
/// instruction is created.
///
/// In some cases, we want to jump to EXACTLY a specific instruction.
/// - Example: a condition is not met, so we want to jump to wherever Halt is.
///
/// In other cases, we don't care what the exact instruction is, but we know that we
/// want to jump to whatever comes AFTER a certain instruction.
/// - Example: a Next instruction will want to jump to "whatever the start of the loop is",
///   but it doesn't care what instruction that is.
///
/// The reason this distinction is important is that we might reorder instructions that are
/// constant at compile time, and when we do that, we need to change the offsets of any impacted
/// jump instructions, so the instruction that comes immediately after "next Insn" might have changed during the reordering.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum JumpTarget {
    ExactlyThisInsn,
    AfterThisInsn,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
/// Represents a target for a jump instruction.
/// Stores 32-bit ints to keep the enum word-sized.
pub enum BranchOffset {
    /// A label is a named location in the program.
    /// If there are references to it, it must always be resolved to an Offset
    /// via program.resolve_label().
    Label(u32),
    /// An offset is a direct index into the instruction list.
    Offset(InsnReference),
    /// A placeholder is a temporary value to satisfy the compiler.
    /// It must be set later.
    Placeholder,
}

impl BranchOffset {
    /// Returns true if the branch offset is a label.
    pub fn is_label(&self) -> bool {
        matches!(self, BranchOffset::Label(_))
    }

    /// Returns true if the branch offset is an offset.
    pub fn is_offset(&self) -> bool {
        matches!(self, BranchOffset::Offset(_))
    }

    /// Returns the offset value. Panics if the branch offset is a label or placeholder.
    pub fn as_offset_int(&self) -> InsnReference {
        match self {
            BranchOffset::Label(v) => unreachable!("Unresolved label: {}", v),
            BranchOffset::Offset(v) => *v,
            BranchOffset::Placeholder => unreachable!("Unresolved placeholder"),
        }
    }

    /// Returns the branch offset as a signed integer.
    /// Used in explain output, where we don't want to panic in case we have an unresolved
    /// label or placeholder.
    pub fn as_debug_int(&self) -> i32 {
        match self {
            BranchOffset::Label(v) => *v as i32,
            BranchOffset::Offset(v) => *v as i32,
            BranchOffset::Placeholder => i32::MAX,
        }
    }

    /// Adds an integer value to the branch offset.
    /// Returns a new branch offset.
    /// Panics if the branch offset is a label or placeholder.
    pub fn add<N: Into<u32>>(self, n: N) -> BranchOffset {
        BranchOffset::Offset(self.as_offset_int() + n.into())
    }

    pub fn sub<N: Into<u32>>(self, n: N) -> BranchOffset {
        BranchOffset::Offset(self.as_offset_int() - n.into())
    }
}

pub type CursorID = usize;

pub type PageIdx = i64;

// Index of insn in list of insns
type InsnReference = u32;

#[derive(Debug)]
pub enum StepResult {
    Done,
    IO,
    Row,
    Interrupt,
    Busy,
}

struct RegexCache {
    like: HashMap<String, Regex>,
    glob: HashMap<String, Regex>,
}

impl RegexCache {
    fn new() -> Self {
        Self {
            like: HashMap::new(),
            glob: HashMap::new(),
        }
    }
}

struct Bitfield<const N: usize>([u64; N]);

impl<const N: usize> Bitfield<N> {
    fn new() -> Self {
        Self([0; N])
    }

    fn set(&mut self, bit: usize) {
        assert!(bit < N * 64, "bit out of bounds");
        self.0[bit / 64] |= 1 << (bit % 64);
    }

    fn unset(&mut self, bit: usize) {
        assert!(bit < N * 64, "bit out of bounds");
        self.0[bit / 64] &= !(1 << (bit % 64));
    }

    fn get(&self, bit: usize) -> bool {
        assert!(bit < N * 64, "bit out of bounds");
        (self.0[bit / 64] & (1 << (bit % 64))) != 0
    }
}

#[derive(Debug)]
#[allow(clippy::large_enum_variant)]
/// The commit state of the program.
/// There are two states:
/// - Ready: The program is ready to run the next instruction, or has shut down after
///   the last instruction.
/// - Committing: The program is committing a write transaction. It is waiting for the pager to finish flushing the cache to disk,
///   primarily to the WAL, but also possibly checkpointing the WAL to the database file.
enum CommitState {
    Ready,
    Committing,
    CommitingMvcc {
        state_machine: StateMachine<CommitStateMachine<LocalClock>>,
    },
}

#[derive(Debug, Clone)]
pub enum Register {
    Value(Value),
    Aggregate(AggContext),
    Record(ImmutableRecord),
}

impl Register {
    #[inline]
    pub fn is_null(&self) -> bool {
        matches!(self, Register::Value(Value::Null))
    }
}

/// A row is a the list of registers that hold the values for a filtered row. This row is a pointer, therefore
/// after stepping again, row will be invalidated to be sure it doesn't point to somewhere unexpected.
#[derive(Debug)]
pub struct Row {
    values: *const Register,
    count: usize,
}

// SAFETY: This needs to be audited for thread safety.
// See: https://github.com/tursodatabase/turso/issues/1552
unsafe impl Send for Row {}
unsafe impl Sync for Row {}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum TxnCleanup {
    None,
    RollbackTxn,
}

/// The program state describes the environment in which the program executes.
pub struct ProgramState {
    pub io_completions: Option<IOCompletions>,
    pub pc: InsnReference,
    pub(crate) cursors: Vec<Option<Cursor>>,
    cursor_seqs: Vec<i64>,
    registers: Vec<Register>,
    pub(crate) result_row: Option<Row>,
    last_compare: Option<std::cmp::Ordering>,
    deferred_seeks: Vec<Option<(CursorID, CursorID)>>,
    ended_coroutine: Bitfield<4>, // flag to indicate that a coroutine has ended (key is the yield register. currently we assume that the yield register is always between 0-255, YOLO)
    /// Indicate whether an [Insn::Once] instruction at a given program counter position has already been executed, well, once.
    once: SmallVec<u32, 4>,
    regex_cache: RegexCache,
    interrupted: bool,
    pub parameters: HashMap<NonZero<usize>, Value>,
    commit_state: CommitState,
    #[cfg(feature = "json")]
    json_cache: JsonCacheCell,
    op_delete_state: OpDeleteState,
    op_destroy_state: OpDestroyState,
    op_idx_delete_state: Option<OpIdxDeleteState>,
    op_integrity_check_state: OpIntegrityCheckState,
    /// Metrics collected during statement execution
    pub metrics: StatementMetrics,
    op_open_ephemeral_state: OpOpenEphemeralState,
    op_program_state: OpProgramState,
    op_new_rowid_state: OpNewRowidState,
    op_idx_insert_state: OpIdxInsertState,
    op_insert_state: OpInsertState,
    op_no_conflict_state: OpNoConflictState,
    seek_state: OpSeekState,
    /// Current collation sequence set by OP_CollSeq instruction
    current_collation: Option<CollationSeq>,
    op_column_state: OpColumnState,
    op_row_id_state: OpRowIdState,
    op_transaction_state: OpTransactionState,
    op_checkpoint_state: OpCheckpointState,
    /// State machine for committing view deltas with I/O handling
    view_delta_state: ViewDeltaCommitState,
    /// Marker which tells about auto transaction cleanup necessary for that connection in case of reset
    /// This is used when statement in auto-commit mode reseted after previous uncomplete execution - in which case we may need to rollback transaction started on previous attempt
    /// Note, that MVCC transactions are always explicit - so they do not update auto_txn_cleanup marker
    pub(crate) auto_txn_cleanup: TxnCleanup,
    /// Number of deferred foreign key violations when the statement started.
    /// When a statement subtransaction rolls back, the connection's deferred foreign key violations counter
    /// is reset to this value.
    fk_deferred_violations_when_stmt_started: AtomicIsize,
    /// Number of immediate foreign key violations that occurred during the active statement. If nonzero,
    /// the statement subtransactionwill roll back.
    fk_immediate_violations_during_stmt: AtomicIsize,
    /// RowSet objects stored by register index
    rowsets: HashMap<usize, RowSet>,
}

impl std::fmt::Debug for Program {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Program").finish()
    }
}

// SAFETY: This needs to be audited for thread safety.
// See: https://github.com/tursodatabase/turso/issues/1552
unsafe impl Send for ProgramState {}
unsafe impl Sync for ProgramState {}

impl ProgramState {
    pub fn new(max_registers: usize, max_cursors: usize) -> Self {
        let cursors: Vec<Option<Cursor>> = (0..max_cursors).map(|_| None).collect();
        let cursor_seqs = vec![0i64; max_cursors];
        let registers = vec![Register::Value(Value::Null); max_registers];
        Self {
            io_completions: None,
            pc: 0,
            cursors,
            cursor_seqs,
            registers,
            result_row: None,
            last_compare: None,
            deferred_seeks: vec![None; max_cursors],
            ended_coroutine: Bitfield::new(),
            once: SmallVec::<u32, 4>::new(),
            regex_cache: RegexCache::new(),
            interrupted: false,
            parameters: HashMap::new(),
            commit_state: CommitState::Ready,
            #[cfg(feature = "json")]
            json_cache: JsonCacheCell::new(),
            op_delete_state: OpDeleteState {
                sub_state: OpDeleteSubState::MaybeCaptureRecord,
                deleted_record: None,
            },
            op_destroy_state: OpDestroyState::CreateCursor,
            op_idx_delete_state: None,
            op_integrity_check_state: OpIntegrityCheckState::Start,
            metrics: StatementMetrics::new(),
            op_open_ephemeral_state: OpOpenEphemeralState::Start,
            op_program_state: OpProgramState::Start,
            op_new_rowid_state: OpNewRowidState::Start,
            op_idx_insert_state: OpIdxInsertState::MaybeSeek,
            op_insert_state: OpInsertState {
                sub_state: OpInsertSubState::MaybeCaptureRecord,
                old_record: None,
            },
            op_no_conflict_state: OpNoConflictState::Start,
            seek_state: OpSeekState::Start,
            current_collation: None,
            op_column_state: OpColumnState::Start,
            op_row_id_state: OpRowIdState::Start,
            op_transaction_state: OpTransactionState::Start,
            op_checkpoint_state: OpCheckpointState::StartCheckpoint,
            view_delta_state: ViewDeltaCommitState::NotStarted,
            auto_txn_cleanup: TxnCleanup::None,
            fk_deferred_violations_when_stmt_started: AtomicIsize::new(0),
            fk_immediate_violations_during_stmt: AtomicIsize::new(0),
            rowsets: HashMap::new(),
        }
    }

    pub fn set_register(&mut self, idx: usize, value: Register) {
        self.registers[idx] = value;
    }

    pub fn get_register(&self, idx: usize) -> &Register {
        &self.registers[idx]
    }

    pub fn column_count(&self) -> usize {
        self.registers.len()
    }

    pub fn column(&self, i: usize) -> Option<String> {
        Some(format!("{:?}", self.registers[i]))
    }

    pub fn interrupt(&mut self) {
        self.interrupted = true;
    }

    pub fn is_interrupted(&self) -> bool {
        self.interrupted
    }

    pub fn bind_at(&mut self, index: NonZero<usize>, value: Value) {
        self.parameters.insert(index, value);
    }

    pub fn clear_bindings(&mut self) {
        self.parameters.clear();
    }

    pub fn get_parameter(&self, index: NonZero<usize>) -> Value {
        self.parameters.get(&index).cloned().unwrap_or(Value::Null)
    }

    pub fn reset(&mut self, max_registers: Option<usize>, max_cursors: Option<usize>) {
        self.pc = 0;

        if let Some(max_cursors) = max_cursors {
            self.cursors.resize_with(max_cursors, || None);
            self.cursor_seqs.resize(max_cursors, 0);
        }
        if let Some(max_registers) = max_registers {
            self.registers
                .resize_with(max_registers, || Register::Value(Value::Null));
        }
        // reset cursors as they can have cached information which will be no longer relevant on next program execution
        self.cursors.iter_mut().for_each(|c| {
            let _ = c.take();
        });
        self.registers
            .iter_mut()
            .for_each(|r| *r = Register::Value(Value::Null));
        self.last_compare = None;
        self.deferred_seeks.iter_mut().for_each(|s| *s = None);
        self.ended_coroutine.0 = [0; 4];
        self.regex_cache.like.clear();
        self.interrupted = false;
        self.current_collation = None;
        #[cfg(feature = "json")]
        self.json_cache.clear();

        // Reset state machines
        self.op_delete_state = OpDeleteState {
            sub_state: OpDeleteSubState::MaybeCaptureRecord,
            deleted_record: None,
        };
        self.op_idx_delete_state = None;
        self.op_integrity_check_state = OpIntegrityCheckState::Start;
        self.metrics = StatementMetrics::new();
        self.op_open_ephemeral_state = OpOpenEphemeralState::Start;
        self.op_new_rowid_state = OpNewRowidState::Start;
        self.op_idx_insert_state = OpIdxInsertState::MaybeSeek;
        self.op_insert_state = OpInsertState {
            sub_state: OpInsertSubState::MaybeCaptureRecord,
            old_record: None,
        };
        self.op_no_conflict_state = OpNoConflictState::Start;
        self.seek_state = OpSeekState::Start;
        self.current_collation = None;
        self.op_column_state = OpColumnState::Start;
        self.op_row_id_state = OpRowIdState::Start;
        self.view_delta_state = ViewDeltaCommitState::NotStarted;
        self.auto_txn_cleanup = TxnCleanup::None;
        self.fk_immediate_violations_during_stmt
            .store(0, Ordering::SeqCst);
        self.fk_deferred_violations_when_stmt_started
            .store(0, Ordering::SeqCst);
        self.rowsets.clear();
    }

    pub fn get_cursor(&mut self, cursor_id: CursorID) -> &mut Cursor {
        self.cursors
            .get_mut(cursor_id)
            .unwrap_or_else(|| panic!("cursor id {cursor_id} out of bounds"))
            .as_mut()
            .unwrap_or_else(|| panic!("cursor id {cursor_id} is None"))
    }

    /// Begin a statement subtransaction.
    pub fn begin_statement(
        &mut self,
        connection: &Connection,
        pager: &Arc<Pager>,
        write: bool,
    ) -> Result<IOResult<()>> {
        // Store the deferred foreign key violations counter at the start of the statement.
        // This is used to ensure that if an interactive transaction had deferred FK violations and a statement subtransaction rolls back,
        // the deferred FK violations are not lost.
        self.fk_deferred_violations_when_stmt_started.store(
            connection.fk_deferred_violations.load(Ordering::Acquire),
            Ordering::SeqCst,
        );
        // Reset the immediate foreign key violations counter to 0. If this is nonzero when the statement completes, the statement subtransaction will roll back.
        self.fk_immediate_violations_during_stmt
            .store(0, Ordering::SeqCst);
        if write {
            let db_size = return_if_io!(pager.with_header(|header| header.database_size.get()));
            pager.begin_statement(db_size)?;
        }
        Ok(IOResult::Done(()))
    }

    /// End a statement subtransaction.
    pub fn end_statement(
        &mut self,
        connection: &Connection,
        pager: &Arc<Pager>,
        end_statement: EndStatement,
    ) -> Result<()> {
        match end_statement {
            EndStatement::ReleaseSavepoint => pager.release_savepoint(),
            EndStatement::RollbackSavepoint => {
                let stmt_was_rolled_back = pager.rollback_to_newest_savepoint()?;
                if !stmt_was_rolled_back {
                    // We sometimes call end_statement() on errors without explicitly knowing whether a stmt transaction
                    // caused the error or not. If it didn't, don't reset any FK violation counters.
                    return Ok(());
                }
                // Reset the deferred foreign key violations counter to the value it had at the start of the statement.
                // This is used to ensure that if an interactive transaction had deferred FK violations, they are not lost.
                connection.fk_deferred_violations.store(
                    self.fk_deferred_violations_when_stmt_started
                        .load(Ordering::Acquire),
                    Ordering::SeqCst,
                );
                Ok(())
            }
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
/// Action to take at the end of a statement subtransaction.
pub enum EndStatement {
    /// Release (commit) the savepoint -- effectively removing the savepoint as it is no longer needed for undo purposes.
    ReleaseSavepoint,
    /// Rollback (abort) to the newest savepoint: read pages from the subjournal and restore them to the page cache.
    /// This is used to undo the changes made by the statement.
    RollbackSavepoint,
}

impl Register {
    pub fn get_value(&self) -> &Value {
        match self {
            Register::Value(v) => v,
            Register::Record(r) => {
                assert!(!r.is_invalidated());
                r.as_blob_value()
            }
            _ => panic!("register holds unexpected value: {self:?}"),
        }
    }
}

#[macro_export]
macro_rules! must_be_btree_cursor {
    ($cursor_id:expr, $cursor_ref:expr, $state:expr, $insn_name:expr) => {{
        let (_, cursor_type) = $cursor_ref.get($cursor_id).unwrap();
        if matches!(
            cursor_type,
            CursorType::BTreeTable(_)
                | CursorType::BTreeIndex(_)
                | CursorType::MaterializedView(_, _)
        ) {
            $crate::get_cursor!($state, $cursor_id)
        } else {
            panic!("{} on unexpected cursor", $insn_name)
        }
    }};
}

/// Macro is necessary to help the borrow checker see we are only accessing state.cursor field
/// and nothing else
#[macro_export]
macro_rules! get_cursor {
    ($state:expr, $cursor_id:expr) => {
        $state
            .cursors
            .get_mut($cursor_id)
            .unwrap_or_else(|| panic!("cursor id {} out of bounds", $cursor_id))
            .as_mut()
            .unwrap_or_else(|| panic!("cursor id {} is None", $cursor_id))
    };
}

/// Tracks the state of explain mode execution, including which subprograms need to be processed.
#[derive(Default)]
pub struct ExplainState {
    /// Program counter positions in the parent program where `Insn::Program` instructions occur.
    parent_program_pcs: Vec<usize>,
    /// Index of the subprogram currently being processed, if any.
    current_subprogram_index: Option<usize>,
    /// PC value when we started processing the current subprogram, to detect if we need to reset.
    subprogram_start_pc: Option<usize>,
}

pub struct Program {
    pub max_registers: usize,
    // we store original indices because we don't want to create new vec from
    // ProgramBuilder
    pub insns: Vec<(Insn, usize)>,
    pub cursor_ref: Vec<(Option<CursorKey>, CursorType)>,
    pub comments: Vec<(InsnReference, &'static str)>,
    pub parameters: crate::parameters::Parameters,
    pub connection: Arc<Connection>,
    pub n_change: AtomicI64,
    pub change_cnt_on: bool,
    pub result_columns: Vec<ResultSetColumn>,
    pub table_references: TableReferences,
    pub sql: String,
    /// Whether the program accesses the database.
    /// Used to determine whether we need to check for schema changes when
    /// starting a transaction.
    pub accesses_db: bool,
    /// In SQLite, whether statement subtransactions will be used for executing a program (`usesStmtJournal`)
    /// is determined by the parser flags "mayAbort" and "isMultiWrite". Essentially this means that the individual
    /// statement may need to be aborted due to a constraint conflict, etc. instead of the entire transaction.
    pub needs_stmt_subtransactions: bool,
    pub trigger: Option<Arc<Trigger>>,
    pub resolve_type: ResolveType,
    pub explain_state: RwLock<ExplainState>,
}

impl Program {
    fn get_pager_from_database_index(&self, idx: &usize) -> Arc<Pager> {
        self.connection.get_pager_from_database_index(idx)
    }

    pub fn step(
        &self,
        state: &mut ProgramState,
        mv_store: Option<&Arc<MvStore>>,
        pager: Arc<Pager>,
        query_mode: QueryMode,
        waker: Option<&Waker>,
    ) -> Result<StepResult> {
        match query_mode {
            QueryMode::Normal => self.normal_step(state, mv_store, pager, waker),
            QueryMode::Explain => self.explain_step(state, mv_store, pager),
            QueryMode::ExplainQueryPlan => self.explain_query_plan_step(state, mv_store, pager),
        }
    }

    fn explain_step(
        &self,
        state: &mut ProgramState,
        _mv_store: Option<&Arc<MvStore>>,
        pager: Arc<Pager>,
    ) -> Result<StepResult> {
        debug_assert!(state.column_count() == EXPLAIN_COLUMNS.len());
        if self.connection.is_closed() {
            // Connection is closed for whatever reason, rollback the transaction.
            let state = self.connection.get_tx_state();
            if let TransactionState::Write { .. } = state {
                pager.rollback_tx(&self.connection);
            }
            return Err(LimboError::InternalError("Connection closed".to_string()));
        }

        if state.is_interrupted() {
            return Ok(StepResult::Interrupt);
        }

        // FIXME: do we need this?
        state.metrics.vm_steps = state.metrics.vm_steps.saturating_add(1);

        let mut explain_state = self.explain_state.write();

        // Check if we're processing a subprogram
        if let Some(sub_idx) = explain_state.current_subprogram_index {
            if sub_idx >= explain_state.parent_program_pcs.len() {
                // All subprograms processed
                *explain_state = ExplainState::default();
                return Ok(StepResult::Done);
            }

            let parent_pc = explain_state.parent_program_pcs[sub_idx];
            let Insn::Program { program: p, .. } = &self.insns[parent_pc].0 else {
                panic!("Expected program insn at pc {parent_pc}");
            };
            let p = &mut p.write().program;

            let subprogram_insn_count = p.insns.len();

            // Check if the subprogram has already finished (PC is out of bounds)
            // This can happen if the subprogram finished in a previous call but we're being called again
            if state.pc as usize >= subprogram_insn_count {
                // Subprogram is done, move to next one
                explain_state.subprogram_start_pc = None;
                if sub_idx + 1 < explain_state.parent_program_pcs.len() {
                    explain_state.current_subprogram_index = Some(sub_idx + 1);
                    state.pc = 0;
                    drop(explain_state);
                    return self.explain_step(state, _mv_store, pager);
                } else {
                    *explain_state = ExplainState::default();
                    return Ok(StepResult::Done);
                }
            }

            // Reset PC to 0 only when starting a new subprogram (when subprogram_start_pc is None)
            // Once we've started, let the subprogram manage its own PC through its explain_step
            if explain_state.subprogram_start_pc.is_none() {
                state.pc = 0;
                explain_state.subprogram_start_pc = Some(0);
            }

            // Process the subprogram - it will handle its own explain_step internally
            // The subprogram's explain_step will process all its instructions (including any nested subprograms)
            // and return StepResult::Row for each instruction, then StepResult::Done when finished
            let result = p.step(state, None, pager.clone(), QueryMode::Explain, None)?;

            match result {
                StepResult::Done => {
                    // This subprogram is done, move to next one
                    explain_state.subprogram_start_pc = None; // Clear the start PC marker
                    if sub_idx + 1 < explain_state.parent_program_pcs.len() {
                        // Move to next subprogram
                        explain_state.current_subprogram_index = Some(sub_idx + 1);
                        // Reset PC to 0 for the next subprogram
                        state.pc = 0;
                        // Recursively call to process the next subprogram
                        drop(explain_state);
                        return self.explain_step(state, _mv_store, pager);
                    } else {
                        // All subprograms done
                        *explain_state = ExplainState::default();
                        return Ok(StepResult::Done);
                    }
                }
                StepResult::Row => {
                    // Output a row from the subprogram
                    // The subprogram's step already set up the registers with PC starting at 0
                    // Don't reset subprogram_start_pc - we're still processing this subprogram
                    drop(explain_state);
                    return Ok(StepResult::Row);
                }
                other => {
                    drop(explain_state);
                    return Ok(other);
                }
            }
        }

        // We're processing the parent program
        if state.pc as usize >= self.insns.len() {
            // Parent program is done, start processing subprograms
            if explain_state.parent_program_pcs.is_empty() {
                // No subprograms to process
                *explain_state = ExplainState::default();
                return Ok(StepResult::Done);
            }

            // Start processing the first subprogram
            explain_state.current_subprogram_index = Some(0);
            explain_state.subprogram_start_pc = None; // Will be set when we actually start processing
            state.pc = 0; // Reset PC to 0 for the first subprogram
            drop(explain_state);
            return self.explain_step(state, _mv_store, pager);
        }

        let (current_insn, _) = &self.insns[state.pc as usize];

        if matches!(current_insn, Insn::Program { .. }) {
            explain_state.parent_program_pcs.push(state.pc as usize);
        }
        let (opcode, p1, p2, p3, p4, p5, comment) = insn_to_row_with_comment(
            self,
            current_insn,
            self.comments
                .iter()
                .find(|(offset, _)| *offset == state.pc)
                .map(|(_, comment)| comment)
                .copied(),
        );

        state.registers[0] = Register::Value(Value::Integer(state.pc as i64));
        state.registers[1] = Register::Value(Value::from_text(opcode));
        state.registers[2] = Register::Value(Value::Integer(p1 as i64));
        state.registers[3] = Register::Value(Value::Integer(p2 as i64));
        state.registers[4] = Register::Value(Value::Integer(p3 as i64));
        state.registers[5] = Register::Value(p4);
        state.registers[6] = Register::Value(Value::Integer(p5 as i64));
        state.registers[7] = Register::Value(Value::from_text(comment));
        state.result_row = Some(Row {
            values: &state.registers[0] as *const Register,
            count: EXPLAIN_COLUMNS.len(),
        });
        state.pc += 1;
        Ok(StepResult::Row)
    }

    fn explain_query_plan_step(
        &self,
        state: &mut ProgramState,
        _mv_store: Option<&Arc<MvStore>>,
        pager: Arc<Pager>,
    ) -> Result<StepResult> {
        debug_assert!(state.column_count() == EXPLAIN_QUERY_PLAN_COLUMNS.len());
        loop {
            if self.connection.is_closed() {
                // Connection is closed for whatever reason, rollback the transaction.
                let state = self.connection.get_tx_state();
                if let TransactionState::Write { .. } = state {
                    pager.rollback_tx(&self.connection);
                }
                return Err(LimboError::InternalError("Connection closed".to_string()));
            }

            if state.is_interrupted() {
                return Ok(StepResult::Interrupt);
            }

            // FIXME: do we need this?
            state.metrics.vm_steps = state.metrics.vm_steps.saturating_add(1);

            if state.pc as usize >= self.insns.len() {
                return Ok(StepResult::Done);
            }

            let Insn::Explain { p1, p2, detail } = &self.insns[state.pc as usize].0 else {
                state.pc += 1;
                continue;
            };

            state.registers[0] = Register::Value(Value::Integer(*p1 as i64));
            state.registers[1] =
                Register::Value(Value::Integer(p2.as_ref().map(|p| *p).unwrap_or(0) as i64));
            state.registers[2] = Register::Value(Value::Integer(0));
            state.registers[3] = Register::Value(Value::from_text(detail.clone()));
            state.result_row = Some(Row {
                values: &state.registers[0] as *const Register,
                count: EXPLAIN_QUERY_PLAN_COLUMNS.len(),
            });
            state.pc += 1;
            return Ok(StepResult::Row);
        }
    }

    #[instrument(skip_all, level = Level::DEBUG)]
    fn normal_step(
        &self,
        state: &mut ProgramState,
        mv_store: Option<&Arc<MvStore>>,
        pager: Arc<Pager>,
        waker: Option<&Waker>,
    ) -> Result<StepResult> {
        let enable_tracing = tracing::enabled!(tracing::Level::TRACE);
        loop {
            if self.connection.is_closed() {
                // Connection is closed for whatever reason, rollback the transaction.
                let state = self.connection.get_tx_state();
                if let TransactionState::Write { .. } = state {
                    pager.rollback_tx(&self.connection);
                }
                return Err(LimboError::InternalError("Connection closed".to_string()));
            }
            if state.is_interrupted() {
                self.abort(mv_store, &pager, None, state);
                return Ok(StepResult::Interrupt);
            }
            if let Some(io) = &state.io_completions {
                if !io.finished() {
                    io.set_waker(waker);
                    return Ok(StepResult::IO);
                }
                if let Some(err) = io.get_error() {
                    let err = err.into();
                    self.abort(mv_store, &pager, Some(&err), state);
                    return Err(err);
                }
                state.io_completions = None;
            }
            // invalidate row
            let _ = state.result_row.take();
            let (insn, _) = &self.insns[state.pc as usize];
            let insn_function = insn.to_function();
            if enable_tracing {
                trace_insn(self, state.pc as InsnReference, insn);
            }
            // Always increment VM steps for every loop iteration
            state.metrics.vm_steps = state.metrics.vm_steps.saturating_add(1);

            match insn_function(self, state, insn, &pager, mv_store) {
                Ok(InsnFunctionStepResult::Step) => {
                    // Instruction completed, moving to next
                    state.metrics.insn_executed = state.metrics.insn_executed.saturating_add(1);
                }
                Ok(InsnFunctionStepResult::Done) => {
                    // Instruction completed execution
                    state.metrics.insn_executed = state.metrics.insn_executed.saturating_add(1);
                    state.auto_txn_cleanup = TxnCleanup::None;
                    return Ok(StepResult::Done);
                }
                Ok(InsnFunctionStepResult::IO(io)) => {
                    // Instruction not complete - waiting for I/O, will resume at same PC
                    io.set_waker(waker);
                    state.io_completions = Some(io);
                    return Ok(StepResult::IO);
                }
                Ok(InsnFunctionStepResult::Row) => {
                    // Instruction completed (ResultRow already incremented PC)
                    state.metrics.insn_executed = state.metrics.insn_executed.saturating_add(1);
                    return Ok(StepResult::Row);
                }
                Err(LimboError::Busy) => {
                    // Instruction blocked - will retry at same PC
                    return Ok(StepResult::Busy);
                }
                Err(err) => {
                    self.abort(mv_store, &pager, Some(&err), state);
                    return Err(err);
                }
            }
        }
    }

    #[instrument(skip_all, level = Level::DEBUG)]
    fn apply_view_deltas(
        &self,
        state: &mut ProgramState,
        rollback: bool,
        pager: &Arc<Pager>,
    ) -> Result<IOResult<()>> {
        use crate::types::IOResult;

        loop {
            match &state.view_delta_state {
                ViewDeltaCommitState::NotStarted => {
                    if self.connection.view_transaction_states.is_empty() {
                        return Ok(IOResult::Done(()));
                    }

                    if rollback {
                        // On rollback, just clear and done
                        self.connection.view_transaction_states.clear();
                        return Ok(IOResult::Done(()));
                    }

                    // Not a rollback - proceed with processing
                    let schema = self.connection.schema.read();

                    // Collect materialized views - they should all have storage
                    let mut views = Vec::new();
                    for view_name in self.connection.view_transaction_states.get_view_names() {
                        if let Some(view_mutex) = schema.get_materialized_view(&view_name) {
                            let view = view_mutex.lock();
                            let root_page = view.get_root_page();

                            // Materialized views should always have storage (root_page != 0)
                            assert!(
                                root_page != 0,
                                "Materialized view '{view_name}' should have a root page"
                            );

                            views.push(view_name);
                        }
                    }

                    state.view_delta_state = ViewDeltaCommitState::Processing {
                        views,
                        current_index: 0,
                    };
                }

                ViewDeltaCommitState::Processing {
                    views,
                    current_index,
                } => {
                    // At this point we know it's not a rollback
                    if *current_index >= views.len() {
                        // All done, clear the transaction states
                        self.connection.view_transaction_states.clear();
                        state.view_delta_state = ViewDeltaCommitState::Done;
                        return Ok(IOResult::Done(()));
                    }

                    let view_name = &views[*current_index];

                    let table_deltas = self
                        .connection
                        .view_transaction_states
                        .get(view_name)
                        .unwrap()
                        .get_table_deltas();

                    let schema = self.connection.schema.read();
                    if let Some(view_mutex) = schema.get_materialized_view(view_name) {
                        let mut view = view_mutex.lock();

                        // Create a DeltaSet from the per-table deltas
                        let mut delta_set = crate::incremental::compiler::DeltaSet::new();
                        for (table_name, delta) in table_deltas {
                            delta_set.insert(table_name, delta);
                        }

                        // Handle I/O from merge_delta - pass pager, circuit will create its own cursor
                        match view.merge_delta(delta_set, pager.clone())? {
                            IOResult::Done(_) => {
                                // Move to next view
                                state.view_delta_state = ViewDeltaCommitState::Processing {
                                    views: views.clone(),
                                    current_index: current_index + 1,
                                };
                            }
                            IOResult::IO(io) => {
                                // Return I/O, will resume at same index
                                return Ok(IOResult::IO(io));
                            }
                        }
                    }
                }

                ViewDeltaCommitState::Done => {
                    return Ok(IOResult::Done(()));
                }
            }
        }
    }

    pub fn commit_txn(
        &self,
        pager: Arc<Pager>,
        program_state: &mut ProgramState,
        mv_store: Option<&Arc<MvStore>>,
        rollback: bool,
    ) -> Result<IOResult<()>> {
        // Apply view deltas with I/O handling
        match self.apply_view_deltas(program_state, rollback, &pager)? {
            IOResult::IO(io) => return Ok(IOResult::IO(io)),
            IOResult::Done(_) => {}
        }

        // Reset state for next use
        program_state.view_delta_state = ViewDeltaCommitState::NotStarted;
        if self.connection.get_tx_state() == TransactionState::None {
            // No need to do any work here if not in tx. Current MVCC logic doesn't work with this assumption,
            // hence the mv_store.is_none() check.
            return Ok(IOResult::Done(()));
        }
        if self.connection.is_nested_stmt() {
            // We don't want to commit on nested statements. Let parent handle it.
            return Ok(IOResult::Done(()));
        }
        if let Some(mv_store) = mv_store {
            let conn = self.connection.clone();
            let auto_commit = conn.auto_commit.load(Ordering::SeqCst);
            if auto_commit {
                // FIXME: we don't want to commit stuff from other programs.
                if matches!(program_state.commit_state, CommitState::Ready) {
                    let Some(tx_id) = conn.get_mv_tx_id() else {
                        return Ok(IOResult::Done(()));
                    };
                    let state_machine = mv_store.commit_tx(tx_id, &conn).unwrap();
                    program_state.commit_state = CommitState::CommitingMvcc { state_machine };
                }
                let CommitState::CommitingMvcc { state_machine } = &mut program_state.commit_state
                else {
                    panic!("invalid state for mvcc commit step")
                };
                match self.step_end_mvcc_txn(state_machine, mv_store)? {
                    IOResult::Done(_) => {
                        assert!(state_machine.is_finalized());
                        *conn.mv_tx.write() = None;
                        conn.set_tx_state(TransactionState::None);
                        program_state.commit_state = CommitState::Ready;
                        return Ok(IOResult::Done(()));
                    }
                    IOResult::IO(io) => {
                        return Ok(IOResult::IO(io));
                    }
                }
            }
            Ok(IOResult::Done(()))
        } else {
            let connection = self.connection.clone();
            let auto_commit = connection.auto_commit.load(Ordering::SeqCst);
            tracing::debug!(
                "Halt auto_commit {}, state={:?}",
                auto_commit,
                program_state.commit_state
            );
            if matches!(program_state.commit_state, CommitState::Committing) {
                let TransactionState::Write { .. } = connection.get_tx_state() else {
                    unreachable!("invalid state for write commit step")
                };
                self.step_end_write_txn(
                    &pager,
                    &mut program_state.commit_state,
                    &connection,
                    rollback,
                )
            } else if auto_commit {
                let current_state = connection.get_tx_state();
                tracing::trace!("Auto-commit state: {:?}", current_state);
                match current_state {
                    TransactionState::Write { .. } => self.step_end_write_txn(
                        &pager,
                        &mut program_state.commit_state,
                        &connection,
                        rollback,
                    ),
                    TransactionState::Read => {
                        connection.set_tx_state(TransactionState::None);
                        pager.end_read_tx();
                        Ok(IOResult::Done(()))
                    }
                    TransactionState::None => Ok(IOResult::Done(())),
                    TransactionState::PendingUpgrade => {
                        panic!("Unexpected transaction state: {current_state:?} during auto-commit",)
                    }
                }
            } else {
                if self.change_cnt_on {
                    self.connection
                        .set_changes(self.n_change.load(Ordering::SeqCst));
                }
                Ok(IOResult::Done(()))
            }
        }
    }

    #[instrument(skip(self, pager, connection), level = Level::DEBUG)]
    fn step_end_write_txn(
        &self,
        pager: &Arc<Pager>,
        commit_state: &mut CommitState,
        connection: &Connection,
        rollback: bool,
    ) -> Result<IOResult<()>> {
        let cacheflush_status = if !rollback {
            pager.commit_tx(connection)?
        } else {
            pager.rollback_tx(connection);
            IOResult::Done(PagerCommitResult::Rollback)
        };
        match cacheflush_status {
            IOResult::Done(_) => {
                if self.change_cnt_on {
                    self.connection
                        .set_changes(self.n_change.load(Ordering::SeqCst));
                }
                connection.set_tx_state(TransactionState::None);
                *commit_state = CommitState::Ready;
            }
            IOResult::IO(io) => {
                tracing::trace!("Cacheflush IO");
                *commit_state = CommitState::Committing;
                return Ok(IOResult::IO(io));
            }
        }
        Ok(IOResult::Done(()))
    }

    #[instrument(skip(self, commit_state, mv_store), level = Level::DEBUG)]
    fn step_end_mvcc_txn(
        &self,
        commit_state: &mut StateMachine<CommitStateMachine<LocalClock>>,
        mv_store: &Arc<MvStore>,
    ) -> Result<IOResult<()>> {
        commit_state.step(mv_store)
    }

    /// Aborts the program due to various conditions (explicit error, interrupt or reset of unfinished statement) by rolling back the transaction
    /// This method is no-op if program was already finished (either aborted or executed to completion)
    pub fn abort(
        &self,
        mv_store: Option<&Arc<MvStore>>,
        pager: &Arc<Pager>,
        err: Option<&LimboError>,
        state: &mut ProgramState,
    ) {
        if self.is_trigger_subprogram() {
            self.connection.end_trigger_execution();
        }
        // Errors from nested statements are handled by the parent statement.
        if !self.connection.is_nested_stmt() && !self.is_trigger_subprogram() {
            if err.is_some() {
                // Any error apart from deferred FK volations causes the statement subtransaction to roll back.
                let res =
                    state.end_statement(&self.connection, pager, EndStatement::RollbackSavepoint);
                if let Err(e) = res {
                    tracing::error!("Error rolling back statement: {}", e);
                }
            }
            match err {
                // Transaction errors, e.g. trying to start a nested transaction, do not cause a rollback.
                Some(LimboError::TxError(_)) => {}
                // Table locked errors, e.g. trying to checkpoint in an interactive transaction, do not cause a rollback.
                Some(LimboError::TableLocked) => {}
                // Busy errors do not cause a rollback.
                Some(LimboError::Busy) => {}
                // Constraint errors do not cause a rollback of the transaction by default;
                // Instead individual statement subtransactions will roll back and these are handled in op_auto_commit
                // and op_halt.
                Some(LimboError::Constraint(_)) => {}
                _ => {
                    if state.auto_txn_cleanup != TxnCleanup::None || err.is_some() {
                        if let Some(mv_store) = mv_store {
                            if let Some(tx_id) = self.connection.get_mv_tx_id() {
                                self.connection.auto_commit.store(true, Ordering::SeqCst);
                                mv_store.rollback_tx(tx_id, pager.clone(), &self.connection);
                            }
                        } else {
                            pager.rollback_tx(&self.connection);
                            self.connection.auto_commit.store(true, Ordering::SeqCst);
                        }
                        self.connection.set_tx_state(TransactionState::None);
                    }
                }
            }
        }
        state.auto_txn_cleanup = TxnCleanup::None;
    }

    pub fn is_trigger_subprogram(&self) -> bool {
        self.trigger.is_some()
    }
}

fn make_record(registers: &[Register], start_reg: &usize, count: &usize) -> ImmutableRecord {
    let regs = &registers[*start_reg..*start_reg + *count];
    ImmutableRecord::from_registers(regs, regs.len())
}

pub fn registers_to_ref_values<'a>(
    registers: &'a [Register],
) -> impl ExactSizeIterator<Item = ValueRef<'a>> {
    registers.iter().map(|reg| reg.get_value().as_ref())
}

#[instrument(skip(program), level = Level::DEBUG)]
fn trace_insn(program: &Program, addr: InsnReference, insn: &Insn) {
    tracing::trace!(
        "\n{}",
        explain::insn_to_str(
            program,
            addr,
            insn,
            String::new(),
            program
                .comments
                .iter()
                .find(|(offset, _)| *offset == addr)
                .map(|(_, comment)| comment)
                .copied()
        )
    );
}

pub trait FromValueRow<'a> {
    fn from_value(value: &'a Value) -> Result<Self>
    where
        Self: Sized + 'a;
}

impl<'a> FromValueRow<'a> for i64 {
    fn from_value(value: &'a Value) -> Result<Self> {
        match value {
            Value::Integer(i) => Ok(*i),
            _ => Err(LimboError::ConversionError("Expected integer value".into())),
        }
    }
}

impl<'a> FromValueRow<'a> for f64 {
    fn from_value(value: &'a Value) -> Result<Self> {
        match value {
            Value::Float(f) => Ok(*f),
            _ => Err(LimboError::ConversionError("Expected integer value".into())),
        }
    }
}

impl<'a> FromValueRow<'a> for String {
    fn from_value(value: &'a Value) -> Result<Self> {
        match value {
            Value::Text(s) => Ok(s.as_str().to_string()),
            _ => Err(LimboError::ConversionError("Expected text value".into())),
        }
    }
}

impl<'a> FromValueRow<'a> for &'a str {
    fn from_value(value: &'a Value) -> Result<Self> {
        match value {
            Value::Text(s) => Ok(s.as_str()),
            _ => Err(LimboError::ConversionError("Expected text value".into())),
        }
    }
}

impl<'a> FromValueRow<'a> for &'a Value {
    fn from_value(value: &'a Value) -> Result<Self> {
        Ok(value)
    }
}

impl Row {
    pub fn get<'a, T: FromValueRow<'a> + 'a>(&'a self, idx: usize) -> Result<T> {
        let value = unsafe { self.values.add(idx).as_ref().unwrap() };
        let value = match value {
            Register::Value(value) => value,
            _ => unreachable!("a row should be formed of values only"),
        };
        T::from_value(value)
    }

    pub fn get_value(&self, idx: usize) -> &Value {
        let value = unsafe { self.values.add(idx).as_ref().unwrap() };
        match value {
            Register::Value(value) => value,
            _ => unreachable!("a row should be formed of values only"),
        }
    }

    pub fn get_values(&self) -> impl Iterator<Item = &Value> {
        let values = unsafe { std::slice::from_raw_parts(self.values, self.count) };
        // This should be ownedvalues
        // TODO: add check for this
        values.iter().map(|v| v.get_value())
    }

    pub fn len(&self) -> usize {
        self.count
    }
}