diff --git a/cli/app.rs b/cli/app.rs index d23e25771..9ae6b220e 100644 --- a/cli/app.rs +++ b/cli/app.rs @@ -860,15 +860,34 @@ impl Limbo { indent_count: usize, curr_insn: &str, prev_insn: &str, + p1: &str, + unclosed_begin_subrtns: &mut Vec, ) -> usize { let indent_count = match prev_insn { "Rewind" | "Last" | "SorterSort" | "SeekGE" | "SeekGT" | "SeekLE" - | "SeekLT" => indent_count + 1, + | "SeekLT" | "BeginSubrtn" => indent_count + 1, _ => indent_count, }; + // The corresponding closing instruction for BeginSubrtn is Return, + // but Return is also used for other purposes, so we need to track pairs of + // BeginSubrtn and Return that share the same 1st parameter (the subroutine register). + if curr_insn == "BeginSubrtn" { + unclosed_begin_subrtns.push(p1.to_string()); + } + match curr_insn { "Next" | "SorterNext" | "Prev" => indent_count - 1, + "Return" => { + let matching_begin_subrtn = + unclosed_begin_subrtns.iter().position(|b| b == p1); + if let Some(matching_begin_subrtn) = matching_begin_subrtn { + unclosed_begin_subrtns.remove(matching_begin_subrtn); + indent_count - 1 + } else { + indent_count + } + } _ => indent_count, } } @@ -883,16 +902,24 @@ impl Limbo { let mut prev_insn: String = "".to_string(); let mut indent_count = 0; let indent = " "; + let mut unclosed_begin_subrtns = vec![]; loop { row_step_result_query!(self, sql, rows, statistics, { let row = rows.row().unwrap(); let insn = row.get_value(1).to_string(); - indent_count = get_explain_indent(indent_count, &insn, &prev_insn); + let p1 = row.get_value(2).to_string(); + indent_count = get_explain_indent( + indent_count, + &insn, + &prev_insn, + &p1, + &mut unclosed_begin_subrtns, + ); let _ = self.writeln(format!( "{:<4} {:<17} {:<4} {:<4} {:<4} {:<13} {:<2} {}", row.get_value(0).to_string(), &(indent.repeat(indent_count) + &insn), - row.get_value(2).to_string(), + p1, row.get_value(3).to_string(), row.get_value(4).to_string(), row.get_value(5).to_string(), diff --git a/core/lib.rs b/core/lib.rs index 3a6974c93..ae8b0e233 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -2779,7 +2779,7 @@ impl Statement { .program .table_references .find_table_by_internal_id(*table) - .map(|table_ref| Cow::Borrowed(table_ref.get_name())), + .map(|(_, table_ref)| Cow::Borrowed(table_ref.get_name())), _ => None, } } @@ -2808,7 +2808,7 @@ impl Statement { column: column_idx, .. } => { - let table_ref = self + let (_, table_ref) = self .program .table_references .find_table_by_internal_id(*table)?; diff --git a/core/translate/aggregation.rs b/core/translate/aggregation.rs index 9a3a98ce3..366332508 100644 --- a/core/translate/aggregation.rs +++ b/core/translate/aggregation.rs @@ -107,7 +107,7 @@ fn emit_collseq_if_needed( // If no explicit collation, check if this is a column with table-defined collation if let ast::Expr::Column { table, column, .. } = expr { - if let Some(table_ref) = referenced_tables.find_table_by_internal_id(*table) { + if let Some((_, table_ref)) = referenced_tables.find_table_by_internal_id(*table) { if let Some(table_column) = table_ref.get_column_at(*column) { if let Some(collation) = &table_column.collation { program.emit_insn(Insn::CollSeq { diff --git a/core/translate/collate.rs b/core/translate/collate.rs index 89804c470..425de8a8b 100644 --- a/core/translate/collate.rs +++ b/core/translate/collate.rs @@ -105,7 +105,7 @@ pub fn get_collseq_from_expr( return Ok(WalkControl::SkipChildren); } Expr::Column { table, column, .. } => { - let table_ref = referenced_tables + let (_, table_ref) = referenced_tables .find_table_by_internal_id(*table) .ok_or_else(|| crate::LimboError::ParseError("table not found".to_string()))?; let column = table_ref @@ -117,7 +117,7 @@ pub fn get_collseq_from_expr( return Ok(WalkControl::Continue); } Expr::RowId { table, .. } => { - let table_ref = referenced_tables + let (_, table_ref) = referenced_tables .find_table_by_internal_id(*table) .ok_or_else(|| crate::LimboError::ParseError("table not found".to_string()))?; if let Some(btree) = table_ref.btree() { diff --git a/core/translate/display.rs b/core/translate/display.rs index 9a0bd5d6d..5939407dc 100644 --- a/core/translate/display.rs +++ b/core/translate/display.rs @@ -251,7 +251,7 @@ pub struct PlanContext<'a>(pub &'a [&'a TableReferences]); // Definitely not perfect yet impl ToSqlContext for PlanContext<'_> { fn get_column_name(&self, table_id: TableInternalId, col_idx: usize) -> Option> { - let table = self + let (_, table) = self .0 .iter() .find_map(|table_ref| table_ref.find_table_by_internal_id(table_id))?; @@ -270,7 +270,8 @@ impl ToSqlContext for PlanContext<'_> { match (joined_table, outer_query) { (Some(table), None) => Some(&table.identifier), (None, Some(table)) => Some(&table.identifier), - _ => unreachable!(), + (Some(table), Some(_)) => Some(&table.identifier), + (None, None) => unreachable!(), } } } diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index ff22d1035..b60e2ddf2 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -21,7 +21,7 @@ use super::plan::{ Distinctness, JoinOrderMember, Operation, Scan, SelectPlan, TableReferences, UpdatePlan, }; use super::select::emit_simple_count; -use super::subquery::emit_subqueries; +use super::subquery::emit_from_clause_subqueries; use crate::error::SQLITE_CONSTRAINT_PRIMARYKEY; use crate::function::Func; use crate::schema::{BTreeTable, Column, Schema, Table, ROWID_SENTINEL}; @@ -243,7 +243,7 @@ pub fn emit_program( } #[instrument(skip_all, level = Level::DEBUG)] -fn emit_program_for_select( +pub fn emit_program_for_select( program: &mut ProgramBuilder, resolver: &Resolver, mut plan: SelectPlan, @@ -280,8 +280,8 @@ pub fn emit_query<'a>( return Ok(reg_result_cols_start); } - // Emit subqueries first so the results can be read in the main query loop. - emit_subqueries(program, t_ctx, &mut plan.table_references)?; + // Emit FROM clause subqueries first so the results can be read in the main query loop. + emit_from_clause_subqueries(program, t_ctx, &mut plan.table_references)?; // No rows will be read from source table loops if there is a constant false condition eg. WHERE 0 // however an aggregation might still happen, @@ -364,6 +364,8 @@ pub fn emit_query<'a>( plan.group_by.as_ref(), OperationMode::SELECT, &plan.where_clause, + &plan.join_order, + &mut plan.non_from_clause_subqueries, )?; if plan.is_simple_count() { @@ -380,6 +382,7 @@ pub fn emit_query<'a>( &plan.where_clause, None, OperationMode::SELECT, + &mut plan.non_from_clause_subqueries, )?; // Process result columns and expressions in the inner loop @@ -463,6 +466,8 @@ fn emit_program_for_delete( None, OperationMode::DELETE, &plan.where_clause, + &[JoinOrderMember::default()], + &mut [], )?; // Set up main query execution loop @@ -474,6 +479,7 @@ fn emit_program_for_delete( &plan.where_clause, None, OperationMode::DELETE, + &mut [], )?; emit_delete_insns( @@ -963,6 +969,8 @@ fn emit_program_for_update( None, mode.clone(), &plan.where_clause, + &[JoinOrderMember::default()], + &mut [], )?; // Prepare index cursors @@ -999,6 +1007,7 @@ fn emit_program_for_update( &plan.where_clause, temp_cursor_id, mode.clone(), + &mut [], )?; let target_table_cursor_id = diff --git a/core/translate/expr.rs b/core/translate/expr.rs index e082ca6ae..929a62a4e 100644 --- a/core/translate/expr.rs +++ b/core/translate/expr.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use tracing::{instrument, Level}; -use turso_parser::ast::{self, As, Expr, UnaryOperator}; +use turso_parser::ast::{self, As, Expr, SubqueryType, UnaryOperator}; use super::emitter::Resolver; use super::optimizer::Optimizable; @@ -288,6 +288,25 @@ pub fn translate_condition_expr( resolver: &Resolver, ) -> Result<()> { match expr { + ast::Expr::SubqueryResult { query_type, .. } => match query_type { + SubqueryType::Exists { result_reg } => { + emit_cond_jump(program, condition_metadata, *result_reg); + } + SubqueryType::In { .. } => { + let result_reg = program.alloc_register(); + translate_expr(program, Some(referenced_tables), expr, result_reg, resolver)?; + emit_cond_jump(program, condition_metadata, result_reg); + } + SubqueryType::RowValue { num_regs, .. } => { + if *num_regs != 1 { + // A query like SELECT * FROM t WHERE (SELECT ...) must return a single column. + crate::bail_parse_error!("sub-select returns {num_regs} columns - expected 1"); + } + let result_reg = program.alloc_register(); + translate_expr(program, Some(referenced_tables), expr, result_reg, resolver)?; + emit_cond_jump(program, condition_metadata, result_reg); + } + }, ast::Expr::Register(_) => { crate::bail_parse_error!("Register in WHERE clause is currently unused. Consider removing Resolver::expr_to_reg_cache and using Expr::Register instead"); } @@ -593,6 +612,149 @@ pub fn translate_expr( } match expr { + ast::Expr::SubqueryResult { + lhs, + not_in, + query_type, + .. + } => { + match query_type { + SubqueryType::Exists { result_reg } => { + program.emit_insn(Insn::Copy { + src_reg: *result_reg, + dst_reg: target_register, + extra_amount: 0, + }); + Ok(target_register) + } + SubqueryType::In { cursor_id } => { + // jump here when we can definitely skip the row + let label_skip_row = program.allocate_label(); + // jump here when we can definitely include the row + let label_include_row = program.allocate_label(); + // jump here when we need to make extra null-related checks, because sql null is the greatest thing ever + let label_null_rewind = program.allocate_label(); + let label_null_checks_loop_start = program.allocate_label(); + let label_null_checks_next = program.allocate_label(); + program.emit_insn(Insn::Integer { + value: 0, + dest: target_register, + }); + let lhs_columns = match unwrap_parens(lhs.as_ref().unwrap())? { + ast::Expr::Parenthesized(exprs) => { + exprs.iter().map(|e| e.as_ref()).collect() + } + expr => vec![expr], + }; + let lhs_column_count = lhs_columns.len(); + let lhs_column_regs_start = program.alloc_registers(lhs_column_count); + for (i, lhs_column) in lhs_columns.iter().enumerate() { + translate_expr( + program, + referenced_tables, + lhs_column, + lhs_column_regs_start + i, + resolver, + )?; + if !lhs_column.is_nonnull(referenced_tables.as_ref().unwrap()) { + program.emit_insn(Insn::IsNull { + reg: lhs_column_regs_start + i, + target_pc: if *not_in { + label_null_rewind + } else { + label_skip_row + }, + }); + } + } + if *not_in { + // WHERE ... NOT IN (SELECT ...) + // We must skip the row if we find a match. + program.emit_insn(Insn::Found { + cursor_id: *cursor_id, + target_pc: label_skip_row, + record_reg: lhs_column_regs_start, + num_regs: lhs_column_count, + }); + // Ok, so Found didn't return a match. + // Because SQL NULL, we need do extra checks to see if we can include the row. + // Consider: + // 1. SELECT * FROM T WHERE 1 NOT IN (SELECT NULL), + // 2. SELECT * FROM T WHERE 1 IN (SELECT NULL) -- or anything else where the subquery evaluates to NULL. + // _Both_ of these queries should return nothing, because... SQL NULL. + // The same goes for e.g. SELECT * FROM T WHERE (1,1) NOT IN (SELECT NULL, NULL). + // However, it does _NOT_ apply for SELECT * FROM T WHERE (1,1) NOT IN (SELECT NULL, 1). + // BUT: it DOES apply for SELECT * FROM T WHERE (2,2) NOT IN ((1,1), (NULL, NULL))!!! + // Ergo: if the subquery result has _ANY_ tuples with all NULLs, we need to NOT include the row. + // + // So, if we didn't found a match (and hence, so far, our 'NOT IN' condition still applies), + // we must still rewind the subquery's ephemeral index cursor and go through ALL rows and compare each LHS column (with !=) to the corresponding column in the ephemeral index. + // Comparison instructions have the default behavior that if either operand is NULL, the comparison is completely skipped. + // That means: if we, for ANY row in the ephemeral index, get through all the != comparisons without jumping, + // it means our subquery result has a tuple that is exactly NULL (or (NULL, NULL) etc.), + // in which case we need to NOT include the row. + // If ALL the rows jump at one of the != comparisons, it means our subquery result has no tuples with all NULLs -> we can include the row. + program.preassign_label_to_next_insn(label_null_rewind); + program.emit_insn(Insn::Rewind { + cursor_id: *cursor_id, + pc_if_empty: label_include_row, + }); + program.preassign_label_to_next_insn(label_null_checks_loop_start); + let column_check_reg = program.alloc_register(); + for i in 0..lhs_column_count { + program.emit_insn(Insn::Column { + cursor_id: *cursor_id, + column: i, + dest: column_check_reg, + default: None, + }); + program.emit_insn(Insn::Ne { + lhs: lhs_column_regs_start + i, + rhs: column_check_reg, + target_pc: label_null_checks_next, + flags: CmpInsFlags::default(), + collation: program.curr_collation(), + }); + } + program.emit_insn(Insn::Goto { + target_pc: label_skip_row, + }); + program.preassign_label_to_next_insn(label_null_checks_next); + program.emit_insn(Insn::Next { + cursor_id: *cursor_id, + pc_if_next: label_null_checks_loop_start, + }) + } else { + // WHERE ... IN (SELECT ...) + // We can skip the row if we don't find a match + program.emit_insn(Insn::NotFound { + cursor_id: *cursor_id, + target_pc: label_skip_row, + record_reg: lhs_column_regs_start, + num_regs: lhs_column_count, + }); + } + program.preassign_label_to_next_insn(label_include_row); + program.emit_insn(Insn::Integer { + value: 1, + dest: target_register, + }); + program.preassign_label_to_next_insn(label_skip_row); + Ok(target_register) + } + SubqueryType::RowValue { + result_reg_start, + num_regs, + } => { + program.emit_insn(Insn::Copy { + src_reg: *result_reg_start, + dst_reg: target_register, + extra_amount: num_regs - 1, + }); + Ok(target_register) + } + } + } ast::Expr::Between { .. } => { unreachable!("expression should have been rewritten in optmizer") } @@ -724,7 +886,9 @@ pub fn translate_expr( ast::Expr::DoublyQualified(_, _, _) => { crate::bail_parse_error!("DoublyQualified should have been rewritten in optimizer") } - ast::Expr::Exists(_) => crate::bail_parse_error!("EXISTS in WHERE clause is not supported"), + ast::Expr::Exists(_) => { + crate::bail_parse_error!("EXISTS is not supported in this position") + } ast::Expr::FunctionCall { name, distinctness: _, @@ -1948,7 +2112,7 @@ pub fn translate_expr( } }; - let table = referenced_tables + let (is_from_outer_query_scope, table) = referenced_tables .unwrap() .find_table_by_internal_id(*table_ref_id) .unwrap_or_else(|| { @@ -1969,14 +2133,34 @@ pub fn translate_expr( // If we have a covering index, we don't have an open table cursor so we read from the index cursor. match &table { Table::BTree(_) => { - let table_cursor_id = if use_covering_index { - None + let (table_cursor_id, index_cursor_id) = if is_from_outer_query_scope { + // Due to a limitation of our translation system, a subquery that references an outer query table + // cannot know whether a table cursor, index cursor, or both were opened for that table reference. + // Hence: currently we first try to resolve a table cursor, and if that fails, + // we resolve an index cursor. + if let Some(table_cursor_id) = + program.resolve_cursor_id_safe(&CursorKey::table(*table_ref_id)) + { + (Some(table_cursor_id), None) + } else { + ( + None, + Some(program.resolve_any_index_cursor_id_for_table(*table_ref_id)), + ) + } } else { - Some(program.resolve_cursor_id(&CursorKey::table(*table_ref_id))) + let table_cursor_id = if use_covering_index { + None + } else { + Some(program.resolve_cursor_id(&CursorKey::table(*table_ref_id))) + }; + let index_cursor_id = index.map(|index| { + program + .resolve_cursor_id(&CursorKey::index(*table_ref_id, index.clone())) + }); + (table_cursor_id, index_cursor_id) }; - let index_cursor_id = index.map(|index| { - program.resolve_cursor_id(&CursorKey::index(*table_ref_id, index.clone())) - }); + if *is_rowid_alias { if let Some(index_cursor_id) = index_cursor_id { program.emit_insn(Insn::IdxRowId { @@ -1992,22 +2176,28 @@ pub fn translate_expr( unreachable!("Either index or table cursor must be opened"); } } else { - let read_cursor = if use_covering_index { - index_cursor_id.expect( - "index cursor should be opened when use_covering_index=true", - ) + let read_from_index = if is_from_outer_query_scope { + index_cursor_id.is_some() } else { - table_cursor_id.expect( - "table cursor should be opened when use_covering_index=false", - ) + use_covering_index }; - let column = if use_covering_index { - let index = index.expect( - "index cursor should be opened when use_covering_index=true", + let read_cursor = if read_from_index { + index_cursor_id.expect("index cursor should be opened") + } else { + table_cursor_id.expect("table cursor should be opened") + }; + let column = if read_from_index { + let index = program.resolve_index_for_cursor_id( + index_cursor_id.expect("index cursor should be opened"), ); - index.column_table_pos_to_index_pos(*column).unwrap_or_else(|| { - panic!("covering index {} does not contain column number {} of table {}", index.name, column, table_ref_id) - }) + index + .column_table_pos_to_index_pos(*column) + .unwrap_or_else(|| { + panic!( + "index {} does not contain column number {} of table {}", + index.name, column, table_ref_id + ) + }) } else { *column }; @@ -2142,10 +2332,10 @@ pub fn translate_expr( Ok(result_reg) } ast::Expr::InSelect { .. } => { - crate::bail_parse_error!("IN (...subquery) in WHERE clause is not supported") + crate::bail_parse_error!("IN (...subquery) is not supported in this position") } ast::Expr::InTable { .. } => { - crate::bail_parse_error!("Table expression in WHERE clause is not supported") + crate::bail_parse_error!("Table expression is not supported in this position") } ast::Expr::IsNull(expr) => { let reg = program.alloc_register(); @@ -2183,7 +2373,7 @@ pub fn translate_expr( } ast::Expr::Literal(lit) => emit_literal(program, lit, target_register), ast::Expr::Name(_) => { - crate::bail_parse_error!("ast::Expr::Name in WHERE clause is not supported") + crate::bail_parse_error!("ast::Expr::Name is not supported in this position") } ast::Expr::NotNull(expr) => { let reg = program.alloc_register(); @@ -2230,7 +2420,7 @@ pub fn translate_expr( } ast::Expr::Raise(_, _) => crate::bail_parse_error!("RAISE is not supported"), ast::Expr::Subquery(_) => { - crate::bail_parse_error!("Subquery in WHERE clause is not supported") + crate::bail_parse_error!("Subquery is not supported in this position") } ast::Expr::Unary(op, expr) => match (op, expr.as_ref()) { (UnaryOperator::Positive, expr) => { @@ -3140,12 +3330,12 @@ pub fn as_binary_components( /// Recursively unwrap parentheses from an expression /// e.g. (((t.x > 5))) -> t.x > 5 -fn unwrap_parens(expr: &ast::Expr) -> Result<&ast::Expr> { +pub fn unwrap_parens(expr: &ast::Expr) -> Result<&ast::Expr> { match expr { ast::Expr::Column { .. } => Ok(expr), ast::Expr::Parenthesized(exprs) => match exprs.len() { 1 => unwrap_parens(exprs.first().unwrap()), - _ => crate::bail_parse_error!("expected single expression in parentheses"), + _ => Ok(expr), // If the expression is e.g. (x, y), as used in e.g. (x, y) IN (SELECT ...), return as is. }, _ => Ok(expr), } @@ -3182,6 +3372,11 @@ where match func(expr)? { WalkControl::Continue => { match expr { + ast::Expr::SubqueryResult { lhs, .. } => { + if let Some(lhs) = lhs { + walk_expr(lhs, func)?; + } + } ast::Expr::Between { lhs, start, end, .. } => { @@ -3710,6 +3905,11 @@ where match func(expr)? { WalkControl::Continue => { match expr { + ast::Expr::SubqueryResult { lhs, .. } => { + if let Some(lhs) = lhs { + walk_expr_mut(lhs, func)?; + } + } ast::Expr::Between { lhs, start, end, .. } => { @@ -3887,7 +4087,7 @@ pub fn get_expr_affinity( match expr { ast::Expr::Column { table, column, .. } => { if let Some(tables) = referenced_tables { - if let Some(table_ref) = tables.find_table_by_internal_id(*table) { + if let Some((_, table_ref)) = tables.find_table_by_internal_id(*table) { if let Some(col) = table_ref.get_column_at(*column) { return col.affinity(); } diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 32afc1c21..dbc383829 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -19,7 +19,11 @@ use super::{ }, }; use crate::translate::{ - collate::get_collseq_from_expr, emitter::UpdateRowSource, window::emit_window_loop_source, + collate::get_collseq_from_expr, + emitter::UpdateRowSource, + plan::{EvalAt, NonFromClauseSubquery}, + subquery::emit_non_from_clause_subquery, + window::emit_window_loop_source, }; use crate::{ schema::{Affinity, Index, IndexColumn, Table}, @@ -116,6 +120,7 @@ pub fn init_distinct(program: &mut ProgramBuilder, plan: &SelectPlan) -> Result< } /// Initialize resources needed for the source operators (tables, joins, etc) +#[allow(clippy::too_many_arguments)] pub fn init_loop( program: &mut ProgramBuilder, t_ctx: &mut TranslateCtx, @@ -124,6 +129,8 @@ pub fn init_loop( group_by: Option<&GroupBy>, mode: OperationMode, where_clause: &[WhereTerm], + join_order: &[JoinOrderMember], + subqueries: &mut [NonFromClauseSubquery], ) -> Result<()> { assert!( t_ctx.meta_left_joins.len() == tables.joined_tables().len(), @@ -397,9 +404,25 @@ pub fn init_loop( } } + for subquery in subqueries.iter_mut().filter(|s| !s.has_been_evaluated()) { + let eval_at = subquery.get_eval_at(join_order)?; + if eval_at != EvalAt::BeforeLoop { + continue; + } + let plan = subquery.consume_plan(EvalAt::BeforeLoop); + + emit_non_from_clause_subquery( + program, + t_ctx, + *plan, + &subquery.query_type, + subquery.correlated, + )?; + } + for cond in where_clause .iter() - .filter(|c| c.should_eval_before_loop(&[JoinOrderMember::default()])) + .filter(|c| c.should_eval_before_loop(join_order, subqueries)) { let jump_target = program.allocate_label(); let meta = ConditionMetadata { @@ -418,6 +441,7 @@ pub fn init_loop( /// Set up the main query execution loop /// For example in the case of a nested table scan, this means emitting the Rewind instruction /// for all tables involved, outermost first. +#[allow(clippy::too_many_arguments)] pub fn open_loop( program: &mut ProgramBuilder, t_ctx: &mut TranslateCtx, @@ -426,6 +450,7 @@ pub fn open_loop( predicates: &[WhereTerm], temp_cursor_id: Option, mode: OperationMode, + subqueries: &mut [NonFromClauseSubquery], ) -> Result<()> { for (join_index, join) in join_order.iter().enumerate() { let joined_table_index = join.original_idx; @@ -666,6 +691,25 @@ pub fn open_loop( } } + for subquery in subqueries.iter_mut().filter(|s| !s.has_been_evaluated()) { + assert!(subquery.correlated, "subquery must be correlated"); + let eval_at = subquery.get_eval_at(join_order)?; + + if eval_at != EvalAt::Loop(join_index) { + continue; + } + + let plan = subquery.consume_plan(eval_at); + + emit_non_from_clause_subquery( + program, + t_ctx, + *plan, + &subquery.query_type, + subquery.correlated, + )?; + } + // First emit outer join conditions, if any. emit_conditions( program, @@ -676,6 +720,7 @@ pub fn open_loop( join_index, next, true, + subqueries, )?; // Set the match flag to true if this is a LEFT JOIN. @@ -706,9 +751,20 @@ pub fn open_loop( join_index, next, false, + subqueries, )?; } + if subqueries.iter().any(|s| !s.has_been_evaluated()) { + crate::bail_parse_error!( + "all subqueries should have already been emitted, but found {} unevaluated subqueries", + subqueries + .iter() + .filter(|s| !s.has_been_evaluated()) + .count() + ); + } + Ok(()) } @@ -722,11 +778,12 @@ fn emit_conditions( join_index: usize, next: BranchOffset, from_outer_join: bool, + subqueries: &[NonFromClauseSubquery], ) -> Result<()> { for cond in predicates .iter() .filter(|cond| cond.from_outer_join.is_some() == from_outer_join) - .filter(|cond| cond.should_eval_at_loop(join_index, join_order)) + .filter(|cond| cond.should_eval_at_loop(join_index, join_order, subqueries)) { let jump_target_when_true = program.allocate_label(); let condition_metadata = ConditionMetadata { diff --git a/core/translate/optimizer/constraints.rs b/core/translate/optimizer/constraints.rs index 486c36687..a1740f2f9 100644 --- a/core/translate/optimizer/constraints.rs +++ b/core/translate/optimizer/constraints.rs @@ -9,7 +9,7 @@ use crate::{ translate::{ collate::get_collseq_from_expr, expr::as_binary_components, - plan::{JoinOrderMember, TableReferences, WhereTerm}, + plan::{JoinOrderMember, NonFromClauseSubquery, TableReferences, WhereTerm}, planner::{table_mask_from_expr, TableMask}, }, util::exprs_are_equivalent, @@ -188,6 +188,7 @@ pub fn constraints_from_where_clause( where_clause: &[WhereTerm], table_references: &TableReferences, available_indexes: &HashMap>>, + subqueries: &[NonFromClauseSubquery], ) -> Result> { let mut constraints = Vec::new(); @@ -241,7 +242,7 @@ pub fn constraints_from_where_clause( where_clause_pos: (i, BinaryExprSide::Rhs), operator, table_col_pos: *column, - lhs_mask: table_mask_from_expr(rhs, table_references)?, + lhs_mask: table_mask_from_expr(rhs, table_references, subqueries)?, selectivity: estimate_selectivity(table_column, operator), usable: true, }); @@ -258,7 +259,7 @@ pub fn constraints_from_where_clause( where_clause_pos: (i, BinaryExprSide::Rhs), operator, table_col_pos: rowid_alias_column.unwrap(), - lhs_mask: table_mask_from_expr(rhs, table_references)?, + lhs_mask: table_mask_from_expr(rhs, table_references, subqueries)?, selectivity: estimate_selectivity(table_column, operator), usable: true, }); @@ -274,7 +275,7 @@ pub fn constraints_from_where_clause( where_clause_pos: (i, BinaryExprSide::Lhs), operator: opposite_cmp_op(operator), table_col_pos: *column, - lhs_mask: table_mask_from_expr(lhs, table_references)?, + lhs_mask: table_mask_from_expr(lhs, table_references, subqueries)?, selectivity: estimate_selectivity(table_column, operator), usable: true, }); @@ -288,7 +289,7 @@ pub fn constraints_from_where_clause( where_clause_pos: (i, BinaryExprSide::Lhs), operator: opposite_cmp_op(operator), table_col_pos: rowid_alias_column.unwrap(), - lhs_mask: table_mask_from_expr(lhs, table_references)?, + lhs_mask: table_mask_from_expr(lhs, table_references, subqueries)?, selectivity: estimate_selectivity(table_column, operator), usable: true, }); diff --git a/core/translate/optimizer/join.rs b/core/translate/optimizer/join.rs index 79b80174a..89e5d7d3f 100644 --- a/core/translate/optimizer/join.rs +++ b/core/translate/optimizer/join.rs @@ -540,9 +540,13 @@ mod tests { let where_clause = vec![]; let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let result = compute_best_join_order( table_references.joined_tables(), @@ -569,9 +573,13 @@ mod tests { let where_clause = vec![]; let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); // SELECT * from test_table // expecting best_best_plan() not to do any work due to empty where clause. @@ -610,9 +618,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); let available_indexes = HashMap::new(); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); // SELECT * FROM test_table WHERE id = 42 // expecting a RowidEq access method because id is a rowid alias. @@ -678,9 +690,13 @@ mod tests { }); available_indexes.insert("test_table".to_string(), VecDeque::from([index])); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); // SELECT * FROM test_table WHERE id = 42 // expecting an IndexScan access method because id is a primary key with an index let result = compute_best_join_order( @@ -757,9 +773,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let result = compute_best_join_order( table_references.joined_tables(), @@ -932,9 +952,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let result = compute_best_join_order( table_references.joined_tables(), @@ -1041,9 +1065,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let available_indexes = HashMap::new(); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let BestJoinOrderResult { best_plan, .. } = compute_best_join_order( table_references.joined_tables(), @@ -1147,9 +1175,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); let available_indexes = HashMap::new(); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let result = compute_best_join_order( table_references.joined_tables(), @@ -1231,9 +1263,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); // Run the optimizer let BestJoinOrderResult { best_plan, .. } = compute_best_join_order( @@ -1352,9 +1388,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let BestJoinOrderResult { best_plan, .. } = compute_best_join_order( table_references.joined_tables(), @@ -1460,9 +1500,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let BestJoinOrderResult { best_plan, .. } = compute_best_join_order( table_references.joined_tables(), @@ -1586,9 +1630,13 @@ mod tests { let table_references = TableReferences::new(joined_tables, vec![]); let access_methods_arena = RefCell::new(Vec::new()); - let table_constraints = - constraints_from_where_clause(&where_clause, &table_references, &available_indexes) - .unwrap(); + let table_constraints = constraints_from_where_clause( + &where_clause, + &table_references, + &available_indexes, + &[], + ) + .unwrap(); let BestJoinOrderResult { best_plan, .. } = compute_best_join_order( table_references.joined_tables(), diff --git a/core/translate/optimizer/mod.rs b/core/translate/optimizer/mod.rs index 798cd50f3..32e7868f5 100644 --- a/core/translate/optimizer/mod.rs +++ b/core/translate/optimizer/mod.rs @@ -23,8 +23,8 @@ use crate::{ constraints::{RangeConstraintRef, SeekRangeConstraint, TableConstraints}, }, plan::{ - ColumnUsedMask, OuterQueryReference, QueryDestination, ResultSetColumn, Scan, - SeekKeyComponent, + ColumnUsedMask, NonFromClauseSubquery, OuterQueryReference, QueryDestination, + ResultSetColumn, Scan, SeekKeyComponent, }, }, types::SeekOp, @@ -89,6 +89,7 @@ pub fn optimize_select_plan(plan: &mut SelectPlan, schema: &Schema) -> Result<() &mut plan.where_clause, &mut plan.order_by, &mut plan.group_by, + &plan.non_from_clause_subqueries, )?; if let Some(best_join_order) = best_join_order { @@ -114,6 +115,7 @@ fn optimize_delete_plan(plan: &mut DeletePlan, schema: &Schema) -> Result<()> { &mut plan.where_clause, &mut plan.order_by, &mut None, + &[], )?; Ok(()) @@ -138,6 +140,7 @@ fn optimize_update_plan( &mut plan.where_clause, &mut plan.order_by, &mut None, + &[], )?; let table_ref = &mut plan.table_references.joined_tables_mut()[0]; @@ -301,6 +304,7 @@ fn add_ephemeral_table_to_update_plan( distinctness: super::plan::Distinctness::NonDistinct, values: vec![], window: None, + non_from_clause_subqueries: vec![], }; plan.ephemeral_plan = Some(ephemeral_plan); @@ -336,6 +340,7 @@ fn optimize_table_access( where_clause: &mut [WhereTerm], order_by: &mut Vec<(Box, SortOrder)>, group_by: &mut Option, + subqueries: &[NonFromClauseSubquery], ) -> Result>> { if table_references.joined_tables().len() > TableReferences::MAX_JOINED_TABLES { crate::bail_parse_error!( @@ -345,8 +350,12 @@ fn optimize_table_access( } let access_methods_arena = RefCell::new(Vec::new()); let maybe_order_target = compute_order_target(order_by, group_by.as_mut()); - let constraints_per_table = - constraints_from_where_clause(where_clause, table_references, available_indexes)?; + let constraints_per_table = constraints_from_where_clause( + where_clause, + table_references, + available_indexes, + subqueries, + )?; // Currently the expressions we evaluate as constraints are binary expressions that will never be true for a NULL operand. // If there are any constraints on the right hand side table of an outer join that are not part of the outer join condition, @@ -766,6 +775,7 @@ impl Optimizable for ast::Expr { /// by writing more complex code. fn is_nonnull(&self, tables: &TableReferences) -> bool { match self { + Expr::SubqueryResult { .. } => false, Expr::Between { lhs, start, end, .. } => lhs.is_nonnull(tables) && start.is_nonnull(tables) && end.is_nonnull(tables), @@ -804,7 +814,9 @@ impl Optimizable for ast::Expr { return true; } - let table_ref = tables.find_joined_table_by_internal_id(*table).unwrap(); + let (_, table_ref) = tables + .find_table_by_internal_id(*table) + .expect("table not found"); let columns = table_ref.columns(); let column = &columns[*column]; column.primary_key || column.notnull @@ -843,6 +855,7 @@ impl Optimizable for ast::Expr { /// Returns true if the expression is a constant i.e. does not depend on variables or columns etc. fn is_constant(&self, resolver: &Resolver<'_>) -> bool { match self { + Expr::SubqueryResult { .. } => false, Expr::Between { lhs, start, end, .. } => { diff --git a/core/translate/plan.rs b/core/translate/plan.rs index aa506ca76..384a69158 100644 --- a/core/translate/plan.rs +++ b/core/translate/plan.rs @@ -1,5 +1,7 @@ use std::{cmp::Ordering, sync::Arc}; -use turso_parser::ast::{self, FrameBound, FrameClause, FrameExclude, FrameMode, SortOrder}; +use turso_parser::ast::{ + self, FrameBound, FrameClause, FrameExclude, FrameMode, SortOrder, SubqueryType, +}; use crate::{ function::AggFunc, @@ -36,12 +38,12 @@ impl ResultSetColumn { } match &self.expr { ast::Expr::Column { table, column, .. } => { - let table_ref = tables.find_table_by_internal_id(*table).unwrap(); + let (_, table_ref) = tables.find_table_by_internal_id(*table).unwrap(); table_ref.get_column_at(*column).unwrap().name.as_deref() } ast::Expr::RowId { table, .. } => { // If there is a rowid alias column, use its name - let table_ref = tables.find_table_by_internal_id(*table).unwrap(); + let (_, table_ref) = tables.find_table_by_internal_id(*table).unwrap(); if let Table::BTree(table) = &table_ref { if let Some(rowid_alias_column) = table.get_rowid_alias_column() { if let Some(name) = &rowid_alias_column.1.name { @@ -106,28 +108,41 @@ pub struct WhereTerm { } impl WhereTerm { - pub fn should_eval_before_loop(&self, join_order: &[JoinOrderMember]) -> bool { + pub fn should_eval_before_loop( + &self, + join_order: &[JoinOrderMember], + subqueries: &[NonFromClauseSubquery], + ) -> bool { if self.consumed { return false; } - let Ok(eval_at) = self.eval_at(join_order) else { + let Ok(eval_at) = self.eval_at(join_order, subqueries) else { return false; }; eval_at == EvalAt::BeforeLoop } - pub fn should_eval_at_loop(&self, loop_idx: usize, join_order: &[JoinOrderMember]) -> bool { + pub fn should_eval_at_loop( + &self, + loop_idx: usize, + join_order: &[JoinOrderMember], + subqueries: &[NonFromClauseSubquery], + ) -> bool { if self.consumed { return false; } - let Ok(eval_at) = self.eval_at(join_order) else { + let Ok(eval_at) = self.eval_at(join_order, subqueries) else { return false; }; eval_at == EvalAt::Loop(loop_idx) } - fn eval_at(&self, join_order: &[JoinOrderMember]) -> Result { - determine_where_to_eval_term(self, join_order) + fn eval_at( + &self, + join_order: &[JoinOrderMember], + subqueries: &[NonFromClauseSubquery], + ) -> Result { + determine_where_to_eval_term(self, join_order, subqueries) } } @@ -225,6 +240,20 @@ pub enum QueryDestination { /// The table that will be used to store the results. table: Arc, }, + /// The result of an EXISTS subquery are stored in a single register. + ExistsSubqueryResult { + /// The register that holds the result of the EXISTS subquery. + result_reg: usize, + }, + /// The results of a subquery that is neither 'EXISTS' nor 'IN' are stored in a range of registers. + RowValueSubqueryResult { + /// The start register of the range that holds the result of the subquery. + result_reg_start: usize, + /// The number of registers that hold the result of the subquery. + num_regs: usize, + }, + /// Decision made at some point after query plan construction. + Unset, } impl QueryDestination { @@ -345,6 +374,8 @@ pub struct SelectPlan { /// The window definition and all window functions associated with it. There is at most one /// window per SELECT. If the original query contains more, they are pushed down into subqueries. pub window: Option, + /// Subqueries that appear in any part of the query apart from the FROM clause + pub non_from_clause_subqueries: Vec, } impl SelectPlan { @@ -356,6 +387,15 @@ impl SelectPlan { self.aggregates.iter().map(|agg| agg.args.len()).sum() } + /// Whether this query or any of its subqueries reference columns from the outer query. + pub fn is_correlated(&self) -> bool { + self.table_references + .outer_query_refs() + .iter() + .any(|t| t.is_used()) + || self.non_from_clause_subqueries.iter().any(|s| s.correlated) + } + /// Reference: https://github.com/sqlite/sqlite/blob/5db695197b74580c777b37ab1b787531f15f7f9f/src/select.c#L8613 /// /// Checks to see if the query is of the format `SELECT count(*) FROM ` @@ -659,17 +699,22 @@ impl TableReferences { .find(|t| t.internal_id == internal_id) } - /// Returns an immutable reference to the [Table] with the given internal ID. - pub fn find_table_by_internal_id(&self, internal_id: TableInternalId) -> Option<&Table> { + /// Returns an immutable reference to the [Table] with the given internal ID, + /// plus a boolean indicating whether the table is a joined table from the current query scope (false), + /// or an outer query reference (true). + pub fn find_table_by_internal_id( + &self, + internal_id: TableInternalId, + ) -> Option<(bool, &Table)> { self.joined_tables .iter() .find(|t| t.internal_id == internal_id) - .map(|t| &t.table) + .map(|t| (false, &t.table)) .or_else(|| { self.outer_query_refs .iter() .find(|t| t.internal_id == internal_id) - .map(|t| &t.table) + .map(|t| (true, &t.table)) }) } @@ -790,6 +835,12 @@ impl ColumnUsedMask { } } +impl std::ops::BitOrAssign<&Self> for ColumnUsedMask { + fn bitor_assign(&mut self, rhs: &Self) { + self.0 |= &rhs.0; + } +} + #[derive(Clone, Debug)] #[allow(clippy::large_enum_variant)] pub enum Operation { @@ -1299,6 +1350,84 @@ pub struct WindowFunction { pub original_expr: Expr, } +#[derive(Debug, Clone)] +pub enum SubqueryState { + /// The subquery has not been evaluated yet. + /// The 'plan' field is only optional because it is .take()'d when the the subquery + /// is translated into bytecode. + Unevaluated { plan: Option> }, + /// The subquery has been evaluated. + /// The [evaluated_at] field contains the loop index where the subquery was evaluated. + /// The query plan struct no longer exists because translating the plan currently + /// requires an ownership transfer. + Evaluated { evaluated_at: EvalAt }, +} + +#[derive(Debug, Clone)] +/// A subquery that is not part of the `FROM` clause. +/// This is used for subqueries in the WHERE clause, HAVING clause, ORDER BY clause, LIMIT clause, OFFSET clause, etc. +/// Currently only subqueries in the WHERE clause are supported. +pub struct NonFromClauseSubquery { + pub internal_id: TableInternalId, + pub query_type: SubqueryType, + pub state: SubqueryState, + pub correlated: bool, +} + +impl NonFromClauseSubquery { + /// Returns true if the subquery has been evaluated (translated into bytecode). + pub fn has_been_evaluated(&self) -> bool { + matches!(self.state, SubqueryState::Evaluated { .. }) + } + + /// Returns the loop index where the subquery should be evaluated in this particular join order. + /// If the subquery references tables from the parent query, it will be evaluated at the right-most + /// nested loop whose table it references. + pub fn get_eval_at(&self, join_order: &[JoinOrderMember]) -> Result { + let mut eval_at = EvalAt::BeforeLoop; + let SubqueryState::Unevaluated { plan } = &self.state else { + crate::bail_parse_error!("subquery has already been evaluated"); + }; + let used_outer_refs = plan + .as_ref() + .unwrap() + .table_references + .outer_query_refs() + .iter() + .filter(|t| t.is_used()); + + for outer_ref in used_outer_refs { + let Some(loop_idx) = join_order + .iter() + .position(|t| t.table_id == outer_ref.internal_id) + else { + continue; + }; + eval_at = eval_at.max(EvalAt::Loop(loop_idx)); + } + for subquery in plan.as_ref().unwrap().non_from_clause_subqueries.iter() { + let eval_at_inner = subquery.get_eval_at(join_order)?; + eval_at = eval_at.max(eval_at_inner); + } + Ok(eval_at) + } + + /// Consumes the plan and returns it, and sets the subquery to the evaluated state. + /// This is used when the subquery is translated into bytecode. + pub fn consume_plan(&mut self, evaluated_at: EvalAt) -> Box { + match &mut self.state { + SubqueryState::Unevaluated { plan } => { + let plan = plan.take().unwrap(); + self.state = SubqueryState::Evaluated { evaluated_at }; + plan + } + SubqueryState::Evaluated { .. } => { + panic!("subquery has already been evaluated"); + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/translate/planner.rs b/core/translate/planner.rs index f7d930bd6..0409ae786 100644 --- a/core/translate/planner.rs +++ b/core/translate/planner.rs @@ -13,6 +13,7 @@ use super::{ use crate::translate::{ emitter::Resolver, expr::{BindingBehavior, WalkControl}, + plan::{NonFromClauseSubquery, SubqueryState}, }; use crate::{ ast::Limit, @@ -776,6 +777,7 @@ pub fn parse_where( pub fn determine_where_to_eval_term( term: &WhereTerm, join_order: &[JoinOrderMember], + subqueries: &[NonFromClauseSubquery], ) -> Result { if let Some(table_id) = term.from_outer_join { return Ok(EvalAt::Loop( @@ -786,7 +788,7 @@ pub fn determine_where_to_eval_term( )); } - determine_where_to_eval_expr(&term.expr, join_order) + determine_where_to_eval_expr(&term.expr, join_order, subqueries) } /// A bitmask representing a set of tables in a query plan. @@ -889,6 +891,7 @@ impl TableMask { pub fn table_mask_from_expr( top_level_expr: &Expr, table_references: &TableReferences, + subqueries: &[NonFromClauseSubquery], ) -> Result { let mut mask = TableMask::new(); walk_expr(top_level_expr, &mut |expr: &Expr| -> Result { @@ -910,6 +913,34 @@ pub fn table_mask_from_expr( crate::bail_parse_error!("table not found in joined_tables"); } } + // Given something like WHERE t.a = (SELECT ...), we can only evaluate that expression + // when all both table 't' and all outer scope tables referenced by the subquery OR its nested subqueries are in scope. + // Hence, the tables referenced in subqueries must be added to the table mask. + Expr::SubqueryResult { subquery_id, .. } => { + let Some(subquery) = subqueries.iter().find(|s| s.internal_id == *subquery_id) + else { + crate::bail_parse_error!("subquery not found"); + }; + let SubqueryState::Unevaluated { plan } = &subquery.state else { + crate::bail_parse_error!("subquery has already been evaluated"); + }; + let used_outer_query_refs = plan + .as_ref() + .unwrap() + .table_references + .outer_query_refs() + .iter() + .filter(|t| t.is_used()); + for outer_query_ref in used_outer_query_refs { + if let Some(table_idx) = table_references + .joined_tables() + .iter() + .position(|t| t.internal_id == outer_query_ref.internal_id) + { + mask.add_table(table_idx); + } + } + } _ => {} } Ok(WalkControl::Continue) @@ -921,17 +952,51 @@ pub fn table_mask_from_expr( pub fn determine_where_to_eval_expr( top_level_expr: &Expr, join_order: &[JoinOrderMember], + subqueries: &[NonFromClauseSubquery], ) -> Result { + // If the expression references no tables, it can be evaluated before any table loops are opened. let mut eval_at: EvalAt = EvalAt::BeforeLoop; walk_expr(top_level_expr, &mut |expr: &Expr| -> Result { match expr { Expr::Column { table, .. } | Expr::RowId { table, .. } => { - let join_idx = join_order - .iter() - .position(|t| t.table_id == *table) - .unwrap_or(usize::MAX); + let Some(join_idx) = join_order.iter().position(|t| t.table_id == *table) else { + // Must be an outer query reference; in that case, the table is already in scope. + return Ok(WalkControl::Continue); + }; eval_at = eval_at.max(EvalAt::Loop(join_idx)); } + // Given something like WHERE t.a = (SELECT ...), we can only evaluate that expression + // when all both table 't' and all outer scope tables referenced by the subquery OR its nested subqueries are in scope. + Expr::SubqueryResult { subquery_id, .. } => { + let Some(subquery) = subqueries.iter().find(|s| s.internal_id == *subquery_id) + else { + crate::bail_parse_error!("subquery not found"); + }; + match &subquery.state { + SubqueryState::Evaluated { evaluated_at } => { + eval_at = eval_at.max(*evaluated_at); + } + SubqueryState::Unevaluated { plan } => { + let used_outer_refs = plan + .as_ref() + .unwrap() + .table_references + .outer_query_refs() + .iter() + .filter(|t| t.is_used()); + for outer_ref in used_outer_refs { + let Some(join_idx) = join_order + .iter() + .position(|t| t.table_id == outer_ref.internal_id) + else { + continue; + }; + eval_at = eval_at.max(EvalAt::Loop(join_idx)); + } + return Ok(WalkControl::Continue); + } + } + } _ => {} } Ok(WalkControl::Continue) diff --git a/core/translate/result_row.rs b/core/translate/result_row.rs index e29ea3531..244e51d17 100644 --- a/core/translate/result_row.rs +++ b/core/translate/result_row.rs @@ -143,6 +143,24 @@ pub fn emit_result_row_and_limit( end_offset: BranchOffset::Offset(0), }); } + QueryDestination::ExistsSubqueryResult { result_reg } => { + program.emit_insn(Insn::Integer { + value: 1, + dest: *result_reg, + }); + } + QueryDestination::RowValueSubqueryResult { + result_reg_start, + num_regs, + } => { + assert!(plan.result_columns.len() == *num_regs, "Row value subqueries should have the same number of result columns as the number of registers"); + program.emit_insn(Insn::Copy { + src_reg: result_columns_start_reg, + dst_reg: *result_reg_start, + extra_amount: num_regs - 1, + }); + } + QueryDestination::Unset => unreachable!("Unset query destination should not be reached"), } if plan.limit.is_some() { diff --git a/core/translate/select.rs b/core/translate/select.rs index 59b6ff6cb..0c9ebf2de 100644 --- a/core/translate/select.rs +++ b/core/translate/select.rs @@ -13,6 +13,7 @@ use crate::translate::planner::{ break_predicate_at_and_boundaries, parse_from, parse_limit, parse_where, resolve_window_and_aggregate_functions, }; +use crate::translate::subquery::plan_subqueries_from_where_clause; use crate::translate::window::plan_windows; use crate::util::normalize_ident; use crate::vdbe::builder::ProgramBuilderOpts; @@ -273,6 +274,7 @@ fn prepare_one_select_plan( distinctness: Distinctness::from_ast(distinctness.as_ref()), values: vec![], window: None, + non_from_clause_subqueries: vec![], }; let mut windows = Vec::with_capacity(window_clause.len()); @@ -502,6 +504,25 @@ fn prepare_one_select_plan( )?; } + // TODO: support subqueries in the following positions: + // - result column of a select, e.g.: SELECT x = (SELECT ...) FROM t + // - GROUP BY clause, e.g.: SELECT * FROM t GROUP BY (SELECT ...) <-- however nonsensical that might be... + // - HAVING clause, e.g.: SELECT * FROM t GROUP BY x HAVING x = (SELECT ...) + // - ORDER BY clause, e.g.: SELECT * FROM t ORDER BY x = (SELECT ...) + // - LIMIT clause, e.g.: SELECT * FROM t LIMIT (SELECT ...) + // - OFFSET clause, e.g.: SELECT * FROM t OFFSET (SELECT ...) + // + // in these positions, unlike in the WHERE clause, the subquery cannot reference columns from the outer query, + // so we don't need to collect outer query references. + plan_subqueries_from_where_clause( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + &mut plan.where_clause, + connection, + )?; + // Return the unoptimized query plan Ok(plan) } @@ -555,6 +576,7 @@ fn prepare_one_select_plan( .map(|values| values.iter().map(|value| *value.clone()).collect()) .collect(), window: None, + non_from_clause_subqueries: vec![], }; Ok(plan) diff --git a/core/translate/subquery.rs b/core/translate/subquery.rs index d883b0052..3cf7e3843 100644 --- a/core/translate/subquery.rs +++ b/core/translate/subquery.rs @@ -1,8 +1,26 @@ +use std::sync::Arc; + +use turso_parser::ast::{self, SortOrder, SubqueryType}; + use crate::{ emit_explain, - schema::Table, - vdbe::{builder::ProgramBuilder, insn::Insn}, - QueryMode, Result, + schema::{Index, IndexColumn, Table}, + translate::{ + collate::get_collseq_from_expr, + emitter::emit_program_for_select, + expr::{unwrap_parens, walk_expr_mut, WalkControl}, + optimizer::optimize_select_plan, + plan::{ + ColumnUsedMask, NonFromClauseSubquery, OuterQueryReference, Plan, SubqueryState, + WhereTerm, + }, + select::prepare_select_plan, + }, + vdbe::{ + builder::{CursorType, ProgramBuilder}, + insn::Insn, + }, + Connection, QueryMode, Result, }; use super::{ @@ -11,9 +29,326 @@ use super::{ plan::{Operation, QueryDestination, Search, SelectPlan, TableReferences}, }; +/// Compute query plans for subqueries in the WHERE clause. +/// The AST expression containing the subquery ([ast::Expr::Exists], [ast::Expr::Subquery], [ast::Expr::InSelect]) is replaced with a [ast::Expr::SubqueryResult] expression. +/// The [ast::Expr::SubqueryResult] expression contains the subquery ID, the left-hand side expression (only applicable to IN subqueries), the NOT IN flag (only applicable to IN subqueries), and the subquery type. +/// The computed plans are stored in the [NonFromClauseSubquery] structs on the [SelectPlan], and evaluated at the appropriate time during the translation of the main query. +/// The appropriate time is determined by whether the subquery is correlated or uncorrelated; +/// if it is uncorrelated, it can be evaluated as early as possible, but if it is correlated, it must be evaluated after all of its dependencies from the +/// outer query are 'in scope', i.e. their cursors are open and rewound. +pub fn plan_subqueries_from_where_clause( + program: &mut ProgramBuilder, + out_subqueries: &mut Vec, + referenced_tables: &mut TableReferences, + resolver: &Resolver, + where_terms: &mut [WhereTerm], + connection: &Arc, +) -> Result<()> { + // A WHERE clause subquery can reference columns from the outer query, + // including nested cases where a subquery inside a subquery references columns from its parent's parent + // and so on. + let get_outer_query_refs = |referenced_tables: &TableReferences| { + referenced_tables + .joined_tables() + .iter() + .map(|t| OuterQueryReference { + table: t.table.clone(), + identifier: t.identifier.clone(), + internal_id: t.internal_id, + col_used_mask: ColumnUsedMask::default(), + }) + .chain( + referenced_tables + .outer_query_refs() + .iter() + .map(|t| OuterQueryReference { + table: t.table.clone(), + identifier: t.identifier.clone(), + internal_id: t.internal_id, + col_used_mask: ColumnUsedMask::default(), + }), + ) + .collect::>() + }; + + // Walk the WHERE clause and replace subqueries with [ast::Expr::SubqueryResult] expressions. + for where_term in where_terms.iter_mut() { + walk_expr_mut( + &mut where_term.expr, + &mut |expr: &mut ast::Expr| -> Result { + match expr { + ast::Expr::Exists(_) => { + let subquery_id = program.table_reference_counter.next(); + let outer_query_refs = get_outer_query_refs(referenced_tables); + + let result_reg = program.alloc_register(); + let subquery_type = SubqueryType::Exists { result_reg }; + let result_expr = ast::Expr::SubqueryResult { + subquery_id, + lhs: None, + not_in: false, + query_type: subquery_type.clone(), + }; + let ast::Expr::Exists(subselect) = std::mem::replace(expr, result_expr) + else { + unreachable!(); + }; + + let plan = prepare_select_plan( + subselect, + resolver, + program, + &outer_query_refs, + QueryDestination::ExistsSubqueryResult { result_reg }, + connection, + )?; + let Plan::Select(mut plan) = plan else { + crate::bail_parse_error!( + "compound SELECT queries not supported yet in WHERE clause subqueries" + ); + }; + optimize_select_plan(&mut plan, resolver.schema)?; + // EXISTS subqueries are satisfied after at most 1 row has been returned. + plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric( + "1".to_string(), + )))); + let correlated = plan.is_correlated(); + out_subqueries.push(NonFromClauseSubquery { + internal_id: subquery_id, + query_type: subquery_type, + state: SubqueryState::Unevaluated { + plan: Some(Box::new(plan)), + }, + correlated, + }); + Ok(WalkControl::Continue) + } + ast::Expr::Subquery(_) => { + let subquery_id = program.table_reference_counter.next(); + let outer_query_refs = get_outer_query_refs(referenced_tables); + + let result_expr = ast::Expr::SubqueryResult { + subquery_id, + lhs: None, + not_in: false, + // Placeholder values because the number of columns returned is not known until the plan is prepared. + // These are replaced below after planning. + query_type: SubqueryType::RowValue { + result_reg_start: 0, + num_regs: 0, + }, + }; + let ast::Expr::Subquery(subselect) = std::mem::replace(expr, result_expr) + else { + unreachable!(); + }; + let plan = prepare_select_plan( + subselect, + resolver, + program, + &outer_query_refs, + QueryDestination::Unset, + connection, + )?; + let Plan::Select(mut plan) = plan else { + crate::bail_parse_error!( + "compound SELECT queries not supported yet in WHERE clause subqueries" + ); + }; + optimize_select_plan(&mut plan, resolver.schema)?; + let reg_count = plan.result_columns.len(); + let reg_start = program.alloc_registers(reg_count); + + plan.query_destination = QueryDestination::RowValueSubqueryResult { + result_reg_start: reg_start, + num_regs: reg_count, + }; + // RowValue subqueries are satisfied after at most 1 row has been returned, + // as they are used in comparisons with a scalar or a tuple of scalars like (x,y) = (SELECT ...) or x = (SELECT ...). + plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric( + "1".to_string(), + )))); + + let ast::Expr::SubqueryResult { + subquery_id, + lhs: None, + not_in: false, + query_type: + SubqueryType::RowValue { + result_reg_start, + num_regs, + }, + } = &mut *expr + else { + unreachable!(); + }; + *result_reg_start = reg_start; + *num_regs = reg_count; + + let correlated = plan.is_correlated(); + + out_subqueries.push(NonFromClauseSubquery { + internal_id: *subquery_id, + query_type: SubqueryType::RowValue { + result_reg_start: reg_start, + num_regs: reg_count, + }, + state: SubqueryState::Unevaluated { + plan: Some(Box::new(plan)), + }, + correlated, + }); + Ok(WalkControl::Continue) + } + ast::Expr::InSelect { .. } => { + let subquery_id = program.table_reference_counter.next(); + let outer_query_refs = get_outer_query_refs(referenced_tables); + + let ast::Expr::InSelect { lhs, not, rhs } = + std::mem::replace(expr, ast::Expr::Literal(ast::Literal::Null)) + else { + unreachable!(); + }; + let plan = prepare_select_plan( + rhs, + resolver, + program, + &outer_query_refs, + QueryDestination::Unset, + connection, + )?; + let Plan::Select(mut plan) = plan else { + crate::bail_parse_error!( + "compound SELECT queries not supported yet in WHERE clause subqueries" + ); + }; + optimize_select_plan(&mut plan, resolver.schema)?; + // e.g. (x,y) IN (SELECT ...) + // or x IN (SELECT ...) + let lhs_column_count = match unwrap_parens(lhs.as_ref())? { + ast::Expr::Parenthesized(exprs) => exprs.len(), + _ => 1, + }; + if lhs_column_count != plan.result_columns.len() { + crate::bail_parse_error!( + "lhs of IN subquery must have the same number of columns as the subquery" + ); + } + + let mut columns = plan + .result_columns + .iter() + .enumerate() + .map(|(i, c)| IndexColumn { + name: c.name(&plan.table_references).unwrap_or("").to_string(), + order: SortOrder::Asc, + pos_in_table: i, + collation: None, + default: None, + }) + .collect::>(); + + for (i, column) in columns.iter_mut().enumerate() { + column.collation = get_collseq_from_expr( + &plan.result_columns[i].expr, + &plan.table_references, + )?; + } + + let ephemeral_index = Arc::new(Index { + columns, + name: format!("ephemeral_index_where_sub_{subquery_id}"), + table_name: String::new(), + ephemeral: true, + has_rowid: false, + root_page: 0, + unique: false, + where_clause: None, + }); + + let cursor_id = program + .alloc_cursor_id(CursorType::BTreeIndex(ephemeral_index.clone())); + + plan.query_destination = QueryDestination::EphemeralIndex { + cursor_id, + index: ephemeral_index.clone(), + is_delete: false, + }; + + *expr = ast::Expr::SubqueryResult { + subquery_id, + lhs: Some(lhs), + not_in: not, + query_type: SubqueryType::In { cursor_id }, + }; + + let correlated = plan.is_correlated(); + + out_subqueries.push(NonFromClauseSubquery { + internal_id: subquery_id, + query_type: SubqueryType::In { cursor_id }, + state: SubqueryState::Unevaluated { + plan: Some(Box::new(plan)), + }, + correlated, + }); + Ok(WalkControl::Continue) + } + _ => Ok(WalkControl::Continue), + } + }, + )?; + } + + update_column_used_masks(referenced_tables, out_subqueries); + + Ok(()) +} + +/// We make decisions about when to evaluate expressions or whether to use covering indexes based on +/// which columns of a table have been referenced. +/// Since subquery nesting is arbitrarily deep, a reference to a column must propagate recursively +/// up to the parent. Example: +/// +/// SELECT * FROM t WHERE EXISTS (SELECT * FROM u WHERE EXISTS (SELECT * FROM v WHERE v.foo = t.foo)) +/// +/// In this case, t.foo is referenced in the innermost subquery, so the top level query must be notified +/// that t.foo has been used. +fn update_column_used_masks( + table_refs: &mut TableReferences, + subqueries: &mut [NonFromClauseSubquery], +) { + for subquery in subqueries.iter_mut() { + let SubqueryState::Unevaluated { plan } = &mut subquery.state else { + panic!("subquery has already been evaluated"); + }; + let Some(child_plan) = plan.as_mut() else { + panic!("subquery has no plan"); + }; + + for child_outer_query_ref in child_plan + .table_references + .outer_query_refs() + .iter() + .filter(|t| t.is_used()) + { + if let Some(joined_table) = + table_refs.find_joined_table_by_internal_id_mut(child_outer_query_ref.internal_id) + { + joined_table.col_used_mask |= &child_outer_query_ref.col_used_mask; + } + if let Some(outer_query_ref) = table_refs + .find_outer_query_ref_by_internal_id_mut(child_outer_query_ref.internal_id) + { + outer_query_ref.col_used_mask |= &child_outer_query_ref.col_used_mask; + } + } + } +} + /// Emit the subqueries contained in the FROM clause. /// This is done first so the results can be read in the main query loop. -pub fn emit_subqueries( +pub fn emit_from_clause_subqueries( program: &mut ProgramBuilder, t_ctx: &mut TranslateCtx, tables: &mut TableReferences, @@ -60,7 +395,7 @@ pub fn emit_subqueries( if let Table::FromClauseSubquery(from_clause_subquery) = &mut table_reference.table { // Emit the subquery and get the start register of the result columns. let result_columns_start = - emit_subquery(program, &mut from_clause_subquery.plan, t_ctx)?; + emit_from_clause_subquery(program, &mut from_clause_subquery.plan, t_ctx)?; // Set the start register of the subquery's result columns. // This is done so that translate_expr() can read the result columns of the subquery, // as if it were reading from a regular table. @@ -72,9 +407,9 @@ pub fn emit_subqueries( Ok(()) } -/// Emit a subquery and return the start register of the result columns. +/// Emit a FROM clause subquery and return the start register of the result columns. /// This is done by emitting a coroutine that stores the result columns in sequential registers. -/// Each subquery in a FROM clause has its own separate SelectPlan which is wrapped in a coroutine. +/// Each FROM clause subquery has its own separate SelectPlan which is wrapped in a coroutine. /// /// The resulting bytecode from a subquery is mostly exactly the same as a regular query, except: /// - it ends in an EndCoroutine instead of a Halt. @@ -85,7 +420,7 @@ pub fn emit_subqueries( /// /// Since a subquery has its own SelectPlan, it can contain nested subqueries, /// which can contain even more nested subqueries, etc. -pub fn emit_subquery( +pub fn emit_from_clause_subquery( program: &mut ProgramBuilder, plan: &mut SelectPlan, t_ctx: &mut TranslateCtx, @@ -102,7 +437,7 @@ pub fn emit_subquery( // The parent query will use this register to reinitialize the coroutine when it needs to run multiple times. *coroutine_implementation_start = coroutine_implementation_start_offset; } - _ => unreachable!("emit_subquery called on non-subquery"), + _ => unreachable!("emit_from_clause_subquery called on non-subquery"), } let end_coroutine_label = program.allocate_label(); let mut metadata = TranslateCtx { @@ -137,3 +472,88 @@ pub fn emit_subquery( program.preassign_label_to_next_insn(subquery_body_end_label); Ok(result_column_start_reg) } + +/// Translate a subquery that is not part of the FROM clause. +/// If a subquery is uncorrelated (i.e. does not reference columns from the outer query), +/// it will be executed only once. +/// +/// If it is correlated (i.e. references columns from the outer query), +/// it will be executed for each row of the outer query. +/// +/// The result of the subquery is stored in: +/// +/// - a single register for EXISTS subqueries, +/// - a range of registers for RowValue subqueries, +/// - an ephemeral index for IN subqueries. +pub fn emit_non_from_clause_subquery( + program: &mut ProgramBuilder, + t_ctx: &mut TranslateCtx, + plan: SelectPlan, + query_type: &SubqueryType, + is_correlated: bool, +) -> Result<()> { + program.incr_nesting(); + + let label_skip_after_first_run = if !is_correlated { + let label = program.allocate_label(); + program.emit_insn(Insn::Once { + target_pc_when_reentered: label, + }); + Some(label) + } else { + None + }; + + match query_type { + SubqueryType::Exists { result_reg, .. } => { + let subroutine_reg = program.alloc_register(); + program.emit_insn(Insn::BeginSubrtn { + dest: subroutine_reg, + dest_end: None, + }); + program.emit_insn(Insn::Integer { + value: 0, + dest: *result_reg, + }); + emit_program_for_select(program, &t_ctx.resolver, plan)?; + program.emit_insn(Insn::Return { + return_reg: subroutine_reg, + can_fallthrough: true, + }); + } + SubqueryType::In { cursor_id } => { + program.emit_insn(Insn::OpenEphemeral { + cursor_id: *cursor_id, + is_table: false, + }); + emit_program_for_select(program, &t_ctx.resolver, plan)?; + } + SubqueryType::RowValue { + result_reg_start, + num_regs, + } => { + let subroutine_reg = program.alloc_register(); + program.emit_insn(Insn::BeginSubrtn { + dest: subroutine_reg, + dest_end: None, + }); + for result_reg in *result_reg_start..*result_reg_start + *num_regs { + program.emit_insn(Insn::Null { + dest: result_reg, + dest_end: None, + }); + } + emit_program_for_select(program, &t_ctx.resolver, plan)?; + program.emit_insn(Insn::Return { + return_reg: subroutine_reg, + can_fallthrough: true, + }); + } + } + if let Some(label) = label_skip_after_first_run { + program.preassign_label_to_next_insn(label); + } + + program.decr_nesting(); + Ok(()) +} diff --git a/core/translate/values.rs b/core/translate/values.rs index b74483f7f..aedca0a0b 100644 --- a/core/translate/values.rs +++ b/core/translate/values.rs @@ -24,6 +24,17 @@ pub fn emit_values( } QueryDestination::EphemeralIndex { .. } => emit_toplevel_values(program, plan, t_ctx)?, QueryDestination::EphemeralTable { .. } => unreachable!(), + QueryDestination::ExistsSubqueryResult { result_reg } => { + program.emit_insn(Insn::Integer { + value: 1, + dest: result_reg, + }); + result_reg + } + QueryDestination::RowValueSubqueryResult { .. } => { + emit_toplevel_values(program, plan, t_ctx)? + } + QueryDestination::Unset => unreachable!("Unset query destination should not be reached"), }; Ok(reg_result_cols_start) } @@ -168,6 +179,24 @@ fn emit_values_to_destination( emit_values_to_index(program, plan, start_reg, row_len); } QueryDestination::EphemeralTable { .. } => unreachable!(), + QueryDestination::ExistsSubqueryResult { result_reg } => { + program.emit_insn(Insn::Integer { + value: 1, + dest: *result_reg, + }); + } + QueryDestination::RowValueSubqueryResult { + result_reg_start, + num_regs, + } => { + assert!(row_len == *num_regs, "Row value subqueries should have the same number of result columns as the number of registers"); + program.emit_insn(Insn::Copy { + src_reg: start_reg, + dst_reg: *result_reg_start, + extra_amount: num_regs - 1, + }); + } + QueryDestination::Unset => unreachable!("Unset query destination should not be reached"), } } diff --git a/core/translate/window.rs b/core/translate/window.rs index 7ab80207d..ad62e31fc 100644 --- a/core/translate/window.rs +++ b/core/translate/window.rs @@ -208,6 +208,7 @@ fn prepare_window_subquery( distinctness: Distinctness::NonDistinct, values: vec![], window: None, + non_from_clause_subqueries: vec![], }; prepare_window_subquery( diff --git a/core/vdbe/builder.rs b/core/vdbe/builder.rs index 3ac364226..c3bff8423 100644 --- a/core/vdbe/builder.rs +++ b/core/vdbe/builder.rs @@ -842,6 +842,42 @@ impl ProgramBuilder { .unwrap_or_else(|| panic!("Cursor not found: {key:?}")) } + /// Resolve the first allocated index cursor for a given table reference. + /// This method exists due to a limitation of our translation system where + /// a subquery that references an outer query table cannot know whether a + /// table cursor, index cursor, or both were opened for that table reference. + /// Hence: currently we first try to resolve a table cursor, and if that fails, + /// we resolve an index cursor via this method. + pub fn resolve_any_index_cursor_id_for_table(&self, table_ref_id: TableInternalId) -> CursorID { + self.cursor_ref + .iter() + .position(|(k, _)| { + k.as_ref() + .is_some_and(|k| k.table_reference_id == table_ref_id && k.index.is_some()) + }) + .unwrap_or_else(|| panic!("No index cursor found for table {table_ref_id}")) + } + + /// Resolve the [Index] that a given cursor is associated with. + pub fn resolve_index_for_cursor_id(&self, cursor_id: CursorID) -> Arc { + let cursor_ref = &self + .cursor_ref + .get(cursor_id) + .unwrap_or_else(|| panic!("Cursor not found: {cursor_id}")) + .1; + let CursorType::BTreeIndex(index) = cursor_ref else { + panic!("Cursor is not an index: {cursor_id}"); + }; + index.clone() + } + + /// Get the [CursorType] of a given cursor. + pub fn get_cursor_type(&self, cursor_id: CursorID) -> Option<&CursorType> { + self.cursor_ref + .get(cursor_id) + .map(|(_, cursor_type)| cursor_type) + } + pub fn set_collation(&mut self, c: Option<(CollationSeq, bool)>) { self.collation = c } diff --git a/parser/src/ast.rs b/parser/src/ast.rs index 092292745..54829c663 100644 --- a/parser/src/ast.rs +++ b/parser/src/ast.rs @@ -476,6 +476,40 @@ pub enum Expr { Unary(UnaryOperator, Box), /// Parameters Variable(String), + /// Subqueries from e.g. the WHERE clause are planned separately + /// and their results will be placed in registers or in an ephemeral index + /// pointed to by this type. + SubqueryResult { + /// Internal "opaque" identifier for the subquery. When the translator encounters + /// a [Expr::SubqueryResult], it needs to know which subquery in the corresponding + /// query plan it references. + subquery_id: TableInternalId, + /// Left-hand side expression for IN subqueries. + /// This property plus 'not_in' are only relevant for IN subqueries, + /// and the reason they are not included in the [SubqueryType] enum is so that + /// we don't have to clone this Box. + lhs: Option>, + /// Whether the IN subquery is a NOT IN subquery. + not_in: bool, + /// The type of subquery. + query_type: SubqueryType, + }, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum SubqueryType { + /// EXISTS subquery; result is stored in a single register. + Exists { result_reg: usize }, + /// Row value subquery; result is stored in a range of registers. + /// Example: x = (SELECT ...) or (x, y) = (SELECT ...) + RowValue { + result_reg_start: usize, + num_regs: usize, + }, + /// IN subquery; result is stored in an ephemeral index. + /// Example: x IN (SELECT ...) + In { cursor_id: usize }, } impl Expr { diff --git a/parser/src/ast/fmt.rs b/parser/src/ast/fmt.rs index e36602bfd..8d997d746 100644 --- a/parser/src/ast/fmt.rs +++ b/parser/src/ast/fmt.rs @@ -707,6 +707,10 @@ impl ToTokens for Expr { context: &C, ) -> Result<(), S::Error> { match self { + Self::SubqueryResult { .. } => { + // FIXME: what to put here? This is a highly "artificial" AST node that has no meaning when stringified. + Ok(()) + } Self::Between { lhs, not, diff --git a/perf/tpc-h/queries/11.sql b/perf/tpc-h/queries/11.sql index f64407357..8335255fc 100644 --- a/perf/tpc-h/queries/11.sql +++ b/perf/tpc-h/queries/11.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported +-- LIMBO_SKIP: subquery in HAVING clausenot supported select diff --git a/perf/tpc-h/queries/16.sql b/perf/tpc-h/queries/16.sql index 10095d24d..898497353 100644 --- a/perf/tpc-h/queries/16.sql +++ b/perf/tpc-h/queries/16.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported + select diff --git a/perf/tpc-h/queries/17.sql b/perf/tpc-h/queries/17.sql index 2a9a24c5a..5d5a7e315 100644 --- a/perf/tpc-h/queries/17.sql +++ b/perf/tpc-h/queries/17.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported +-- LIMBO_SKIP: query 17 is slow as hell in both Turso and Sqlite select diff --git a/perf/tpc-h/queries/18.sql b/perf/tpc-h/queries/18.sql index 158ac6b9b..deadd7690 100644 --- a/perf/tpc-h/queries/18.sql +++ b/perf/tpc-h/queries/18.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported + select diff --git a/perf/tpc-h/queries/2.sql b/perf/tpc-h/queries/2.sql index 2eb53b209..3f4f34922 100644 --- a/perf/tpc-h/queries/2.sql +++ b/perf/tpc-h/queries/2.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported + select diff --git a/perf/tpc-h/queries/20.sql b/perf/tpc-h/queries/20.sql index c7fe1545d..58aacd056 100644 --- a/perf/tpc-h/queries/20.sql +++ b/perf/tpc-h/queries/20.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported +-- LIMBO_SKIP: query 20 is slow as hell in both Turso and Sqlite select diff --git a/perf/tpc-h/queries/21.sql b/perf/tpc-h/queries/21.sql index 39e94a308..d616c00df 100644 --- a/perf/tpc-h/queries/21.sql +++ b/perf/tpc-h/queries/21.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported + select diff --git a/perf/tpc-h/queries/22.sql b/perf/tpc-h/queries/22.sql index dc40c8b77..89278cc39 100644 --- a/perf/tpc-h/queries/22.sql +++ b/perf/tpc-h/queries/22.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported +-- LIMBO_SKIP: query 22 is slow as hell in both Turso and Sqlite select diff --git a/perf/tpc-h/queries/4.sql b/perf/tpc-h/queries/4.sql index b461356fe..7d8c06d9f 100644 --- a/perf/tpc-h/queries/4.sql +++ b/perf/tpc-h/queries/4.sql @@ -1,4 +1,4 @@ --- LIMBO_SKIP: subquery in where not supported + select diff --git a/testing/subquery.test b/testing/subquery.test index 3a909df34..2848c64e9 100644 --- a/testing/subquery.test +++ b/testing/subquery.test @@ -446,4 +446,196 @@ do_execsql_test_on_specific_db {:memory:} subquery-cte-available-in-arbitrary-de from cte join cte3 ) join cte2 ); -} {112} \ No newline at end of file +} {112} + +# WHERE CLAUSE SUBQUERIES + +# EXISTS/NOT EXISTS + +do_execsql_test_on_specific_db {:memory:} subquery-exists-basic { + create table test(a); + insert into test values (1); + select * from test where exists (select 0); + select * from test where not exists (select 0 where false); +} {1 +1} + +# Trivial example: get all users if there are any products +do_execsql_test_on_specific_db {:memory:} subquery-exists-uncorrelated { + create table products(id, name, price); + create table users(id, name); + insert into products values (1, 'hat', 50), (2, 'cap', 75), (3, 'shirt', 100); + insert into users values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'); + + select * from users where exists (select 1 from products); +} {1|Alice +2|Bob +3|Charlie} + +# Get all products when there are no users +do_execsql_test_on_specific_db {:memory:} subquery-not-exists-uncorrelated { + create table products(id, name, price); + create table users(id, name); + insert into products values (1, 'hat', 50), (2, 'cap', 75); + + select * from products where not exists (select 1 from users); +} {1|hat|50 +2|cap|75} + +# Get products that have a matching category +do_execsql_test_on_specific_db {:memory:} subquery-exists-correlated { + create table products(id, name, category_id); + create table categories(id, name); + insert into products values (1, 'hat', 10), (2, 'cap', 20), (3, 'shirt', 10); + insert into categories values (10, 'Clothing'), (30, 'Electronics'); + + select * from products p where exists ( + select * from categories c where c.id = p.category_id + ); +} {1|hat|10 +3|shirt|10} + +# Get users who have no orders +do_execsql_test_on_specific_db {:memory:} subquery-not-exists-correlated { + create table users(id, name, age); + create table orders(id, user_id, amount); + insert into users values (1, 'Alice', 25), (2, 'Bob', 30), (3, 'Charlie', 35); + insert into orders values (1, 1, 100), (2, 3, 200); + + select * from users u where not exists ( + select * from orders o where o.user_id = u.id + ); +} {2|Bob|30} + +# Get products that belong to the 'Clothing' category +do_execsql_test_on_specific_db {:memory:} subquery-exists-with-conditions { + create table categories(id, name); + create table products(id, name, price, category_id); + insert into categories values (1, 'Clothing'), (2, 'Electronics'); + insert into products values (1, 'hat', 50, 1), (2, 'cap', 25, 1), (3, 'macbook', 75, 2); + + select * from products p where exists ( + select * from categories c + where c.id = p.category_id and c.name = 'Clothing' + ); +} {1|hat|50|1 +2|cap|25|1} + +# Get users who have products with high-rated reviews +do_execsql_test_on_specific_db {:memory:} subquery-nested-exists { + create table users(id, name); + create table products(id, name, user_id); + create table reviews(id, product_id, rating); + insert into users values (1, 'Alice'), (2, 'Bob'); + insert into products values (1, 'hat', 1), (2, 'cap', 2); + insert into reviews values (1, 1, 5); + + select * from users u where exists ( + select * from products p where p.user_id = u.id and exists ( + select 1 from reviews r where r.product_id = p.id and r.rating >= 4 + ) + ); +} {1|Alice} + +# Get products that have tags (none exist, so empty result) +do_execsql_test_on_specific_db {:memory:} subquery-exists-empty-result { + create table products(id, name); + create table tags(product_id, tag); + insert into products values (1, 'hat'), (2, 'cap'); + + select * from products p where exists ( + select * from tags t where t.product_id = p.id + ); +} {} + +# Get users whose emails are not in the blocked list +do_execsql_test_on_specific_db {:memory:} subquery-not-exists-all-match { + create table users(id, email); + create table blocked_emails(email); + insert into users values (1, 'alice@test.com'), (2, 'bob@test.com'); + insert into blocked_emails values ('spam@test.com'); + + select * from users u where not exists ( + select * from blocked_emails b where b.email = u.email + ); +} {1|alice@test.com +2|bob@test.com} + +# SCALAR SUBQUERIES + +# Get products with price higher than average price +do_execsql_test_on_specific_db {:memory:} subquery-scalar-comparison { + create table products(id, name, price); + insert into products values (1, 'hat', 50), (2, 'cap', 25), (3, 'jacket', 75); + + select * from products where price >= ( + select avg(price) from products + ); +} {1|hat|50 +3|jacket|75} + +# Get users with the highest score +do_execsql_test_on_specific_db {:memory:} subquery-scalar-max { + create table users(id, name, score); + insert into users values (1, 'Alice', 85), (2, 'Bob', 92), (3, 'Charlie', 92); + + select * from users where score = ( + select max(score) from users + ); +} {2|Bob|92 +3|Charlie|92} + +# (x,y) IN SUBQUERIES + +# Get products from specific categories using IN +do_execsql_test_on_specific_db {:memory:} subquery-in-single-column { + create table products(id, name, category_id); + create table categories(id, name); + insert into categories values (1, 'Clothing'), (2, 'Electronics'), (3, 'Books'); + insert into products values (1, 'hat', 1), (2, 'laptop', 2), (3, 'novel', 3), (4, 'cap', 1); + + select * from products where category_id in ( + select id from categories where name in ('Clothing', 'Electronics') + ); +} {1|hat|1 +2|laptop|2 +4|cap|1} + +# Get products NOT in discontinued categories +do_execsql_test_on_specific_db {:memory:} subquery-not-in-single-column { + create table products(id, name, category_id); + create table discontinued_categories(category_id); + insert into products values (1, 'hat', 1), (2, 'laptop', 2), (3, 'book', 3); + insert into discontinued_categories values (2); + + select * from products where category_id not in ( + select category_id from discontinued_categories + ); +} {1|hat|1 +3|book|3} + +# Get order items matching specific product-quantity combinations +do_execsql_test_on_specific_db {:memory:} subquery-in-multiple-columns { + create table order_items(order_id, product_id, quantity); + create table special_offers(product_id, min_quantity); + insert into order_items values (1, 10, 5), (2, 20, 3), (3, 10, 2), (4, 30, 1); + insert into special_offers values (10, 5), (20, 3); + + select * from order_items where (product_id, quantity) in ( + select product_id, min_quantity from special_offers + ); +} {1|10|5 +2|20|3} + +# Get users whose (name, age) combination is not in the restricted list +do_execsql_test_on_specific_db {:memory:} subquery-not-in-multiple-columns { + create table users(id, name, age); + create table restricted_profiles(name, age); + insert into users values (1, 'Alice', 25), (2, 'Bob', 30), (3, 'Charlie', 25); + insert into restricted_profiles values ('Bob', 30); + + select * from users where (name, age) not in ( + select name, age from restricted_profiles + ); +} {1|Alice|25 +3|Charlie|25} \ No newline at end of file diff --git a/tests/fuzz/mod.rs b/tests/fuzz/mod.rs index 9f06f20fc..9cfb06ea7 100644 --- a/tests/fuzz/mod.rs +++ b/tests/fuzz/mod.rs @@ -4203,4 +4203,685 @@ mod fuzz_tests { Ok(()) } + + #[test] + /// Tests for correlated and uncorrelated subqueries occurring in the WHERE clause of a SELECT statement. + pub fn table_subquery_fuzz() { + let _ = env_logger::try_init(); + let (mut rng, seed) = rng_from_time_or_env(); + log::info!("table_subquery_fuzz seed: {seed}"); + + // Constants for fuzzing parameters + const NUM_FUZZ_ITERATIONS: usize = 2000; + const MAX_ROWS_PER_TABLE: usize = 100; + const MIN_ROWS_PER_TABLE: usize = 5; + const MAX_SUBQUERY_DEPTH: usize = 4; + + let db = TempDatabase::new_empty(true); + let limbo_conn = db.connect_limbo(); + let sqlite_conn = rusqlite::Connection::open_in_memory().unwrap(); + + let mut debug_ddl_dml_string = String::new(); + + // Create 3 simple tables + let table_schemas = [ + "CREATE TABLE t1 (id INT PRIMARY KEY, value1 INTEGER, value2 INTEGER);", + "CREATE TABLE t2 (id INT PRIMARY KEY, ref_id INTEGER, data INTEGER);", + "CREATE TABLE t3 (id INT PRIMARY KEY, category INTEGER, amount INTEGER);", + ]; + + for schema in &table_schemas { + debug_ddl_dml_string.push_str(schema); + limbo_exec_rows(&db, &limbo_conn, schema); + sqlite_exec_rows(&sqlite_conn, schema); + } + + // Populate tables with random data + for table_num in 1..=3 { + let num_rows = rng.random_range(MIN_ROWS_PER_TABLE..=MAX_ROWS_PER_TABLE); + for i in 1..=num_rows { + let insert_sql = match table_num { + 1 => format!( + "INSERT INTO t1 VALUES ({}, {}, {});", + i, + rng.random_range(-10..20), + rng.random_range(-5..15) + ), + 2 => format!( + "INSERT INTO t2 VALUES ({}, {}, {});", + i, + rng.random_range(1..=num_rows), // ref_id references t1 approximately + rng.random_range(-5..10) + ), + 3 => format!( + "INSERT INTO t3 VALUES ({}, {}, {});", + i, + rng.random_range(1..5), // category 1-4 + rng.random_range(0..100) + ), + _ => unreachable!(), + }; + log::debug!("{insert_sql}"); + debug_ddl_dml_string.push_str(&insert_sql); + limbo_exec_rows(&db, &limbo_conn, &insert_sql); + sqlite_exec_rows(&sqlite_conn, &insert_sql); + } + } + + log::info!("DDL/DML to reproduce manually:\n{debug_ddl_dml_string}"); + + // Helper function to generate random simple WHERE condition + let gen_simple_where = |rng: &mut ChaCha8Rng, table: &str| -> String { + let conditions = match table { + "t1" => vec![ + format!("value1 > {}", rng.random_range(-5..15)), + format!("value2 < {}", rng.random_range(-5..15)), + format!("id <= {}", rng.random_range(1..20)), + "value1 IS NOT NULL".to_string(), + ], + "t2" => vec![ + format!("data > {}", rng.random_range(-3..8)), + format!("ref_id = {}", rng.random_range(1..15)), + format!("id < {}", rng.random_range(5..25)), + "data IS NOT NULL".to_string(), + ], + "t3" => vec![ + format!("category = {}", rng.random_range(1..5)), + format!("amount > {}", rng.random_range(0..50)), + format!("id <= {}", rng.random_range(1..20)), + "amount IS NOT NULL".to_string(), + ], + _ => vec!["1=1".to_string()], + }; + conditions[rng.random_range(0..conditions.len())].clone() + }; + + // Helper function to generate simple subquery + fn gen_subquery(rng: &mut ChaCha8Rng, depth: usize, outer_table: Option<&str>) -> String { + if depth > MAX_SUBQUERY_DEPTH { + return "SELECT 1".to_string(); + } + + let gen_simple_where_inner = |rng: &mut ChaCha8Rng, table: &str| -> String { + let conditions = match table { + "t1" => vec![ + format!("value1 > {}", rng.random_range(-5..15)), + format!("value2 < {}", rng.random_range(-5..15)), + format!("id <= {}", rng.random_range(1..20)), + "value1 IS NOT NULL".to_string(), + ], + "t2" => vec![ + format!("data > {}", rng.random_range(-3..8)), + format!("ref_id = {}", rng.random_range(1..15)), + format!("id < {}", rng.random_range(5..25)), + "data IS NOT NULL".to_string(), + ], + "t3" => vec![ + format!("category = {}", rng.random_range(1..5)), + format!("amount > {}", rng.random_range(0..50)), + format!("id <= {}", rng.random_range(1..20)), + "amount IS NOT NULL".to_string(), + ], + _ => vec!["1=1".to_string()], + }; + conditions[rng.random_range(0..conditions.len())].clone() + }; + + // Helper function to generate correlated WHERE conditions + let gen_correlated_where = + |rng: &mut ChaCha8Rng, inner_table: &str, outer_table: &str| -> String { + match (outer_table, inner_table) { + ("t1", "t2") => { + // t2.ref_id relates to t1.id + let conditions = [ + format!("{inner_table}.ref_id = {outer_table}.id"), + format!("{inner_table}.id < {outer_table}.value1"), + format!("{inner_table}.data > {outer_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t1", "t3") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.category < {outer_table}.value1"), + format!("{inner_table}.amount > {outer_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t1") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.ref_id"), + format!("{inner_table}.value1 > {outer_table}.data"), + format!("{inner_table}.value2 < {outer_table}.id"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t3") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.category = {outer_table}.ref_id"), + format!("{inner_table}.amount > {outer_table}.data"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t1") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.value1 > {outer_table}.category"), + format!("{inner_table}.value2 < {outer_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t2") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.ref_id = {outer_table}.category"), + format!("{inner_table}.data < {outer_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + _ => "1=1".to_string(), + } + }; + + let subquery_types = [ + // Simple scalar subqueries - single column only for safe nesting + "SELECT MAX(amount) FROM t3".to_string(), + "SELECT MIN(value1) FROM t1".to_string(), + "SELECT COUNT(*) FROM t2".to_string(), + "SELECT AVG(amount) FROM t3".to_string(), + "SELECT id FROM t1".to_string(), + "SELECT ref_id FROM t2".to_string(), + "SELECT category FROM t3".to_string(), + // Subqueries with WHERE - single column only + format!( + "SELECT MAX(amount) FROM t3 WHERE {}", + gen_simple_where_inner(rng, "t3") + ), + format!( + "SELECT value1 FROM t1 WHERE {}", + gen_simple_where_inner(rng, "t1") + ), + format!( + "SELECT ref_id FROM t2 WHERE {}", + gen_simple_where_inner(rng, "t2") + ), + ]; + + let base_query = &subquery_types[rng.random_range(0..subquery_types.len())]; + + // Add correlated conditions if outer_table is provided and sometimes + let final_query = if let Some(outer_table) = outer_table { + if rng.random_bool(0.4) { + // 40% chance for correlation + // Extract the inner table from the base query + let inner_table = if base_query.contains("FROM t1") { + "t1" + } else if base_query.contains("FROM t2") { + "t2" + } else if base_query.contains("FROM t3") { + "t3" + } else { + return base_query.clone(); // fallback + }; + + let correlated_condition = gen_correlated_where(rng, inner_table, outer_table); + + if base_query.contains("WHERE") { + format!("{base_query} AND {correlated_condition}") + } else { + format!("{base_query} WHERE {correlated_condition}") + } + } else { + base_query.clone() + } + } else { + base_query.clone() + }; + + // Sometimes add nesting - but use scalar subquery for nesting to avoid column count issues + if depth < 1 && rng.random_bool(0.2) { + // Reduced probability and depth + let nested = gen_scalar_subquery(rng, 0, outer_table); + if final_query.contains("WHERE") { + format!("{final_query} AND id IN ({nested})") + } else { + format!("{final_query} WHERE id IN ({nested})") + } + } else { + final_query + } + } + + // Helper function to generate scalar subquery (single column only) + fn gen_scalar_subquery( + rng: &mut ChaCha8Rng, + depth: usize, + outer_table: Option<&str>, + ) -> String { + if depth > MAX_SUBQUERY_DEPTH { + // Reduced nesting depth + return "SELECT 1".to_string(); + } + + let gen_simple_where_inner = |rng: &mut ChaCha8Rng, table: &str| -> String { + let conditions = match table { + "t1" => vec![ + format!("value1 > {}", rng.random_range(-5..15)), + format!("value2 < {}", rng.random_range(-5..15)), + format!("id <= {}", rng.random_range(1..20)), + "value1 IS NOT NULL".to_string(), + ], + "t2" => vec![ + format!("data > {}", rng.random_range(-3..8)), + format!("ref_id = {}", rng.random_range(1..15)), + format!("id < {}", rng.random_range(5..25)), + "data IS NOT NULL".to_string(), + ], + "t3" => vec![ + format!("category = {}", rng.random_range(1..5)), + format!("amount > {}", rng.random_range(0..50)), + format!("id <= {}", rng.random_range(1..20)), + "amount IS NOT NULL".to_string(), + ], + _ => vec!["1=1".to_string()], + }; + conditions[rng.random_range(0..conditions.len())].clone() + }; + + // Helper function to generate correlated WHERE conditions + let gen_correlated_where = + |rng: &mut ChaCha8Rng, inner_table: &str, outer_table: &str| -> String { + match (outer_table, inner_table) { + ("t1", "t2") => { + // t2.ref_id relates to t1.id + let conditions = [ + format!("{inner_table}.ref_id = {outer_table}.id"), + format!("{inner_table}.id < {outer_table}.value1"), + format!("{inner_table}.data > {outer_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t1", "t3") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.category < {outer_table}.value1"), + format!("{inner_table}.amount > {outer_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t1") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.ref_id"), + format!("{inner_table}.value1 > {outer_table}.data"), + format!("{inner_table}.value2 < {outer_table}.id"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t3") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.category = {outer_table}.ref_id"), + format!("{inner_table}.amount > {outer_table}.data"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t1") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.value1 > {outer_table}.category"), + format!("{inner_table}.value2 < {outer_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t2") => { + let conditions = [ + format!("{inner_table}.id = {outer_table}.id"), + format!("{inner_table}.ref_id = {outer_table}.category"), + format!("{inner_table}.data < {outer_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + _ => "1=1".to_string(), + } + }; + + let scalar_subquery_types = [ + // Only scalar subqueries - single column only + "SELECT MAX(amount) FROM t3".to_string(), + "SELECT MIN(value1) FROM t1".to_string(), + "SELECT COUNT(*) FROM t2".to_string(), + "SELECT AVG(amount) FROM t3".to_string(), + "SELECT id FROM t1".to_string(), + "SELECT ref_id FROM t2".to_string(), + "SELECT category FROM t3".to_string(), + // Scalar subqueries with WHERE + format!( + "SELECT MAX(amount) FROM t3 WHERE {}", + gen_simple_where_inner(rng, "t3") + ), + format!( + "SELECT value1 FROM t1 WHERE {}", + gen_simple_where_inner(rng, "t1") + ), + format!( + "SELECT ref_id FROM t2 WHERE {}", + gen_simple_where_inner(rng, "t2") + ), + ]; + + let base_query = + &scalar_subquery_types[rng.random_range(0..scalar_subquery_types.len())]; + + // Add correlated conditions if outer_table is provided and sometimes + let final_query = if let Some(outer_table) = outer_table { + if rng.random_bool(0.4) { + // 40% chance for correlation + // Extract the inner table from the base query + let inner_table = if base_query.contains("FROM t1") { + "t1" + } else if base_query.contains("FROM t2") { + "t2" + } else if base_query.contains("FROM t3") { + "t3" + } else { + return base_query.clone(); // fallback + }; + + let correlated_condition = gen_correlated_where(rng, inner_table, outer_table); + + if base_query.contains("WHERE") { + format!("{base_query} AND {correlated_condition}") + } else { + format!("{base_query} WHERE {correlated_condition}") + } + } else { + base_query.clone() + } + } else { + base_query.clone() + }; + + // Sometimes add nesting + if depth < 1 && rng.random_bool(0.2) { + // Reduced probability and depth + let nested = gen_scalar_subquery(rng, depth + 1, outer_table); + if final_query.contains("WHERE") { + format!("{final_query} AND id IN ({nested})") + } else { + format!("{final_query} WHERE id IN ({nested})") + } + } else { + final_query + } + } + + for iter_num in 0..NUM_FUZZ_ITERATIONS { + let main_table = ["t1", "t2", "t3"][rng.random_range(0..3)]; + + let query_type = rng.random_range(0..6); // Increased from 4 to 6 for new correlated query types + let query = match query_type { + 0 => { + // Comparison subquery: WHERE column (SELECT ...) + let column = match main_table { + "t1" => ["value1", "value2", "id"][rng.random_range(0..3)], + "t2" => ["data", "ref_id", "id"][rng.random_range(0..3)], + "t3" => ["amount", "category", "id"][rng.random_range(0..3)], + _ => "id", + }; + let op = [">", "<", ">=", "<=", "=", "<>"][rng.random_range(0..6)]; + let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table)); + format!("SELECT * FROM {main_table} WHERE {column} {op} ({subquery})",) + } + 1 => { + // EXISTS subquery: WHERE [NOT] EXISTS (SELECT ...) + let not_exists = if rng.random_bool(0.3) { "NOT " } else { "" }; + let subquery = gen_subquery(&mut rng, 0, Some(main_table)); + format!("SELECT * FROM {main_table} WHERE {not_exists}EXISTS ({subquery})",) + } + 2 => { + // IN subquery with single column: WHERE column [NOT] IN (SELECT ...) + let not_in = if rng.random_bool(0.3) { "NOT " } else { "" }; + let column = match main_table { + "t1" => ["value1", "value2", "id"][rng.random_range(0..3)], + "t2" => ["data", "ref_id", "id"][rng.random_range(0..3)], + "t3" => ["amount", "category", "id"][rng.random_range(0..3)], + _ => "id", + }; + let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table)); + format!("SELECT * FROM {main_table} WHERE {column} {not_in}IN ({subquery})",) + } + 3 => { + // IN subquery with tuple: WHERE (col1, col2) [NOT] IN (SELECT col1, col2 ...) + let not_in = if rng.random_bool(0.3) { "NOT " } else { "" }; + let (columns, sub_columns) = match main_table { + "t1" => { + if rng.random_bool(0.5) { + ("(id, value1)", "SELECT id, value1 FROM t1") + } else { + ("id", "SELECT id FROM t1") + } + } + "t2" => { + if rng.random_bool(0.5) { + ("(ref_id, data)", "SELECT ref_id, data FROM t2") + } else { + ("ref_id", "SELECT ref_id FROM t2") + } + } + "t3" => { + if rng.random_bool(0.5) { + ("(id, category)", "SELECT id, category FROM t3") + } else { + ("id", "SELECT id FROM t3") + } + } + _ => ("id", "SELECT id FROM t1"), + }; + let subquery = if rng.random_bool(0.5) { + sub_columns.to_string() + } else { + let base = sub_columns; + let table_for_where = base.split("FROM ").nth(1).unwrap_or("t1"); + format!( + "{} WHERE {}", + base, + gen_simple_where(&mut rng, table_for_where) + ) + }; + format!("SELECT * FROM {main_table} WHERE {columns} {not_in}IN ({subquery})",) + } + 4 => { + // Correlated EXISTS subquery: WHERE [NOT] EXISTS (SELECT ... WHERE correlation) + let not_exists = if rng.random_bool(0.3) { "NOT " } else { "" }; + + // Choose a different table for the subquery to ensure correlation is meaningful + let inner_tables = match main_table { + "t1" => ["t2", "t3"], + "t2" => ["t1", "t3"], + "t3" => ["t1", "t2"], + _ => ["t1", "t2"], + }; + let inner_table = inner_tables[rng.random_range(0..inner_tables.len())]; + + // Generate correlated condition + let correlated_condition = match (main_table, inner_table) { + ("t1", "t2") => { + let conditions = [ + format!("{inner_table}.ref_id = {main_table}.id"), + format!("{inner_table}.id < {main_table}.value1"), + format!("{inner_table}.data > {main_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t1", "t3") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.category < {main_table}.value1"), + format!("{inner_table}.amount > {main_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t1") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.ref_id"), + format!("{inner_table}.value1 > {main_table}.data"), + format!("{inner_table}.value2 < {main_table}.id"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t3") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.category = {main_table}.ref_id"), + format!("{inner_table}.amount > {main_table}.data"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t1") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.value1 > {main_table}.category"), + format!("{inner_table}.value2 < {main_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t2") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.ref_id = {main_table}.category"), + format!("{inner_table}.data < {main_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + _ => "1=1".to_string(), + }; + + format!( + "SELECT * FROM {main_table} WHERE {not_exists}EXISTS (SELECT 1 FROM {inner_table} WHERE {correlated_condition})", + ) + } + 5 => { + // Correlated comparison subquery: WHERE column (SELECT ... WHERE correlation) + let column = match main_table { + "t1" => ["value1", "value2", "id"][rng.random_range(0..3)], + "t2" => ["data", "ref_id", "id"][rng.random_range(0..3)], + "t3" => ["amount", "category", "id"][rng.random_range(0..3)], + _ => "id", + }; + let op = [">", "<", ">=", "<=", "=", "<>"][rng.random_range(0..6)]; + + // Choose a different table for the subquery + let inner_tables = match main_table { + "t1" => ["t2", "t3"], + "t2" => ["t1", "t3"], + "t3" => ["t1", "t2"], + _ => ["t1", "t2"], + }; + let inner_table = inner_tables[rng.random_range(0..inner_tables.len())]; + + // Choose what to select from inner table + let select_column = match inner_table { + "t1" => ["value1", "value2", "id"][rng.random_range(0..3)], + "t2" => ["data", "ref_id", "id"][rng.random_range(0..3)], + "t3" => ["amount", "category", "id"][rng.random_range(0..3)], + _ => "id", + }; + + // Generate correlated condition + let correlated_condition = match (main_table, inner_table) { + ("t1", "t2") => { + let conditions = [ + format!("{inner_table}.ref_id = {main_table}.id"), + format!("{inner_table}.id < {main_table}.value1"), + format!("{inner_table}.data > {main_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t1", "t3") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.category < {main_table}.value1"), + format!("{inner_table}.amount > {main_table}.value2"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t1") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.ref_id"), + format!("{inner_table}.value1 > {main_table}.data"), + format!("{inner_table}.value2 < {main_table}.id"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t2", "t3") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.category = {main_table}.ref_id"), + format!("{inner_table}.amount > {main_table}.data"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t1") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.value1 > {main_table}.category"), + format!("{inner_table}.value2 < {main_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + ("t3", "t2") => { + let conditions = [ + format!("{inner_table}.id = {main_table}.id"), + format!("{inner_table}.ref_id = {main_table}.category"), + format!("{inner_table}.data < {main_table}.amount"), + ]; + conditions[rng.random_range(0..conditions.len())].clone() + } + _ => "1=1".to_string(), + }; + + format!( + "SELECT * FROM {main_table} WHERE {column} {op} (SELECT {select_column} FROM {inner_table} WHERE {correlated_condition})", + ) + } + _ => unreachable!(), + }; + + log::info!( + "Iteration {}/{NUM_FUZZ_ITERATIONS}: Query: {query}", + iter_num + 1, + ); + + let limbo_results = limbo_exec_rows(&db, &limbo_conn, &query); + let sqlite_results = sqlite_exec_rows(&sqlite_conn, &query); + + // Check if results match + if limbo_results.len() != sqlite_results.len() { + panic!( + "Row count mismatch for query: {}\nLimbo: {} rows, SQLite: {} rows\nLimbo: {:?}\nSQLite: {:?}\nSeed: {}\n\n DDL/DML to reproduce manually:\n{}", + query, limbo_results.len(), sqlite_results.len(), limbo_results, sqlite_results, seed, debug_ddl_dml_string + ); + } + + // Check if all rows match (order might be different) + // Since Value doesn't implement Ord, we'll check containment both ways + let all_limbo_in_sqlite = limbo_results.iter().all(|limbo_row| { + sqlite_results + .iter() + .any(|sqlite_row| limbo_row == sqlite_row) + }); + let all_sqlite_in_limbo = sqlite_results.iter().all(|sqlite_row| { + limbo_results + .iter() + .any(|limbo_row| sqlite_row == limbo_row) + }); + + if !all_limbo_in_sqlite || !all_sqlite_in_limbo { + panic!( + "Results mismatch for query: {query}\nLimbo: {limbo_results:?}\nSQLite: {sqlite_results:?}\nSeed: {seed}", + ); + } + } + } }