diff --git a/COMPAT.md b/COMPAT.md index e14c04005..57ce2dcd9 100644 --- a/COMPAT.md +++ b/COMPAT.md @@ -200,8 +200,9 @@ Feature support of [sqlite expr syntax](https://www.sqlite.org/lang_expr.html). | IS (NOT) | Yes | | | IS (NOT) DISTINCT FROM | Yes | | | (NOT) BETWEEN ... AND ... | Yes | Expression is rewritten in the optimizer | -| (NOT) IN (subquery) | No | | -| (NOT) EXISTS (subquery) | No | | +| (NOT) IN (SELECT...) | Yes | | +| (NOT) EXISTS (SELECT...) | Yes | | +| x (SELECT...)) | Partial | Only scalar subqueries supported, i.e. not (x,y) = (SELECT...) | CASE WHEN THEN ELSE END | Yes | | | RAISE | No | | diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index c8c99e9c4..1f61da880 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -35,8 +35,9 @@ use crate::translate::fkeys::{ emit_fk_delete_parent_existence_checks, emit_guarded_fk_decrement, emit_parent_key_change_checks, open_read_index, open_read_table, stabilize_new_row_for_fk, }; -use crate::translate::plan::{DeletePlan, JoinedTable, Plan, QueryDestination, Search}; +use crate::translate::plan::{DeletePlan, EvalAt, JoinedTable, Plan, QueryDestination, Search}; use crate::translate::planner::ROWID_STRS; +use crate::translate::subquery::emit_non_from_clause_subquery; use crate::translate::values::emit_values; use crate::translate::window::{emit_window_results, init_window, WindowMetadata}; use crate::util::{exprs_are_equivalent, normalize_ident}; @@ -278,6 +279,27 @@ pub fn emit_query<'a>( return Ok(reg_result_cols_start); } + // Evaluate uncorrelated subqueries as early as possible, because even LIMIT can reference a subquery. + for subquery in plan + .non_from_clause_subqueries + .iter_mut() + .filter(|s| !s.has_been_evaluated()) + { + let eval_at = subquery.get_eval_at(&plan.join_order)?; + if eval_at != EvalAt::BeforeLoop { + continue; + } + let plan = subquery.consume_plan(EvalAt::BeforeLoop); + + emit_non_from_clause_subquery( + program, + t_ctx, + *plan, + &subquery.query_type, + subquery.correlated, + )?; + } + // Emit FROM clause subqueries first so the results can be read in the main query loop. emit_from_clause_subqueries(program, t_ctx, &mut plan.table_references)?; diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index 19f91c5bc..aa5e5dcdc 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -321,6 +321,8 @@ fn collect_non_aggregate_expressions<'a>( Ok(()) } +/// Collects columns from different parts of a SELECT that are needed for +/// GROUP BY. fn collect_result_columns<'a>( root_expr: &'a ast::Expr, plan: &SelectPlan, @@ -337,6 +339,29 @@ fn collect_result_columns<'a>( result_columns.push(expr); } } + // SubqueryResult is an exception because we can't "extract" columns from it + // unlike other expressions like function calls or direct column references, + // so we must add it so that the subquery result gets collected to the GROUP BY + // columns. + // + // However, if the subquery is of the form: 'aggregate_result IN (SELECT...)', we need to skip it because the aggregation + // is done later. + ast::Expr::SubqueryResult { lhs, .. } => { + if let Some(ref lhs) = lhs { + let mut lhs_contains_agg = false; + walk_expr(lhs, &mut |expr: &ast::Expr| -> Result { + if plan.aggregates.iter().any(|a| a.original_expr == *expr) { + lhs_contains_agg = true; + return Ok(WalkControl::SkipChildren); + } + Ok(WalkControl::Continue) + })?; + if lhs_contains_agg { + return Ok(WalkControl::SkipChildren); + } + } + result_columns.push(expr); + } _ => { if plan.aggregates.iter().any(|a| a.original_expr == *expr) { return Ok(WalkControl::SkipChildren); diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 8cdedf6f8..619a64fd4 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -418,22 +418,6 @@ pub fn init_loop( } } - for subquery in subqueries.iter_mut().filter(|s| !s.has_been_evaluated()) { - let eval_at = subquery.get_eval_at(join_order)?; - if eval_at != EvalAt::BeforeLoop { - continue; - } - let plan = subquery.consume_plan(EvalAt::BeforeLoop); - - emit_non_from_clause_subquery( - program, - t_ctx, - *plan, - &subquery.query_type, - subquery.correlated, - )?; - } - for cond in where_clause .iter() .filter(|c| c.should_eval_before_loop(join_order, subqueries)) diff --git a/core/translate/plan.rs b/core/translate/plan.rs index 32daaa8b2..2b9f98e59 100644 --- a/core/translate/plan.rs +++ b/core/translate/plan.rs @@ -1390,6 +1390,16 @@ pub enum SubqueryState { Evaluated { evaluated_at: EvalAt }, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SubqueryPosition { + ResultColumn, + Where, + GroupBy, + Having, + OrderBy, + LimitOffset, +} + #[derive(Debug, Clone)] /// A subquery that is not part of the `FROM` clause. /// This is used for subqueries in the WHERE clause, HAVING clause, ORDER BY clause, LIMIT clause, OFFSET clause, etc. diff --git a/core/translate/select.rs b/core/translate/select.rs index 7c9d29ea5..3ec65af30 100644 --- a/core/translate/select.rs +++ b/core/translate/select.rs @@ -13,7 +13,7 @@ use crate::translate::planner::{ break_predicate_at_and_boundaries, parse_from, parse_limit, parse_where, resolve_window_and_aggregate_functions, }; -use crate::translate::subquery::plan_subqueries_from_where_clause; +use crate::translate::subquery::plan_subqueries_from_select_plan; use crate::translate::window::plan_windows; use crate::util::normalize_ident; use crate::vdbe::builder::ProgramBuilderOpts; @@ -504,24 +504,7 @@ fn prepare_one_select_plan( )?; } - // TODO: support subqueries in the following positions: - // - result column of a select, e.g.: SELECT x = (SELECT ...) FROM t - // - GROUP BY clause, e.g.: SELECT * FROM t GROUP BY (SELECT ...) <-- however nonsensical that might be... - // - HAVING clause, e.g.: SELECT * FROM t GROUP BY x HAVING x = (SELECT ...) - // - ORDER BY clause, e.g.: SELECT * FROM t ORDER BY x = (SELECT ...) - // - LIMIT clause, e.g.: SELECT * FROM t LIMIT (SELECT ...) - // - OFFSET clause, e.g.: SELECT * FROM t OFFSET (SELECT ...) - // - // in these positions, unlike in the WHERE clause, the subquery cannot reference columns from the outer query, - // so we don't need to collect outer query references. - plan_subqueries_from_where_clause( - program, - &mut plan.non_from_clause_subqueries, - &mut plan.table_references, - resolver, - &mut plan.where_clause, - connection, - )?; + plan_subqueries_from_select_plan(program, &mut plan, resolver, connection)?; // Return the unoptimized query plan Ok(plan) diff --git a/core/translate/subquery.rs b/core/translate/subquery.rs index 84b2ef939..42ef2e616 100644 --- a/core/translate/subquery.rs +++ b/core/translate/subquery.rs @@ -11,8 +11,8 @@ use crate::{ expr::{unwrap_parens, walk_expr_mut, WalkControl}, optimizer::optimize_select_plan, plan::{ - ColumnUsedMask, NonFromClauseSubquery, OuterQueryReference, Plan, SubqueryState, - WhereTerm, + ColumnUsedMask, NonFromClauseSubquery, OuterQueryReference, Plan, SubqueryPosition, + SubqueryState, }, select::prepare_select_plan, }, @@ -29,22 +29,117 @@ use super::{ plan::{Operation, QueryDestination, Search, SelectPlan, TableReferences}, }; -/// Compute query plans for subqueries in the WHERE clause. +// Compute query plans for subqueries occurring in any position other than the FROM clause. +// This includes the WHERE clause, HAVING clause, GROUP BY clause, ORDER BY clause, LIMIT clause, and OFFSET clause. /// The AST expression containing the subquery ([ast::Expr::Exists], [ast::Expr::Subquery], [ast::Expr::InSelect]) is replaced with a [ast::Expr::SubqueryResult] expression. /// The [ast::Expr::SubqueryResult] expression contains the subquery ID, the left-hand side expression (only applicable to IN subqueries), the NOT IN flag (only applicable to IN subqueries), and the subquery type. /// The computed plans are stored in the [NonFromClauseSubquery] structs on the [SelectPlan], and evaluated at the appropriate time during the translation of the main query. /// The appropriate time is determined by whether the subquery is correlated or uncorrelated; /// if it is uncorrelated, it can be evaluated as early as possible, but if it is correlated, it must be evaluated after all of its dependencies from the /// outer query are 'in scope', i.e. their cursors are open and rewound. -pub fn plan_subqueries_from_where_clause( +pub fn plan_subqueries_from_select_plan( + program: &mut ProgramBuilder, + plan: &mut SelectPlan, + resolver: &Resolver, + connection: &Arc, +) -> Result<()> { + // WHERE + plan_subqueries_with_outer_query_access( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + plan.where_clause.iter_mut().map(|t| &mut t.expr), + connection, + SubqueryPosition::Where, + )?; + + // GROUP BY + if let Some(group_by) = &mut plan.group_by { + plan_subqueries_with_outer_query_access( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + group_by.exprs.iter_mut(), + connection, + SubqueryPosition::GroupBy, + )?; + if let Some(having) = group_by.having.as_mut() { + plan_subqueries_with_outer_query_access( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + having.iter_mut(), + connection, + SubqueryPosition::Having, + )?; + } + } + + // Result columns + plan_subqueries_with_outer_query_access( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + plan.result_columns.iter_mut().map(|c| &mut c.expr), + connection, + SubqueryPosition::ResultColumn, + )?; + + // ORDER BY + plan_subqueries_with_outer_query_access( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + plan.order_by.iter_mut().map(|(expr, _)| &mut **expr), + connection, + SubqueryPosition::OrderBy, + )?; + + // LIMIT and OFFSET cannot reference columns from the outer query + let get_outer_query_refs = |_: &TableReferences| vec![]; + { + let mut subquery_parser = get_subquery_parser( + program, + &mut plan.non_from_clause_subqueries, + &mut plan.table_references, + resolver, + connection, + get_outer_query_refs, + SubqueryPosition::LimitOffset, + ); + // Limit + if let Some(limit) = &mut plan.limit { + walk_expr_mut(limit, &mut subquery_parser)?; + } + // Offset + if let Some(offset) = &mut plan.offset { + walk_expr_mut(offset, &mut subquery_parser)?; + } + } + + update_column_used_masks( + &mut plan.table_references, + &mut plan.non_from_clause_subqueries, + ); + Ok(()) +} + +/// Compute query plans for subqueries in the WHERE clause and HAVING clause (both of which have access to the outer query scope) +fn plan_subqueries_with_outer_query_access<'a>( program: &mut ProgramBuilder, out_subqueries: &mut Vec, referenced_tables: &mut TableReferences, resolver: &Resolver, - where_terms: &mut [WhereTerm], + exprs: impl Iterator, connection: &Arc, + position: SubqueryPosition, ) -> Result<()> { - // A WHERE clause subquery can reference columns from the outer query, + // Most subqueries can reference columns from the outer query, // including nested cases where a subquery inside a subquery references columns from its parent's parent // and so on. let get_outer_query_refs = |referenced_tables: &TableReferences| { @@ -71,241 +166,261 @@ pub fn plan_subqueries_from_where_clause( .collect::>() }; - // Walk the WHERE clause and replace subqueries with [ast::Expr::SubqueryResult] expressions. - for where_term in where_terms.iter_mut() { - walk_expr_mut( - &mut where_term.expr, - &mut |expr: &mut ast::Expr| -> Result { - match expr { - ast::Expr::Exists(_) => { - let subquery_id = program.table_reference_counter.next(); - let outer_query_refs = get_outer_query_refs(referenced_tables); - - let result_reg = program.alloc_register(); - let subquery_type = SubqueryType::Exists { result_reg }; - let result_expr = ast::Expr::SubqueryResult { - subquery_id, - lhs: None, - not_in: false, - query_type: subquery_type.clone(), - }; - let ast::Expr::Exists(subselect) = std::mem::replace(expr, result_expr) - else { - unreachable!(); - }; - - let plan = prepare_select_plan( - subselect, - resolver, - program, - &outer_query_refs, - QueryDestination::ExistsSubqueryResult { result_reg }, - connection, - )?; - let Plan::Select(mut plan) = plan else { - crate::bail_parse_error!( - "compound SELECT queries not supported yet in WHERE clause subqueries" - ); - }; - optimize_select_plan(&mut plan, resolver.schema)?; - // EXISTS subqueries are satisfied after at most 1 row has been returned. - plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric( - "1".to_string(), - )))); - let correlated = plan.is_correlated(); - out_subqueries.push(NonFromClauseSubquery { - internal_id: subquery_id, - query_type: subquery_type, - state: SubqueryState::Unevaluated { - plan: Some(Box::new(plan)), - }, - correlated, - }); - Ok(WalkControl::Continue) - } - ast::Expr::Subquery(_) => { - let subquery_id = program.table_reference_counter.next(); - let outer_query_refs = get_outer_query_refs(referenced_tables); - - let result_expr = ast::Expr::SubqueryResult { - subquery_id, - lhs: None, - not_in: false, - // Placeholder values because the number of columns returned is not known until the plan is prepared. - // These are replaced below after planning. - query_type: SubqueryType::RowValue { - result_reg_start: 0, - num_regs: 0, - }, - }; - let ast::Expr::Subquery(subselect) = std::mem::replace(expr, result_expr) - else { - unreachable!(); - }; - let plan = prepare_select_plan( - subselect, - resolver, - program, - &outer_query_refs, - QueryDestination::Unset, - connection, - )?; - let Plan::Select(mut plan) = plan else { - crate::bail_parse_error!( - "compound SELECT queries not supported yet in WHERE clause subqueries" - ); - }; - optimize_select_plan(&mut plan, resolver.schema)?; - let reg_count = plan.result_columns.len(); - let reg_start = program.alloc_registers(reg_count); - - plan.query_destination = QueryDestination::RowValueSubqueryResult { - result_reg_start: reg_start, - num_regs: reg_count, - }; - // RowValue subqueries are satisfied after at most 1 row has been returned, - // as they are used in comparisons with a scalar or a tuple of scalars like (x,y) = (SELECT ...) or x = (SELECT ...). - plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric( - "1".to_string(), - )))); - - let ast::Expr::SubqueryResult { - subquery_id, - lhs: None, - not_in: false, - query_type: - SubqueryType::RowValue { - result_reg_start, - num_regs, - }, - } = &mut *expr - else { - unreachable!(); - }; - *result_reg_start = reg_start; - *num_regs = reg_count; - - let correlated = plan.is_correlated(); - - out_subqueries.push(NonFromClauseSubquery { - internal_id: *subquery_id, - query_type: SubqueryType::RowValue { - result_reg_start: reg_start, - num_regs: reg_count, - }, - state: SubqueryState::Unevaluated { - plan: Some(Box::new(plan)), - }, - correlated, - }); - Ok(WalkControl::Continue) - } - ast::Expr::InSelect { .. } => { - let subquery_id = program.table_reference_counter.next(); - let outer_query_refs = get_outer_query_refs(referenced_tables); - - let ast::Expr::InSelect { lhs, not, rhs } = - std::mem::replace(expr, ast::Expr::Literal(ast::Literal::Null)) - else { - unreachable!(); - }; - let plan = prepare_select_plan( - rhs, - resolver, - program, - &outer_query_refs, - QueryDestination::Unset, - connection, - )?; - let Plan::Select(mut plan) = plan else { - crate::bail_parse_error!( - "compound SELECT queries not supported yet in WHERE clause subqueries" - ); - }; - optimize_select_plan(&mut plan, resolver.schema)?; - // e.g. (x,y) IN (SELECT ...) - // or x IN (SELECT ...) - let lhs_column_count = match unwrap_parens(lhs.as_ref())? { - ast::Expr::Parenthesized(exprs) => exprs.len(), - _ => 1, - }; - if lhs_column_count != plan.result_columns.len() { - crate::bail_parse_error!( - "lhs of IN subquery must have the same number of columns as the subquery" - ); - } - - let mut columns = plan - .result_columns - .iter() - .enumerate() - .map(|(i, c)| IndexColumn { - name: c.name(&plan.table_references).unwrap_or("").to_string(), - order: SortOrder::Asc, - pos_in_table: i, - collation: None, - default: None, - }) - .collect::>(); - - for (i, column) in columns.iter_mut().enumerate() { - column.collation = get_collseq_from_expr( - &plan.result_columns[i].expr, - &plan.table_references, - )?; - } - - let ephemeral_index = Arc::new(Index { - columns, - name: format!("ephemeral_index_where_sub_{subquery_id}"), - table_name: String::new(), - ephemeral: true, - has_rowid: false, - root_page: 0, - unique: false, - where_clause: None, - index_method: None, - }); - - let cursor_id = program - .alloc_cursor_id(CursorType::BTreeIndex(ephemeral_index.clone())); - - plan.query_destination = QueryDestination::EphemeralIndex { - cursor_id, - index: ephemeral_index.clone(), - is_delete: false, - }; - - *expr = ast::Expr::SubqueryResult { - subquery_id, - lhs: Some(lhs), - not_in: not, - query_type: SubqueryType::In { cursor_id }, - }; - - let correlated = plan.is_correlated(); - - out_subqueries.push(NonFromClauseSubquery { - internal_id: subquery_id, - query_type: SubqueryType::In { cursor_id }, - state: SubqueryState::Unevaluated { - plan: Some(Box::new(plan)), - }, - correlated, - }); - Ok(WalkControl::Continue) - } - _ => Ok(WalkControl::Continue), - } - }, - )?; + let mut subquery_parser = get_subquery_parser( + program, + out_subqueries, + referenced_tables, + resolver, + connection, + get_outer_query_refs, + position, + ); + for expr in exprs { + walk_expr_mut(expr, &mut subquery_parser)?; } - update_column_used_masks(referenced_tables, out_subqueries); - Ok(()) } +/// Create a closure that will walk the AST and replace subqueries with [ast::Expr::SubqueryResult] expressions. +fn get_subquery_parser<'a>( + program: &'a mut ProgramBuilder, + out_subqueries: &'a mut Vec, + referenced_tables: &'a mut TableReferences, + resolver: &'a Resolver, + connection: &'a Arc, + get_outer_query_refs: fn(&TableReferences) -> Vec, + position: SubqueryPosition, +) -> impl FnMut(&mut ast::Expr) -> Result + 'a { + move |expr: &mut ast::Expr| -> Result { + match expr { + ast::Expr::Exists(_) => { + let subquery_id = program.table_reference_counter.next(); + let outer_query_refs = get_outer_query_refs(referenced_tables); + + let result_reg = program.alloc_register(); + let subquery_type = SubqueryType::Exists { result_reg }; + let result_expr = ast::Expr::SubqueryResult { + subquery_id, + lhs: None, + not_in: false, + query_type: subquery_type.clone(), + }; + let ast::Expr::Exists(subselect) = std::mem::replace(expr, result_expr) else { + unreachable!(); + }; + + let plan = prepare_select_plan( + subselect, + resolver, + program, + &outer_query_refs, + QueryDestination::ExistsSubqueryResult { result_reg }, + connection, + )?; + let Plan::Select(mut plan) = plan else { + crate::bail_parse_error!( + "compound SELECT queries not supported yet in WHERE clause subqueries" + ); + }; + optimize_select_plan(&mut plan, resolver.schema)?; + // EXISTS subqueries are satisfied after at most 1 row has been returned. + plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric( + "1".to_string(), + )))); + let correlated = plan.is_correlated(); + out_subqueries.push(NonFromClauseSubquery { + internal_id: subquery_id, + query_type: subquery_type, + state: SubqueryState::Unevaluated { + plan: Some(Box::new(plan)), + }, + correlated, + }); + Ok(WalkControl::Continue) + } + ast::Expr::Subquery(_) => { + let subquery_id = program.table_reference_counter.next(); + let outer_query_refs = get_outer_query_refs(referenced_tables); + + let result_expr = ast::Expr::SubqueryResult { + subquery_id, + lhs: None, + not_in: false, + // Placeholder values because the number of columns returned is not known until the plan is prepared. + // These are replaced below after planning. + query_type: SubqueryType::RowValue { + result_reg_start: 0, + num_regs: 0, + }, + }; + let ast::Expr::Subquery(subselect) = std::mem::replace(expr, result_expr) else { + unreachable!(); + }; + let plan = prepare_select_plan( + subselect, + resolver, + program, + &outer_query_refs, + QueryDestination::Unset, + connection, + )?; + let Plan::Select(mut plan) = plan else { + crate::bail_parse_error!( + "compound SELECT queries not supported yet in WHERE clause subqueries" + ); + }; + optimize_select_plan(&mut plan, resolver.schema)?; + let reg_count = plan.result_columns.len(); + let reg_start = program.alloc_registers(reg_count); + + plan.query_destination = QueryDestination::RowValueSubqueryResult { + result_reg_start: reg_start, + num_regs: reg_count, + }; + // RowValue subqueries are satisfied after at most 1 row has been returned, + // as they are used in comparisons with a scalar or a tuple of scalars like (x,y) = (SELECT ...) or x = (SELECT ...). + plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric( + "1".to_string(), + )))); + + let ast::Expr::SubqueryResult { + subquery_id, + lhs: None, + not_in: false, + query_type: + SubqueryType::RowValue { + result_reg_start, + num_regs, + }, + } = &mut *expr + else { + unreachable!(); + }; + *result_reg_start = reg_start; + *num_regs = reg_count; + + let correlated = plan.is_correlated(); + + out_subqueries.push(NonFromClauseSubquery { + internal_id: *subquery_id, + query_type: SubqueryType::RowValue { + result_reg_start: reg_start, + num_regs: reg_count, + }, + state: SubqueryState::Unevaluated { + plan: Some(Box::new(plan)), + }, + correlated, + }); + Ok(WalkControl::Continue) + } + ast::Expr::InSelect { .. } => { + let subquery_id = program.table_reference_counter.next(); + let outer_query_refs = get_outer_query_refs(referenced_tables); + + let ast::Expr::InSelect { lhs, not, rhs } = + std::mem::replace(expr, ast::Expr::Literal(ast::Literal::Null)) + else { + unreachable!(); + }; + let plan = prepare_select_plan( + rhs, + resolver, + program, + &outer_query_refs, + QueryDestination::Unset, + connection, + )?; + let Plan::Select(mut plan) = plan else { + crate::bail_parse_error!( + "compound SELECT queries not supported yet in WHERE clause subqueries" + ); + }; + optimize_select_plan(&mut plan, resolver.schema)?; + // e.g. (x,y) IN (SELECT ...) + // or x IN (SELECT ...) + let lhs_column_count = match unwrap_parens(lhs.as_ref())? { + ast::Expr::Parenthesized(exprs) => exprs.len(), + _ => 1, + }; + if lhs_column_count != plan.result_columns.len() { + crate::bail_parse_error!( + "lhs of IN subquery must have the same number of columns as the subquery" + ); + } + + let mut columns = plan + .result_columns + .iter() + .enumerate() + .map(|(i, c)| IndexColumn { + name: c.name(&plan.table_references).unwrap_or("").to_string(), + order: SortOrder::Asc, + pos_in_table: i, + collation: None, + default: None, + }) + .collect::>(); + + for (i, column) in columns.iter_mut().enumerate() { + column.collation = get_collseq_from_expr( + &plan.result_columns[i].expr, + &plan.table_references, + )?; + } + + let ephemeral_index = Arc::new(Index { + columns, + name: format!("ephemeral_index_where_sub_{subquery_id}"), + table_name: String::new(), + ephemeral: true, + has_rowid: false, + root_page: 0, + unique: false, + where_clause: None, + index_method: None, + }); + + let cursor_id = + program.alloc_cursor_id(CursorType::BTreeIndex(ephemeral_index.clone())); + + plan.query_destination = QueryDestination::EphemeralIndex { + cursor_id, + index: ephemeral_index.clone(), + is_delete: false, + }; + + *expr = ast::Expr::SubqueryResult { + subquery_id, + lhs: Some(lhs), + not_in: not, + query_type: SubqueryType::In { cursor_id }, + }; + + let correlated = plan.is_correlated(); + + if correlated && position == SubqueryPosition::Having { + crate::bail_parse_error!( + "correlated IN subqueries in HAVING clause are not supported yet" + ); + } + + out_subqueries.push(NonFromClauseSubquery { + internal_id: subquery_id, + query_type: SubqueryType::In { cursor_id }, + state: SubqueryState::Unevaluated { + plan: Some(Box::new(plan)), + }, + correlated, + }); + Ok(WalkControl::Continue) + } + _ => Ok(WalkControl::Continue), + } + } +} + /// We make decisions about when to evaluate expressions or whether to use covering indexes based on /// which columns of a table have been referenced. /// Since subquery nesting is arbitrarily deep, a reference to a column must propagate recursively diff --git a/testing/subquery.test b/testing/subquery.test index 2848c64e9..261defd47 100644 --- a/testing/subquery.test +++ b/testing/subquery.test @@ -638,4 +638,283 @@ do_execsql_test_on_specific_db {:memory:} subquery-not-in-multiple-columns { select name, age from restricted_profiles ); } {1|Alice|25 -3|Charlie|25} \ No newline at end of file +3|Charlie|25} + +# SUBQUERIES IN OTHER POSITIONS (result columns, GROUP BY, ORDER BY, HAVING, LIMIT, OFFSET) + +# Uncorrelated subquery in result column +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-result-column { + create table employees(id, name, dept_id); + create table company_info(total_depts); + insert into employees values (1, 'Alice', 10), (2, 'Bob', 20); + insert into company_info values (5); + + select id, name, (select total_depts from company_info) as total_depts from employees; +} {1|Alice|5 +2|Bob|5} + +# Correlated subquery in result column +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-result-column { + create table employees(id, name, dept_id); + create table departments(id, name); + insert into employees values (1, 'Alice', 10), (2, 'Bob', 20); + insert into departments values (10, 'Sales'), (20, 'Engineering'); + + select id, name, (select name from departments where id = dept_id) as dept_name from employees; +} {1|Alice|Sales +2|Bob|Engineering} + +# Correlated subquery in result column with join +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-result-column-with-join { + create table employees(id, name, dept_id, manager_id); + create table departments(id, name); + create table managers(id, name); + insert into employees values (1, 'Alice', 10, 100), (2, 'Bob', 20, 200); + insert into departments values (10, 'Sales'), (20, 'Engineering'); + insert into managers values (100, 'Carol'), (200, 'Dave'); + + select e.id, e.name, m.name as manager, (select name from departments where id = e.dept_id) as dept_name + from employees e join managers m on e.manager_id = m.id; +} {1|Alice|Carol|Sales +2|Bob|Dave|Engineering} + +# Uncorrelated IN-subquery in result column +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-result-column-in { + create table employees(id, name, dept_id); + create table special_depts(dept_id); + insert into employees values (1, 'Alice', 10), (2, 'Bob', 20), (3, 'Charlie', 30); + insert into special_depts values (10), (20); + + select id, name, dept_id in (select dept_id from special_depts) as is_special from employees; +} {1|Alice|1 +2|Bob|1 +3|Charlie|0} + +# Correlated IN-subquery in result column +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-result-column-in { + create table employees(id, name, dept_id); + create table dept_awards(dept_id, employee_id); + insert into employees values (1, 'Alice', 10), (2, 'Bob', 20), (3, 'Charlie', 10); + insert into dept_awards values (10, 1), (20, 2); + + select id, name, id in (select employee_id from dept_awards where dept_id = employees.dept_id) as has_award from employees; +} {1|Alice|1 +2|Bob|1 +3|Charlie|0} + +# Uncorrelated subquery in GROUP BY clause +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-group-by { + create table sales(id, amount, region_id); + create table grouping_config(group_column); + insert into sales values (1, 100, 1), (2, 200, 1), (3, 150, 2); + insert into grouping_config values ('region_id'); + + select region_id, sum(amount) + from sales + group by (select case when group_column = 'region_id' then region_id else amount end from grouping_config); +} {1|300 +2|150} + +# Correlated subquery in GROUP BY clause +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-group-by { + create table sales(id, amount, region_id); + create table regions(id, name); + insert into sales values (1, 100, 1), (2, 200, 1), (3, 150, 2); + insert into regions values (1, 'North'), (2, 'South'); + + select (select name from regions where id = region_id) as region, sum(amount) + from sales + group by (select name from regions where id = region_id); +} {North|300 +South|150} + +# Correlated subquery in GROUP BY clause with join +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-group-by-with-join { + create table sales(id, amount, region_id, salesperson_id); + create table regions(id, name); + create table salespeople(id, name); + insert into sales values (1, 100, 1, 10), (2, 200, 1, 20), (3, 150, 2, 10); + insert into regions values (1, 'North'), (2, 'South'); + insert into salespeople values (10, 'Alice'), (20, 'Bob'); + + select (select name from regions where id = s.region_id) as region, sp.name as salesperson, sum(amount) + from sales s join salespeople sp on s.salesperson_id = sp.id + group by (select name from regions where id = s.region_id), sp.name; +} {North|Alice|100 +North|Bob|200 +South|Alice|150} + +# Uncorrelated subquery in ORDER BY clause +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-order-by { + create table products(id, name, category_id); + create table sort_config(sort_order); + insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3); + insert into sort_config values ('asc'); + + select id, name from products + order by (select case when sort_order = 'asc' then id else -id end from sort_config); +} {1|hat +2|laptop +3|book} + +# Correlated subquery in ORDER BY clause +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-order-by { + create table products(id, name, category_id); + create table categories(id, priority); + insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3); + insert into categories values (1, 10), (2, 20), (3, 5); + + select id, name from products + order by (select priority from categories where id = category_id); +} {3|book +2|laptop +1|hat} + +# Correlated subquery in ORDER BY clause with join +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-order-by-with-join { + create table products(id, name, category_id, supplier_id); + create table categories(id, priority); + create table suppliers(id, name); + insert into products values (1, 'hat', 2, 100), (2, 'laptop', 1, 200), (3, 'book', 3, 100); + insert into categories values (1, 10), (2, 20), (3, 5); + insert into suppliers values (100, 'SupplierA'), (200, 'SupplierB'); + + select p.id, p.name, s.name as supplier + from products p join suppliers s on p.supplier_id = s.id + order by (select priority from categories where id = p.category_id); +} {3|book|SupplierA +2|laptop|SupplierB +1|hat|SupplierA} + +# Uncorrelated IN-subquery in ORDER BY clause +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-order-by-in { + create table products(id, name, category_id); + create table priority_categories(category_id); + insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3); + insert into priority_categories values (1), (3); + + select id, name from products + order by category_id in (select category_id from priority_categories) desc, id; +} {2|laptop +3|book +1|hat} + +# Correlated IN-subquery in ORDER BY clause +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-order-by-in { + create table products(id, name, category_id); + create table category_priorities(category_id, priority); + insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3); + insert into category_priorities values (1, 10), (2, 20), (3, 5); + + select id, name from products + order by category_id in (select category_id from category_priorities where priority > 8), id; +} {3|book +1|hat +2|laptop} + +# Uncorrelated subquery in HAVING clause +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-having { + create table orders(id, customer_id, amount); + create table vip_threshold(min_amount); + insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30); + insert into vip_threshold values (100); + + select customer_id, sum(amount) as total + from orders + group by customer_id + having total > (select min_amount from vip_threshold); +} {100|200} + +# Uncorrelated IN-subquery in HAVING clause +do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-having-in { + create table orders(id, customer_id, amount); + create table target_totals(total_amount); + insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30), (4, 300, 100); + insert into target_totals values (200), (100); + + select customer_id, sum(amount) as total + from orders + group by customer_id + having total in (select total_amount from target_totals); +} {100|200 +300|100} + +# Correlated subquery in HAVING clause +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-having { + create table orders(id, customer_id, amount); + create table customer_thresholds(customer_id, min_amount); + insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30), (4, 200, 80); + insert into customer_thresholds values (100, 100), (200, 150); + + select customer_id, sum(amount) as total + from orders + group by customer_id + having total > (select min_amount from customer_thresholds where customer_thresholds.customer_id = orders.customer_id); +} {100|200} + +# Correlated IN-subquery in HAVING clause +# FIXME: currently disabled +# do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-having-in { +# create table orders(id, customer_id, amount); +# create table customer_targets(customer_id, target_amount); +# insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30), (4, 200, 80); +# insert into customer_targets values (100, 200), (100, 250), (200, 110); +# +# select customer_id, sum(amount) as total +# from orders +# group by customer_id +# having total in (select target_amount from customer_targets where customer_targets.customer_id = orders.customer_id); +# } {100|200 +# 200|110} + +# Correlated subquery in HAVING clause with join +do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-having-with-join { + create table orders(id, customer_id, amount, region_id); + create table customer_thresholds(customer_id, min_amount); + create table regions(id, name); + insert into orders values (1, 100, 50, 1), (2, 100, 150, 1), (3, 200, 30, 2), (4, 200, 80, 2); + insert into customer_thresholds values (100, 100), (200, 150); + insert into regions values (1, 'East'), (2, 'West'); + + select o.customer_id, r.name as region, sum(o.amount) as total + from orders o join regions r on o.region_id = r.id + group by o.customer_id, r.name + having total > (select min_amount from customer_thresholds where customer_thresholds.customer_id = o.customer_id); +} {100|East|200} + +# Correlated IN-subquery in HAVING clause with join +# do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-subquery-in-having-with-join { +# create table orders(id, customer_id, amount, region_id); +# create table customer_thresholds(customer_id, min_amount); +# create table regions(id, name); +# insert into orders values (1, 100, 50, 1), (2, 100, 150, 1), (3, 200, 30, 2), (4, 200, 80, 2); +# insert into customer_thresholds values (100, 100), (200, 150); +# insert into regions values (1, 'East'), (2, 'West'); +# +# select o.customer_id, r.name as region, sum(o.amount) as total +# from orders o join regions r on o.region_id = r.id +# group by o.customer_id, r.name +# having total + 40 in (select min_amount from customer_thresholds where customer_thresholds.customer_id = o.customer_id); +# } {200|West|110} + +# Uncorrelated subquery in LIMIT clause +do_execsql_test_on_specific_db {:memory:} subquery-in-limit { + create table items(id, name); + create table config(max_results); + insert into items values (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'); + insert into config values (2); + + select * from items limit (select max_results from config); +} {1|a +2|b} + +# Uncorrelated subquery in OFFSET clause +do_execsql_test_on_specific_db {:memory:} subquery-in-offset { + create table items(id, name); + create table config(skip_count); + insert into items values (1, 'a'), (2, 'b'), (3, 'c'); + insert into config values (1); + + select * from items limit 2 offset (select skip_count from config); +} {2|b +3|c} \ No newline at end of file diff --git a/tests/fuzz/mod.rs b/tests/fuzz/mod.rs index a7c0e8a9e..8085ba588 100644 --- a/tests/fuzz/mod.rs +++ b/tests/fuzz/mod.rs @@ -4295,7 +4295,12 @@ mod fuzz_tests { }; // Helper function to generate simple subquery - fn gen_subquery(rng: &mut ChaCha8Rng, depth: usize, outer_table: Option<&str>) -> String { + fn gen_subquery( + rng: &mut ChaCha8Rng, + depth: usize, + outer_table: Option<&str>, + allowed_outer_cols: Option<&[&str]>, + ) -> String { if depth > MAX_SUBQUERY_DEPTH { return "SELECT 1".to_string(); } @@ -4328,56 +4333,96 @@ mod fuzz_tests { // Helper function to generate correlated WHERE conditions let gen_correlated_where = |rng: &mut ChaCha8Rng, inner_table: &str, outer_table: &str| -> String { + let pick = + |rng: &mut ChaCha8Rng, mut candidates: Vec<(String, &'static str)>| { + let filtered: Vec = if let Some(allowed) = allowed_outer_cols { + candidates + .drain(..) + .filter(|(_, col)| allowed.contains(col)) + .map(|(cond, _)| cond) + .collect() + } else { + candidates.drain(..).map(|(cond, _)| cond).collect() + }; + if filtered.is_empty() { + "1=1".to_string() + } else { + filtered[rng.random_range(0..filtered.len())].clone() + } + }; match (outer_table, inner_table) { - ("t1", "t2") => { - // t2.ref_id relates to t1.id - let conditions = [ - format!("{inner_table}.ref_id = {outer_table}.id"), - format!("{inner_table}.id < {outer_table}.value1"), - format!("{inner_table}.data > {outer_table}.value2"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t1", "t3") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.category < {outer_table}.value1"), - format!("{inner_table}.amount > {outer_table}.value2"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t2", "t1") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.ref_id"), - format!("{inner_table}.value1 > {outer_table}.data"), - format!("{inner_table}.value2 < {outer_table}.id"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t2", "t3") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.category = {outer_table}.ref_id"), - format!("{inner_table}.amount > {outer_table}.data"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t3", "t1") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.value1 > {outer_table}.category"), - format!("{inner_table}.value2 < {outer_table}.amount"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t3", "t2") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.ref_id = {outer_table}.category"), - format!("{inner_table}.data < {outer_table}.amount"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } + ("t1", "t2") => pick( + rng, + vec![ + (format!("{inner_table}.ref_id = {outer_table}.id"), "id"), + (format!("{inner_table}.id < {outer_table}.value1"), "value1"), + ( + format!("{inner_table}.data > {outer_table}.value2"), + "value2", + ), + ], + ), + ("t1", "t3") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.category < {outer_table}.value1"), + "value1", + ), + ( + format!("{inner_table}.amount > {outer_table}.value2"), + "value2", + ), + ], + ), + ("t2", "t1") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.ref_id"), "ref_id"), + (format!("{inner_table}.value1 > {outer_table}.data"), "data"), + (format!("{inner_table}.value2 < {outer_table}.id"), "id"), + ], + ), + ("t2", "t3") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.category = {outer_table}.ref_id"), + "ref_id", + ), + (format!("{inner_table}.amount > {outer_table}.data"), "data"), + ], + ), + ("t3", "t1") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.value1 > {outer_table}.category"), + "category", + ), + ( + format!("{inner_table}.value2 < {outer_table}.amount"), + "amount", + ), + ], + ), + ("t3", "t2") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.ref_id = {outer_table}.category"), + "category", + ), + ( + format!("{inner_table}.data < {outer_table}.amount"), + "amount", + ), + ], + ), _ => "1=1".to_string(), } }; @@ -4410,7 +4455,11 @@ mod fuzz_tests { // Add correlated conditions if outer_table is provided and sometimes let final_query = if let Some(outer_table) = outer_table { - if rng.random_bool(0.4) { + let can_correlate = match allowed_outer_cols { + Some(cols) => !cols.is_empty(), + None => true, + }; + if can_correlate && rng.random_bool(0.4) { // 40% chance for correlation // Extract the inner table from the base query let inner_table = if base_query.contains("FROM t1") { @@ -4440,7 +4489,7 @@ mod fuzz_tests { // Sometimes add nesting - but use scalar subquery for nesting to avoid column count issues if depth < 1 && rng.random_bool(0.2) { // Reduced probability and depth - let nested = gen_scalar_subquery(rng, 0, outer_table); + let nested = gen_scalar_subquery(rng, 0, outer_table, allowed_outer_cols); if final_query.contains("WHERE") { format!("{final_query} AND id IN ({nested})") } else { @@ -4456,6 +4505,7 @@ mod fuzz_tests { rng: &mut ChaCha8Rng, depth: usize, outer_table: Option<&str>, + allowed_outer_cols: Option<&[&str]>, ) -> String { if depth > MAX_SUBQUERY_DEPTH { // Reduced nesting depth @@ -4490,56 +4540,96 @@ mod fuzz_tests { // Helper function to generate correlated WHERE conditions let gen_correlated_where = |rng: &mut ChaCha8Rng, inner_table: &str, outer_table: &str| -> String { + let pick = + |rng: &mut ChaCha8Rng, mut candidates: Vec<(String, &'static str)>| { + let filtered: Vec = if let Some(allowed) = allowed_outer_cols { + candidates + .drain(..) + .filter(|(_, col)| allowed.contains(col)) + .map(|(cond, _)| cond) + .collect() + } else { + candidates.drain(..).map(|(cond, _)| cond).collect() + }; + if filtered.is_empty() { + "1=1".to_string() + } else { + filtered[rng.random_range(0..filtered.len())].clone() + } + }; match (outer_table, inner_table) { - ("t1", "t2") => { - // t2.ref_id relates to t1.id - let conditions = [ - format!("{inner_table}.ref_id = {outer_table}.id"), - format!("{inner_table}.id < {outer_table}.value1"), - format!("{inner_table}.data > {outer_table}.value2"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t1", "t3") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.category < {outer_table}.value1"), - format!("{inner_table}.amount > {outer_table}.value2"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t2", "t1") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.ref_id"), - format!("{inner_table}.value1 > {outer_table}.data"), - format!("{inner_table}.value2 < {outer_table}.id"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t2", "t3") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.category = {outer_table}.ref_id"), - format!("{inner_table}.amount > {outer_table}.data"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t3", "t1") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.value1 > {outer_table}.category"), - format!("{inner_table}.value2 < {outer_table}.amount"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } - ("t3", "t2") => { - let conditions = [ - format!("{inner_table}.id = {outer_table}.id"), - format!("{inner_table}.ref_id = {outer_table}.category"), - format!("{inner_table}.data < {outer_table}.amount"), - ]; - conditions[rng.random_range(0..conditions.len())].clone() - } + ("t1", "t2") => pick( + rng, + vec![ + (format!("{inner_table}.ref_id = {outer_table}.id"), "id"), + (format!("{inner_table}.id < {outer_table}.value1"), "value1"), + ( + format!("{inner_table}.data > {outer_table}.value2"), + "value2", + ), + ], + ), + ("t1", "t3") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.category < {outer_table}.value1"), + "value1", + ), + ( + format!("{inner_table}.amount > {outer_table}.value2"), + "value2", + ), + ], + ), + ("t2", "t1") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.ref_id"), "ref_id"), + (format!("{inner_table}.value1 > {outer_table}.data"), "data"), + (format!("{inner_table}.value2 < {outer_table}.id"), "id"), + ], + ), + ("t2", "t3") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.category = {outer_table}.ref_id"), + "ref_id", + ), + (format!("{inner_table}.amount > {outer_table}.data"), "data"), + ], + ), + ("t3", "t1") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.value1 > {outer_table}.category"), + "category", + ), + ( + format!("{inner_table}.value2 < {outer_table}.amount"), + "amount", + ), + ], + ), + ("t3", "t2") => pick( + rng, + vec![ + (format!("{inner_table}.id = {outer_table}.id"), "id"), + ( + format!("{inner_table}.ref_id = {outer_table}.category"), + "category", + ), + ( + format!("{inner_table}.data < {outer_table}.amount"), + "amount", + ), + ], + ), _ => "1=1".to_string(), } }; @@ -4573,7 +4663,11 @@ mod fuzz_tests { // Add correlated conditions if outer_table is provided and sometimes let final_query = if let Some(outer_table) = outer_table { - if rng.random_bool(0.4) { + let can_correlate = match allowed_outer_cols { + Some(cols) => !cols.is_empty(), + None => true, + }; + if can_correlate && rng.random_bool(0.4) { // 40% chance for correlation // Extract the inner table from the base query let inner_table = if base_query.contains("FROM t1") { @@ -4603,7 +4697,7 @@ mod fuzz_tests { // Sometimes add nesting if depth < 1 && rng.random_bool(0.2) { // Reduced probability and depth - let nested = gen_scalar_subquery(rng, depth + 1, outer_table); + let nested = gen_scalar_subquery(rng, depth + 1, outer_table, allowed_outer_cols); if final_query.contains("WHERE") { format!("{final_query} AND id IN ({nested})") } else { @@ -4614,11 +4708,110 @@ mod fuzz_tests { } } + // Helper to generate a SELECT-list expression as a scalar subquery, optionally correlated + fn gen_selectlist_scalar_expr(rng: &mut ChaCha8Rng, outer_table: &str) -> String { + // Reuse scalar subquery generator; return the inner SELECT (without wrapping) + gen_scalar_subquery(rng, 0, Some(outer_table), None) + } + + // Helper to generate a GROUP BY expression which may include a correlated scalar subquery + fn gen_group_by_expr(rng: &mut ChaCha8Rng, main_table: &str) -> String { + // Either a plain column or a correlated scalar subquery + if rng.random_bool(0.4) { + // Prefer plain columns most of the time to keep GROUP BY semantics simple + match main_table { + "t1" => ["id", "value1", "value2"][rng.random_range(0..3)].to_string(), + "t2" => ["id", "ref_id", "data"][rng.random_range(0..3)].to_string(), + "t3" => ["id", "category", "amount"][rng.random_range(0..3)].to_string(), + _ => "id".to_string(), + } + } else { + // If GROUP BY is present, a subquery that references outer columns would be invalid + // unless it only references GROUP BY columns; since this subquery becomes the + // grouping expression itself, disallow correlation entirely here. + format!( + "({})", + gen_scalar_subquery(rng, 0, Some(main_table), Some(&[])) + ) + } + } + + // Helper to generate a HAVING condition comparing an aggregate to a scalar subquery + fn gen_having_condition( + rng: &mut ChaCha8Rng, + main_table: &str, + allowed_outer_cols: Option<&[&str]>, + ) -> String { + let (agg_func, agg_col) = match main_table { + "t1" => [ + ("SUM", "value1"), + ("SUM", "value2"), + ("MAX", "value1"), + ("MAX", "value2"), + ("MIN", "value1"), + ("MIN", "value2"), + ("COUNT", "*"), + ][rng.random_range(0..7)], + "t2" => [ + ("SUM", "data"), + ("MAX", "data"), + ("MIN", "data"), + ("COUNT", "*"), + ][rng.random_range(0..4)], + "t3" => [ + ("SUM", "amount"), + ("MAX", "amount"), + ("MIN", "amount"), + ("COUNT", "*"), + ][rng.random_range(0..4)], + _ => ("COUNT", "*"), + }; + let op = [">", "<", ">=", "<=", "=", "<>"][rng.random_range(0..6)]; + let rhs = gen_scalar_subquery(rng, 0, Some(main_table), allowed_outer_cols); + if agg_col == "*" { + format!("COUNT(*) {op} ({rhs})") + } else { + format!("{agg_func}({agg_col}) {op} ({rhs})") + } + } + + // Helper to generate LIMIT/OFFSET clause (optionally empty). Expressions may be subqueries. + fn gen_limit_offset_clause(rng: &mut ChaCha8Rng) -> String { + // 50% of the time, no LIMIT/OFFSET + if rng.random_bool(0.5) { + return String::new(); + } + + fn gen_limit_like_expr(rng: &mut ChaCha8Rng) -> String { + // Small literal or a scalar subquery from a random table + if rng.random_bool(0.6) { + // Keep literal sizes modest + format!("{}", rng.random_range(0..20)) + } else { + let which = rng.random_range(0..3); + match which { + 0 => "(SELECT COUNT(*) FROM t1)".to_string(), + 1 => "(SELECT COUNT(*) FROM t2)".to_string(), + _ => "(SELECT COUNT(*) FROM t3)".to_string(), + } + } + } + + let mut clause = String::new(); + let limit_expr = gen_limit_like_expr(rng); + clause.push_str(&format!(" LIMIT {limit_expr}",)); + if rng.random_bool(0.5) { + let offset_expr = gen_limit_like_expr(rng); + clause.push_str(&format!(" OFFSET {offset_expr}",)); + } + clause + } + for iter_num in 0..NUM_FUZZ_ITERATIONS { let main_table = ["t1", "t2", "t3"][rng.random_range(0..3)]; - let query_type = rng.random_range(0..6); // Increased from 4 to 6 for new correlated query types - let query = match query_type { + let query_type = rng.random_range(0..8); // Add GROUP BY/HAVING variants + let mut query = match query_type { 0 => { // Comparison subquery: WHERE column (SELECT ...) let column = match main_table { @@ -4628,13 +4821,13 @@ mod fuzz_tests { _ => "id", }; let op = [">", "<", ">=", "<=", "=", "<>"][rng.random_range(0..6)]; - let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table)); + let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table), None); format!("SELECT * FROM {main_table} WHERE {column} {op} ({subquery})",) } 1 => { // EXISTS subquery: WHERE [NOT] EXISTS (SELECT ...) let not_exists = if rng.random_bool(0.3) { "NOT " } else { "" }; - let subquery = gen_subquery(&mut rng, 0, Some(main_table)); + let subquery = gen_subquery(&mut rng, 0, Some(main_table), None); format!("SELECT * FROM {main_table} WHERE {not_exists}EXISTS ({subquery})",) } 2 => { @@ -4646,7 +4839,7 @@ mod fuzz_tests { "t3" => ["amount", "category", "id"][rng.random_range(0..3)], _ => "id", }; - let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table)); + let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table), None); format!("SELECT * FROM {main_table} WHERE {column} {not_in}IN ({subquery})",) } 3 => { @@ -4843,8 +5036,93 @@ mod fuzz_tests { "SELECT * FROM {main_table} WHERE {column} {op} (SELECT {select_column} FROM {inner_table} WHERE {correlated_condition})", ) } + 6 => { + // Aggregated query with GROUP BY and optional HAVING; allow subqueries in GROUP BY/HAVING + let group_expr = gen_group_by_expr(&mut rng, main_table); + // Only GROUP BY columns may be referenced by correlated subqueries + let allowed_outer_cols: Vec<&str> = match group_expr.as_str() { + "id" => vec!["id"], + "value1" => vec!["value1"], + "value2" => vec!["value2"], + "ref_id" => vec!["ref_id"], + "data" => vec!["data"], + "category" => vec!["category"], + "amount" => vec!["amount"], + _ => Vec::new(), + }; + let (agg_func, agg_col) = match main_table { + "t1" => [ + ("SUM", "value1"), + ("SUM", "value2"), + ("MAX", "value1"), + ("MAX", "value2"), + ("COUNT", "*"), + ][rng.random_range(0..5)], + "t2" => [("SUM", "data"), ("MAX", "data"), ("COUNT", "*")] + [rng.random_range(0..3)], + "t3" => [("SUM", "amount"), ("MAX", "amount"), ("COUNT", "*")] + [rng.random_range(0..3)], + _ => ("COUNT", "*"), + }; + let mut q; + if agg_col == "*" { + q = format!("SELECT {group_expr} AS g, COUNT(*) AS c FROM {main_table}"); + } else { + q = format!("SELECT {group_expr} AS g, {agg_func}({agg_col}) AS a FROM {main_table}"); + } + if rng.random_bool(0.5) { + q.push_str(&format!( + " WHERE {}", + gen_simple_where(&mut rng, main_table) + )); + } + q.push_str(&format!(" GROUP BY {group_expr}")); + if rng.random_bool(0.4) { + q.push_str(&format!( + " HAVING {}", + gen_having_condition(&mut rng, main_table, Some(&allowed_outer_cols)) + )); + } + q + } + 7 => { + // Simple GROUP BY without HAVING (baseline support); may use subquery in GROUP BY + let group_expr = gen_group_by_expr(&mut rng, main_table); + let select_expr = if rng.random_bool(0.5) { + // Use aggregate + match main_table { + "t1" => "SUM(value1) AS s".to_string(), + "t2" => "SUM(data) AS s".to_string(), + _ => "SUM(amount) AS s".to_string(), + } + } else { + "COUNT(*) AS c".to_string() + }; + let mut q = + format!("SELECT {group_expr} AS g, {select_expr} FROM {main_table}"); + if rng.random_bool(0.5) { + q.push_str(&format!( + " WHERE {}", + gen_simple_where(&mut rng, main_table) + )); + } + q.push_str(&format!(" GROUP BY {group_expr}")); + q + } _ => unreachable!(), }; + // Optionally inject a SELECT-list scalar subquery into non-aggregated SELECT * queries + if query.starts_with("SELECT * FROM ") && rng.random_bool(0.4) { + let sel_expr = gen_selectlist_scalar_expr(&mut rng, main_table); + let replacement = "SELECT *, (".to_string() + &sel_expr + ") AS s_sub FROM "; + query = query.replacen("SELECT * FROM ", &replacement, 1); + } + + // Optionally append LIMIT/OFFSET (with or without subqueries) + let limit_clause = gen_limit_offset_clause(&mut rng); + if !limit_clause.is_empty() { + query.push_str(&limit_clause); + } log::info!( "Iteration {}/{NUM_FUZZ_ITERATIONS}: Query: {query}",