Merge 'Support subqueries in all positions of a SELECT statement' from Jussi Saurio

Follow-up to #3847.
Adds support for subqueries in all other positions of a SELECT (the
result list, GROUP BY, ORDER BY, HAVING, LIMIT, OFFSET).
Turns out I am a sql noob and didn't realize that correlated subqueries
are supported in basically all positions except LIMIT/OFFSET, so added
support for those too + accompanying TCL tests.
Thankfully the abstractions introduced in #3847 carry over to this very
well so the code change is relatively small (over half of the diff is
tests and a lot of the remaining diff is just moving logic around).

Reviewed-by: Preston Thorpe <preston@turso.tech>

Closes #3852
This commit is contained in:
Jussi Saurio
2025-10-29 10:19:39 +02:00
committed by GitHub
9 changed files with 1080 additions and 383 deletions

View File

@@ -200,8 +200,9 @@ Feature support of [sqlite expr syntax](https://www.sqlite.org/lang_expr.html).
| IS (NOT) | Yes | |
| IS (NOT) DISTINCT FROM | Yes | |
| (NOT) BETWEEN ... AND ... | Yes | Expression is rewritten in the optimizer |
| (NOT) IN (subquery) | No | |
| (NOT) EXISTS (subquery) | No | |
| (NOT) IN (SELECT...) | Yes | |
| (NOT) EXISTS (SELECT...) | Yes | |
| x <operator> (SELECT...)) | Partial | Only scalar subqueries supported, i.e. not (x,y) = (SELECT...)
| CASE WHEN THEN ELSE END | Yes | |
| RAISE | No | |

View File

@@ -35,8 +35,9 @@ use crate::translate::fkeys::{
emit_fk_delete_parent_existence_checks, emit_guarded_fk_decrement,
emit_parent_key_change_checks, open_read_index, open_read_table, stabilize_new_row_for_fk,
};
use crate::translate::plan::{DeletePlan, JoinedTable, Plan, QueryDestination, Search};
use crate::translate::plan::{DeletePlan, EvalAt, JoinedTable, Plan, QueryDestination, Search};
use crate::translate::planner::ROWID_STRS;
use crate::translate::subquery::emit_non_from_clause_subquery;
use crate::translate::values::emit_values;
use crate::translate::window::{emit_window_results, init_window, WindowMetadata};
use crate::util::{exprs_are_equivalent, normalize_ident};
@@ -278,6 +279,27 @@ pub fn emit_query<'a>(
return Ok(reg_result_cols_start);
}
// Evaluate uncorrelated subqueries as early as possible, because even LIMIT can reference a subquery.
for subquery in plan
.non_from_clause_subqueries
.iter_mut()
.filter(|s| !s.has_been_evaluated())
{
let eval_at = subquery.get_eval_at(&plan.join_order)?;
if eval_at != EvalAt::BeforeLoop {
continue;
}
let plan = subquery.consume_plan(EvalAt::BeforeLoop);
emit_non_from_clause_subquery(
program,
t_ctx,
*plan,
&subquery.query_type,
subquery.correlated,
)?;
}
// Emit FROM clause subqueries first so the results can be read in the main query loop.
emit_from_clause_subqueries(program, t_ctx, &mut plan.table_references)?;

View File

@@ -321,6 +321,8 @@ fn collect_non_aggregate_expressions<'a>(
Ok(())
}
/// Collects columns from different parts of a SELECT that are needed for
/// GROUP BY.
fn collect_result_columns<'a>(
root_expr: &'a ast::Expr,
plan: &SelectPlan,
@@ -337,6 +339,29 @@ fn collect_result_columns<'a>(
result_columns.push(expr);
}
}
// SubqueryResult is an exception because we can't "extract" columns from it
// unlike other expressions like function calls or direct column references,
// so we must add it so that the subquery result gets collected to the GROUP BY
// columns.
//
// However, if the subquery is of the form: 'aggregate_result IN (SELECT...)', we need to skip it because the aggregation
// is done later.
ast::Expr::SubqueryResult { lhs, .. } => {
if let Some(ref lhs) = lhs {
let mut lhs_contains_agg = false;
walk_expr(lhs, &mut |expr: &ast::Expr| -> Result<WalkControl> {
if plan.aggregates.iter().any(|a| a.original_expr == *expr) {
lhs_contains_agg = true;
return Ok(WalkControl::SkipChildren);
}
Ok(WalkControl::Continue)
})?;
if lhs_contains_agg {
return Ok(WalkControl::SkipChildren);
}
}
result_columns.push(expr);
}
_ => {
if plan.aggregates.iter().any(|a| a.original_expr == *expr) {
return Ok(WalkControl::SkipChildren);

View File

@@ -418,22 +418,6 @@ pub fn init_loop(
}
}
for subquery in subqueries.iter_mut().filter(|s| !s.has_been_evaluated()) {
let eval_at = subquery.get_eval_at(join_order)?;
if eval_at != EvalAt::BeforeLoop {
continue;
}
let plan = subquery.consume_plan(EvalAt::BeforeLoop);
emit_non_from_clause_subquery(
program,
t_ctx,
*plan,
&subquery.query_type,
subquery.correlated,
)?;
}
for cond in where_clause
.iter()
.filter(|c| c.should_eval_before_loop(join_order, subqueries))

View File

@@ -1390,6 +1390,16 @@ pub enum SubqueryState {
Evaluated { evaluated_at: EvalAt },
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SubqueryPosition {
ResultColumn,
Where,
GroupBy,
Having,
OrderBy,
LimitOffset,
}
#[derive(Debug, Clone)]
/// A subquery that is not part of the `FROM` clause.
/// This is used for subqueries in the WHERE clause, HAVING clause, ORDER BY clause, LIMIT clause, OFFSET clause, etc.

View File

@@ -13,7 +13,7 @@ use crate::translate::planner::{
break_predicate_at_and_boundaries, parse_from, parse_limit, parse_where,
resolve_window_and_aggregate_functions,
};
use crate::translate::subquery::plan_subqueries_from_where_clause;
use crate::translate::subquery::plan_subqueries_from_select_plan;
use crate::translate::window::plan_windows;
use crate::util::normalize_ident;
use crate::vdbe::builder::ProgramBuilderOpts;
@@ -504,24 +504,7 @@ fn prepare_one_select_plan(
)?;
}
// TODO: support subqueries in the following positions:
// - result column of a select, e.g.: SELECT x = (SELECT ...) FROM t
// - GROUP BY clause, e.g.: SELECT * FROM t GROUP BY (SELECT ...) <-- however nonsensical that might be...
// - HAVING clause, e.g.: SELECT * FROM t GROUP BY x HAVING x = (SELECT ...)
// - ORDER BY clause, e.g.: SELECT * FROM t ORDER BY x = (SELECT ...)
// - LIMIT clause, e.g.: SELECT * FROM t LIMIT (SELECT ...)
// - OFFSET clause, e.g.: SELECT * FROM t OFFSET (SELECT ...)
//
// in these positions, unlike in the WHERE clause, the subquery cannot reference columns from the outer query,
// so we don't need to collect outer query references.
plan_subqueries_from_where_clause(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
&mut plan.where_clause,
connection,
)?;
plan_subqueries_from_select_plan(program, &mut plan, resolver, connection)?;
// Return the unoptimized query plan
Ok(plan)

View File

@@ -11,8 +11,8 @@ use crate::{
expr::{unwrap_parens, walk_expr_mut, WalkControl},
optimizer::optimize_select_plan,
plan::{
ColumnUsedMask, NonFromClauseSubquery, OuterQueryReference, Plan, SubqueryState,
WhereTerm,
ColumnUsedMask, NonFromClauseSubquery, OuterQueryReference, Plan, SubqueryPosition,
SubqueryState,
},
select::prepare_select_plan,
},
@@ -29,22 +29,117 @@ use super::{
plan::{Operation, QueryDestination, Search, SelectPlan, TableReferences},
};
/// Compute query plans for subqueries in the WHERE clause.
// Compute query plans for subqueries occurring in any position other than the FROM clause.
// This includes the WHERE clause, HAVING clause, GROUP BY clause, ORDER BY clause, LIMIT clause, and OFFSET clause.
/// The AST expression containing the subquery ([ast::Expr::Exists], [ast::Expr::Subquery], [ast::Expr::InSelect]) is replaced with a [ast::Expr::SubqueryResult] expression.
/// The [ast::Expr::SubqueryResult] expression contains the subquery ID, the left-hand side expression (only applicable to IN subqueries), the NOT IN flag (only applicable to IN subqueries), and the subquery type.
/// The computed plans are stored in the [NonFromClauseSubquery] structs on the [SelectPlan], and evaluated at the appropriate time during the translation of the main query.
/// The appropriate time is determined by whether the subquery is correlated or uncorrelated;
/// if it is uncorrelated, it can be evaluated as early as possible, but if it is correlated, it must be evaluated after all of its dependencies from the
/// outer query are 'in scope', i.e. their cursors are open and rewound.
pub fn plan_subqueries_from_where_clause(
pub fn plan_subqueries_from_select_plan(
program: &mut ProgramBuilder,
plan: &mut SelectPlan,
resolver: &Resolver,
connection: &Arc<Connection>,
) -> Result<()> {
// WHERE
plan_subqueries_with_outer_query_access(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
plan.where_clause.iter_mut().map(|t| &mut t.expr),
connection,
SubqueryPosition::Where,
)?;
// GROUP BY
if let Some(group_by) = &mut plan.group_by {
plan_subqueries_with_outer_query_access(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
group_by.exprs.iter_mut(),
connection,
SubqueryPosition::GroupBy,
)?;
if let Some(having) = group_by.having.as_mut() {
plan_subqueries_with_outer_query_access(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
having.iter_mut(),
connection,
SubqueryPosition::Having,
)?;
}
}
// Result columns
plan_subqueries_with_outer_query_access(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
plan.result_columns.iter_mut().map(|c| &mut c.expr),
connection,
SubqueryPosition::ResultColumn,
)?;
// ORDER BY
plan_subqueries_with_outer_query_access(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
plan.order_by.iter_mut().map(|(expr, _)| &mut **expr),
connection,
SubqueryPosition::OrderBy,
)?;
// LIMIT and OFFSET cannot reference columns from the outer query
let get_outer_query_refs = |_: &TableReferences| vec![];
{
let mut subquery_parser = get_subquery_parser(
program,
&mut plan.non_from_clause_subqueries,
&mut plan.table_references,
resolver,
connection,
get_outer_query_refs,
SubqueryPosition::LimitOffset,
);
// Limit
if let Some(limit) = &mut plan.limit {
walk_expr_mut(limit, &mut subquery_parser)?;
}
// Offset
if let Some(offset) = &mut plan.offset {
walk_expr_mut(offset, &mut subquery_parser)?;
}
}
update_column_used_masks(
&mut plan.table_references,
&mut plan.non_from_clause_subqueries,
);
Ok(())
}
/// Compute query plans for subqueries in the WHERE clause and HAVING clause (both of which have access to the outer query scope)
fn plan_subqueries_with_outer_query_access<'a>(
program: &mut ProgramBuilder,
out_subqueries: &mut Vec<NonFromClauseSubquery>,
referenced_tables: &mut TableReferences,
resolver: &Resolver,
where_terms: &mut [WhereTerm],
exprs: impl Iterator<Item = &'a mut ast::Expr>,
connection: &Arc<Connection>,
position: SubqueryPosition,
) -> Result<()> {
// A WHERE clause subquery can reference columns from the outer query,
// Most subqueries can reference columns from the outer query,
// including nested cases where a subquery inside a subquery references columns from its parent's parent
// and so on.
let get_outer_query_refs = |referenced_tables: &TableReferences| {
@@ -71,241 +166,261 @@ pub fn plan_subqueries_from_where_clause(
.collect::<Vec<_>>()
};
// Walk the WHERE clause and replace subqueries with [ast::Expr::SubqueryResult] expressions.
for where_term in where_terms.iter_mut() {
walk_expr_mut(
&mut where_term.expr,
&mut |expr: &mut ast::Expr| -> Result<WalkControl> {
match expr {
ast::Expr::Exists(_) => {
let subquery_id = program.table_reference_counter.next();
let outer_query_refs = get_outer_query_refs(referenced_tables);
let result_reg = program.alloc_register();
let subquery_type = SubqueryType::Exists { result_reg };
let result_expr = ast::Expr::SubqueryResult {
subquery_id,
lhs: None,
not_in: false,
query_type: subquery_type.clone(),
};
let ast::Expr::Exists(subselect) = std::mem::replace(expr, result_expr)
else {
unreachable!();
};
let plan = prepare_select_plan(
subselect,
resolver,
program,
&outer_query_refs,
QueryDestination::ExistsSubqueryResult { result_reg },
connection,
)?;
let Plan::Select(mut plan) = plan else {
crate::bail_parse_error!(
"compound SELECT queries not supported yet in WHERE clause subqueries"
);
};
optimize_select_plan(&mut plan, resolver.schema)?;
// EXISTS subqueries are satisfied after at most 1 row has been returned.
plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric(
"1".to_string(),
))));
let correlated = plan.is_correlated();
out_subqueries.push(NonFromClauseSubquery {
internal_id: subquery_id,
query_type: subquery_type,
state: SubqueryState::Unevaluated {
plan: Some(Box::new(plan)),
},
correlated,
});
Ok(WalkControl::Continue)
}
ast::Expr::Subquery(_) => {
let subquery_id = program.table_reference_counter.next();
let outer_query_refs = get_outer_query_refs(referenced_tables);
let result_expr = ast::Expr::SubqueryResult {
subquery_id,
lhs: None,
not_in: false,
// Placeholder values because the number of columns returned is not known until the plan is prepared.
// These are replaced below after planning.
query_type: SubqueryType::RowValue {
result_reg_start: 0,
num_regs: 0,
},
};
let ast::Expr::Subquery(subselect) = std::mem::replace(expr, result_expr)
else {
unreachable!();
};
let plan = prepare_select_plan(
subselect,
resolver,
program,
&outer_query_refs,
QueryDestination::Unset,
connection,
)?;
let Plan::Select(mut plan) = plan else {
crate::bail_parse_error!(
"compound SELECT queries not supported yet in WHERE clause subqueries"
);
};
optimize_select_plan(&mut plan, resolver.schema)?;
let reg_count = plan.result_columns.len();
let reg_start = program.alloc_registers(reg_count);
plan.query_destination = QueryDestination::RowValueSubqueryResult {
result_reg_start: reg_start,
num_regs: reg_count,
};
// RowValue subqueries are satisfied after at most 1 row has been returned,
// as they are used in comparisons with a scalar or a tuple of scalars like (x,y) = (SELECT ...) or x = (SELECT ...).
plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric(
"1".to_string(),
))));
let ast::Expr::SubqueryResult {
subquery_id,
lhs: None,
not_in: false,
query_type:
SubqueryType::RowValue {
result_reg_start,
num_regs,
},
} = &mut *expr
else {
unreachable!();
};
*result_reg_start = reg_start;
*num_regs = reg_count;
let correlated = plan.is_correlated();
out_subqueries.push(NonFromClauseSubquery {
internal_id: *subquery_id,
query_type: SubqueryType::RowValue {
result_reg_start: reg_start,
num_regs: reg_count,
},
state: SubqueryState::Unevaluated {
plan: Some(Box::new(plan)),
},
correlated,
});
Ok(WalkControl::Continue)
}
ast::Expr::InSelect { .. } => {
let subquery_id = program.table_reference_counter.next();
let outer_query_refs = get_outer_query_refs(referenced_tables);
let ast::Expr::InSelect { lhs, not, rhs } =
std::mem::replace(expr, ast::Expr::Literal(ast::Literal::Null))
else {
unreachable!();
};
let plan = prepare_select_plan(
rhs,
resolver,
program,
&outer_query_refs,
QueryDestination::Unset,
connection,
)?;
let Plan::Select(mut plan) = plan else {
crate::bail_parse_error!(
"compound SELECT queries not supported yet in WHERE clause subqueries"
);
};
optimize_select_plan(&mut plan, resolver.schema)?;
// e.g. (x,y) IN (SELECT ...)
// or x IN (SELECT ...)
let lhs_column_count = match unwrap_parens(lhs.as_ref())? {
ast::Expr::Parenthesized(exprs) => exprs.len(),
_ => 1,
};
if lhs_column_count != plan.result_columns.len() {
crate::bail_parse_error!(
"lhs of IN subquery must have the same number of columns as the subquery"
);
}
let mut columns = plan
.result_columns
.iter()
.enumerate()
.map(|(i, c)| IndexColumn {
name: c.name(&plan.table_references).unwrap_or("").to_string(),
order: SortOrder::Asc,
pos_in_table: i,
collation: None,
default: None,
})
.collect::<Vec<_>>();
for (i, column) in columns.iter_mut().enumerate() {
column.collation = get_collseq_from_expr(
&plan.result_columns[i].expr,
&plan.table_references,
)?;
}
let ephemeral_index = Arc::new(Index {
columns,
name: format!("ephemeral_index_where_sub_{subquery_id}"),
table_name: String::new(),
ephemeral: true,
has_rowid: false,
root_page: 0,
unique: false,
where_clause: None,
index_method: None,
});
let cursor_id = program
.alloc_cursor_id(CursorType::BTreeIndex(ephemeral_index.clone()));
plan.query_destination = QueryDestination::EphemeralIndex {
cursor_id,
index: ephemeral_index.clone(),
is_delete: false,
};
*expr = ast::Expr::SubqueryResult {
subquery_id,
lhs: Some(lhs),
not_in: not,
query_type: SubqueryType::In { cursor_id },
};
let correlated = plan.is_correlated();
out_subqueries.push(NonFromClauseSubquery {
internal_id: subquery_id,
query_type: SubqueryType::In { cursor_id },
state: SubqueryState::Unevaluated {
plan: Some(Box::new(plan)),
},
correlated,
});
Ok(WalkControl::Continue)
}
_ => Ok(WalkControl::Continue),
}
},
)?;
let mut subquery_parser = get_subquery_parser(
program,
out_subqueries,
referenced_tables,
resolver,
connection,
get_outer_query_refs,
position,
);
for expr in exprs {
walk_expr_mut(expr, &mut subquery_parser)?;
}
update_column_used_masks(referenced_tables, out_subqueries);
Ok(())
}
/// Create a closure that will walk the AST and replace subqueries with [ast::Expr::SubqueryResult] expressions.
fn get_subquery_parser<'a>(
program: &'a mut ProgramBuilder,
out_subqueries: &'a mut Vec<NonFromClauseSubquery>,
referenced_tables: &'a mut TableReferences,
resolver: &'a Resolver,
connection: &'a Arc<Connection>,
get_outer_query_refs: fn(&TableReferences) -> Vec<OuterQueryReference>,
position: SubqueryPosition,
) -> impl FnMut(&mut ast::Expr) -> Result<WalkControl> + 'a {
move |expr: &mut ast::Expr| -> Result<WalkControl> {
match expr {
ast::Expr::Exists(_) => {
let subquery_id = program.table_reference_counter.next();
let outer_query_refs = get_outer_query_refs(referenced_tables);
let result_reg = program.alloc_register();
let subquery_type = SubqueryType::Exists { result_reg };
let result_expr = ast::Expr::SubqueryResult {
subquery_id,
lhs: None,
not_in: false,
query_type: subquery_type.clone(),
};
let ast::Expr::Exists(subselect) = std::mem::replace(expr, result_expr) else {
unreachable!();
};
let plan = prepare_select_plan(
subselect,
resolver,
program,
&outer_query_refs,
QueryDestination::ExistsSubqueryResult { result_reg },
connection,
)?;
let Plan::Select(mut plan) = plan else {
crate::bail_parse_error!(
"compound SELECT queries not supported yet in WHERE clause subqueries"
);
};
optimize_select_plan(&mut plan, resolver.schema)?;
// EXISTS subqueries are satisfied after at most 1 row has been returned.
plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric(
"1".to_string(),
))));
let correlated = plan.is_correlated();
out_subqueries.push(NonFromClauseSubquery {
internal_id: subquery_id,
query_type: subquery_type,
state: SubqueryState::Unevaluated {
plan: Some(Box::new(plan)),
},
correlated,
});
Ok(WalkControl::Continue)
}
ast::Expr::Subquery(_) => {
let subquery_id = program.table_reference_counter.next();
let outer_query_refs = get_outer_query_refs(referenced_tables);
let result_expr = ast::Expr::SubqueryResult {
subquery_id,
lhs: None,
not_in: false,
// Placeholder values because the number of columns returned is not known until the plan is prepared.
// These are replaced below after planning.
query_type: SubqueryType::RowValue {
result_reg_start: 0,
num_regs: 0,
},
};
let ast::Expr::Subquery(subselect) = std::mem::replace(expr, result_expr) else {
unreachable!();
};
let plan = prepare_select_plan(
subselect,
resolver,
program,
&outer_query_refs,
QueryDestination::Unset,
connection,
)?;
let Plan::Select(mut plan) = plan else {
crate::bail_parse_error!(
"compound SELECT queries not supported yet in WHERE clause subqueries"
);
};
optimize_select_plan(&mut plan, resolver.schema)?;
let reg_count = plan.result_columns.len();
let reg_start = program.alloc_registers(reg_count);
plan.query_destination = QueryDestination::RowValueSubqueryResult {
result_reg_start: reg_start,
num_regs: reg_count,
};
// RowValue subqueries are satisfied after at most 1 row has been returned,
// as they are used in comparisons with a scalar or a tuple of scalars like (x,y) = (SELECT ...) or x = (SELECT ...).
plan.limit = Some(Box::new(ast::Expr::Literal(ast::Literal::Numeric(
"1".to_string(),
))));
let ast::Expr::SubqueryResult {
subquery_id,
lhs: None,
not_in: false,
query_type:
SubqueryType::RowValue {
result_reg_start,
num_regs,
},
} = &mut *expr
else {
unreachable!();
};
*result_reg_start = reg_start;
*num_regs = reg_count;
let correlated = plan.is_correlated();
out_subqueries.push(NonFromClauseSubquery {
internal_id: *subquery_id,
query_type: SubqueryType::RowValue {
result_reg_start: reg_start,
num_regs: reg_count,
},
state: SubqueryState::Unevaluated {
plan: Some(Box::new(plan)),
},
correlated,
});
Ok(WalkControl::Continue)
}
ast::Expr::InSelect { .. } => {
let subquery_id = program.table_reference_counter.next();
let outer_query_refs = get_outer_query_refs(referenced_tables);
let ast::Expr::InSelect { lhs, not, rhs } =
std::mem::replace(expr, ast::Expr::Literal(ast::Literal::Null))
else {
unreachable!();
};
let plan = prepare_select_plan(
rhs,
resolver,
program,
&outer_query_refs,
QueryDestination::Unset,
connection,
)?;
let Plan::Select(mut plan) = plan else {
crate::bail_parse_error!(
"compound SELECT queries not supported yet in WHERE clause subqueries"
);
};
optimize_select_plan(&mut plan, resolver.schema)?;
// e.g. (x,y) IN (SELECT ...)
// or x IN (SELECT ...)
let lhs_column_count = match unwrap_parens(lhs.as_ref())? {
ast::Expr::Parenthesized(exprs) => exprs.len(),
_ => 1,
};
if lhs_column_count != plan.result_columns.len() {
crate::bail_parse_error!(
"lhs of IN subquery must have the same number of columns as the subquery"
);
}
let mut columns = plan
.result_columns
.iter()
.enumerate()
.map(|(i, c)| IndexColumn {
name: c.name(&plan.table_references).unwrap_or("").to_string(),
order: SortOrder::Asc,
pos_in_table: i,
collation: None,
default: None,
})
.collect::<Vec<_>>();
for (i, column) in columns.iter_mut().enumerate() {
column.collation = get_collseq_from_expr(
&plan.result_columns[i].expr,
&plan.table_references,
)?;
}
let ephemeral_index = Arc::new(Index {
columns,
name: format!("ephemeral_index_where_sub_{subquery_id}"),
table_name: String::new(),
ephemeral: true,
has_rowid: false,
root_page: 0,
unique: false,
where_clause: None,
index_method: None,
});
let cursor_id =
program.alloc_cursor_id(CursorType::BTreeIndex(ephemeral_index.clone()));
plan.query_destination = QueryDestination::EphemeralIndex {
cursor_id,
index: ephemeral_index.clone(),
is_delete: false,
};
*expr = ast::Expr::SubqueryResult {
subquery_id,
lhs: Some(lhs),
not_in: not,
query_type: SubqueryType::In { cursor_id },
};
let correlated = plan.is_correlated();
if correlated && position == SubqueryPosition::Having {
crate::bail_parse_error!(
"correlated IN subqueries in HAVING clause are not supported yet"
);
}
out_subqueries.push(NonFromClauseSubquery {
internal_id: subquery_id,
query_type: SubqueryType::In { cursor_id },
state: SubqueryState::Unevaluated {
plan: Some(Box::new(plan)),
},
correlated,
});
Ok(WalkControl::Continue)
}
_ => Ok(WalkControl::Continue),
}
}
}
/// We make decisions about when to evaluate expressions or whether to use covering indexes based on
/// which columns of a table have been referenced.
/// Since subquery nesting is arbitrarily deep, a reference to a column must propagate recursively

View File

@@ -638,4 +638,283 @@ do_execsql_test_on_specific_db {:memory:} subquery-not-in-multiple-columns {
select name, age from restricted_profiles
);
} {1|Alice|25
3|Charlie|25}
3|Charlie|25}
# SUBQUERIES IN OTHER POSITIONS (result columns, GROUP BY, ORDER BY, HAVING, LIMIT, OFFSET)
# Uncorrelated subquery in result column
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-result-column {
create table employees(id, name, dept_id);
create table company_info(total_depts);
insert into employees values (1, 'Alice', 10), (2, 'Bob', 20);
insert into company_info values (5);
select id, name, (select total_depts from company_info) as total_depts from employees;
} {1|Alice|5
2|Bob|5}
# Correlated subquery in result column
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-result-column {
create table employees(id, name, dept_id);
create table departments(id, name);
insert into employees values (1, 'Alice', 10), (2, 'Bob', 20);
insert into departments values (10, 'Sales'), (20, 'Engineering');
select id, name, (select name from departments where id = dept_id) as dept_name from employees;
} {1|Alice|Sales
2|Bob|Engineering}
# Correlated subquery in result column with join
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-result-column-with-join {
create table employees(id, name, dept_id, manager_id);
create table departments(id, name);
create table managers(id, name);
insert into employees values (1, 'Alice', 10, 100), (2, 'Bob', 20, 200);
insert into departments values (10, 'Sales'), (20, 'Engineering');
insert into managers values (100, 'Carol'), (200, 'Dave');
select e.id, e.name, m.name as manager, (select name from departments where id = e.dept_id) as dept_name
from employees e join managers m on e.manager_id = m.id;
} {1|Alice|Carol|Sales
2|Bob|Dave|Engineering}
# Uncorrelated IN-subquery in result column
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-result-column-in {
create table employees(id, name, dept_id);
create table special_depts(dept_id);
insert into employees values (1, 'Alice', 10), (2, 'Bob', 20), (3, 'Charlie', 30);
insert into special_depts values (10), (20);
select id, name, dept_id in (select dept_id from special_depts) as is_special from employees;
} {1|Alice|1
2|Bob|1
3|Charlie|0}
# Correlated IN-subquery in result column
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-result-column-in {
create table employees(id, name, dept_id);
create table dept_awards(dept_id, employee_id);
insert into employees values (1, 'Alice', 10), (2, 'Bob', 20), (3, 'Charlie', 10);
insert into dept_awards values (10, 1), (20, 2);
select id, name, id in (select employee_id from dept_awards where dept_id = employees.dept_id) as has_award from employees;
} {1|Alice|1
2|Bob|1
3|Charlie|0}
# Uncorrelated subquery in GROUP BY clause
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-group-by {
create table sales(id, amount, region_id);
create table grouping_config(group_column);
insert into sales values (1, 100, 1), (2, 200, 1), (3, 150, 2);
insert into grouping_config values ('region_id');
select region_id, sum(amount)
from sales
group by (select case when group_column = 'region_id' then region_id else amount end from grouping_config);
} {1|300
2|150}
# Correlated subquery in GROUP BY clause
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-group-by {
create table sales(id, amount, region_id);
create table regions(id, name);
insert into sales values (1, 100, 1), (2, 200, 1), (3, 150, 2);
insert into regions values (1, 'North'), (2, 'South');
select (select name from regions where id = region_id) as region, sum(amount)
from sales
group by (select name from regions where id = region_id);
} {North|300
South|150}
# Correlated subquery in GROUP BY clause with join
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-group-by-with-join {
create table sales(id, amount, region_id, salesperson_id);
create table regions(id, name);
create table salespeople(id, name);
insert into sales values (1, 100, 1, 10), (2, 200, 1, 20), (3, 150, 2, 10);
insert into regions values (1, 'North'), (2, 'South');
insert into salespeople values (10, 'Alice'), (20, 'Bob');
select (select name from regions where id = s.region_id) as region, sp.name as salesperson, sum(amount)
from sales s join salespeople sp on s.salesperson_id = sp.id
group by (select name from regions where id = s.region_id), sp.name;
} {North|Alice|100
North|Bob|200
South|Alice|150}
# Uncorrelated subquery in ORDER BY clause
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-order-by {
create table products(id, name, category_id);
create table sort_config(sort_order);
insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3);
insert into sort_config values ('asc');
select id, name from products
order by (select case when sort_order = 'asc' then id else -id end from sort_config);
} {1|hat
2|laptop
3|book}
# Correlated subquery in ORDER BY clause
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-order-by {
create table products(id, name, category_id);
create table categories(id, priority);
insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3);
insert into categories values (1, 10), (2, 20), (3, 5);
select id, name from products
order by (select priority from categories where id = category_id);
} {3|book
2|laptop
1|hat}
# Correlated subquery in ORDER BY clause with join
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-order-by-with-join {
create table products(id, name, category_id, supplier_id);
create table categories(id, priority);
create table suppliers(id, name);
insert into products values (1, 'hat', 2, 100), (2, 'laptop', 1, 200), (3, 'book', 3, 100);
insert into categories values (1, 10), (2, 20), (3, 5);
insert into suppliers values (100, 'SupplierA'), (200, 'SupplierB');
select p.id, p.name, s.name as supplier
from products p join suppliers s on p.supplier_id = s.id
order by (select priority from categories where id = p.category_id);
} {3|book|SupplierA
2|laptop|SupplierB
1|hat|SupplierA}
# Uncorrelated IN-subquery in ORDER BY clause
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-order-by-in {
create table products(id, name, category_id);
create table priority_categories(category_id);
insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3);
insert into priority_categories values (1), (3);
select id, name from products
order by category_id in (select category_id from priority_categories) desc, id;
} {2|laptop
3|book
1|hat}
# Correlated IN-subquery in ORDER BY clause
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-order-by-in {
create table products(id, name, category_id);
create table category_priorities(category_id, priority);
insert into products values (1, 'hat', 2), (2, 'laptop', 1), (3, 'book', 3);
insert into category_priorities values (1, 10), (2, 20), (3, 5);
select id, name from products
order by category_id in (select category_id from category_priorities where priority > 8), id;
} {3|book
1|hat
2|laptop}
# Uncorrelated subquery in HAVING clause
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-having {
create table orders(id, customer_id, amount);
create table vip_threshold(min_amount);
insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30);
insert into vip_threshold values (100);
select customer_id, sum(amount) as total
from orders
group by customer_id
having total > (select min_amount from vip_threshold);
} {100|200}
# Uncorrelated IN-subquery in HAVING clause
do_execsql_test_on_specific_db {:memory:} subquery-uncorrelated-in-having-in {
create table orders(id, customer_id, amount);
create table target_totals(total_amount);
insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30), (4, 300, 100);
insert into target_totals values (200), (100);
select customer_id, sum(amount) as total
from orders
group by customer_id
having total in (select total_amount from target_totals);
} {100|200
300|100}
# Correlated subquery in HAVING clause
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-having {
create table orders(id, customer_id, amount);
create table customer_thresholds(customer_id, min_amount);
insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30), (4, 200, 80);
insert into customer_thresholds values (100, 100), (200, 150);
select customer_id, sum(amount) as total
from orders
group by customer_id
having total > (select min_amount from customer_thresholds where customer_thresholds.customer_id = orders.customer_id);
} {100|200}
# Correlated IN-subquery in HAVING clause
# FIXME: currently disabled
# do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-having-in {
# create table orders(id, customer_id, amount);
# create table customer_targets(customer_id, target_amount);
# insert into orders values (1, 100, 50), (2, 100, 150), (3, 200, 30), (4, 200, 80);
# insert into customer_targets values (100, 200), (100, 250), (200, 110);
#
# select customer_id, sum(amount) as total
# from orders
# group by customer_id
# having total in (select target_amount from customer_targets where customer_targets.customer_id = orders.customer_id);
# } {100|200
# 200|110}
# Correlated subquery in HAVING clause with join
do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-having-with-join {
create table orders(id, customer_id, amount, region_id);
create table customer_thresholds(customer_id, min_amount);
create table regions(id, name);
insert into orders values (1, 100, 50, 1), (2, 100, 150, 1), (3, 200, 30, 2), (4, 200, 80, 2);
insert into customer_thresholds values (100, 100), (200, 150);
insert into regions values (1, 'East'), (2, 'West');
select o.customer_id, r.name as region, sum(o.amount) as total
from orders o join regions r on o.region_id = r.id
group by o.customer_id, r.name
having total > (select min_amount from customer_thresholds where customer_thresholds.customer_id = o.customer_id);
} {100|East|200}
# Correlated IN-subquery in HAVING clause with join
# do_execsql_test_on_specific_db {:memory:} subquery-correlated-in-subquery-in-having-with-join {
# create table orders(id, customer_id, amount, region_id);
# create table customer_thresholds(customer_id, min_amount);
# create table regions(id, name);
# insert into orders values (1, 100, 50, 1), (2, 100, 150, 1), (3, 200, 30, 2), (4, 200, 80, 2);
# insert into customer_thresholds values (100, 100), (200, 150);
# insert into regions values (1, 'East'), (2, 'West');
#
# select o.customer_id, r.name as region, sum(o.amount) as total
# from orders o join regions r on o.region_id = r.id
# group by o.customer_id, r.name
# having total + 40 in (select min_amount from customer_thresholds where customer_thresholds.customer_id = o.customer_id);
# } {200|West|110}
# Uncorrelated subquery in LIMIT clause
do_execsql_test_on_specific_db {:memory:} subquery-in-limit {
create table items(id, name);
create table config(max_results);
insert into items values (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd');
insert into config values (2);
select * from items limit (select max_results from config);
} {1|a
2|b}
# Uncorrelated subquery in OFFSET clause
do_execsql_test_on_specific_db {:memory:} subquery-in-offset {
create table items(id, name);
create table config(skip_count);
insert into items values (1, 'a'), (2, 'b'), (3, 'c');
insert into config values (1);
select * from items limit 2 offset (select skip_count from config);
} {2|b
3|c}

View File

@@ -4295,7 +4295,12 @@ mod fuzz_tests {
};
// Helper function to generate simple subquery
fn gen_subquery(rng: &mut ChaCha8Rng, depth: usize, outer_table: Option<&str>) -> String {
fn gen_subquery(
rng: &mut ChaCha8Rng,
depth: usize,
outer_table: Option<&str>,
allowed_outer_cols: Option<&[&str]>,
) -> String {
if depth > MAX_SUBQUERY_DEPTH {
return "SELECT 1".to_string();
}
@@ -4328,56 +4333,96 @@ mod fuzz_tests {
// Helper function to generate correlated WHERE conditions
let gen_correlated_where =
|rng: &mut ChaCha8Rng, inner_table: &str, outer_table: &str| -> String {
let pick =
|rng: &mut ChaCha8Rng, mut candidates: Vec<(String, &'static str)>| {
let filtered: Vec<String> = if let Some(allowed) = allowed_outer_cols {
candidates
.drain(..)
.filter(|(_, col)| allowed.contains(col))
.map(|(cond, _)| cond)
.collect()
} else {
candidates.drain(..).map(|(cond, _)| cond).collect()
};
if filtered.is_empty() {
"1=1".to_string()
} else {
filtered[rng.random_range(0..filtered.len())].clone()
}
};
match (outer_table, inner_table) {
("t1", "t2") => {
// t2.ref_id relates to t1.id
let conditions = [
format!("{inner_table}.ref_id = {outer_table}.id"),
format!("{inner_table}.id < {outer_table}.value1"),
format!("{inner_table}.data > {outer_table}.value2"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t1", "t3") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.category < {outer_table}.value1"),
format!("{inner_table}.amount > {outer_table}.value2"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t2", "t1") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.ref_id"),
format!("{inner_table}.value1 > {outer_table}.data"),
format!("{inner_table}.value2 < {outer_table}.id"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t2", "t3") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.category = {outer_table}.ref_id"),
format!("{inner_table}.amount > {outer_table}.data"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t3", "t1") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.value1 > {outer_table}.category"),
format!("{inner_table}.value2 < {outer_table}.amount"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t3", "t2") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.ref_id = {outer_table}.category"),
format!("{inner_table}.data < {outer_table}.amount"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t1", "t2") => pick(
rng,
vec![
(format!("{inner_table}.ref_id = {outer_table}.id"), "id"),
(format!("{inner_table}.id < {outer_table}.value1"), "value1"),
(
format!("{inner_table}.data > {outer_table}.value2"),
"value2",
),
],
),
("t1", "t3") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.category < {outer_table}.value1"),
"value1",
),
(
format!("{inner_table}.amount > {outer_table}.value2"),
"value2",
),
],
),
("t2", "t1") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.ref_id"), "ref_id"),
(format!("{inner_table}.value1 > {outer_table}.data"), "data"),
(format!("{inner_table}.value2 < {outer_table}.id"), "id"),
],
),
("t2", "t3") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.category = {outer_table}.ref_id"),
"ref_id",
),
(format!("{inner_table}.amount > {outer_table}.data"), "data"),
],
),
("t3", "t1") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.value1 > {outer_table}.category"),
"category",
),
(
format!("{inner_table}.value2 < {outer_table}.amount"),
"amount",
),
],
),
("t3", "t2") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.ref_id = {outer_table}.category"),
"category",
),
(
format!("{inner_table}.data < {outer_table}.amount"),
"amount",
),
],
),
_ => "1=1".to_string(),
}
};
@@ -4410,7 +4455,11 @@ mod fuzz_tests {
// Add correlated conditions if outer_table is provided and sometimes
let final_query = if let Some(outer_table) = outer_table {
if rng.random_bool(0.4) {
let can_correlate = match allowed_outer_cols {
Some(cols) => !cols.is_empty(),
None => true,
};
if can_correlate && rng.random_bool(0.4) {
// 40% chance for correlation
// Extract the inner table from the base query
let inner_table = if base_query.contains("FROM t1") {
@@ -4440,7 +4489,7 @@ mod fuzz_tests {
// Sometimes add nesting - but use scalar subquery for nesting to avoid column count issues
if depth < 1 && rng.random_bool(0.2) {
// Reduced probability and depth
let nested = gen_scalar_subquery(rng, 0, outer_table);
let nested = gen_scalar_subquery(rng, 0, outer_table, allowed_outer_cols);
if final_query.contains("WHERE") {
format!("{final_query} AND id IN ({nested})")
} else {
@@ -4456,6 +4505,7 @@ mod fuzz_tests {
rng: &mut ChaCha8Rng,
depth: usize,
outer_table: Option<&str>,
allowed_outer_cols: Option<&[&str]>,
) -> String {
if depth > MAX_SUBQUERY_DEPTH {
// Reduced nesting depth
@@ -4490,56 +4540,96 @@ mod fuzz_tests {
// Helper function to generate correlated WHERE conditions
let gen_correlated_where =
|rng: &mut ChaCha8Rng, inner_table: &str, outer_table: &str| -> String {
let pick =
|rng: &mut ChaCha8Rng, mut candidates: Vec<(String, &'static str)>| {
let filtered: Vec<String> = if let Some(allowed) = allowed_outer_cols {
candidates
.drain(..)
.filter(|(_, col)| allowed.contains(col))
.map(|(cond, _)| cond)
.collect()
} else {
candidates.drain(..).map(|(cond, _)| cond).collect()
};
if filtered.is_empty() {
"1=1".to_string()
} else {
filtered[rng.random_range(0..filtered.len())].clone()
}
};
match (outer_table, inner_table) {
("t1", "t2") => {
// t2.ref_id relates to t1.id
let conditions = [
format!("{inner_table}.ref_id = {outer_table}.id"),
format!("{inner_table}.id < {outer_table}.value1"),
format!("{inner_table}.data > {outer_table}.value2"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t1", "t3") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.category < {outer_table}.value1"),
format!("{inner_table}.amount > {outer_table}.value2"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t2", "t1") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.ref_id"),
format!("{inner_table}.value1 > {outer_table}.data"),
format!("{inner_table}.value2 < {outer_table}.id"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t2", "t3") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.category = {outer_table}.ref_id"),
format!("{inner_table}.amount > {outer_table}.data"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t3", "t1") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.value1 > {outer_table}.category"),
format!("{inner_table}.value2 < {outer_table}.amount"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t3", "t2") => {
let conditions = [
format!("{inner_table}.id = {outer_table}.id"),
format!("{inner_table}.ref_id = {outer_table}.category"),
format!("{inner_table}.data < {outer_table}.amount"),
];
conditions[rng.random_range(0..conditions.len())].clone()
}
("t1", "t2") => pick(
rng,
vec![
(format!("{inner_table}.ref_id = {outer_table}.id"), "id"),
(format!("{inner_table}.id < {outer_table}.value1"), "value1"),
(
format!("{inner_table}.data > {outer_table}.value2"),
"value2",
),
],
),
("t1", "t3") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.category < {outer_table}.value1"),
"value1",
),
(
format!("{inner_table}.amount > {outer_table}.value2"),
"value2",
),
],
),
("t2", "t1") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.ref_id"), "ref_id"),
(format!("{inner_table}.value1 > {outer_table}.data"), "data"),
(format!("{inner_table}.value2 < {outer_table}.id"), "id"),
],
),
("t2", "t3") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.category = {outer_table}.ref_id"),
"ref_id",
),
(format!("{inner_table}.amount > {outer_table}.data"), "data"),
],
),
("t3", "t1") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.value1 > {outer_table}.category"),
"category",
),
(
format!("{inner_table}.value2 < {outer_table}.amount"),
"amount",
),
],
),
("t3", "t2") => pick(
rng,
vec![
(format!("{inner_table}.id = {outer_table}.id"), "id"),
(
format!("{inner_table}.ref_id = {outer_table}.category"),
"category",
),
(
format!("{inner_table}.data < {outer_table}.amount"),
"amount",
),
],
),
_ => "1=1".to_string(),
}
};
@@ -4573,7 +4663,11 @@ mod fuzz_tests {
// Add correlated conditions if outer_table is provided and sometimes
let final_query = if let Some(outer_table) = outer_table {
if rng.random_bool(0.4) {
let can_correlate = match allowed_outer_cols {
Some(cols) => !cols.is_empty(),
None => true,
};
if can_correlate && rng.random_bool(0.4) {
// 40% chance for correlation
// Extract the inner table from the base query
let inner_table = if base_query.contains("FROM t1") {
@@ -4603,7 +4697,7 @@ mod fuzz_tests {
// Sometimes add nesting
if depth < 1 && rng.random_bool(0.2) {
// Reduced probability and depth
let nested = gen_scalar_subquery(rng, depth + 1, outer_table);
let nested = gen_scalar_subquery(rng, depth + 1, outer_table, allowed_outer_cols);
if final_query.contains("WHERE") {
format!("{final_query} AND id IN ({nested})")
} else {
@@ -4614,11 +4708,110 @@ mod fuzz_tests {
}
}
// Helper to generate a SELECT-list expression as a scalar subquery, optionally correlated
fn gen_selectlist_scalar_expr(rng: &mut ChaCha8Rng, outer_table: &str) -> String {
// Reuse scalar subquery generator; return the inner SELECT (without wrapping)
gen_scalar_subquery(rng, 0, Some(outer_table), None)
}
// Helper to generate a GROUP BY expression which may include a correlated scalar subquery
fn gen_group_by_expr(rng: &mut ChaCha8Rng, main_table: &str) -> String {
// Either a plain column or a correlated scalar subquery
if rng.random_bool(0.4) {
// Prefer plain columns most of the time to keep GROUP BY semantics simple
match main_table {
"t1" => ["id", "value1", "value2"][rng.random_range(0..3)].to_string(),
"t2" => ["id", "ref_id", "data"][rng.random_range(0..3)].to_string(),
"t3" => ["id", "category", "amount"][rng.random_range(0..3)].to_string(),
_ => "id".to_string(),
}
} else {
// If GROUP BY is present, a subquery that references outer columns would be invalid
// unless it only references GROUP BY columns; since this subquery becomes the
// grouping expression itself, disallow correlation entirely here.
format!(
"({})",
gen_scalar_subquery(rng, 0, Some(main_table), Some(&[]))
)
}
}
// Helper to generate a HAVING condition comparing an aggregate to a scalar subquery
fn gen_having_condition(
rng: &mut ChaCha8Rng,
main_table: &str,
allowed_outer_cols: Option<&[&str]>,
) -> String {
let (agg_func, agg_col) = match main_table {
"t1" => [
("SUM", "value1"),
("SUM", "value2"),
("MAX", "value1"),
("MAX", "value2"),
("MIN", "value1"),
("MIN", "value2"),
("COUNT", "*"),
][rng.random_range(0..7)],
"t2" => [
("SUM", "data"),
("MAX", "data"),
("MIN", "data"),
("COUNT", "*"),
][rng.random_range(0..4)],
"t3" => [
("SUM", "amount"),
("MAX", "amount"),
("MIN", "amount"),
("COUNT", "*"),
][rng.random_range(0..4)],
_ => ("COUNT", "*"),
};
let op = [">", "<", ">=", "<=", "=", "<>"][rng.random_range(0..6)];
let rhs = gen_scalar_subquery(rng, 0, Some(main_table), allowed_outer_cols);
if agg_col == "*" {
format!("COUNT(*) {op} ({rhs})")
} else {
format!("{agg_func}({agg_col}) {op} ({rhs})")
}
}
// Helper to generate LIMIT/OFFSET clause (optionally empty). Expressions may be subqueries.
fn gen_limit_offset_clause(rng: &mut ChaCha8Rng) -> String {
// 50% of the time, no LIMIT/OFFSET
if rng.random_bool(0.5) {
return String::new();
}
fn gen_limit_like_expr(rng: &mut ChaCha8Rng) -> String {
// Small literal or a scalar subquery from a random table
if rng.random_bool(0.6) {
// Keep literal sizes modest
format!("{}", rng.random_range(0..20))
} else {
let which = rng.random_range(0..3);
match which {
0 => "(SELECT COUNT(*) FROM t1)".to_string(),
1 => "(SELECT COUNT(*) FROM t2)".to_string(),
_ => "(SELECT COUNT(*) FROM t3)".to_string(),
}
}
}
let mut clause = String::new();
let limit_expr = gen_limit_like_expr(rng);
clause.push_str(&format!(" LIMIT {limit_expr}",));
if rng.random_bool(0.5) {
let offset_expr = gen_limit_like_expr(rng);
clause.push_str(&format!(" OFFSET {offset_expr}",));
}
clause
}
for iter_num in 0..NUM_FUZZ_ITERATIONS {
let main_table = ["t1", "t2", "t3"][rng.random_range(0..3)];
let query_type = rng.random_range(0..6); // Increased from 4 to 6 for new correlated query types
let query = match query_type {
let query_type = rng.random_range(0..8); // Add GROUP BY/HAVING variants
let mut query = match query_type {
0 => {
// Comparison subquery: WHERE column <op> (SELECT ...)
let column = match main_table {
@@ -4628,13 +4821,13 @@ mod fuzz_tests {
_ => "id",
};
let op = [">", "<", ">=", "<=", "=", "<>"][rng.random_range(0..6)];
let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table));
let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table), None);
format!("SELECT * FROM {main_table} WHERE {column} {op} ({subquery})",)
}
1 => {
// EXISTS subquery: WHERE [NOT] EXISTS (SELECT ...)
let not_exists = if rng.random_bool(0.3) { "NOT " } else { "" };
let subquery = gen_subquery(&mut rng, 0, Some(main_table));
let subquery = gen_subquery(&mut rng, 0, Some(main_table), None);
format!("SELECT * FROM {main_table} WHERE {not_exists}EXISTS ({subquery})",)
}
2 => {
@@ -4646,7 +4839,7 @@ mod fuzz_tests {
"t3" => ["amount", "category", "id"][rng.random_range(0..3)],
_ => "id",
};
let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table));
let subquery = gen_scalar_subquery(&mut rng, 0, Some(main_table), None);
format!("SELECT * FROM {main_table} WHERE {column} {not_in}IN ({subquery})",)
}
3 => {
@@ -4843,8 +5036,93 @@ mod fuzz_tests {
"SELECT * FROM {main_table} WHERE {column} {op} (SELECT {select_column} FROM {inner_table} WHERE {correlated_condition})",
)
}
6 => {
// Aggregated query with GROUP BY and optional HAVING; allow subqueries in GROUP BY/HAVING
let group_expr = gen_group_by_expr(&mut rng, main_table);
// Only GROUP BY columns may be referenced by correlated subqueries
let allowed_outer_cols: Vec<&str> = match group_expr.as_str() {
"id" => vec!["id"],
"value1" => vec!["value1"],
"value2" => vec!["value2"],
"ref_id" => vec!["ref_id"],
"data" => vec!["data"],
"category" => vec!["category"],
"amount" => vec!["amount"],
_ => Vec::new(),
};
let (agg_func, agg_col) = match main_table {
"t1" => [
("SUM", "value1"),
("SUM", "value2"),
("MAX", "value1"),
("MAX", "value2"),
("COUNT", "*"),
][rng.random_range(0..5)],
"t2" => [("SUM", "data"), ("MAX", "data"), ("COUNT", "*")]
[rng.random_range(0..3)],
"t3" => [("SUM", "amount"), ("MAX", "amount"), ("COUNT", "*")]
[rng.random_range(0..3)],
_ => ("COUNT", "*"),
};
let mut q;
if agg_col == "*" {
q = format!("SELECT {group_expr} AS g, COUNT(*) AS c FROM {main_table}");
} else {
q = format!("SELECT {group_expr} AS g, {agg_func}({agg_col}) AS a FROM {main_table}");
}
if rng.random_bool(0.5) {
q.push_str(&format!(
" WHERE {}",
gen_simple_where(&mut rng, main_table)
));
}
q.push_str(&format!(" GROUP BY {group_expr}"));
if rng.random_bool(0.4) {
q.push_str(&format!(
" HAVING {}",
gen_having_condition(&mut rng, main_table, Some(&allowed_outer_cols))
));
}
q
}
7 => {
// Simple GROUP BY without HAVING (baseline support); may use subquery in GROUP BY
let group_expr = gen_group_by_expr(&mut rng, main_table);
let select_expr = if rng.random_bool(0.5) {
// Use aggregate
match main_table {
"t1" => "SUM(value1) AS s".to_string(),
"t2" => "SUM(data) AS s".to_string(),
_ => "SUM(amount) AS s".to_string(),
}
} else {
"COUNT(*) AS c".to_string()
};
let mut q =
format!("SELECT {group_expr} AS g, {select_expr} FROM {main_table}");
if rng.random_bool(0.5) {
q.push_str(&format!(
" WHERE {}",
gen_simple_where(&mut rng, main_table)
));
}
q.push_str(&format!(" GROUP BY {group_expr}"));
q
}
_ => unreachable!(),
};
// Optionally inject a SELECT-list scalar subquery into non-aggregated SELECT * queries
if query.starts_with("SELECT * FROM ") && rng.random_bool(0.4) {
let sel_expr = gen_selectlist_scalar_expr(&mut rng, main_table);
let replacement = "SELECT *, (".to_string() + &sel_expr + ") AS s_sub FROM ";
query = query.replacen("SELECT * FROM ", &replacement, 1);
}
// Optionally append LIMIT/OFFSET (with or without subqueries)
let limit_clause = gen_limit_offset_clause(&mut rng);
if !limit_clause.is_empty() {
query.push_str(&limit_clause);
}
log::info!(
"Iteration {}/{NUM_FUZZ_ITERATIONS}: Query: {query}",