From b47c214a5eed43a26fd3382e81ee517955112e1f Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Thu, 27 Mar 2025 21:48:32 +0200 Subject: [PATCH 01/11] fix aggregation functions without group by --- core/translate/emitter.rs | 8 ++++++++ core/translate/main_loop.rs | 19 ++++++++++++++++++- core/translate/result_row.rs | 4 +++- core/translate/subquery.rs | 1 + 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index e4d05bfaa..b473715fe 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -62,6 +62,8 @@ pub struct TranslateCtx<'a> { pub label_main_loop_end: Option, // First register of the aggregation results pub reg_agg_start: Option, + // Register to track if we set non aggregate cols to first encountered row in non group by agg statement + pub reg_agg_flag: Option, // First register of the result columns of the query pub reg_result_cols_start: Option, // The register holding the limit value, if any. @@ -115,6 +117,7 @@ fn prologue<'a>( labels_main_loop: (0..table_count).map(|_| LoopLabels::new(program)).collect(), label_main_loop_end: None, reg_agg_start: None, + reg_agg_flag: None, reg_limit: None, reg_offset: None, reg_limit_offset_sum: None, @@ -242,6 +245,11 @@ pub fn emit_query<'a>( target_pc: after_main_loop_label, }); } + if !plan.aggregates.is_empty() && plan.group_by.is_none() { + let flag = program.alloc_register(); + program.emit_int(0, flag); + t_ctx.reg_agg_flag = Some(flag); + } // Allocate registers for result columns t_ctx.reg_result_cols_start = Some(program.alloc_registers(plan.result_columns.len())); diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 7b51a2328..95601bb9a 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -677,6 +677,18 @@ fn emit_loop_source( &t_ctx.resolver, )?; } + + if let Some(flag) = t_ctx.reg_agg_flag { + let offset = program.offset().add(plan.result_columns.len() as u32); + + program.emit_insn(Insn::If { + reg: flag, + target_pc: offset, + jump_if_null: false, + }); + } + let col_start = t_ctx.reg_result_cols_start.unwrap(); + for (i, rc) in plan.result_columns.iter().enumerate() { if rc.contains_aggregates { // Do nothing, aggregates are computed above @@ -684,7 +696,9 @@ fn emit_loop_source( // it will be computed after the aggregations are finalized. continue; } - let reg = start_reg + num_aggs + i; + + let reg = col_start + i; + translate_expr( program, Some(&plan.table_references), @@ -693,6 +707,9 @@ fn emit_loop_source( &t_ctx.resolver, )?; } + if let Some(flag) = t_ctx.reg_agg_flag { + program.emit_int(1, flag); + } Ok(()) } LoopEmitTarget::QueryResult => { diff --git a/core/translate/result_row.rs b/core/translate/result_row.rs index ad8454c25..7988d0417 100644 --- a/core/translate/result_row.rs +++ b/core/translate/result_row.rs @@ -25,7 +25,9 @@ pub fn emit_select_result( } let start_reg = t_ctx.reg_result_cols_start.unwrap(); - for (i, rc) in plan.result_columns.iter().enumerate() { + for (i, rc) in plan.result_columns.iter().enumerate().filter(|(_, rc)| { + t_ctx.reg_agg_flag.is_some() && rc.contains_aggregates || t_ctx.reg_agg_flag.is_none() + }) { let reg = start_reg + i; translate_expr( program, diff --git a/core/translate/subquery.rs b/core/translate/subquery.rs index 1730312be..f81b20788 100644 --- a/core/translate/subquery.rs +++ b/core/translate/subquery.rs @@ -75,6 +75,7 @@ pub fn emit_subquery<'a>( meta_left_joins: (0..plan.table_references.len()).map(|_| None).collect(), meta_sort: None, reg_agg_start: None, + reg_agg_flag: None, reg_result_cols_start: None, result_column_indexes_in_orderby_sorter: (0..plan.result_columns.len()).collect(), result_columns_to_skip_in_orderby_sorter: None, From 352fa6fd34f42a4dfbe326f01ab7093ed63b0ac7 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Thu, 27 Mar 2025 22:05:34 +0200 Subject: [PATCH 02/11] cargo fmt --- core/translate/main_loop.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 95601bb9a..da922ce20 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -677,7 +677,7 @@ fn emit_loop_source( &t_ctx.resolver, )?; } - + if let Some(flag) = t_ctx.reg_agg_flag { let offset = program.offset().add(plan.result_columns.len() as u32); From 36fe859d7d8e2752c049db0ca16db7aad1c234e9 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Fri, 28 Mar 2025 00:52:53 +0200 Subject: [PATCH 03/11] create if only if non aggregate columns present --- core/translate/emitter.rs | 5 ++++- core/translate/main_loop.rs | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index b473715fe..ef3c2b9ad 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -245,7 +245,10 @@ pub fn emit_query<'a>( target_pc: after_main_loop_label, }); } - if !plan.aggregates.is_empty() && plan.group_by.is_none() { + if !plan.aggregates.is_empty() + && plan.group_by.is_none() + && plan.result_columns.iter().any(|c| !c.contains_aggregates) + { let flag = program.alloc_register(); program.emit_int(0, flag); t_ctx.reg_agg_flag = Some(flag); diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index da922ce20..24b057219 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -678,12 +678,11 @@ fn emit_loop_source( )?; } + let if_label = program.allocate_label(); if let Some(flag) = t_ctx.reg_agg_flag { - let offset = program.offset().add(plan.result_columns.len() as u32); - program.emit_insn(Insn::If { reg: flag, - target_pc: offset, + target_pc: if_label, jump_if_null: false, }); } @@ -707,6 +706,7 @@ fn emit_loop_source( &t_ctx.resolver, )?; } + program.resolve_label(if_label, program.offset()); if let Some(flag) = t_ctx.reg_agg_flag { program.emit_int(1, flag); } From 4fd1dcdc73da1e075159c23c66acd9541f6ba3c3 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Fri, 28 Mar 2025 01:06:09 +0200 Subject: [PATCH 04/11] small refine --- core/translate/main_loop.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 24b057219..ea7b2f4ef 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -678,14 +678,18 @@ fn emit_loop_source( )?; } - let if_label = program.allocate_label(); - if let Some(flag) = t_ctx.reg_agg_flag { + let if_label = if let Some(flag) = t_ctx.reg_agg_flag { + let if_label = program.allocate_label(); program.emit_insn(Insn::If { reg: flag, target_pc: if_label, jump_if_null: false, }); - } + Some(if_label) + } else { + None + }; + let col_start = t_ctx.reg_result_cols_start.unwrap(); for (i, rc) in plan.result_columns.iter().enumerate() { @@ -706,10 +710,12 @@ fn emit_loop_source( &t_ctx.resolver, )?; } - program.resolve_label(if_label, program.offset()); - if let Some(flag) = t_ctx.reg_agg_flag { + if let Some(label) = if_label { + program.resolve_label(label, program.offset()); + let flag = t_ctx.reg_agg_flag.unwrap(); program.emit_int(1, flag); } + Ok(()) } LoopEmitTarget::QueryResult => { From 2bcdd4e4042972c951dea9da032b1a6090148b37 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Sun, 30 Mar 2025 18:39:19 +0300 Subject: [PATCH 05/11] non group by cols are displayed in group by agg statements --- core/translate/emitter.rs | 4 +- core/translate/group_by.rs | 103 +++++++++++++++++++++++++++++------- core/translate/main_loop.rs | 41 +++++++++++--- 3 files changed, 120 insertions(+), 28 deletions(-) diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index ef3c2b9ad..0eaeef58f 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -262,8 +262,8 @@ pub fn emit_query<'a>( init_order_by(program, t_ctx, order_by)?; } - if let Some(ref mut group_by) = plan.group_by { - init_group_by(program, t_ctx, group_by, &plan.aggregates)?; + if let Some(ref group_by) = plan.group_by { + init_group_by(program, t_ctx, group_by, &plan)?; } init_loop( program, diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index 13d860f16..86d6087e2 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -50,15 +50,22 @@ pub fn init_group_by( program: &mut ProgramBuilder, t_ctx: &mut TranslateCtx, group_by: &GroupBy, - aggregates: &[Aggregate], + plan: &SelectPlan, ) -> Result<()> { - let num_aggs = aggregates.len(); + let num_aggs = plan.aggregates.len(); + + // Calculate this count only once + let non_aggregate_count = plan + .result_columns + .iter() + .filter(|rc| !rc.contains_aggregates) + .count(); let sort_cursor = program.alloc_cursor_id(None, CursorType::Sorter); let reg_abort_flag = program.alloc_register(); let reg_group_exprs_cmp = program.alloc_registers(group_by.exprs.len()); - let reg_group_exprs_acc = program.alloc_registers(group_by.exprs.len()); + let reg_group_exprs_acc = program.alloc_registers(non_aggregate_count); let reg_agg_exprs_start = program.alloc_registers(num_aggs); let reg_sorter_key = program.alloc_register(); @@ -71,7 +78,7 @@ pub fn init_group_by( } program.emit_insn(Insn::SorterOpen { cursor_id: sort_cursor, - columns: aggregates.len() + group_by.exprs.len(), + columns: non_aggregate_count + plan.aggregates.len(), order: Record::new(order), }); @@ -156,14 +163,23 @@ pub fn emit_group_by<'a>( let group_by = plan.group_by.as_ref().unwrap(); + // Calculate these values once + let non_aggregate_count = plan + .result_columns + .iter() + .filter(|rc| !rc.contains_aggregates) + .count(); + + let agg_args_count = plan + .aggregates + .iter() + .map(|agg| agg.args.len()) + .sum::(); + // all group by columns and all arguments of agg functions are in the sorter. // the sort keys are the group by columns (the aggregation within groups is done based on how long the sort keys remain the same) - let sorter_column_count = group_by.exprs.len() - + plan - .aggregates - .iter() - .map(|agg| agg.args.len()) - .sum::(); + let sorter_column_count = non_aggregate_count + agg_args_count; + // sorter column names do not matter let ty = crate::schema::Type::Null; let pseudo_columns = (0..sorter_column_count) @@ -238,11 +254,6 @@ pub fn emit_group_by<'a>( }); // New group, move current group by columns into the comparison register - program.emit_insn(Insn::Move { - source_reg: groups_start_reg, - dest_reg: reg_group_exprs_cmp, - count: group_by.exprs.len(), - }); program.add_comment( program.offset(), @@ -253,6 +264,12 @@ pub fn emit_group_by<'a>( return_reg: reg_subrtn_acc_output_return_offset, }); + program.emit_insn(Insn::Move { + source_reg: groups_start_reg, + dest_reg: reg_group_exprs_cmp, + count: group_by.exprs.len(), + }); + program.add_comment(program.offset(), "check abort flag"); program.emit_insn(Insn::IfPos { reg: reg_abort_flag, @@ -269,7 +286,7 @@ pub fn emit_group_by<'a>( // Accumulate the values into the aggregations program.resolve_label(agg_step_label, program.offset()); let start_reg = t_ctx.reg_agg_start.unwrap(); - let mut cursor_index = group_by.exprs.len(); + let mut cursor_index = non_aggregate_count; for (i, agg) in plan.aggregates.iter().enumerate() { let agg_result_reg = start_reg + i; translate_aggregation_step_groupby( @@ -296,7 +313,7 @@ pub fn emit_group_by<'a>( }); // Read the group by columns for a finished group - for i in 0..group_by.exprs.len() { + for i in 0..non_aggregate_count { let key_reg = reg_group_exprs_acc + i; let sorter_column_index = i; program.emit_insn(Insn::Column { @@ -363,6 +380,12 @@ pub fn emit_group_by<'a>( }); } + // Cache expressions we need multiple times + let filtered_results = plan + .result_columns + .iter() + .filter(|rc| !rc.contains_aggregates) + .collect::>(); // we now have the group by columns in registers (group_exprs_start_register..group_exprs_start_register + group_by.len() - 1) // and the agg results in (agg_start_reg..agg_start_reg + aggregates.len() - 1) // we need to call translate_expr on each result column, but replace the expr with a register copy in case any part of the @@ -373,6 +396,24 @@ pub fn emit_group_by<'a>( .expr_to_reg_cache .push((expr, reg_group_exprs_acc + i)); } + + // Offset for the next expressions after group_by + let mut offset = group_by.exprs.len(); + + for rc in filtered_results.iter() { + let expr = &rc.expr; + + // skip cols that are already in group by + if !matches!(expr, ast::Expr::Column { .. }) + || !is_column_in_group_by(expr, &group_by.exprs) + { + t_ctx + .resolver + .expr_to_reg_cache + .push((expr, reg_group_exprs_acc + offset)); + offset += 1; + } + } for (i, agg) in plan.aggregates.iter().enumerate() { t_ctx .resolver @@ -420,7 +461,7 @@ pub fn emit_group_by<'a>( let start_reg = reg_group_exprs_acc; program.emit_insn(Insn::Null { dest: start_reg, - dest_end: Some(start_reg + group_by.exprs.len() + plan.aggregates.len() - 1), + dest_end: Some(start_reg + non_aggregate_count + plan.aggregates.len() - 1), }); program.emit_insn(Insn::Integer { @@ -668,3 +709,29 @@ pub fn translate_aggregation_step_groupby( }; Ok(dest) } + +pub fn is_column_in_group_by(expr: &ast::Expr, group_by_exprs: &[ast::Expr]) -> bool { + if let ast::Expr::Column { + database: _, + table: _, + column: col, + is_rowid_alias: _, + } = expr + { + group_by_exprs.iter().any(|ex| { + if let ast::Expr::Column { + database: _, + table: _, + column: group_col, + is_rowid_alias: _, + } = ex + { + col == group_col + } else { + false + } + }) + } else { + false + } +} diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index ea7b2f4ef..9c58e193d 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -15,6 +15,7 @@ use super::{ aggregation::translate_aggregation_step, emitter::{OperationMode, TranslateCtx}, expr::{translate_condition_expr, translate_expr, ConditionMetadata}, + group_by::is_column_in_group_by, order_by::{order_by_sorter_insert, sorter_insert}, plan::{ IterationDirection, Operation, Search, SelectPlan, SelectQueryType, TableReference, @@ -599,7 +600,12 @@ fn emit_loop_source( LoopEmitTarget::GroupBySorter => { let group_by = plan.group_by.as_ref().unwrap(); let aggregates = &plan.aggregates; - let sort_keys_count = group_by.exprs.len(); + let non_aggregate_columns = plan + .result_columns + .iter() + .filter(|rc| !rc.contains_aggregates) + .collect::>(); + let sort_keys_count = non_aggregate_columns.len(); let aggregate_arguments_count = plan .aggregates .iter() @@ -621,6 +627,25 @@ fn emit_loop_source( &t_ctx.resolver, )?; } + + if group_by.exprs.len() + aggregates.len() != plan.result_columns.len() { + for rc in non_aggregate_columns.iter() { + let expr = &rc.expr; + if !is_column_in_group_by(expr, &group_by.exprs) { + let key_reg = cur_reg; + cur_reg += 1; + translate_expr( + program, + Some(&plan.table_references), + expr, + key_reg, + &t_ctx.resolver, + )?; + } + } + } + // Process non-aggregate result columns that aren't already in group_by + // Then we have the aggregate arguments. for agg in aggregates.iter() { // Here we are collecting scalars for the group by sorter, which will include @@ -692,14 +717,14 @@ fn emit_loop_source( let col_start = t_ctx.reg_result_cols_start.unwrap(); - for (i, rc) in plan.result_columns.iter().enumerate() { - if rc.contains_aggregates { - // Do nothing, aggregates are computed above - // if this result column is e.g. something like sum(x) + 1 or length(sum(x)), we do not want to translate that (+1) or length() yet, - // it will be computed after the aggregations are finalized. - continue; - } + // Process only non-aggregate columns + let non_agg_columns = plan + .result_columns + .iter() + .enumerate() + .filter(|(_, rc)| !rc.contains_aggregates); + for (i, rc) in non_agg_columns { let reg = col_start + i; translate_expr( From 816cbacc9caa5fc820b356b611f93dd5a1ce4dee Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Mon, 31 Mar 2025 01:56:04 +0300 Subject: [PATCH 06/11] some smartie optimizations --- core/translate/group_by.rs | 27 +++---------------- core/translate/optimizer.rs | 53 +++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index 86d6087e2..e00edf455 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -6,6 +6,7 @@ use crate::{ function::AggFunc, schema::{Column, PseudoTable}, types::{OwnedValue, Record}, + util::exprs_are_equivalent, vdbe::{ builder::{CursorType, ProgramBuilder}, insn::Insn, @@ -711,27 +712,7 @@ pub fn translate_aggregation_step_groupby( } pub fn is_column_in_group_by(expr: &ast::Expr, group_by_exprs: &[ast::Expr]) -> bool { - if let ast::Expr::Column { - database: _, - table: _, - column: col, - is_rowid_alias: _, - } = expr - { - group_by_exprs.iter().any(|ex| { - if let ast::Expr::Column { - database: _, - table: _, - column: group_col, - is_rowid_alias: _, - } = ex - { - col == group_col - } else { - false - } - }) - } else { - false - } + group_by_exprs + .iter() + .any(|expr2| exprs_are_equivalent(expr, expr2)) } diff --git a/core/translate/optimizer.rs b/core/translate/optimizer.rs index 6fa7f9619..dfd7ab16d 100644 --- a/core/translate/optimizer.rs +++ b/core/translate/optimizer.rs @@ -4,6 +4,7 @@ use limbo_sqlite3_parser::ast; use crate::{ schema::{Index, Schema}, + util::exprs_are_equivalent, Result, }; @@ -43,6 +44,8 @@ fn optimize_select_plan(plan: &mut SelectPlan, schema: &Schema) -> Result<()> { eliminate_unnecessary_orderby(plan, schema)?; + eliminate_orderby_like_groupby(plan)?; + Ok(()) } @@ -117,6 +120,51 @@ fn query_is_already_ordered_by( } } +fn eliminate_orderby_like_groupby(plan: &mut SelectPlan) -> Result<()> { + if plan.order_by.is_none() | plan.group_by.is_none() { + return Ok(()); + } + if plan.table_references.len() == 0 { + return Ok(()); + } + + let o = plan.order_by.as_mut().unwrap(); + let g = plan.group_by.as_mut().unwrap(); + + let mut insert_pos = 0; + let mut i = 0; + + while i < o.len() { + let (key, order) = &o[i]; + + if matches!(order, Direction::Descending) { + i += 1; + continue; + } + if let Some(pos) = g + .exprs + .iter() + .position(|expr| exprs_are_equivalent(expr, key)) + { + if pos != insert_pos { + let mut current_pos = pos; + while current_pos > insert_pos { + g.exprs.swap(current_pos, current_pos - 1); + current_pos -= 1; + } + } + insert_pos += 1; + o.remove(i); + } else { + i += 1; + } + } + if o.is_empty() { + plan.order_by = None + } + Ok(()) +} + fn eliminate_unnecessary_orderby(plan: &mut SelectPlan, schema: &Schema) -> Result<()> { if plan.order_by.is_none() { return Ok(()); @@ -125,6 +173,11 @@ fn eliminate_unnecessary_orderby(plan: &mut SelectPlan, schema: &Schema) -> Resu return Ok(()); } + // if pk will be removed later + if plan.group_by.is_some() { + return Ok(()); + } + let o = plan.order_by.as_mut().unwrap(); if o.len() != 1 { From 91ceab16268e5c7063670e8e6c214d061a8bdbb0 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Thu, 3 Apr 2025 16:53:13 +0300 Subject: [PATCH 07/11] improve naming and add comments for context --- core/translate/emitter.rs | 10 +++--- core/translate/group_by.rs | 41 ++++++++++-------------- core/translate/main_loop.rs | 8 ++--- core/translate/optimizer.rs | 62 ++++++++++++++++++++++++------------ core/translate/result_row.rs | 9 +++++- core/translate/subquery.rs | 2 +- 6 files changed, 78 insertions(+), 54 deletions(-) diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index 0eaeef58f..80a6db6d5 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -62,8 +62,10 @@ pub struct TranslateCtx<'a> { pub label_main_loop_end: Option, // First register of the aggregation results pub reg_agg_start: Option, - // Register to track if we set non aggregate cols to first encountered row in non group by agg statement - pub reg_agg_flag: Option, + // In non-group-by statements with aggregations (e.g. SELECT foo, bar, sum(baz) FROM t), + // we want to emit the non-aggregate columns (foo and bar) only once. + // This register is a flag that tracks whether we have already done that. + pub reg_nonagg_emit_once_flag: Option, // First register of the result columns of the query pub reg_result_cols_start: Option, // The register holding the limit value, if any. @@ -117,7 +119,7 @@ fn prologue<'a>( labels_main_loop: (0..table_count).map(|_| LoopLabels::new(program)).collect(), label_main_loop_end: None, reg_agg_start: None, - reg_agg_flag: None, + reg_nonagg_emit_once_flag: None, reg_limit: None, reg_offset: None, reg_limit_offset_sum: None, @@ -251,7 +253,7 @@ pub fn emit_query<'a>( { let flag = program.alloc_register(); program.emit_int(0, flag); - t_ctx.reg_agg_flag = Some(flag); + t_ctx.reg_nonagg_emit_once_flag = Some(flag); } // Allocate registers for result columns diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index e00edf455..ce268671d 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -38,8 +38,8 @@ pub struct GroupByMetadata { pub reg_sorter_key: usize, // Register holding a flag to abort the grouping process if necessary pub reg_abort_flag: usize, - // Register holding the start of the accumulator group registers (i.e. the groups, not the aggregates) - pub reg_group_exprs_acc: usize, + // Register holding the start of the non aggregate query members (all columns except aggregate arguments) + pub reg_non_aggregate_exprs_acc: usize, // Starting index of the register(s) that hold the comparison result between the current row and the previous row // The comparison result is used to determine if the current row belongs to the same group as the previous row // Each group by expression has a corresponding register @@ -55,7 +55,6 @@ pub fn init_group_by( ) -> Result<()> { let num_aggs = plan.aggregates.len(); - // Calculate this count only once let non_aggregate_count = plan .result_columns .iter() @@ -66,7 +65,7 @@ pub fn init_group_by( let reg_abort_flag = program.alloc_register(); let reg_group_exprs_cmp = program.alloc_registers(group_by.exprs.len()); - let reg_group_exprs_acc = program.alloc_registers(non_aggregate_count); + let reg_non_aggregate_exprs_acc = program.alloc_registers(non_aggregate_count); let reg_agg_exprs_start = program.alloc_registers(num_aggs); let reg_sorter_key = program.alloc_register(); @@ -118,7 +117,7 @@ pub fn init_group_by( label_acc_indicator_set_flag_true: program.allocate_label(), reg_subrtn_acc_clear_return_offset, reg_abort_flag, - reg_group_exprs_acc, + reg_non_aggregate_exprs_acc, reg_group_exprs_cmp, reg_sorter_key, }); @@ -154,7 +153,7 @@ pub fn emit_group_by<'a>( sort_cursor, reg_group_exprs_cmp, reg_subrtn_acc_clear_return_offset, - reg_group_exprs_acc, + reg_non_aggregate_exprs_acc, reg_abort_flag, reg_sorter_key, label_subrtn_acc_clear, @@ -164,7 +163,6 @@ pub fn emit_group_by<'a>( let group_by = plan.group_by.as_ref().unwrap(); - // Calculate these values once let non_aggregate_count = plan .result_columns .iter() @@ -177,7 +175,7 @@ pub fn emit_group_by<'a>( .map(|agg| agg.args.len()) .sum::(); - // all group by columns and all arguments of agg functions are in the sorter. + // all non-aggregate columns and all arguments of agg functions are in the sorter // the sort keys are the group by columns (the aggregation within groups is done based on how long the sort keys remain the same) let sorter_column_count = non_aggregate_count + agg_args_count; @@ -254,8 +252,6 @@ pub fn emit_group_by<'a>( target_pc_gt: program.offset().add(1u32), }); - // New group, move current group by columns into the comparison register - program.add_comment( program.offset(), "check if ended group had data, and output if so", @@ -265,6 +261,7 @@ pub fn emit_group_by<'a>( return_reg: reg_subrtn_acc_output_return_offset, }); + // New group, move current group by columns into the comparison register program.emit_insn(Insn::Move { source_reg: groups_start_reg, dest_reg: reg_group_exprs_cmp, @@ -313,9 +310,9 @@ pub fn emit_group_by<'a>( jump_if_null: false, }); - // Read the group by columns for a finished group + // Read the non-aggregate columns for a finished group for i in 0..non_aggregate_count { - let key_reg = reg_group_exprs_acc + i; + let key_reg = reg_non_aggregate_exprs_acc + i; let sorter_column_index = i; program.emit_insn(Insn::Column { cursor_id: pseudo_cursor, @@ -381,12 +378,10 @@ pub fn emit_group_by<'a>( }); } - // Cache expressions we need multiple times - let filtered_results = plan + let non_aggregate_result_columns = plan .result_columns .iter() - .filter(|rc| !rc.contains_aggregates) - .collect::>(); + .filter(|rc| !rc.contains_aggregates); // we now have the group by columns in registers (group_exprs_start_register..group_exprs_start_register + group_by.len() - 1) // and the agg results in (agg_start_reg..agg_start_reg + aggregates.len() - 1) // we need to call translate_expr on each result column, but replace the expr with a register copy in case any part of the @@ -395,23 +390,21 @@ pub fn emit_group_by<'a>( t_ctx .resolver .expr_to_reg_cache - .push((expr, reg_group_exprs_acc + i)); + .push((expr, reg_non_aggregate_exprs_acc + i)); } - // Offset for the next expressions after group_by + // Register offset for the non-aggregate expressions that are not part of GROUP BY let mut offset = group_by.exprs.len(); - for rc in filtered_results.iter() { + for rc in non_aggregate_result_columns { let expr = &rc.expr; // skip cols that are already in group by - if !matches!(expr, ast::Expr::Column { .. }) - || !is_column_in_group_by(expr, &group_by.exprs) - { + if !is_column_in_group_by(expr, &group_by.exprs) { t_ctx .resolver .expr_to_reg_cache - .push((expr, reg_group_exprs_acc + offset)); + .push((expr, reg_non_aggregate_exprs_acc + offset)); offset += 1; } } @@ -459,7 +452,7 @@ pub fn emit_group_by<'a>( program.add_comment(program.offset(), "clear accumulator subroutine start"); program.resolve_label(label_subrtn_acc_clear, program.offset()); - let start_reg = reg_group_exprs_acc; + let start_reg = reg_non_aggregate_exprs_acc; program.emit_insn(Insn::Null { dest: start_reg, dest_end: Some(start_reg + non_aggregate_count + plan.aggregates.len() - 1), diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 9c58e193d..5a3e1c126 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -628,6 +628,7 @@ fn emit_loop_source( )?; } + // Process non-aggregate result columns that aren't already in group_by if group_by.exprs.len() + aggregates.len() != plan.result_columns.len() { for rc in non_aggregate_columns.iter() { let expr = &rc.expr; @@ -644,7 +645,6 @@ fn emit_loop_source( } } } - // Process non-aggregate result columns that aren't already in group_by // Then we have the aggregate arguments. for agg in aggregates.iter() { @@ -703,7 +703,7 @@ fn emit_loop_source( )?; } - let if_label = if let Some(flag) = t_ctx.reg_agg_flag { + let label_emit_nonagg_only_once = if let Some(flag) = t_ctx.reg_nonagg_emit_once_flag { let if_label = program.allocate_label(); program.emit_insn(Insn::If { reg: flag, @@ -735,9 +735,9 @@ fn emit_loop_source( &t_ctx.resolver, )?; } - if let Some(label) = if_label { + if let Some(label) = label_emit_nonagg_only_once { program.resolve_label(label, program.offset()); - let flag = t_ctx.reg_agg_flag.unwrap(); + let flag = t_ctx.reg_nonagg_emit_once_flag.unwrap(); program.emit_int(1, flag); } diff --git a/core/translate/optimizer.rs b/core/translate/optimizer.rs index dfd7ab16d..5321e0fa0 100644 --- a/core/translate/optimizer.rs +++ b/core/translate/optimizer.rs @@ -128,38 +128,58 @@ fn eliminate_orderby_like_groupby(plan: &mut SelectPlan) -> Result<()> { return Ok(()); } - let o = plan.order_by.as_mut().unwrap(); - let g = plan.group_by.as_mut().unwrap(); + let order_by_clauses = plan.order_by.as_mut().unwrap(); + let group_by_clauses = plan.group_by.as_mut().unwrap(); - let mut insert_pos = 0; - let mut i = 0; + let mut group_by_insert_position = 0; + let mut order_index = 0; - while i < o.len() { - let (key, order) = &o[i]; + // This function optimizes query execution by eliminating duplicate expressions between ORDER BY and GROUP BY clauses + // When the same column appears in both clauses, we can avoid redundant sorting operations + // The function reorders GROUP BY expressions and removes redundant ORDER BY expressions to ensure consistent ordering + while order_index < order_by_clauses.len() { + let (order_expr, direction) = &order_by_clauses[order_index]; - if matches!(order, Direction::Descending) { - i += 1; + // Skip descending orders as they require separate sorting + if matches!(direction, Direction::Descending) { + order_index += 1; continue; } - if let Some(pos) = g + + // Check if the current ORDER BY expression matches any expression in the GROUP BY clause + if let Some(group_expr_position) = group_by_clauses .exprs .iter() - .position(|expr| exprs_are_equivalent(expr, key)) + .position(|expr| exprs_are_equivalent(expr, order_expr)) { - if pos != insert_pos { - let mut current_pos = pos; - while current_pos > insert_pos { - g.exprs.swap(current_pos, current_pos - 1); - current_pos -= 1; + // If we found a matching expression in GROUP BY, we need to ensure it's in the correct position + // to preserve the ordering specified by ORDER BY clauses + + // Move the matching GROUP BY expression to the current insertion position + // This effectively "bubbles up" the expression to maintain proper ordering + if group_expr_position != group_by_insert_position { + let mut current_position = group_expr_position; + + // Swap expressions to move the matching one to the correct position + while current_position > group_by_insert_position { + group_by_clauses + .exprs + .swap(current_position, current_position - 1); + current_position -= 1; } } - insert_pos += 1; - o.remove(i); + + group_by_insert_position += 1; + + // Remove this expression from ORDER BY since it's now handled by GROUP BY + order_by_clauses.remove(order_index); + // Note: We don't increment order_index here because removal shifts all elements } else { - i += 1; + // If not found in GROUP BY, move to next ORDER BY expression + order_index += 1; } } - if o.is_empty() { + if order_by_clauses.is_empty() { plan.order_by = None } Ok(()) @@ -173,7 +193,9 @@ fn eliminate_unnecessary_orderby(plan: &mut SelectPlan, schema: &Schema) -> Resu return Ok(()); } - // if pk will be removed later + // If GROUP BY clause is present, we can't rely on already ordered columns because GROUP BY reorders the data + // This early return prevents the elimination of ORDER BY when GROUP BY exists, as sorting must be applied after grouping + // And if ORDER BY clause duplicates GROUP BY we handle it later in fn eliminate_orderby_like_groupby if plan.group_by.is_some() { return Ok(()); } diff --git a/core/translate/result_row.rs b/core/translate/result_row.rs index 7988d0417..dc24cee67 100644 --- a/core/translate/result_row.rs +++ b/core/translate/result_row.rs @@ -26,7 +26,14 @@ pub fn emit_select_result( let start_reg = t_ctx.reg_result_cols_start.unwrap(); for (i, rc) in plan.result_columns.iter().enumerate().filter(|(_, rc)| { - t_ctx.reg_agg_flag.is_some() && rc.contains_aggregates || t_ctx.reg_agg_flag.is_none() + // For aggregate queries, we handle columns differently; example: select id, first_name, sum(age) from users limit 1; + // 1. Columns with aggregates (e.g., sum(age)) are computed in each iteration of aggregation + // 2. Non-aggregate columns (e.g., id, first_name) are only computed once in the first iteration + // This filter ensures we only emit expressions for non aggregate columns once, + // preserving previously calculated values while updating aggregate results + // For all other queries where reg_nonagg_emit_once_flag is none we do nothing. + t_ctx.reg_nonagg_emit_once_flag.is_some() && rc.contains_aggregates + || t_ctx.reg_nonagg_emit_once_flag.is_none() }) { let reg = start_reg + i; translate_expr( diff --git a/core/translate/subquery.rs b/core/translate/subquery.rs index f81b20788..87ddddd63 100644 --- a/core/translate/subquery.rs +++ b/core/translate/subquery.rs @@ -75,7 +75,7 @@ pub fn emit_subquery<'a>( meta_left_joins: (0..plan.table_references.len()).map(|_| None).collect(), meta_sort: None, reg_agg_start: None, - reg_agg_flag: None, + reg_nonagg_emit_once_flag: None, reg_result_cols_start: None, result_column_indexes_in_orderby_sorter: (0..plan.result_columns.len()).collect(), result_columns_to_skip_in_orderby_sorter: None, From 34a132fcd3845623031a7d17bde745b13f357ce0 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Thu, 3 Apr 2025 21:14:22 +0300 Subject: [PATCH 08/11] fix output when group by is not part of resulting set --- core/translate/group_by.rs | 142 +++++++++++++++++++++++------------- core/translate/main_loop.rs | 78 ++++++++++++-------- 2 files changed, 139 insertions(+), 81 deletions(-) diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index ce268671d..a36e82864 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -1,4 +1,4 @@ -use std::rc::Rc; +use std::{collections::HashMap, rc::Rc}; use limbo_sqlite3_parser::ast; @@ -44,6 +44,9 @@ pub struct GroupByMetadata { // The comparison result is used to determine if the current row belongs to the same group as the previous row // Each group by expression has a corresponding register pub reg_group_exprs_cmp: usize, + // Columns that not part of GROUP BY clause and not arguments of Aggregation function. + // Heavy calculation and needed in different functions, so it is reasonable to do it once and save. + pub non_group_by_non_agg_column_count: Option, } /// Initialize resources needed for GROUP BY processing @@ -120,6 +123,7 @@ pub fn init_group_by( reg_non_aggregate_exprs_acc, reg_group_exprs_cmp, reg_sorter_key, + non_group_by_non_agg_column_count: None, }); Ok(()) } @@ -158,28 +162,58 @@ pub fn emit_group_by<'a>( reg_sorter_key, label_subrtn_acc_clear, label_acc_indicator_set_flag_true, + non_group_by_non_agg_column_count, .. } = *t_ctx.meta_group_by.as_mut().unwrap(); - let group_by = plan.group_by.as_ref().unwrap(); - let non_aggregate_count = plan - .result_columns - .iter() - .filter(|rc| !rc.contains_aggregates) - .count(); - let agg_args_count = plan .aggregates .iter() .map(|agg| agg.args.len()) .sum::(); + let group_by_count = group_by.exprs.len(); + let non_group_by_non_agg_column_count = non_group_by_non_agg_column_count.unwrap(); + // Count of GROUP BY columns that appear in the result set + let group_by_colls_in_result_set = + plan.result_columns.len() - non_group_by_non_agg_column_count - plan.aggregates.len(); - // all non-aggregate columns and all arguments of agg functions are in the sorter - // the sort keys are the group by columns (the aggregation within groups is done based on how long the sort keys remain the same) - let sorter_column_count = non_aggregate_count + agg_args_count; + // We have to know which group by expr present in resulting set + let group_by_expr_in_res_cols: Vec = group_by + .exprs + .iter() + .map(|expr| { + plan.result_columns + .iter() + .any(|e| exprs_are_equivalent(&e.expr, expr)) + }) + .collect(); - // sorter column names do not matter + // Create a map from sorter column index to result register + // This helps track where each column from the sorter should be stored + let mut column_register_mapping = HashMap::new(); + let mut next_reg = reg_non_aggregate_exprs_acc; + + // Map GROUP BY columns that are in the result set to registers + for (i, is_in_result) in group_by_expr_in_res_cols.iter().enumerate() { + if *is_in_result { + column_register_mapping.insert(i, next_reg); + next_reg += 1; + } + } + + // Handle other non-aggregate columns that aren't part of GROUP BY and not part of Aggregation function + for i in group_by_count..group_by_count + non_group_by_non_agg_column_count { + column_register_mapping.insert(i, next_reg); + next_reg += 1; + } + + // Calculate total number of columns in the sorter + // The sorter contains all GROUP BY columns, aggregate arguments, and other columns + let sorter_column_count = agg_args_count + group_by_count + non_group_by_non_agg_column_count; + + // Create pseudo-columns for the pseudo-table + // (these are placeholders as we only care about structure, not semantics) let ty = crate::schema::Type::Null; let pseudo_columns = (0..sorter_column_count) .map(|_| Column { @@ -193,7 +227,8 @@ pub fn emit_group_by<'a>( }) .collect::>(); - // A pseudo table is a "fake" table to which we read one row at a time from the sorter + // Create a pseudo-table to read one row at a time from the sorter + // This allows us to use standard table access operations on the sorted data let pseudo_table = Rc::new(PseudoTable { columns: pseudo_columns, }); @@ -281,10 +316,10 @@ pub fn emit_group_by<'a>( return_reg: reg_subrtn_acc_clear_return_offset, }); - // Accumulate the values into the aggregations + // Process each aggregate function for the current row program.resolve_label(agg_step_label, program.offset()); let start_reg = t_ctx.reg_agg_start.unwrap(); - let mut cursor_index = non_aggregate_count; + let mut cursor_index = group_by_count + non_group_by_non_agg_column_count; // Skipping all columns in sorter that not an aggregation arguments for (i, agg) in plan.aggregates.iter().enumerate() { let agg_result_reg = start_reg + i; translate_aggregation_step_groupby( @@ -299,7 +334,8 @@ pub fn emit_group_by<'a>( cursor_index += agg.args.len(); } - // We only emit the group by columns if we are going to start a new group (i.e. the prev group will not accumulate any more values into the aggregations) + // We only need to store non-aggregate columns once per group + // Skip if we've already stored them for this group program.add_comment( program.offset(), "don't emit group columns if continuing existing group", @@ -310,17 +346,16 @@ pub fn emit_group_by<'a>( jump_if_null: false, }); - // Read the non-aggregate columns for a finished group - for i in 0..non_aggregate_count { - let key_reg = reg_non_aggregate_exprs_acc + i; - let sorter_column_index = i; + // Read non-aggregate columns from the current row + for (sorter_column_index, dest_reg) in column_register_mapping.iter() { program.emit_insn(Insn::Column { cursor_id: pseudo_cursor, - column: sorter_column_index, - dest: key_reg, + column: *sorter_column_index, + dest: *dest_reg, }); } + // Mark that we've stored data for this group program.resolve_label(label_acc_indicator_set_flag_true, program.offset()); program.add_comment(program.offset(), "indicate data in accumulator"); program.emit_insn(Insn::Integer { @@ -328,6 +363,7 @@ pub fn emit_group_by<'a>( dest: reg_data_in_acc_flag, }); + // Continue to the next row in the sorter program.emit_insn(Insn::SorterNext { cursor_id: sort_cursor, pc_if_next: label_grouping_loop_start, @@ -355,18 +391,22 @@ pub fn emit_group_by<'a>( program.resolve_label(label_subrtn_acc_output, program.offset()); + // Only output a row if there's data in the accumulator program.add_comment(program.offset(), "output group by row subroutine start"); program.emit_insn(Insn::IfPos { reg: reg_data_in_acc_flag, target_pc: label_agg_final, decrement_by: 0, }); + + // If no data, return without outputting a row let group_by_end_without_emitting_row_label = program.allocate_label(); program.resolve_label(group_by_end_without_emitting_row_label, program.offset()); program.emit_insn(Insn::Return { return_reg: reg_subrtn_acc_output_return_offset, }); + // Finalize aggregate values for output let agg_start_reg = t_ctx.reg_agg_start.unwrap(); // Resolve the label for the start of the group by output row subroutine program.resolve_label(label_agg_final, program.offset()); @@ -378,36 +418,29 @@ pub fn emit_group_by<'a>( }); } - let non_aggregate_result_columns = plan - .result_columns - .iter() - .filter(|rc| !rc.contains_aggregates); - // we now have the group by columns in registers (group_exprs_start_register..group_exprs_start_register + group_by.len() - 1) - // and the agg results in (agg_start_reg..agg_start_reg + aggregates.len() - 1) - // we need to call translate_expr on each result column, but replace the expr with a register copy in case any part of the - // result column expression matches a) a group by column or b) an aggregation result. + // Map GROUP BY expressions to their registers in the result set for (i, expr) in group_by.exprs.iter().enumerate() { - t_ctx - .resolver - .expr_to_reg_cache - .push((expr, reg_non_aggregate_exprs_acc + i)); - } - - // Register offset for the non-aggregate expressions that are not part of GROUP BY - let mut offset = group_by.exprs.len(); - - for rc in non_aggregate_result_columns { - let expr = &rc.expr; - - // skip cols that are already in group by - if !is_column_in_group_by(expr, &group_by.exprs) { - t_ctx - .resolver - .expr_to_reg_cache - .push((expr, reg_non_aggregate_exprs_acc + offset)); - offset += 1; + if group_by_expr_in_res_cols[i] { + if let Some(reg) = &column_register_mapping.get(&i) { + t_ctx.resolver.expr_to_reg_cache.push((expr, **reg)); + } } } + + // Map non-aggregate, non-GROUP BY columns to their registers + let non_agg_cols = plan + .result_columns + .iter() + .filter(|rc| !rc.contains_aggregates && !is_column_in_group_by(&rc.expr, &group_by.exprs)); + + for (idx, rc) in non_agg_cols.enumerate() { + let sorter_idx = group_by_count + idx; + if let Some(®) = column_register_mapping.get(&sorter_idx) { + t_ctx.resolver.expr_to_reg_cache.push((&rc.expr, reg)); + } + } + + // Map aggregate expressions to their result registers for (i, agg) in plan.aggregates.iter().enumerate() { t_ctx .resolver @@ -450,12 +483,21 @@ pub fn emit_group_by<'a>( return_reg: reg_subrtn_acc_output_return_offset, }); + // Subroutine to clear accumulators for a new group program.add_comment(program.offset(), "clear accumulator subroutine start"); program.resolve_label(label_subrtn_acc_clear, program.offset()); let start_reg = reg_non_aggregate_exprs_acc; + + // Reset all accumulator registers to NULL program.emit_insn(Insn::Null { dest: start_reg, - dest_end: Some(start_reg + non_aggregate_count + plan.aggregates.len() - 1), + dest_end: Some( + start_reg + + non_group_by_non_agg_column_count + + group_by_colls_in_result_set + + plan.aggregates.len() + - 1, + ), }); program.emit_insn(Insn::Integer { diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 5a3e1c126..50f5a948c 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -598,24 +598,48 @@ fn emit_loop_source( ) -> Result<()> { match emit_target { LoopEmitTarget::GroupBySorter => { + // This function creates a sorter for GROUP BY operations by allocating registers and + // translating expressions for three types of columns: + // 1) GROUP BY columns (used as sorting keys) + // 2) non-aggregate, non-GROUP BY columns + // 3) aggregate function arguments let group_by = plan.group_by.as_ref().unwrap(); let aggregates = &plan.aggregates; - let non_aggregate_columns = plan + + // Identify columns in the result set that are neither in GROUP BY nor contain aggregates + let non_group_by_non_agg_expr = plan .result_columns .iter() - .filter(|rc| !rc.contains_aggregates) + .filter(|rc| { + !rc.contains_aggregates && !is_column_in_group_by(&rc.expr, &group_by.exprs) + }) + .map(|rc| &rc.expr) .collect::>(); - let sort_keys_count = non_aggregate_columns.len(); + + // Store the count of non-GROUP BY, non-aggregate columns in the metadata + // This will be used later during aggregation processing + t_ctx.meta_group_by.as_mut().map(|meta| { + meta.non_group_by_non_agg_column_count = Some(non_group_by_non_agg_expr.len()); + meta + }); + + // Calculate the total number of arguments used across all aggregate functions let aggregate_arguments_count = plan .aggregates .iter() .map(|agg| agg.args.len()) .sum::(); - let column_count = sort_keys_count + aggregate_arguments_count; + + // Calculate total number of registers needed for all columns in the sorter + let column_count = + group_by.exprs.len() + aggregate_arguments_count + non_group_by_non_agg_expr.len(); + + // Allocate a contiguous block of registers for all columns let start_reg = program.alloc_registers(column_count); let mut cur_reg = start_reg; - // The group by sorter rows will contain the grouping keys first. They are also the sort keys. + // Step 1: Process GROUP BY columns first + // These will be the first columns in the sorter and serve as sort keys for expr in group_by.exprs.iter() { let key_reg = cur_reg; cur_reg += 1; @@ -628,32 +652,27 @@ fn emit_loop_source( )?; } - // Process non-aggregate result columns that aren't already in group_by - if group_by.exprs.len() + aggregates.len() != plan.result_columns.len() { - for rc in non_aggregate_columns.iter() { - let expr = &rc.expr; - if !is_column_in_group_by(expr, &group_by.exprs) { - let key_reg = cur_reg; - cur_reg += 1; - translate_expr( - program, - Some(&plan.table_references), - expr, - key_reg, - &t_ctx.resolver, - )?; - } - } + // Step 2: Process columns that aren't part of GROUP BY and don't contain aggregates + // Example: SELECT col1, col2, SUM(col3) FROM table GROUP BY col1 + // Here col2 would be processed in this loop if it's in the result set + for expr in non_group_by_non_agg_expr.iter() { + let key_reg = cur_reg; + cur_reg += 1; + translate_expr( + program, + Some(&plan.table_references), + expr, + key_reg, + &t_ctx.resolver, + )?; } - // Then we have the aggregate arguments. + // Step 3: Process arguments for all aggregate functions + // For each aggregate, translate all its argument expressions for agg in aggregates.iter() { - // Here we are collecting scalars for the group by sorter, which will include - // both the group by expressions and the aggregate arguments. - // e.g. in `select u.first_name, sum(u.age) from users group by u.first_name` - // the sorter will have two scalars: u.first_name and u.age. - // these are then sorted by u.first_name, and for each u.first_name, we sum the u.age. - // the actual aggregation is done later. + // For a query like: SELECT group_col, SUM(val1), AVG(val2) FROM table GROUP BY group_col + // we'll process val1 and val2 here, storing them in the sorter so they're available + // when computing the aggregates after sorting by group_col for expr in agg.args.iter() { let agg_reg = cur_reg; cur_reg += 1; @@ -667,9 +686,6 @@ fn emit_loop_source( } } - // TODO: although it's less often useful, SQLite does allow for expressions in the SELECT that are not part of a GROUP BY or aggregate. - // We currently ignore those and only emit the GROUP BY keys and aggregate arguments. This should be fixed. - let group_by_metadata = t_ctx.meta_group_by.as_ref().unwrap(); sorter_insert( From 5632b15a449cb1e291d7ca114575dfcb2608358a Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Thu, 3 Apr 2025 21:51:10 +0300 Subject: [PATCH 09/11] add tests --- testing/agg-functions.test | 21 +++++++++++++++++++++ testing/testing | 0 2 files changed, 21 insertions(+) create mode 100644 testing/testing diff --git a/testing/agg-functions.test b/testing/agg-functions.test index 52cf2865c..f1a85dde5 100755 --- a/testing/agg-functions.test +++ b/testing/agg-functions.test @@ -99,6 +99,27 @@ do_execsql_test select-agg-binary-unary-positive { SELECT min(age) + +max(age) FROM users; } {101} +do_execsql_test select-non-agg-cols-should-be-not-null { + SELECT id, first_name, sum(age) FROM users LIMIT 1; +} {1|Jamie|503960} + +do_execsql_test select-with-group-by-and-agg-1 { + SELECT id, first_name, avg(age) FROM users group by last_name limit 1; +} {274|Debra|66.25} + +do_execsql_test select-with-group-by-and-agg-2 { + select first_name, last_name from users where state = 'AL' group by last_name limit 10; +} {Jay|Acosta +Daniel|Adams +Aaron|Baker +Sharon|Becker +Kim|Berg +Donald|Bishop +Brian|Bradford +Jesus|Bradley +John|Brown +Hunter|Burke} + do_execsql_test select-agg-json-array { SELECT json_group_array(name) FROM products; } {["hat","cap","shirt","sweater","sweatshirt","shorts","jeans","sneakers","boots","coat","accessories"]} diff --git a/testing/testing b/testing/testing new file mode 100644 index 000000000..e69de29bb From d4b8fa17f8612a65f5aa5c659a04015cbb5ebfd3 Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Thu, 3 Apr 2025 22:06:46 +0300 Subject: [PATCH 10/11] fix tests --- core/translate/group_by.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index a36e82864..5adcf658f 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -174,9 +174,6 @@ pub fn emit_group_by<'a>( .sum::(); let group_by_count = group_by.exprs.len(); let non_group_by_non_agg_column_count = non_group_by_non_agg_column_count.unwrap(); - // Count of GROUP BY columns that appear in the result set - let group_by_colls_in_result_set = - plan.result_columns.len() - non_group_by_non_agg_column_count - plan.aggregates.len(); // We have to know which group by expr present in resulting set let group_by_expr_in_res_cols: Vec = group_by @@ -492,10 +489,7 @@ pub fn emit_group_by<'a>( program.emit_insn(Insn::Null { dest: start_reg, dest_end: Some( - start_reg - + non_group_by_non_agg_column_count - + group_by_colls_in_result_set - + plan.aggregates.len() + start_reg + non_group_by_non_agg_column_count + group_by_count + plan.aggregates.len() - 1, ), }); From 0c9464e3fc3bc80a44a4ca01ab327865ad42b99a Mon Sep 17 00:00:00 2001 From: Ihor Andrianov Date: Sat, 5 Apr 2025 15:15:10 +0300 Subject: [PATCH 11/11] reduce vec allocations, add comments for magic ifs --- core/translate/emitter.rs | 6 +++- core/translate/group_by.rs | 56 ++++++++++++++++++++----------------- core/translate/main_loop.rs | 12 ++++---- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index 80a6db6d5..8422cea0f 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -247,12 +247,16 @@ pub fn emit_query<'a>( target_pc: after_main_loop_label, }); } + + // For non-grouped aggregation queries that also have non-aggregate columns, + // we need to ensure non-aggregate columns are only emitted once. + // This flag helps track whether we've already emitted these columns. if !plan.aggregates.is_empty() && plan.group_by.is_none() && plan.result_columns.iter().any(|c| !c.contains_aggregates) { let flag = program.alloc_register(); - program.emit_int(0, flag); + program.emit_int(0, flag); // Initialize flag to 0 (not yet emitted) t_ctx.reg_nonagg_emit_once_flag = Some(flag); } diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index 5adcf658f..68f732cbb 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, rc::Rc}; +use std::rc::Rc; use limbo_sqlite3_parser::ast; @@ -176,32 +176,29 @@ pub fn emit_group_by<'a>( let non_group_by_non_agg_column_count = non_group_by_non_agg_column_count.unwrap(); // We have to know which group by expr present in resulting set - let group_by_expr_in_res_cols: Vec = group_by - .exprs - .iter() - .map(|expr| { - plan.result_columns - .iter() - .any(|e| exprs_are_equivalent(&e.expr, expr)) - }) - .collect(); + let group_by_expr_in_res_cols = group_by.exprs.iter().map(|expr| { + plan.result_columns + .iter() + .any(|e| exprs_are_equivalent(&e.expr, expr)) + }); // Create a map from sorter column index to result register // This helps track where each column from the sorter should be stored - let mut column_register_mapping = HashMap::new(); + let mut column_register_mapping = + vec![None; group_by_count + non_group_by_non_agg_column_count]; let mut next_reg = reg_non_aggregate_exprs_acc; // Map GROUP BY columns that are in the result set to registers - for (i, is_in_result) in group_by_expr_in_res_cols.iter().enumerate() { - if *is_in_result { - column_register_mapping.insert(i, next_reg); + for (i, is_in_result) in group_by_expr_in_res_cols.clone().enumerate() { + if is_in_result { + column_register_mapping[i] = Some(next_reg); next_reg += 1; } } // Handle other non-aggregate columns that aren't part of GROUP BY and not part of Aggregation function for i in group_by_count..group_by_count + non_group_by_non_agg_column_count { - column_register_mapping.insert(i, next_reg); + column_register_mapping[i] = Some(next_reg); next_reg += 1; } @@ -344,12 +341,14 @@ pub fn emit_group_by<'a>( }); // Read non-aggregate columns from the current row - for (sorter_column_index, dest_reg) in column_register_mapping.iter() { - program.emit_insn(Insn::Column { - cursor_id: pseudo_cursor, - column: *sorter_column_index, - dest: *dest_reg, - }); + for (sorter_column_index, dest_reg) in column_register_mapping.iter().enumerate() { + if let Some(dest_reg) = dest_reg { + program.emit_insn(Insn::Column { + cursor_id: pseudo_cursor, + column: sorter_column_index, + dest: *dest_reg, + }); + } } // Mark that we've stored data for this group @@ -416,10 +415,15 @@ pub fn emit_group_by<'a>( } // Map GROUP BY expressions to their registers in the result set - for (i, expr) in group_by.exprs.iter().enumerate() { - if group_by_expr_in_res_cols[i] { - if let Some(reg) = &column_register_mapping.get(&i) { - t_ctx.resolver.expr_to_reg_cache.push((expr, **reg)); + for (i, (expr, is_in_result)) in group_by + .exprs + .iter() + .zip(group_by_expr_in_res_cols) + .enumerate() + { + if is_in_result { + if let Some(reg) = &column_register_mapping.get(i).and_then(|opt| *opt) { + t_ctx.resolver.expr_to_reg_cache.push((expr, *reg)); } } } @@ -432,7 +436,7 @@ pub fn emit_group_by<'a>( for (idx, rc) in non_agg_cols.enumerate() { let sorter_idx = group_by_count + idx; - if let Some(®) = column_register_mapping.get(&sorter_idx) { + if let Some(reg) = column_register_mapping.get(sorter_idx).and_then(|opt| *opt) { t_ctx.resolver.expr_to_reg_cache.push((&rc.expr, reg)); } } diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 50f5a948c..f6f139d26 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -613,13 +613,12 @@ fn emit_loop_source( .filter(|rc| { !rc.contains_aggregates && !is_column_in_group_by(&rc.expr, &group_by.exprs) }) - .map(|rc| &rc.expr) - .collect::>(); - + .map(|rc| &rc.expr); + let non_agg_count = non_group_by_non_agg_expr.clone().count(); // Store the count of non-GROUP BY, non-aggregate columns in the metadata // This will be used later during aggregation processing t_ctx.meta_group_by.as_mut().map(|meta| { - meta.non_group_by_non_agg_column_count = Some(non_group_by_non_agg_expr.len()); + meta.non_group_by_non_agg_column_count = Some(non_agg_count); meta }); @@ -631,8 +630,7 @@ fn emit_loop_source( .sum::(); // Calculate total number of registers needed for all columns in the sorter - let column_count = - group_by.exprs.len() + aggregate_arguments_count + non_group_by_non_agg_expr.len(); + let column_count = group_by.exprs.len() + aggregate_arguments_count + non_agg_count; // Allocate a contiguous block of registers for all columns let start_reg = program.alloc_registers(column_count); @@ -655,7 +653,7 @@ fn emit_loop_source( // Step 2: Process columns that aren't part of GROUP BY and don't contain aggregates // Example: SELECT col1, col2, SUM(col3) FROM table GROUP BY col1 // Here col2 would be processed in this loop if it's in the result set - for expr in non_group_by_non_agg_expr.iter() { + for expr in non_group_by_non_agg_expr { let key_reg = cur_reg; cur_reg += 1; translate_expr(