mirror of
https://github.com/aljazceru/turso.git
synced 2025-12-17 00:24:21 +01:00
1064 lines
40 KiB
Rust
1064 lines
40 KiB
Rust
use crate::schema::{BTreeTable, Table};
|
||
use crate::translate::aggregation::{translate_aggregation_step, AggArgumentSource};
|
||
use crate::translate::collate::{get_collseq_from_expr, CollationSeq};
|
||
use crate::translate::emitter::{Resolver, TranslateCtx};
|
||
use crate::translate::expr::{walk_expr, walk_expr_mut, WalkControl};
|
||
use crate::translate::order_by::order_by_sorter_insert;
|
||
use crate::translate::plan::{
|
||
Aggregate, Distinctness, JoinOrderMember, JoinedTable, QueryDestination, ResultSetColumn,
|
||
SelectPlan, TableReferences, Window,
|
||
};
|
||
use crate::translate::planner::resolve_window_and_aggregate_functions;
|
||
use crate::translate::result_row::emit_select_result;
|
||
use crate::types::KeyInfo;
|
||
use crate::util::exprs_are_equivalent;
|
||
use crate::vdbe::builder::{CursorType, ProgramBuilder, TableRefIdCounter};
|
||
use crate::vdbe::insn::{InsertFlags, Insn};
|
||
use crate::vdbe::{BranchOffset, CursorID};
|
||
use crate::Result;
|
||
use std::mem;
|
||
use std::sync::Arc;
|
||
use turso_parser::ast::Name;
|
||
use turso_parser::ast::{Expr, FunctionTail, Literal, Over, SortOrder, TableInternalId};
|
||
|
||
const SUBQUERY_DATABASE_ID: usize = 0;
|
||
|
||
struct WindowSubqueryContext<'a> {
|
||
resolver: &'a Resolver<'a>,
|
||
subquery_order_by: &'a mut Vec<(Box<Expr>, SortOrder)>,
|
||
subquery_result_columns: &'a mut Vec<ResultSetColumn>,
|
||
subquery_id: &'a TableInternalId,
|
||
}
|
||
|
||
/// Rewrite a `SELECT` plan for window function processing.
|
||
///
|
||
/// A `SELECT` may reference multiple window definitions, but internally, each `SELECT` plan
|
||
/// operates on **exactly one** window. Multiple window functions may reference the same window.
|
||
///
|
||
/// The original plan is rewritten into a series of nested subqueries, each bound to a single
|
||
/// window definition. Each subquery produces rows in the order determined by its parent window
|
||
/// definition. The innermost subquery does not have any window assigned to it; instead,
|
||
/// the FROM, WHERE, GROUP BY, and HAVING clauses from the original query are pushed down to it.
|
||
/// The outermost query retains ORDER BY, LIMIT, and OFFSET.
|
||
///
|
||
/// # Examples
|
||
/// ```sql
|
||
/// -- Example 1: Query with one window
|
||
/// SELECT
|
||
/// a+1,
|
||
/// max(b) OVER (PARTITION BY c ORDER BY d),
|
||
/// min(c) OVER (PARTITION BY c ORDER BY d)
|
||
/// FROM t1
|
||
/// ORDER BY e;
|
||
///
|
||
/// -- Rewritten form
|
||
/// SELECT
|
||
/// a+1,
|
||
/// max(b) OVER (PARTITION BY c ORDER BY d),
|
||
/// min(c) OVER (PARTITION BY c ORDER BY d)
|
||
/// FROM (SELECT a, b, c, d, e FROM t1 ORDER BY c, d)
|
||
/// ORDER BY e;
|
||
///
|
||
/// -- Example 2: Query with multiple windows
|
||
/// SELECT
|
||
/// a,
|
||
/// max(b) OVER (PARTITION BY c ORDER BY d),
|
||
/// min(c) OVER (PARTITION BY e ORDER BY f)
|
||
/// FROM t1;
|
||
///
|
||
/// -- Rewritten form
|
||
/// SELECT
|
||
/// a,
|
||
/// max(b) OVER (PARTITION BY c ORDER BY d) AS w1,
|
||
/// w2
|
||
/// FROM (
|
||
/// SELECT
|
||
/// a,
|
||
/// b,
|
||
/// c,
|
||
/// d,
|
||
/// min(c) OVER (PARTITION BY e ORDER BY f) AS w2
|
||
/// FROM (SELECT a, b, c, d, e, f FROM t1 ORDER BY e, f)
|
||
/// ORDER BY c, d
|
||
/// );
|
||
/// ```
|
||
pub fn plan_windows(
|
||
plan: &mut SelectPlan,
|
||
resolver: &Resolver,
|
||
table_ref_counter: &mut TableRefIdCounter,
|
||
windows: &mut Vec<Window>,
|
||
) -> crate::Result<()> {
|
||
// Remove named windows that are not referenced by any function, as they can be ignored.
|
||
windows.retain(|w| !w.functions.is_empty());
|
||
|
||
if !windows.is_empty() {
|
||
// Sanity check: this should never happen because the syntax disallows combining VALUES with windows
|
||
assert!(
|
||
plan.values.is_empty(),
|
||
"VALUES clause with windows is not supported"
|
||
);
|
||
}
|
||
|
||
prepare_window_subquery(plan, resolver, table_ref_counter, windows, 0)
|
||
}
|
||
|
||
fn prepare_window_subquery(
|
||
outer_plan: &mut SelectPlan,
|
||
resolver: &Resolver,
|
||
table_ref_counter: &mut TableRefIdCounter,
|
||
windows: &mut Vec<Window>,
|
||
processed_window_count: usize,
|
||
) -> crate::Result<()> {
|
||
if windows.is_empty() {
|
||
return Ok(());
|
||
}
|
||
|
||
let mut current_window = windows.swap_remove(0);
|
||
let mut subquery_result_columns = Vec::new();
|
||
let mut subquery_order_by = Vec::new();
|
||
let subquery_id = table_ref_counter.next();
|
||
|
||
if current_window.name.is_none() {
|
||
// This is part of normalizing the window definition. The remaining logic lives in
|
||
// `rewrite_expr_referencing_current_window`, which replaces inline window definitions
|
||
// with references by name.
|
||
//
|
||
// The goal is to always work with named windows instead of a mix of named and
|
||
// inline ones. This way, we don’t need to rewrite expressions embedded in inline
|
||
// definitions (there might be many equivalent definitions per subquery). Instead,
|
||
// we rewrite the named definition once, and all associated window functions
|
||
// require no additional processing.
|
||
//
|
||
// At this stage, window definitions and window functions are already bound,
|
||
// so this normalization is purely to keep the plan valid.
|
||
//
|
||
// If the generated name is not unique across the entire query, that’s acceptable —
|
||
// the final plan always associates exactly one window with one subquery.
|
||
current_window.name = Some(format!("window_{processed_window_count}"));
|
||
}
|
||
|
||
let mut ctx = WindowSubqueryContext {
|
||
resolver,
|
||
subquery_order_by: &mut subquery_order_by,
|
||
subquery_result_columns: &mut subquery_result_columns,
|
||
subquery_id: &subquery_id,
|
||
};
|
||
|
||
// Build the ORDER BY clause for the subquery by concatenating the window’s PARTITION BY
|
||
// columns with its ORDER BY columns.This ensures that rows in the subquery are returned
|
||
// in the correct order for partitioning and window function evaluation.
|
||
for expr in current_window.partition_by.iter_mut() {
|
||
append_order_by(outer_plan, expr, &SortOrder::Asc, &mut ctx)?;
|
||
current_window.deduplicated_partition_by_len = Some(ctx.subquery_result_columns.len())
|
||
}
|
||
for (expr, order) in current_window.order_by.iter_mut() {
|
||
append_order_by(outer_plan, expr, order, &mut ctx)?;
|
||
}
|
||
|
||
// Rewrite expressions from the outer query’s result columns and ORDER BY clause so that
|
||
// they reference the subquery instead. The original expressions are included in the
|
||
// subquery’s result columns.
|
||
for col in outer_plan.result_columns.iter_mut() {
|
||
rewrite_terminal_expr(
|
||
&mut outer_plan.aggregates,
|
||
&mut col.expr,
|
||
&mut current_window,
|
||
&mut ctx,
|
||
)?;
|
||
}
|
||
for (expr, _) in outer_plan.order_by.iter_mut() {
|
||
rewrite_terminal_expr(
|
||
&mut outer_plan.aggregates,
|
||
expr,
|
||
&mut current_window,
|
||
&mut ctx,
|
||
)?;
|
||
}
|
||
|
||
// When there is no ORDER BY or PARTITION BY clause, the window function takes zero arguments,
|
||
// and no other columns are selected (e.g., "SELECT count() OVER () FROM products"),
|
||
// `subquery_result_columns` may be empty. Add a constant expression to keep the query valid.
|
||
if subquery_result_columns.is_empty() {
|
||
subquery_result_columns.push(ResultSetColumn {
|
||
expr: Expr::Literal(Literal::Numeric("0".to_string())),
|
||
alias: None,
|
||
contains_aggregates: false,
|
||
});
|
||
}
|
||
|
||
let new_join_order = vec![JoinOrderMember {
|
||
table_id: subquery_id,
|
||
original_idx: 0,
|
||
is_outer: false,
|
||
}];
|
||
let new_table_references = TableReferences::new(vec![], vec![]);
|
||
|
||
let mut inner_plan = SelectPlan {
|
||
join_order: mem::replace(&mut outer_plan.join_order, new_join_order),
|
||
table_references: mem::replace(&mut outer_plan.table_references, new_table_references),
|
||
result_columns: subquery_result_columns,
|
||
where_clause: mem::take(&mut outer_plan.where_clause),
|
||
group_by: mem::take(&mut outer_plan.group_by),
|
||
order_by: subquery_order_by,
|
||
aggregates: mem::take(&mut outer_plan.aggregates),
|
||
limit: None,
|
||
offset: None,
|
||
contains_constant_false_condition: false,
|
||
query_destination: QueryDestination::placeholder_for_subquery(),
|
||
distinctness: Distinctness::NonDistinct,
|
||
values: vec![],
|
||
window: None,
|
||
non_from_clause_subqueries: vec![],
|
||
};
|
||
|
||
prepare_window_subquery(
|
||
&mut inner_plan,
|
||
resolver,
|
||
table_ref_counter,
|
||
windows,
|
||
processed_window_count + 1,
|
||
)?;
|
||
|
||
let subquery = JoinedTable::new_subquery(
|
||
format!("window_subquery_{processed_window_count}"),
|
||
inner_plan,
|
||
None,
|
||
subquery_id,
|
||
)?;
|
||
|
||
// Verify that the subquery has the expected database ID.
|
||
// This is required to ensure that assumptions in `rewrite_terminal_expr` are valid.
|
||
assert_eq!(
|
||
subquery.database_id, SUBQUERY_DATABASE_ID,
|
||
"expected subquery database id to be {SUBQUERY_DATABASE_ID}"
|
||
);
|
||
|
||
outer_plan.window = Some(current_window);
|
||
outer_plan.table_references.add_joined_table(subquery);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn append_order_by(
|
||
plan: &mut SelectPlan,
|
||
expr: &mut Expr,
|
||
sort_order: &SortOrder,
|
||
ctx: &mut WindowSubqueryContext,
|
||
) -> crate::Result<()> {
|
||
ctx.subquery_order_by
|
||
.push((Box::new(expr.clone()), *sort_order));
|
||
|
||
let contains_aggregates =
|
||
resolve_window_and_aggregate_functions(expr, ctx.resolver, &mut plan.aggregates, None)?;
|
||
rewrite_expr_as_subquery_column(expr, ctx, contains_aggregates);
|
||
Ok(())
|
||
}
|
||
|
||
fn rewrite_terminal_expr(
|
||
aggregates: &mut Vec<Aggregate>,
|
||
top_level_expr: &mut Expr,
|
||
current_window: &mut Window,
|
||
ctx: &mut WindowSubqueryContext,
|
||
) -> crate::Result<WalkControl> {
|
||
walk_expr_mut(
|
||
top_level_expr,
|
||
&mut |expr: &mut Expr| -> crate::Result<WalkControl> {
|
||
match expr {
|
||
Expr::FunctionCall { filter_over, .. }
|
||
| Expr::FunctionCallStar { filter_over, .. } => {
|
||
if filter_over.over_clause.is_none() {
|
||
// If the expression is a standard aggregate (non-window), push it down
|
||
// to the subquery.
|
||
if aggregates
|
||
.iter()
|
||
.any(|a| exprs_are_equivalent(&a.original_expr, expr))
|
||
{
|
||
rewrite_expr_as_subquery_column(expr, ctx, true);
|
||
}
|
||
} else if let Some(window_function) = current_window
|
||
.functions
|
||
.iter_mut()
|
||
.find(|f| exprs_are_equivalent(&f.original_expr, expr))
|
||
{
|
||
// If the expression is a window function tied to the current window,
|
||
// do not push it to the subquery. Instead, rewrite it so its child
|
||
// expressions reference the subquery where needed.
|
||
rewrite_expr_referencing_current_window(
|
||
aggregates,
|
||
current_window
|
||
.name
|
||
.clone()
|
||
.expect("current_window must always have a name here"),
|
||
ctx,
|
||
expr,
|
||
)?;
|
||
window_function.original_expr = expr.clone();
|
||
|
||
// At this point, the expression and all its children now reference the subquery,
|
||
// so further traversal is unnecessary.
|
||
return Ok(WalkControl::SkipChildren);
|
||
} else {
|
||
// This is a window function referencing a different window (not the current one).
|
||
// Push the entire expression to the subquery; it will be rewritten later.
|
||
rewrite_expr_as_subquery_column(expr, ctx, false);
|
||
}
|
||
}
|
||
Expr::RowId { .. } | Expr::Column { .. } => {
|
||
rewrite_expr_as_subquery_column(expr, ctx, false);
|
||
}
|
||
_ => {}
|
||
}
|
||
|
||
Ok(WalkControl::Continue)
|
||
},
|
||
)
|
||
}
|
||
|
||
fn rewrite_expr_referencing_current_window(
|
||
aggregates: &mut Vec<Aggregate>,
|
||
window_name: String,
|
||
ctx: &mut WindowSubqueryContext,
|
||
expr: &mut Expr,
|
||
) -> crate::Result<()> {
|
||
fn normalize_over_clause(filter_over: &mut FunctionTail, window_name: &str) {
|
||
// FILTER clause is not supported yet. Proper checks elsewhere return appropriate
|
||
// error messages, and this ensures that nothing slips through unnoticed.
|
||
assert!(
|
||
filter_over.filter_clause.is_none(),
|
||
"FILTER in window functions is not supported"
|
||
);
|
||
|
||
// Replace inline OVER clause with a reference to the named window.
|
||
// The window name may be user-provided or planner-generated.
|
||
*filter_over = FunctionTail {
|
||
filter_clause: None,
|
||
over_clause: Some(Over::Name(Name::exact(window_name.to_string()))),
|
||
};
|
||
}
|
||
|
||
match expr {
|
||
Expr::FunctionCall {
|
||
name: _,
|
||
distinctness: _,
|
||
args,
|
||
order_by,
|
||
filter_over,
|
||
} => {
|
||
for arg in args.iter_mut() {
|
||
let contains_aggregates =
|
||
resolve_window_and_aggregate_functions(arg, ctx.resolver, aggregates, None)?;
|
||
rewrite_expr_as_subquery_column(arg, ctx, contains_aggregates);
|
||
}
|
||
assert!(
|
||
order_by.is_empty(),
|
||
"ORDER BY in window functions is not supported"
|
||
);
|
||
normalize_over_clause(filter_over, &window_name);
|
||
}
|
||
Expr::FunctionCallStar {
|
||
filter_over,
|
||
name: _,
|
||
} => {
|
||
normalize_over_clause(filter_over, &window_name);
|
||
}
|
||
_ => unreachable!("only functions can reference windows"),
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
/// Rewrites an expression into a reference to a subquery column.
|
||
/// If the expression was already pushed down, reuses the existing column index.
|
||
/// Otherwise, adds it as a new column in the subquery's result set.
|
||
fn rewrite_expr_as_subquery_column(
|
||
expr: &mut Expr,
|
||
ctx: &mut WindowSubqueryContext,
|
||
contains_aggregates: bool,
|
||
) {
|
||
let (column_idx, existing) = match ctx
|
||
.subquery_result_columns
|
||
.iter()
|
||
.position(|col| exprs_are_equivalent(&col.expr, expr))
|
||
{
|
||
Some(pos) => (pos, true),
|
||
None => (ctx.subquery_result_columns.len(), false),
|
||
};
|
||
|
||
let subquery_ref = Expr::Column {
|
||
database: Some(SUBQUERY_DATABASE_ID),
|
||
table: *ctx.subquery_id,
|
||
column: column_idx,
|
||
is_rowid_alias: false,
|
||
};
|
||
|
||
if existing {
|
||
*expr = subquery_ref;
|
||
} else {
|
||
let subquery_expr = mem::replace(expr, subquery_ref);
|
||
ctx.subquery_result_columns.push(ResultSetColumn {
|
||
expr: subquery_expr,
|
||
alias: None,
|
||
contains_aggregates,
|
||
});
|
||
}
|
||
}
|
||
|
||
#[derive(Debug)]
|
||
pub struct WindowMetadata<'a> {
|
||
pub labels: WindowLabels,
|
||
pub registers: WindowRegisters,
|
||
pub cursors: WindowCursors,
|
||
/// Number of input columns in the source subquery.
|
||
pub src_column_count: usize,
|
||
/// Maps expressions in the current query that reference subquery columns
|
||
/// to their corresponding column indexes in the subquery’s result.
|
||
pub expressions_referencing_subquery: Vec<(&'a Expr, usize)>,
|
||
pub buffer_table_name: String,
|
||
}
|
||
|
||
#[derive(Debug)]
|
||
pub struct WindowLabels {
|
||
/// Address of the subroutine for flushing buffered rows
|
||
pub flush_buffer: BranchOffset,
|
||
/// Address of the end of window processing
|
||
pub window_processing_end: BranchOffset,
|
||
}
|
||
|
||
#[derive(Debug)]
|
||
pub struct WindowRegisters {
|
||
/// Stores the ROWID of the last row inserted into the buffer table.
|
||
/// If NULL, we are before inserting the first row of a new partition.
|
||
pub rowid: usize,
|
||
/// Start of the register array storing partition key values for the current partition.
|
||
pub partition_start: Option<usize>,
|
||
/// Start of the register array storing accumulator states for each window function
|
||
/// (populated by `AggStep` during aggregation).
|
||
pub acc_start: usize,
|
||
/// Start of the register array storing current accumulator results for each window function
|
||
/// (populated by `AggValue` when computing results without clearing accumulators).
|
||
pub acc_result_start: usize,
|
||
/// Stores the address to which control returns after all buffered rows are flushed.
|
||
pub flush_buffer_return_offset: usize,
|
||
/// Start of consecutive registers containing column values for the current row
|
||
/// read from the subquery.
|
||
pub src_columns_start: usize,
|
||
/// Start of the register array storing column values that need to be propagated
|
||
/// from the subquery to the parent query.
|
||
pub result_columns_start: usize,
|
||
/// Start of the register array holding ORDER BY column values for the current row.
|
||
/// These registers are used to detect whether the current row is a "peer"
|
||
/// (i.e., has identical ORDER BY values to the previous row).
|
||
pub new_order_by_columns_start: Option<usize>,
|
||
/// Start of the register array holding ORDER BY column values from the previous row.
|
||
/// These are used to compare against the current row to determine peer relationships.
|
||
pub prev_order_by_columns_start: Option<usize>,
|
||
}
|
||
|
||
#[derive(Debug)]
|
||
pub struct WindowCursors {
|
||
/// Cursor used to read from the ephemeral buffer table
|
||
pub buffer_read: CursorID,
|
||
/// Cursor used to write to the ephemeral buffer table
|
||
pub buffer_write: CursorID,
|
||
}
|
||
|
||
pub fn init_window<'a>(
|
||
program: &mut ProgramBuilder,
|
||
t_ctx: &mut TranslateCtx<'a>,
|
||
window: &'a Window,
|
||
plan: &SelectPlan,
|
||
result_columns: &'a [ResultSetColumn],
|
||
order_by: &'a [(Box<Expr>, SortOrder)],
|
||
) -> crate::Result<()> {
|
||
let joined_tables = &plan.joined_tables();
|
||
assert_eq!(joined_tables.len(), 1, "expected only one joined table");
|
||
|
||
let src_table = &joined_tables[0];
|
||
let reg_src_columns_start =
|
||
if let Table::FromClauseSubquery(from_clause_subquery) = &src_table.table {
|
||
from_clause_subquery
|
||
.result_columns_start_reg
|
||
.expect("Subquery result_columns_start_reg must be set")
|
||
} else {
|
||
panic!(
|
||
"expected source table to be a FromClauseSubquery, but got: {:?}",
|
||
src_table.table
|
||
);
|
||
};
|
||
|
||
let src_columns = src_table.columns().to_vec();
|
||
let src_column_count = src_columns.len();
|
||
let window_name = window.name.clone().expect("window name is missing");
|
||
let partition_by_len = window.partition_by.len();
|
||
let order_by_len = window.order_by.len();
|
||
let window_function_count = window.functions.len();
|
||
|
||
// An ephemeral table used to buffer rows for the current frame
|
||
let buffer_table = Arc::new(BTreeTable {
|
||
root_page: 0,
|
||
// TODO: Generating the name this way may cause collisions with real tables in the
|
||
// attached database. Other ephemeral tables are created similarly, so it’s left
|
||
// as-is for now. Ideally, there should be a way to mark tables as ephemeral so
|
||
// they can be handled differently from regular tables.
|
||
name: format!("buffer_table_{window_name}"),
|
||
has_rowid: true,
|
||
primary_key_columns: vec![],
|
||
columns: src_columns,
|
||
is_strict: false,
|
||
unique_sets: vec![],
|
||
has_autoincrement: false,
|
||
foreign_keys: vec![],
|
||
});
|
||
let cursor_buffer_read = program.alloc_cursor_id(CursorType::BTreeTable(buffer_table.clone()));
|
||
let cursor_buffer_write = program.alloc_cursor_id(CursorType::BTreeTable(buffer_table.clone()));
|
||
program.emit_insn(Insn::OpenEphemeral {
|
||
cursor_id: cursor_buffer_read,
|
||
is_table: true,
|
||
});
|
||
program.emit_insn(Insn::OpenDup {
|
||
original_cursor_id: cursor_buffer_read,
|
||
new_cursor_id: cursor_buffer_write,
|
||
});
|
||
|
||
// Window function processing is similar to aggregation processing in how results are mapped
|
||
// to registers. Each function expression is stored in `expr_to_reg_cache` along with its
|
||
// result register. Later, when bytecode generation encounters the expression, the value is
|
||
// copied from the result register instead of generating code to evaluate the expression.
|
||
let reg_acc_start = program.alloc_registers(window_function_count);
|
||
let reg_acc_result_start = program.alloc_registers(window_function_count);
|
||
for (i, func) in window.functions.iter().enumerate() {
|
||
t_ctx
|
||
.resolver
|
||
.expr_to_reg_cache
|
||
.push((&func.original_expr, reg_acc_result_start + i));
|
||
}
|
||
|
||
// The same approach applies to expressions referencing the subquery (columns).
|
||
// Instead of reading directly from the subquery, we redirect them to the corresponding
|
||
// result registers. This is necessary because rows are buffered in an ephemeral table and
|
||
// returned according to the rules of the window definition.
|
||
let expressions_referencing_subquery =
|
||
collect_expressions_referencing_subquery(result_columns, order_by, &src_table.internal_id)?;
|
||
let reg_col_start = program.alloc_registers(expressions_referencing_subquery.len());
|
||
for (i, (expr, _)) in expressions_referencing_subquery.iter().enumerate() {
|
||
t_ctx
|
||
.resolver
|
||
.expr_to_reg_cache
|
||
.push((expr, reg_col_start + i));
|
||
}
|
||
|
||
t_ctx.meta_window = Some(WindowMetadata {
|
||
labels: WindowLabels {
|
||
flush_buffer: program.allocate_label(),
|
||
window_processing_end: program.allocate_label(),
|
||
},
|
||
registers: WindowRegisters {
|
||
rowid: program.alloc_registers_and_init_w_null(1),
|
||
partition_start: if partition_by_len > 0 {
|
||
Some(program.alloc_registers_and_init_w_null(partition_by_len))
|
||
} else {
|
||
None
|
||
},
|
||
acc_start: reg_acc_start,
|
||
acc_result_start: reg_acc_result_start,
|
||
flush_buffer_return_offset: program.alloc_register(),
|
||
src_columns_start: reg_src_columns_start,
|
||
result_columns_start: reg_col_start,
|
||
prev_order_by_columns_start: alloc_optional_registers(program, order_by_len),
|
||
new_order_by_columns_start: alloc_optional_registers(program, order_by_len),
|
||
},
|
||
cursors: WindowCursors {
|
||
buffer_read: cursor_buffer_read,
|
||
buffer_write: cursor_buffer_write,
|
||
},
|
||
src_column_count,
|
||
expressions_referencing_subquery,
|
||
buffer_table_name: buffer_table.name.clone(),
|
||
});
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn alloc_optional_registers(program: &mut ProgramBuilder, count: usize) -> Option<usize> {
|
||
if count > 0 {
|
||
Some(program.alloc_registers(count))
|
||
} else {
|
||
None
|
||
}
|
||
}
|
||
|
||
fn collect_expressions_referencing_subquery<'a>(
|
||
result_columns: &'a [ResultSetColumn],
|
||
order_by: &'a [(Box<Expr>, SortOrder)],
|
||
subquery_id: &TableInternalId,
|
||
) -> crate::Result<Vec<(&'a Expr, usize)>> {
|
||
let mut expressions_referencing_subquery: Vec<(&'a Expr, usize)> = Vec::new();
|
||
|
||
for root_expr in result_columns
|
||
.iter()
|
||
.map(|col| &col.expr)
|
||
.chain(order_by.iter().map(|(e, _)| e.as_ref()))
|
||
{
|
||
walk_expr(
|
||
root_expr,
|
||
&mut |expr: &Expr| -> crate::Result<WalkControl> {
|
||
match expr {
|
||
Expr::FunctionCall { filter_over, .. }
|
||
| Expr::FunctionCallStar { filter_over, .. } => {
|
||
if filter_over.over_clause.is_some() {
|
||
return Ok(WalkControl::SkipChildren);
|
||
}
|
||
}
|
||
Expr::Column { column, table, .. } => {
|
||
assert_eq!(
|
||
table, subquery_id,
|
||
"only subquery columns can be referenced"
|
||
);
|
||
if expressions_referencing_subquery
|
||
.iter()
|
||
.all(|(_, existing_column)| column != existing_column)
|
||
{
|
||
expressions_referencing_subquery.push((expr, *column));
|
||
}
|
||
}
|
||
_ => {}
|
||
};
|
||
Ok(WalkControl::Continue)
|
||
},
|
||
)?;
|
||
}
|
||
|
||
Ok(expressions_referencing_subquery)
|
||
}
|
||
|
||
/// Emits bytecode to process a single row of the window’s input (always a subquery).
|
||
///
|
||
/// Note:
|
||
/// The **buffer table** mentioned below is an ephemeral B-tree that temporarily
|
||
/// stores rows for the current window frame.
|
||
///
|
||
/// High-level overview:
|
||
/// - Each row from the subquery is read, and its ORDER BY columns are loaded into
|
||
/// dedicated registers for comparison and partitioning purposes.
|
||
/// - If the row starts a new partition (based on PARTITION BY columns), the buffer
|
||
/// and accumulators are flushed or reset as needed.
|
||
/// - Rows are compared against the previous row to determine if they are "peers"
|
||
/// (i.e., have the same ORDER BY values). Non-peer rows may trigger flushing
|
||
/// of intermediate results.
|
||
/// - The row is then inserted into the window’s buffer table.
|
||
/// - Aggregate steps for any window functions are executed.
|
||
pub fn emit_window_loop_source(
|
||
program: &mut ProgramBuilder,
|
||
t_ctx: &mut TranslateCtx,
|
||
plan: &SelectPlan,
|
||
) -> crate::Result<()> {
|
||
let WindowMetadata {
|
||
labels,
|
||
registers,
|
||
cursors,
|
||
src_column_count: input_column_count,
|
||
buffer_table_name,
|
||
..
|
||
} = t_ctx.meta_window.as_ref().expect("missing window metadata");
|
||
let window = plan.window.as_ref().expect("missing window");
|
||
|
||
emit_load_order_by_columns(program, window, registers);
|
||
emit_flush_buffer_if_new_partition(program, labels, registers, window, plan)?;
|
||
emit_reset_state_if_new_partition(program, registers, window);
|
||
emit_flush_buffer_if_not_peer(program, labels, registers, window, plan)?;
|
||
emit_insert_row_into_buffer(
|
||
program,
|
||
registers,
|
||
cursors,
|
||
input_column_count,
|
||
buffer_table_name,
|
||
);
|
||
emit_aggregation_step(program, window, &t_ctx.resolver, plan, registers)?;
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn emit_flush_buffer_if_new_partition(
|
||
program: &mut ProgramBuilder,
|
||
labels: &WindowLabels,
|
||
registers: &WindowRegisters,
|
||
window: &Window,
|
||
plan: &SelectPlan,
|
||
) -> Result<()> {
|
||
if let Some(reg_partition_start) = registers.partition_start {
|
||
let same_partition_label = program.allocate_label();
|
||
let new_partition_label = program.allocate_label();
|
||
|
||
// Compare the first `deduplicated_partition_by_len` source columns with the saved
|
||
// partition keys. If they differ, this row starts a new partition and we flush the buffer.
|
||
let partition_by_len = window
|
||
.deduplicated_partition_by_len
|
||
.expect("deduplicated_partition_by_len must exist");
|
||
|
||
program.add_comment(
|
||
program.offset(),
|
||
"compare partition keys to detect new partition",
|
||
);
|
||
let mut compare_key_info = (0..window.partition_by.len())
|
||
.map(|_| KeyInfo {
|
||
sort_order: SortOrder::Asc,
|
||
collation: CollationSeq::default(),
|
||
})
|
||
.collect::<Vec<_>>();
|
||
for (i, c) in compare_key_info
|
||
.iter_mut()
|
||
.enumerate()
|
||
.take(window.partition_by.len())
|
||
{
|
||
let maybe_collation =
|
||
get_collseq_from_expr(&window.partition_by[i], &plan.table_references)?;
|
||
c.collation = maybe_collation.unwrap_or_default();
|
||
}
|
||
program.emit_insn(Insn::Compare {
|
||
start_reg_a: registers.src_columns_start,
|
||
start_reg_b: reg_partition_start,
|
||
count: partition_by_len,
|
||
key_info: compare_key_info,
|
||
});
|
||
program.emit_insn(Insn::Jump {
|
||
target_pc_lt: new_partition_label,
|
||
target_pc_eq: same_partition_label,
|
||
target_pc_gt: new_partition_label,
|
||
});
|
||
|
||
program.resolve_label(new_partition_label, program.offset());
|
||
program.add_comment(program.offset(), "detected new partition");
|
||
program.emit_insn(Insn::Gosub {
|
||
target_pc: labels.flush_buffer,
|
||
return_reg: registers.flush_buffer_return_offset,
|
||
});
|
||
// Reset rowid to signal the start of processing a new partition.
|
||
program.emit_insn(Insn::Null {
|
||
dest: registers.rowid,
|
||
dest_end: None,
|
||
});
|
||
program.emit_insn(Insn::Copy {
|
||
src_reg: registers.src_columns_start,
|
||
dst_reg: reg_partition_start,
|
||
extra_amount: partition_by_len - 1,
|
||
});
|
||
|
||
program.resolve_label(same_partition_label, program.offset());
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn emit_reset_state_if_new_partition(
|
||
program: &mut ProgramBuilder,
|
||
registers: &WindowRegisters,
|
||
window: &Window,
|
||
) {
|
||
let label_skip_reset_state = program.allocate_label();
|
||
|
||
// If rowid is null, it means we are starting a new partition. It was either set by the code
|
||
// initializing window processing or by code detecting the start of a new partition.
|
||
program.emit_insn(Insn::NotNull {
|
||
reg: registers.rowid,
|
||
target_pc: label_skip_reset_state,
|
||
});
|
||
if let Some(dst_reg_start) = registers.new_order_by_columns_start {
|
||
// Initialize previous ORDER BY values for the new partition. The first row of the
|
||
// partition is compared to itself, not to the row from the previous partition.
|
||
program.add_comment(
|
||
program.offset(),
|
||
"initialize previous peer register for new partition",
|
||
);
|
||
program.emit_insn(Insn::Copy {
|
||
src_reg: dst_reg_start,
|
||
dst_reg: registers
|
||
.prev_order_by_columns_start
|
||
.expect("prev_order_by_columns_start must exist"),
|
||
extra_amount: window.order_by.len() - 1,
|
||
});
|
||
}
|
||
// Since this is a new partition, we must reset accumulator registers.
|
||
program.add_comment(program.offset(), "reset accumulator registers");
|
||
program.emit_insn(Insn::Null {
|
||
dest: registers.acc_start,
|
||
dest_end: Some(registers.acc_start + window.functions.len() - 1),
|
||
});
|
||
|
||
program.preassign_label_to_next_insn(label_skip_reset_state);
|
||
}
|
||
|
||
fn emit_flush_buffer_if_not_peer(
|
||
program: &mut ProgramBuilder,
|
||
labels: &WindowLabels,
|
||
registers: &WindowRegisters,
|
||
window: &Window,
|
||
plan: &SelectPlan,
|
||
) -> Result<()> {
|
||
if let Some(reg_new_order_by_columns_start) = registers.new_order_by_columns_start {
|
||
let label_peer = program.allocate_label();
|
||
let label_not_peer = program.allocate_label();
|
||
let order_by_len = window.order_by.len();
|
||
let reg_prev_order_by_columns_start = registers
|
||
.prev_order_by_columns_start
|
||
.expect("prev_order_by_columns_start must exist");
|
||
|
||
program.add_comment(program.offset(), "compare ORDER BY columns to detect peer");
|
||
let mut compare_key_info = (0..window.order_by.len())
|
||
.map(|_| KeyInfo {
|
||
sort_order: SortOrder::Asc,
|
||
collation: CollationSeq::default(),
|
||
})
|
||
.collect::<Vec<_>>();
|
||
for (i, c) in compare_key_info
|
||
.iter_mut()
|
||
.enumerate()
|
||
.take(window.order_by.len())
|
||
{
|
||
let maybe_collation =
|
||
get_collseq_from_expr(&window.order_by[i].0, &plan.table_references)?;
|
||
c.collation = maybe_collation.unwrap_or_default();
|
||
}
|
||
program.emit_insn(Insn::Compare {
|
||
start_reg_a: reg_prev_order_by_columns_start,
|
||
start_reg_b: reg_new_order_by_columns_start,
|
||
count: order_by_len,
|
||
key_info: compare_key_info,
|
||
});
|
||
program.emit_insn(Insn::Jump {
|
||
target_pc_lt: label_not_peer,
|
||
target_pc_eq: label_peer,
|
||
target_pc_gt: label_not_peer,
|
||
});
|
||
|
||
program.resolve_label(label_not_peer, program.offset());
|
||
program.add_comment(program.offset(), "detected non-peer row");
|
||
program.emit_insn(Insn::Gosub {
|
||
target_pc: labels.flush_buffer,
|
||
return_reg: registers.flush_buffer_return_offset,
|
||
});
|
||
program.emit_insn(Insn::Copy {
|
||
src_reg: reg_new_order_by_columns_start,
|
||
dst_reg: reg_prev_order_by_columns_start,
|
||
extra_amount: order_by_len - 1,
|
||
});
|
||
|
||
program.resolve_label(label_peer, program.offset());
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn emit_load_order_by_columns(
|
||
program: &mut ProgramBuilder,
|
||
window: &Window,
|
||
registers: &WindowRegisters,
|
||
) {
|
||
if let Some(reg_new_order_by_columns_start) = registers.new_order_by_columns_start {
|
||
// Source columns are deduplicated and may appear in a different order than
|
||
// the ORDER BY terms. Therefore, we must restore the original ORDER BY layout
|
||
// here by copying the values into an array of registers.
|
||
for (i, (expr, _)) in window.order_by.iter().enumerate() {
|
||
match expr {
|
||
Expr::Column { column, .. } => {
|
||
program.emit_insn(Insn::Copy {
|
||
src_reg: registers.src_columns_start + column,
|
||
dst_reg: reg_new_order_by_columns_start + i,
|
||
extra_amount: 0,
|
||
});
|
||
}
|
||
_ => unreachable!("expected Column, got {:?}", expr),
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
fn emit_insert_row_into_buffer(
|
||
program: &mut ProgramBuilder,
|
||
registers: &WindowRegisters,
|
||
cursors: &WindowCursors,
|
||
input_column_count: &usize,
|
||
table_name: &str,
|
||
) {
|
||
let reg_record = program.alloc_register();
|
||
|
||
program.emit_insn(Insn::MakeRecord {
|
||
start_reg: registers.src_columns_start,
|
||
count: *input_column_count,
|
||
dest_reg: reg_record,
|
||
index_name: None,
|
||
affinity_str: None,
|
||
});
|
||
program.emit_insn(Insn::NewRowid {
|
||
cursor: cursors.buffer_write,
|
||
rowid_reg: registers.rowid,
|
||
prev_largest_reg: 0,
|
||
});
|
||
program.emit_insn(Insn::Insert {
|
||
cursor: cursors.buffer_write,
|
||
key_reg: registers.rowid,
|
||
record_reg: reg_record,
|
||
flag: InsertFlags::new(),
|
||
table_name: table_name.to_string(),
|
||
});
|
||
}
|
||
|
||
fn emit_aggregation_step(
|
||
program: &mut ProgramBuilder,
|
||
window: &Window,
|
||
resolver: &Resolver,
|
||
plan: &SelectPlan,
|
||
registers: &WindowRegisters,
|
||
) -> crate::Result<()> {
|
||
for (i, func) in window.functions.iter().enumerate() {
|
||
// The aggregation step is performed incrementally as each row from the subquery is
|
||
// processed. Therefore, we don’t need to access the buffer table and can obtain argument
|
||
// values directly by evaluating the expressions that reference the subquery result columns.
|
||
let args = match &func.original_expr {
|
||
Expr::FunctionCall { args, .. } => args.iter().map(|a| (**a).clone()).collect(),
|
||
Expr::FunctionCallStar { .. } => vec![],
|
||
_ => unreachable!("All window functions should be either FunctionCall or FunctionCallStar expressions"),
|
||
};
|
||
|
||
let reg_acc_start = registers.acc_start + i;
|
||
translate_aggregation_step(
|
||
program,
|
||
&plan.table_references,
|
||
AggArgumentSource::new_from_expression(&func.func, &args, &Distinctness::NonDistinct),
|
||
reg_acc_start,
|
||
resolver,
|
||
)?;
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
/// Emits bytecode to output all buffered rows produced by window processing.
|
||
///
|
||
/// The generated code has two possible entry points:
|
||
/// * **Fallthrough mode** (normal flow): After all source rows have been processed,
|
||
/// this code executes inline to flush any remaining buffered rows, then continues execution.
|
||
/// * **Subroutine mode** (jump into `labels.flush_buffer`): In this case the code
|
||
/// returns control to the address stored in `registers.flush_buffer_return_offset`
|
||
/// once all buffered rows are processed.
|
||
pub fn emit_window_results(
|
||
program: &mut ProgramBuilder,
|
||
t_ctx: &mut TranslateCtx,
|
||
plan: &SelectPlan,
|
||
) -> crate::Result<()> {
|
||
let WindowMetadata {
|
||
labels,
|
||
registers,
|
||
cursors,
|
||
..
|
||
} = t_ctx.meta_window.as_ref().expect("missing window metadata");
|
||
let window = plan.window.as_ref().expect("missing window");
|
||
|
||
let label_empty = program.allocate_label();
|
||
let label_window_processing_end = labels.window_processing_end;
|
||
let reg_flush_buffer_return_offset = registers.flush_buffer_return_offset;
|
||
let cursor_buffer_read = cursors.buffer_read;
|
||
|
||
// All source rows have already been processed at this point.
|
||
// In fallthrough mode, we are not returning to a caller — we just flush
|
||
// the buffered rows and continue execution.
|
||
program.add_comment(program.offset(), "return remaining buffered rows");
|
||
program.emit_insn(Insn::Null {
|
||
dest: registers.flush_buffer_return_offset,
|
||
dest_end: None,
|
||
});
|
||
|
||
// If control jumps here (labels.flush_buffer), we are in subroutine mode.
|
||
// In that case, after flushing the buffer, execution will return to the
|
||
// address stored in `flush_buffer_return_offset`.
|
||
program.preassign_label_to_next_insn(labels.flush_buffer);
|
||
|
||
program.emit_insn(Insn::Rewind {
|
||
cursor_id: cursor_buffer_read,
|
||
pc_if_empty: label_empty,
|
||
});
|
||
|
||
emit_return_buffered_rows(program, window, t_ctx, plan)?;
|
||
|
||
program.resolve_label(label_empty, program.offset());
|
||
|
||
program.emit_insn(Insn::ResetSorter {
|
||
cursor_id: cursor_buffer_read,
|
||
});
|
||
program.emit_insn(Insn::Return {
|
||
return_reg: reg_flush_buffer_return_offset,
|
||
can_fallthrough: true,
|
||
});
|
||
|
||
program.preassign_label_to_next_insn(label_window_processing_end);
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn emit_return_buffered_rows(
|
||
program: &mut ProgramBuilder,
|
||
window: &Window,
|
||
t_ctx: &mut TranslateCtx,
|
||
plan: &SelectPlan,
|
||
) -> crate::Result<()> {
|
||
let WindowMetadata {
|
||
labels,
|
||
registers,
|
||
cursors,
|
||
expressions_referencing_subquery,
|
||
..
|
||
} = t_ctx.meta_window.as_ref().expect("missing window metadata");
|
||
|
||
for (i, func) in window.functions.iter().enumerate() {
|
||
program.emit_insn(Insn::AggValue {
|
||
acc_reg: registers.acc_start + i,
|
||
dest_reg: registers.acc_result_start + i,
|
||
func: func.func.clone(),
|
||
});
|
||
}
|
||
|
||
let label_skip_returning_row = program.allocate_label();
|
||
let label_loop_start = program.allocate_label();
|
||
program.preassign_label_to_next_insn(label_loop_start);
|
||
|
||
// Propagate subquery result column values to the outer query (if any) or directly to
|
||
// the final output that will be returned to the user, by copying them from the buffer table
|
||
// into the dedicated registers.
|
||
for (i, (_, col_idx)) in expressions_referencing_subquery.iter().enumerate() {
|
||
let reg_result = registers.result_columns_start + i;
|
||
program.emit_column_or_rowid(cursors.buffer_read, *col_idx, reg_result);
|
||
}
|
||
t_ctx.resolver.enable_expr_to_reg_cache();
|
||
|
||
match plan.order_by.is_empty() {
|
||
true => {
|
||
emit_select_result(
|
||
program,
|
||
&t_ctx.resolver,
|
||
plan,
|
||
Some(labels.window_processing_end),
|
||
Some(label_skip_returning_row),
|
||
t_ctx.reg_nonagg_emit_once_flag,
|
||
t_ctx.reg_offset,
|
||
t_ctx.reg_result_cols_start.unwrap(),
|
||
t_ctx.limit_ctx,
|
||
)?;
|
||
}
|
||
false => {
|
||
order_by_sorter_insert(program, t_ctx, plan)?;
|
||
}
|
||
}
|
||
|
||
program.resolve_label(label_skip_returning_row, program.offset());
|
||
|
||
if let Distinctness::Distinct { ctx } = &plan.distinctness {
|
||
let distinct_ctx = ctx.as_ref().expect("distinct context must exist");
|
||
program.preassign_label_to_next_insn(distinct_ctx.label_on_conflict);
|
||
}
|
||
|
||
program.emit_insn(Insn::Next {
|
||
cursor_id: cursors.buffer_read,
|
||
pc_if_next: label_loop_start,
|
||
});
|
||
|
||
Ok(())
|
||
}
|