From 6924424f11bc5c5f02c3da816e49c96d5cd32e66 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Sat, 19 Apr 2025 12:22:40 +0300 Subject: [PATCH] optimizer: add highly unintelligent heuristics-based cost estimation --- core/translate/optimizer.rs | 160 ++++++++++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 35 deletions(-) diff --git a/core/translate/optimizer.rs b/core/translate/optimizer.rs index 8e796c175..872758257 100644 --- a/core/translate/optimizer.rs +++ b/core/translate/optimizer.rs @@ -713,14 +713,80 @@ fn opposite_cmp_op(op: ast::Operator) -> ast::Operator { } /// Struct used for scoring index scans -/// Currently we just score by the number of index columns that can be utilized -/// in the scan, i.e. no statistics are used. +/// Currently we just estimate cost in a really dumb way, +/// i.e. no statistics are used. struct IndexScore { index: Option>, - score: usize, + cost: f64, constraints: Vec, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct IndexInfo { + unique: bool, + column_count: usize, +} + +const ESTIMATED_HARDCODED_ROWS_PER_TABLE: f64 = 1000.0; + +/// Unbelievably dumb cost estimate for rows scanned by an index scan. +fn dumb_cost_estimator( + index_info: Option, + constraints: &[IndexConstraint], + is_inner_loop: bool, + is_ephemeral: bool, +) -> f64 { + // assume that the outer table always does a full table scan :) + // this discourages building ephemeral indexes on the outer table + // (since a scan reads TABLE_ROWS rows, so an ephemeral index on the outer table would both read TABLE_ROWS rows to build the index and then seek the index) + // but encourages building it on the inner table because it's only built once but the inner loop is run as many times as the outer loop has iterations. + let loop_multiplier = if is_inner_loop { + ESTIMATED_HARDCODED_ROWS_PER_TABLE + } else { + 1.0 + }; + + // If we are building an ephemeral index, we assume we will scan the entire source table to build it. + // Non-ephemeral indexes don't need to be built. + let cost_to_build_index = is_ephemeral as usize as f64 * ESTIMATED_HARDCODED_ROWS_PER_TABLE; + + let Some(index_info) = index_info else { + return cost_to_build_index + ESTIMATED_HARDCODED_ROWS_PER_TABLE * loop_multiplier; + }; + + let final_constraint_is_range = constraints + .last() + .map_or(false, |c| c.operator != ast::Operator::Equals); + let equalities_count = constraints + .iter() + .take(if final_constraint_is_range { + constraints.len() - 1 + } else { + constraints.len() + }) + .count() as f64; + + let selectivity = match ( + index_info.unique, + index_info.column_count as f64, + equalities_count, + ) { + // no equalities: let's assume range query selectivity is 0.4. if final constraint is not range and there are no equalities, it means full table scan incoming + (_, _, 0.0) => { + if final_constraint_is_range { + 0.4 + } else { + 1.0 + } + } + // on an unique index if we have equalities across all index columns, assume very high selectivity + (true, index_cols, eq_count) if eq_count == index_cols => 0.01 * eq_count, + // some equalities: let's assume each equality has a selectivity of 0.1 and range query selectivity is 0.4 + (_, _, eq_count) => (eq_count * 0.1) * if final_constraint_is_range { 0.4 } else { 1.0 }, + }; + cost_to_build_index + selectivity * ESTIMATED_HARDCODED_ROWS_PER_TABLE * loop_multiplier +} + /// Try to extract an index search from the WHERE clause /// Returns an optional [Search] struct if an index search can be extracted, otherwise returns None. pub fn try_extract_index_search_from_where_clause( @@ -747,10 +813,11 @@ pub fn try_extract_index_search_from_where_clause( // 3. constrain the index columns in the order that they appear in the index // - e.g. if the index is on (a,b,c) then we can use all of "a = 1 AND b = 2 AND c = 3" to constrain the index scan, // - but if the where clause is "a = 1 and c = 3" then we can only use "a = 1". + let cost_of_full_table_scan = dumb_cost_estimator(None, &[], table_index != 0, false); let mut constraints_cur = vec![]; let mut best_index = IndexScore { index: None, - score: 0, + cost: cost_of_full_table_scan, constraints: vec![], }; @@ -759,10 +826,18 @@ pub fn try_extract_index_search_from_where_clause( find_index_constraints(where_clause, table_index, index, &mut constraints_cur)?; // naive scoring since we don't have statistics: prefer the index where we can use the most columns // e.g. if we can use all columns of an index on (a,b), it's better than an index of (c,d,e) where we can only use c. - let score = constraints_cur.len(); - if score > best_index.score { + let cost = dumb_cost_estimator( + Some(IndexInfo { + unique: index.unique, + column_count: index.columns.len(), + }), + &constraints_cur, + table_index != 0, + false, + ); + if cost < best_index.cost { best_index.index = Some(Arc::clone(index)); - best_index.score = score; + best_index.cost = cost; best_index.constraints.clear(); best_index.constraints.append(&mut constraints_cur); } @@ -873,6 +948,45 @@ fn get_column_position_in_index( Ok(index.column_table_pos_to_index_pos(*column)) } +fn is_potential_index_constraint(term: &WhereTerm, table_index: usize) -> bool { + // Skip terms that cannot be evaluated at this table's loop level + if !term.should_eval_at_loop(table_index) { + return false; + } + // Skip terms that are not binary comparisons + let Ok(ast::Expr::Binary(lhs, operator, rhs)) = unwrap_parens(&term.expr) else { + return false; + }; + // Only consider index scans for binary ops that are comparisons + if !matches!( + *operator, + ast::Operator::Equals + | ast::Operator::Greater + | ast::Operator::GreaterEquals + | ast::Operator::Less + | ast::Operator::LessEquals + ) { + return false; + } + + // If both lhs and rhs refer to columns from this table, we can't use this constraint + // because we can't use the index to satisfy the condition. + // Examples: + // - WHERE t.x > t.y + // - WHERE t.x + 1 > t.y - 5 + // - WHERE t.x = (t.x) + let Ok(eval_at_left) = determine_where_to_eval_expr(&lhs) else { + return false; + }; + let Ok(eval_at_right) = determine_where_to_eval_expr(&rhs) else { + return false; + }; + if eval_at_left == EvalAt::Loop(table_index) && eval_at_right == EvalAt::Loop(table_index) { + return false; + } + true +} + /// Find all [IndexConstraint]s for a given WHERE clause /// Constraints are appended as long as they constrain the index in column order. /// E.g. for index (a,b,c) to be fully used, there must be a [WhereTerm] for each of a, b, and c. @@ -886,37 +1000,13 @@ fn find_index_constraints( for position_in_index in 0..index.columns.len() { let mut found = false; for (position_in_where_clause, term) in where_clause.iter().enumerate() { - // Skip terms that cannot be evaluated at this table's loop level - if !term.should_eval_at_loop(table_index) { - continue; - } - // Skip terms that are not binary comparisons - let ast::Expr::Binary(lhs, operator, rhs) = unwrap_parens(&term.expr)? else { - continue; - }; - // Only consider index scans for binary ops that are comparisons - if !matches!( - *operator, - ast::Operator::Equals - | ast::Operator::Greater - | ast::Operator::GreaterEquals - | ast::Operator::Less - | ast::Operator::LessEquals - ) { + if !is_potential_index_constraint(term, table_index) { continue; } - // If both lhs and rhs refer to columns from this table, we can't use this constraint - // because we can't use the index to satisfy the condition. - // Examples: - // - WHERE t.x > t.y - // - WHERE t.x + 1 > t.y - 5 - // - WHERE t.x = (t.x) - if determine_where_to_eval_expr(&lhs)? == EvalAt::Loop(table_index) - && determine_where_to_eval_expr(&rhs)? == EvalAt::Loop(table_index) - { - continue; - } + let ast::Expr::Binary(lhs, operator, rhs) = unwrap_parens(&term.expr)? else { + panic!("expected binary expression"); + }; // Check if lhs is a column that is in the i'th position of the index if Some(position_in_index) == get_column_position_in_index(lhs, table_index, index)? {