From 5f724d6b2e9c9976beedf98a4c6a0381d107fb74 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Sat, 3 May 2025 15:17:16 +0300 Subject: [PATCH] Add more comments to join ordering logic --- core/translate/optimizer.rs | 78 ++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/core/translate/optimizer.rs b/core/translate/optimizer.rs index 701b865a8..306975ac3 100644 --- a/core/translate/optimizer.rs +++ b/core/translate/optimizer.rs @@ -358,7 +358,7 @@ enum AccessMethodKind { } /// A bitmask representing which tables are in the join. -/// For example, if the bitmask is 0b1101, then the tables 0, 1, and 3 are in the join. +/// For example, if the bitmask is 0b1101, then the tables 0, 2, and 3 are in the join. /// Since this is a mask, the tables aren't ordered. /// This is used for memoizing the best way to join a subset of N tables. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -580,12 +580,12 @@ fn compute_best_join_order( maybe_order_target: Option<&OrderTarget>, access_methods_cache: &mut HashMap, ) -> Result> { + // Skip work if we have no tables to consider. if table_references.is_empty() { return Ok(None); } - let n = table_references.len(); - let mut best_plan_memo: HashMap = HashMap::new(); + let num_tables = table_references.len(); // Compute naive left-to-right plan to use as pruning threshold let naive_plan = compute_naive_left_deep_plan( @@ -595,6 +595,10 @@ fn compute_best_join_order( maybe_order_target, access_methods_cache, )?; + + // Keep track of both 1. the best plan overall (not considering sorting), and 2. the best ordered plan (which might not be the same). + // We assign Some Cost (tm) to any required sort operation, so the best ordered plan may end up being + // the one we choose, if the cost reduction from avoiding sorting brings it below the cost of the overall best one. let mut best_ordered_plan: Option = None; let mut best_plan_is_also_ordered = if let Some(ref order_target) = maybe_order_target { plan_satisfies_order_target( @@ -606,6 +610,8 @@ fn compute_best_join_order( } else { false }; + + // If we have one table, then the "naive left-to-right plan" is always the best. if table_references.len() == 1 { return Ok(Some(BestJoinOrderResult { best_plan: naive_plan, @@ -613,11 +619,16 @@ fn compute_best_join_order( })); } let mut best_plan = naive_plan; - let mut join_order = Vec::with_capacity(n); + + // Reuse a single mutable join order to avoid allocating join orders per permutation. + let mut join_order = Vec::with_capacity(num_tables); join_order.push(JoinOrderMember { table_no: 0, is_outer: false, }); + + // Keep track of the current best cost so we can short-circuit planning for subplans + // that already exceed the cost of the current best plan. let cost_upper_bound = best_plan.cost; let cost_upper_bound_ordered = { if best_plan_is_also_ordered { @@ -627,8 +638,16 @@ fn compute_best_join_order( } }; - // Base cases: 1-table subsets - for i in 0..n { + // Keep track of the best plan for a given subset of tables. + // Consider this example: we have tables a,b,c,d to join. + // if we find that 'b JOIN a' is better than 'a JOIN b', then we don't need to even try + // to do 'a JOIN b JOIN c', because we know 'b JOIN a JOIN c' is going to be better. + // This is due to the commutativity and associativity of inner joins. + let mut best_plan_memo: HashMap = HashMap::new(); + + // Dynamic programming base case: calculate the best way to access each single table, as if + // there were no other tables. + for i in 0..num_tables { let mut mask = JoinBitmask::new(0); mask.set(i); let table_ref = &table_references[i]; @@ -656,6 +675,10 @@ fn compute_best_join_order( } join_order.clear(); + // As mentioned, inner joins are commutative. Outer joins are NOT. + // Example: + // "a LEFT JOIN b" can NOT be reordered as "b LEFT JOIN a". + // If there are outer joins in the plan, ensure correct ordering. let left_join_illegal_map = { let left_join_count = table_references .iter() @@ -687,27 +710,37 @@ fn compute_best_join_order( } }; - // Build larger plans - for subset_size in 2..=n { - for mask in generate_join_bitmasks(n, subset_size) { + // Now that we have our single-table base cases, we can start considering join subsets of 2 tables and more. + // Try to join each single table to each other table. + for subset_size in 2..=num_tables { + for mask in generate_join_bitmasks(num_tables, subset_size) { + // Keep track of the best way to join this subset of tables. + // Take the (a,b,c,d) example from above: + // E.g. for "a JOIN b JOIN c", the possibilities are (a,b,c), (a,c,b), (b,a,c) and so on. + // If we find out (b,a,c) is the best way to join these three, then we ONLY need to compute + // the cost of (b,a,c,d) in the final step, because (a,b,c,d) (and all others) are guaranteed to be worse. let mut best_for_mask: Option = None; + // also keep track of the best plan for this subset that orders the rows in an Interesting Way (tm), + // i.e. allows us to eliminate sort operations downstream. let (mut best_ordered_for_mask, mut best_for_mask_is_also_ordered) = (None, false); - // Try all possible RHS base tables in this mask - for rhs_idx in 0..n { + + // Try to join all subsets (masks) with all other tables. + // In this block, LHS is always (n-1) tables, and RHS is a single table. + for rhs_idx in 0..num_tables { let rhs_bit = 1 << rhs_idx; - // Make sure rhs is in this mask + // If the RHS table isn't a member of this join subset, skip. if mask.0 & rhs_bit == 0 { continue; } - // LHS = mask - rhs + // If there are no other tables except RHS, skip. let lhs_mask = JoinBitmask(mask.0 ^ rhs_bit); if lhs_mask.0 == 0 { continue; } - // Skip illegal left join ordering + // If this join ordering would violate LEFT JOIN ordering restrictions, skip. if let Some(illegal_lhs) = left_join_illegal_map .as_ref() .and_then(|deps| deps.get(&rhs_idx)) @@ -718,6 +751,8 @@ fn compute_best_join_order( } } + // If the already cached plan for this subset was too crappy to consider, + // then joining it with RHS won't help. Skip. let Some(lhs) = best_plan_memo.get(&lhs_mask) else { continue; }; @@ -728,6 +763,7 @@ fn compute_best_join_order( indexes_ref = indexes; } + // Build a JoinOrder out of the table bitmask we are now considering. for table_no in lhs.table_numbers.iter() { join_order.push(JoinOrderMember { table_no: *table_no, @@ -746,6 +782,7 @@ fn compute_best_join_order( }); assert!(join_order.len() == subset_size); + // Calculate the best way to join LHS with RHS. let rel = join_lhs_tables_to_rhs_table( Some(lhs), rhs_idx, @@ -758,6 +795,9 @@ fn compute_best_join_order( )?; join_order.clear(); + // Since cost_upper_bound_ordered is always >= to cost_upper_bound, + // if the cost we calculated for this plan is worse than cost_upper_bound_ordered, + // this join subset is already worse than our best plan for the ENTIRE query, so skip. if rel.cost >= cost_upper_bound_ordered { continue; } @@ -773,7 +813,9 @@ fn compute_best_join_order( false }; + // If this plan is worse than our overall best, it might still be the best ordered plan. if rel.cost >= cost_upper_bound { + // But if it isn't, skip. if !satisfies_order_target { continue; } @@ -792,7 +834,7 @@ fn compute_best_join_order( if let Some(rel) = best_ordered_for_mask.take() { let cost = rel.cost; - let has_all_tables = mask.0.count_ones() as usize == n; + let has_all_tables = mask.0.count_ones() as usize == num_tables; if has_all_tables && cost_upper_bound_ordered > cost { best_ordered_plan = Some(rel); } @@ -800,7 +842,7 @@ fn compute_best_join_order( if let Some(rel) = best_for_mask.take() { let cost = rel.cost; - let has_all_tables = mask.0.count_ones() as usize == n; + let has_all_tables = mask.0.count_ones() as usize == num_tables; if has_all_tables { if cost_upper_bound > cost { best_plan = rel; @@ -3049,7 +3091,7 @@ mod tests { ) .unwrap() .unwrap(); - /// Should just be a table scan access method + // Should just be a table scan access method assert!(matches!( access_methods_cache[&best_plan.best_access_methods[0]].kind, AccessMethodKind::TableScan { iter_dir } @@ -3615,7 +3657,7 @@ mod tests { )); } - let mut table_references = { + let table_references = { let mut refs = vec![_create_table_reference(dim_tables[0].clone(), None)]; refs.extend(dim_tables.iter().skip(1).map(|t| { _create_table_reference(