diff --git a/core/translate/expr.rs b/core/translate/expr.rs index d9e1c281d..7d5f60bc0 100644 --- a/core/translate/expr.rs +++ b/core/translate/expr.rs @@ -30,6 +30,7 @@ pub struct ConditionMetadata { pub jump_if_condition_is_true: bool, pub jump_target_when_true: BranchOffset, pub jump_target_when_false: BranchOffset, + pub jump_target_when_null: BranchOffset, } /// Container for register locations of values that can be referenced in RETURNING expressions @@ -154,128 +155,127 @@ macro_rules! expect_arguments_even { /// /// This is extracted from the original conditional implementation to be reusable. /// The logic exactly matches the original conditional InList implementation. +/// +/// An IN expression has one of the following formats: +/// ```sql +/// x IN (y1, y2,...,yN) +/// x IN (subquery) (Not yet implemented) +/// ``` +/// The result of an IN operator is one of TRUE, FALSE, or NULL. A NULL result +/// means that it cannot be determined if the LHS is contained in the RHS due +/// to the presence of NULL values. +/// +/// Currently, we do a simple full-scan, yet it's not ideal when there are many rows +/// on RHS. (Check sqlite's in-operator.md) +/// +/// Algorithm: +/// 1. Set the null-flag to false +/// 2. For each row in the RHS: +/// - Compare LHS and RHS +/// - If LHS matches RHS, returns TRUE +/// - If the comparison results in NULL, set the null-flag to true +/// 3. If the null-flag is true, return NULL +/// 4. Return FALSE +/// +/// A "NOT IN" operator is computed by first computing the equivalent IN +/// operator, then interchanging the TRUE and FALSE results. +// todo: Check right affinities #[instrument(skip(program, referenced_tables, resolver), level = Level::DEBUG)] fn translate_in_list( program: &mut ProgramBuilder, referenced_tables: Option<&TableReferences>, lhs: &ast::Expr, rhs: &[Box], - not: bool, condition_metadata: ConditionMetadata, + // dest if null should be in ConditionMetadata resolver: &Resolver, ) -> Result<()> { - // lhs is e.g. a column reference - // rhs is an Option> - // If rhs is None, it means the IN expression is always false, i.e. tbl.id IN (). - // If rhs is Some, it means the IN expression has a list of values to compare against, e.g. tbl.id IN (1, 2, 3). - // - // The IN expression is equivalent to a series of OR expressions. - // For example, `a IN (1, 2, 3)` is equivalent to `a = 1 OR a = 2 OR a = 3`. - // The NOT IN expression is equivalent to a series of AND expressions. - // For example, `a NOT IN (1, 2, 3)` is equivalent to `a != 1 AND a != 2 AND a != 3`. - // - // SQLite typically optimizes IN expressions to use a binary search on an ephemeral index if there are many values. - // For now we don't have the plumbing to do that, so we'll just emit a series of comparisons, - // which is what SQLite also does for small lists of values. - // TODO: Let's refactor this later to use a more efficient implementation conditionally based on the number of values. + let lhs_reg = if let Expr::Parenthesized(v) = lhs { + program.alloc_registers(v.len()) + } else { + program.alloc_register() + }; + let _ = translate_expr(program, referenced_tables, lhs, lhs_reg, resolver)?; + let mut check_null_reg = 0; + let label_ok = program.allocate_label(); - if rhs.is_empty() { - // If rhs is None, IN expressions are always false and NOT IN expressions are always true. - if not { - // On a trivially true NOT IN () expression we can only jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'; otherwise me must fall through. - // This is because in a more complex condition we might need to evaluate the rest of the condition. - // Note that we are already breaking up our WHERE clauses into a series of terms at "AND" boundaries, so right now we won't be running into cases where jumping on true would be incorrect, - // but once we have e.g. parenthesization and more complex conditions, not having this 'if' here would introduce a bug. - if condition_metadata.jump_if_condition_is_true { - program.emit_insn(Insn::Goto { - target_pc: condition_metadata.jump_target_when_true, - }); - } - } else { - program.emit_insn(Insn::Goto { - target_pc: condition_metadata.jump_target_when_false, - }); - } - return Ok(()); + if condition_metadata.jump_target_when_false != condition_metadata.jump_target_when_null { + check_null_reg = program.alloc_register(); + program.emit_insn(Insn::BitAnd { + lhs: lhs_reg, + rhs: lhs_reg, + dest: check_null_reg, + }); } - // The left hand side only needs to be evaluated once we have a list of values to compare against. - let lhs_reg = program.alloc_register(); - let _ = translate_expr(program, referenced_tables, lhs, lhs_reg, resolver)?; + for (i, expr) in rhs.iter().enumerate() { + let last_condition = i == rhs.len() - 1; + let rhs_reg = program.alloc_register(); + let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?; - // The difference between a local jump and an "upper level" jump is that for example in this case: - // WHERE foo IN (1,2,3) OR bar = 5, - // we can immediately jump to the 'jump_target_when_true' label of the ENTIRE CONDITION if foo = 1, foo = 2, or foo = 3 without evaluating the bar = 5 condition. - // This is why in Binary-OR expressions we set jump_if_condition_is_true to true for the first condition. - // However, in this example: - // WHERE foo IN (1,2,3) AND bar = 5, - // we can't jump to the 'jump_target_when_true' label of the entire condition foo = 1, foo = 2, or foo = 3, because we still need to evaluate the bar = 5 condition later. - // This is why in that case we just jump over the rest of the IN conditions in this "local" branch which evaluates the IN condition. - let jump_target_when_true = if condition_metadata.jump_if_condition_is_true { - condition_metadata.jump_target_when_true - } else { - program.allocate_label() - }; + if check_null_reg != 0 && expr.can_be_null() { + program.emit_insn(Insn::BitAnd { + lhs: check_null_reg, + rhs: rhs_reg, + dest: check_null_reg, + }); + } - if !not { - // If it's an IN expression, we need to jump to the 'jump_target_when_true' label if any of the conditions are true. - for (i, expr) in rhs.iter().enumerate() { - let rhs_reg = program.alloc_register(); - let last_condition = i == rhs.len() - 1; - let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?; - // If this is not the last condition, we need to jump to the 'jump_target_when_true' label if the condition is true. - if !last_condition { + if !last_condition + || condition_metadata.jump_target_when_false != condition_metadata.jump_target_when_null + { + if lhs_reg != rhs_reg { program.emit_insn(Insn::Eq { lhs: lhs_reg, rhs: rhs_reg, - target_pc: jump_target_when_true, + target_pc: label_ok, + // Use affinity instead flags: CmpInsFlags::default(), collation: program.curr_collation(), }); } else { - // If this is the last condition, we need to jump to the 'jump_target_when_false' label if there is no match. - program.emit_insn(Insn::Ne { - lhs: lhs_reg, - rhs: rhs_reg, - target_pc: condition_metadata.jump_target_when_false, - flags: CmpInsFlags::default().jump_if_null(), - collation: program.curr_collation(), + program.emit_insn(Insn::NotNull { + reg: lhs_reg, + target_pc: label_ok, }); } - } - // If we got here, then the last condition was a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'. - // If not, we can just fall through without emitting an unnecessary instruction. - if condition_metadata.jump_if_condition_is_true { - program.emit_insn(Insn::Goto { - target_pc: condition_metadata.jump_target_when_true, - }); - } - } else { - // If it's a NOT IN expression, we need to jump to the 'jump_target_when_false' label if any of the conditions are true. - for expr in rhs.iter() { - let rhs_reg = program.alloc_register(); - let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?; - program.emit_insn(Insn::Eq { + // sqlite3VdbeChangeP5(v, zAff[0]); + } else if lhs_reg != rhs_reg { + program.emit_insn(Insn::Ne { lhs: lhs_reg, rhs: rhs_reg, target_pc: condition_metadata.jump_target_when_false, - flags: CmpInsFlags::default().jump_if_null(), + flags: CmpInsFlags::default(), collation: program.curr_collation(), }); - } - // If we got here, then none of the conditions were a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'. - // If not, we can just fall through without emitting an unnecessary instruction. - if condition_metadata.jump_if_condition_is_true { - program.emit_insn(Insn::Goto { - target_pc: condition_metadata.jump_target_when_true, + } else { + program.emit_insn(Insn::IsNull { + reg: lhs_reg, + target_pc: condition_metadata.jump_target_when_false, }); } } - if !condition_metadata.jump_if_condition_is_true { - program.preassign_label_to_next_insn(jump_target_when_true); + if check_null_reg != 0 { + program.emit_insn(Insn::IsNull { + reg: check_null_reg, + target_pc: condition_metadata.jump_target_when_null, + }); + program.emit_insn(Insn::Goto { + target_pc: condition_metadata.jump_target_when_false, + }); } + program.resolve_label(label_ok, program.offset()); + + // by default if IN expression is true we just continue to the next instruction + if condition_metadata.jump_if_condition_is_true { + program.emit_insn(Insn::Goto { + target_pc: condition_metadata.jump_target_when_true, + }); + } + // todo: deallocate check_null_reg + Ok(()) } @@ -402,16 +402,55 @@ pub fn translate_condition_expr( translate_expr(program, Some(referenced_tables), expr, reg, resolver)?; emit_cond_jump(program, condition_metadata, reg); } + ast::Expr::InList { lhs, not, rhs } => { + let ConditionMetadata { + jump_if_condition_is_true, + jump_target_when_true, + jump_target_when_false, + jump_target_when_null, + } = condition_metadata; + + // Adjust targets if `NOT IN` + let (adjusted_metadata, not_true_label, not_false_label) = if *not { + let not_true_label = program.allocate_label(); + let not_false_label = program.allocate_label(); + ( + ConditionMetadata { + jump_if_condition_is_true, + jump_target_when_true: not_true_label, + jump_target_when_false: not_false_label, + jump_target_when_null, + }, + Some(not_true_label), + Some(not_false_label), + ) + } else { + (condition_metadata, None, None) + }; + translate_in_list( program, Some(referenced_tables), lhs, rhs, - *not, - condition_metadata, + adjusted_metadata, resolver, )?; + + if *not { + // When IN is TRUE (match found), NOT IN should be FALSE + program.resolve_label(not_true_label.unwrap(), program.offset()); + program.emit_insn(Insn::Goto { + target_pc: jump_target_when_false, + }); + + // When IN is FALSE (no match), NOT IN should be TRUE + program.resolve_label(not_false_label.unwrap(), program.offset()); + program.emit_insn(Insn::Goto { + target_pc: jump_target_when_true, + }); + } } ast::Expr::Like { not, .. } => { let cur_reg = program.alloc_register(); @@ -2029,26 +2068,29 @@ pub fn translate_expr( // but wrap it with appropriate expression context handling let result_reg = target_register; - // Set result to NULL initially (matches SQLite behavior) - program.emit_insn(Insn::Null { - dest: result_reg, + let dest_if_false = program.allocate_label(); + let dest_if_null = program.allocate_label(); + let dest_if_true = program.allocate_label(); + + // Ideally we wouldn't need a tmp register, but currently if an IN expression + // is used inside an aggregator the target_register is cleared on every iteration, + // losing the state of the aggregator. + let tmp = program.alloc_register(); + program.emit_no_constant_insn(Insn::Null { + dest: tmp, dest_end: None, }); - let dest_if_false = program.allocate_label(); - let label_integer_conversion = program.allocate_label(); - - // Call the core InList logic with expression-appropriate condition metadata translate_in_list( program, referenced_tables, lhs, rhs, - *not, ConditionMetadata { jump_if_condition_is_true: false, - jump_target_when_true: label_integer_conversion, // will be resolved below + jump_target_when_true: dest_if_true, jump_target_when_false: dest_if_false, + jump_target_when_null: dest_if_null, }, resolver, )?; @@ -2056,27 +2098,30 @@ pub fn translate_expr( // condition true: set result to 1 program.emit_insn(Insn::Integer { value: 1, - dest: result_reg, - }); - program.emit_insn(Insn::Goto { - target_pc: label_integer_conversion, + dest: tmp, }); // False path: set result to 0 program.resolve_label(dest_if_false, program.offset()); - program.emit_insn(Insn::Integer { - value: 0, - dest: result_reg, - }); - - program.resolve_label(label_integer_conversion, program.offset()); // Force integer conversion with AddImm 0 program.emit_insn(Insn::AddImm { - register: result_reg, + register: tmp, value: 0, }); + if *not { + program.emit_insn(Insn::Not { + reg: tmp, + dest: tmp, + }); + } + program.resolve_label(dest_if_null, program.offset()); + program.emit_insn(Insn::Copy { + src_reg: tmp, + dst_reg: result_reg, + extra_amount: 0, + }); Ok(result_reg) } ast::Expr::InSelect { .. } => { diff --git a/core/translate/group_by.rs b/core/translate/group_by.rs index 94d8dee03..19f91c5bc 100644 --- a/core/translate/group_by.rs +++ b/core/translate/group_by.rs @@ -786,6 +786,8 @@ pub fn group_by_emit_row_phase<'a>( jump_if_condition_is_true: false, jump_target_when_false: labels.label_group_by_end_without_emitting_row, jump_target_when_true: if_true_target, + // treat null result has false for now + jump_target_when_null: labels.label_group_by_end_without_emitting_row, }, &t_ctx.resolver, )?; diff --git a/core/translate/index.rs b/core/translate/index.rs index 8aced50d9..5fa07dc0d 100644 --- a/core/translate/index.rs +++ b/core/translate/index.rs @@ -245,6 +245,7 @@ pub fn translate_create_index( jump_if_condition_is_true: false, jump_target_when_false: label, jump_target_when_true: BranchOffset::Placeholder, + jump_target_when_null: label, }, resolver, )?; diff --git a/core/translate/main_loop.rs b/core/translate/main_loop.rs index 03e00f01d..1f9d0a069 100644 --- a/core/translate/main_loop.rs +++ b/core/translate/main_loop.rs @@ -388,6 +388,7 @@ pub fn init_loop( jump_if_condition_is_true: false, jump_target_when_true: jump_target, jump_target_when_false: t_ctx.label_main_loop_end.unwrap(), + jump_target_when_null: t_ctx.label_main_loop_end.unwrap(), }; translate_condition_expr(program, tables, &cond.expr, meta, &t_ctx.resolver)?; program.preassign_label_to_next_insn(jump_target); @@ -713,6 +714,7 @@ fn emit_conditions( jump_if_condition_is_true: false, jump_target_when_true, jump_target_when_false: next, + jump_target_when_null: next, }; translate_condition_expr( program, @@ -727,7 +729,7 @@ fn emit_conditions( Ok(()) } -/// SQLite (and so Limbo) processes joins as a nested loop. +/// SQLite (and so Turso) processes joins as a nested loop. /// The loop may emit rows to various destinations depending on the query: /// - a GROUP BY sorter (grouping is done by sorting based on the GROUP BY keys and aggregating while the GROUP BY keys match) /// - a GROUP BY phase with no sorting (when the rows are already in the order required by the GROUP BY keys) diff --git a/core/vdbe/builder.rs b/core/vdbe/builder.rs index 70e98eb00..82710377e 100644 --- a/core/vdbe/builder.rs +++ b/core/vdbe/builder.rs @@ -341,6 +341,14 @@ impl ProgramBuilder { self.insns.push((insn, self.insns.len())); } + /// Emit an instruction that is guaranteed not to be in any constant span. + /// This ensures the instruction won't be hoisted when emit_constant_insns is called. + #[instrument(skip(self), level = Level::DEBUG)] + pub fn emit_no_constant_insn(&mut self, insn: Insn) { + self.constant_span_end_all(); + self.emit_insn(insn); + } + pub fn close_cursors(&mut self, cursors: &[CursorID]) { for cursor in cursors { self.emit_insn(Insn::Close { cursor_id: *cursor }); diff --git a/core/vdbe/insn.rs b/core/vdbe/insn.rs index 9a5bab21c..64bae3947 100644 --- a/core/vdbe/insn.rs +++ b/core/vdbe/insn.rs @@ -853,10 +853,12 @@ pub enum Insn { db: usize, }, + /// Make a copy of register src..src+extra_amount into dst..dst+extra_amount. Copy { src_reg: usize, dst_reg: usize, - extra_amount: usize, // 0 extra_amount means we include src_reg, dst_reg..=dst_reg+amount = src_reg..=src_reg+amount + /// 0 extra_amount means we include src_reg, dst_reg..=dst_reg+amount = src_reg..=src_reg+amount + extra_amount: usize, }, /// Allocate a new b-tree. diff --git a/parser/src/ast.rs b/parser/src/ast.rs index 81b47a967..5f4449afb 100644 --- a/parser/src/ast.rs +++ b/parser/src/ast.rs @@ -542,6 +542,17 @@ impl Expr { pub fn raise(resolve_type: ResolveType, expr: Option) -> Expr { Expr::Raise(resolve_type, expr.map(Box::new)) } + + pub fn can_be_null(&self) -> bool { + // todo: better handling columns. Check sqlite3ExprCanBeNull + match self { + Expr::Literal(literal) => !matches!( + literal, + Literal::Numeric(_) | Literal::String(_) | Literal::Blob(_) + ), + _ => true, + } + } } /// SQL literal diff --git a/parser/src/parser.rs b/parser/src/parser.rs index b4188df44..bbab464ff 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -1723,11 +1723,23 @@ impl<'a> Parser<'a> { _ => { let exprs = self.parse_expr_list()?; eat_expect!(self, TK_RP); - Box::new(Expr::InList { - lhs: result, - not, - rhs: exprs, - }) + // Expressions in the form: + // lhs IN () + // lhs NOT IN () + // can be simplified to constants 0 (false) and 1 (true), respectively. + // + // todo: should check if lhs has a function. If so, this optimization cannot + // be done. + if exprs.is_empty() { + let name = if not { "1" } else { "0" }; + Box::new(Expr::Literal(Literal::Numeric(name.into()))) + } else { + Box::new(Expr::InList { + lhs: result, + rhs: exprs, + not, + }) + } } } } diff --git a/testing/select.test b/testing/select.test index f70d0ada7..5dff582f5 100755 --- a/testing/select.test +++ b/testing/select.test @@ -913,6 +913,19 @@ do_execsql_test_on_specific_db {:memory:} select-in-simple { } {1 0} +do_execsql_test_on_specific_db {:memory:} select-in-with-nulls { + SELECT 4 IN (1, 4, null); + SELECT 4 NOT IN (1, 4, null); +} {1 +0} + +# All should be null +do_execsql_test_on_specific_db {:memory:} select-in-with-nulls-2 { +SELECT 1 IN (2, 3, null); +SELECT 1 NOT IN (2, 3, null); +SELECT null in (null); +} {\n\n} + do_execsql_test_on_specific_db {:memory:} select-in-complex { CREATE TABLE test_table (id INTEGER, category TEXT, value INTEGER); INSERT INTO test_table VALUES (1, 'A', 10), (2, 'B', 20), (3, 'A', 30), (4, 'C', 40);