Merge 'Fix IN operator NULL handling' from Diego Reis

Closes #3277
Basically changed what we used to do to match what and how SQLite does.

Closes #3606
This commit is contained in:
Jussi Saurio
2025-10-10 10:17:57 +03:00
committed by GitHub
9 changed files with 213 additions and 117 deletions

View File

@@ -30,6 +30,7 @@ pub struct ConditionMetadata {
pub jump_if_condition_is_true: bool,
pub jump_target_when_true: BranchOffset,
pub jump_target_when_false: BranchOffset,
pub jump_target_when_null: BranchOffset,
}
/// Container for register locations of values that can be referenced in RETURNING expressions
@@ -154,128 +155,127 @@ macro_rules! expect_arguments_even {
///
/// This is extracted from the original conditional implementation to be reusable.
/// The logic exactly matches the original conditional InList implementation.
///
/// An IN expression has one of the following formats:
/// ```sql
/// x IN (y1, y2,...,yN)
/// x IN (subquery) (Not yet implemented)
/// ```
/// The result of an IN operator is one of TRUE, FALSE, or NULL. A NULL result
/// means that it cannot be determined if the LHS is contained in the RHS due
/// to the presence of NULL values.
///
/// Currently, we do a simple full-scan, yet it's not ideal when there are many rows
/// on RHS. (Check sqlite's in-operator.md)
///
/// Algorithm:
/// 1. Set the null-flag to false
/// 2. For each row in the RHS:
/// - Compare LHS and RHS
/// - If LHS matches RHS, returns TRUE
/// - If the comparison results in NULL, set the null-flag to true
/// 3. If the null-flag is true, return NULL
/// 4. Return FALSE
///
/// A "NOT IN" operator is computed by first computing the equivalent IN
/// operator, then interchanging the TRUE and FALSE results.
// todo: Check right affinities
#[instrument(skip(program, referenced_tables, resolver), level = Level::DEBUG)]
fn translate_in_list(
program: &mut ProgramBuilder,
referenced_tables: Option<&TableReferences>,
lhs: &ast::Expr,
rhs: &[Box<ast::Expr>],
not: bool,
condition_metadata: ConditionMetadata,
// dest if null should be in ConditionMetadata
resolver: &Resolver,
) -> Result<()> {
// lhs is e.g. a column reference
// rhs is an Option<Vec<Expr>>
// If rhs is None, it means the IN expression is always false, i.e. tbl.id IN ().
// If rhs is Some, it means the IN expression has a list of values to compare against, e.g. tbl.id IN (1, 2, 3).
//
// The IN expression is equivalent to a series of OR expressions.
// For example, `a IN (1, 2, 3)` is equivalent to `a = 1 OR a = 2 OR a = 3`.
// The NOT IN expression is equivalent to a series of AND expressions.
// For example, `a NOT IN (1, 2, 3)` is equivalent to `a != 1 AND a != 2 AND a != 3`.
//
// SQLite typically optimizes IN expressions to use a binary search on an ephemeral index if there are many values.
// For now we don't have the plumbing to do that, so we'll just emit a series of comparisons,
// which is what SQLite also does for small lists of values.
// TODO: Let's refactor this later to use a more efficient implementation conditionally based on the number of values.
let lhs_reg = if let Expr::Parenthesized(v) = lhs {
program.alloc_registers(v.len())
} else {
program.alloc_register()
};
let _ = translate_expr(program, referenced_tables, lhs, lhs_reg, resolver)?;
let mut check_null_reg = 0;
let label_ok = program.allocate_label();
if rhs.is_empty() {
// If rhs is None, IN expressions are always false and NOT IN expressions are always true.
if not {
// On a trivially true NOT IN () expression we can only jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'; otherwise me must fall through.
// This is because in a more complex condition we might need to evaluate the rest of the condition.
// Note that we are already breaking up our WHERE clauses into a series of terms at "AND" boundaries, so right now we won't be running into cases where jumping on true would be incorrect,
// but once we have e.g. parenthesization and more complex conditions, not having this 'if' here would introduce a bug.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
} else {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_false,
});
}
return Ok(());
if condition_metadata.jump_target_when_false != condition_metadata.jump_target_when_null {
check_null_reg = program.alloc_register();
program.emit_insn(Insn::BitAnd {
lhs: lhs_reg,
rhs: lhs_reg,
dest: check_null_reg,
});
}
// The left hand side only needs to be evaluated once we have a list of values to compare against.
let lhs_reg = program.alloc_register();
let _ = translate_expr(program, referenced_tables, lhs, lhs_reg, resolver)?;
for (i, expr) in rhs.iter().enumerate() {
let last_condition = i == rhs.len() - 1;
let rhs_reg = program.alloc_register();
let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?;
// The difference between a local jump and an "upper level" jump is that for example in this case:
// WHERE foo IN (1,2,3) OR bar = 5,
// we can immediately jump to the 'jump_target_when_true' label of the ENTIRE CONDITION if foo = 1, foo = 2, or foo = 3 without evaluating the bar = 5 condition.
// This is why in Binary-OR expressions we set jump_if_condition_is_true to true for the first condition.
// However, in this example:
// WHERE foo IN (1,2,3) AND bar = 5,
// we can't jump to the 'jump_target_when_true' label of the entire condition foo = 1, foo = 2, or foo = 3, because we still need to evaluate the bar = 5 condition later.
// This is why in that case we just jump over the rest of the IN conditions in this "local" branch which evaluates the IN condition.
let jump_target_when_true = if condition_metadata.jump_if_condition_is_true {
condition_metadata.jump_target_when_true
} else {
program.allocate_label()
};
if check_null_reg != 0 && expr.can_be_null() {
program.emit_insn(Insn::BitAnd {
lhs: check_null_reg,
rhs: rhs_reg,
dest: check_null_reg,
});
}
if !not {
// If it's an IN expression, we need to jump to the 'jump_target_when_true' label if any of the conditions are true.
for (i, expr) in rhs.iter().enumerate() {
let rhs_reg = program.alloc_register();
let last_condition = i == rhs.len() - 1;
let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?;
// If this is not the last condition, we need to jump to the 'jump_target_when_true' label if the condition is true.
if !last_condition {
if !last_condition
|| condition_metadata.jump_target_when_false != condition_metadata.jump_target_when_null
{
if lhs_reg != rhs_reg {
program.emit_insn(Insn::Eq {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: jump_target_when_true,
target_pc: label_ok,
// Use affinity instead
flags: CmpInsFlags::default(),
collation: program.curr_collation(),
});
} else {
// If this is the last condition, we need to jump to the 'jump_target_when_false' label if there is no match.
program.emit_insn(Insn::Ne {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: condition_metadata.jump_target_when_false,
flags: CmpInsFlags::default().jump_if_null(),
collation: program.curr_collation(),
program.emit_insn(Insn::NotNull {
reg: lhs_reg,
target_pc: label_ok,
});
}
}
// If we got here, then the last condition was a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'.
// If not, we can just fall through without emitting an unnecessary instruction.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
} else {
// If it's a NOT IN expression, we need to jump to the 'jump_target_when_false' label if any of the conditions are true.
for expr in rhs.iter() {
let rhs_reg = program.alloc_register();
let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?;
program.emit_insn(Insn::Eq {
// sqlite3VdbeChangeP5(v, zAff[0]);
} else if lhs_reg != rhs_reg {
program.emit_insn(Insn::Ne {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: condition_metadata.jump_target_when_false,
flags: CmpInsFlags::default().jump_if_null(),
flags: CmpInsFlags::default(),
collation: program.curr_collation(),
});
}
// If we got here, then none of the conditions were a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'.
// If not, we can just fall through without emitting an unnecessary instruction.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
} else {
program.emit_insn(Insn::IsNull {
reg: lhs_reg,
target_pc: condition_metadata.jump_target_when_false,
});
}
}
if !condition_metadata.jump_if_condition_is_true {
program.preassign_label_to_next_insn(jump_target_when_true);
if check_null_reg != 0 {
program.emit_insn(Insn::IsNull {
reg: check_null_reg,
target_pc: condition_metadata.jump_target_when_null,
});
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_false,
});
}
program.resolve_label(label_ok, program.offset());
// by default if IN expression is true we just continue to the next instruction
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
// todo: deallocate check_null_reg
Ok(())
}
@@ -402,16 +402,55 @@ pub fn translate_condition_expr(
translate_expr(program, Some(referenced_tables), expr, reg, resolver)?;
emit_cond_jump(program, condition_metadata, reg);
}
ast::Expr::InList { lhs, not, rhs } => {
let ConditionMetadata {
jump_if_condition_is_true,
jump_target_when_true,
jump_target_when_false,
jump_target_when_null,
} = condition_metadata;
// Adjust targets if `NOT IN`
let (adjusted_metadata, not_true_label, not_false_label) = if *not {
let not_true_label = program.allocate_label();
let not_false_label = program.allocate_label();
(
ConditionMetadata {
jump_if_condition_is_true,
jump_target_when_true: not_true_label,
jump_target_when_false: not_false_label,
jump_target_when_null,
},
Some(not_true_label),
Some(not_false_label),
)
} else {
(condition_metadata, None, None)
};
translate_in_list(
program,
Some(referenced_tables),
lhs,
rhs,
*not,
condition_metadata,
adjusted_metadata,
resolver,
)?;
if *not {
// When IN is TRUE (match found), NOT IN should be FALSE
program.resolve_label(not_true_label.unwrap(), program.offset());
program.emit_insn(Insn::Goto {
target_pc: jump_target_when_false,
});
// When IN is FALSE (no match), NOT IN should be TRUE
program.resolve_label(not_false_label.unwrap(), program.offset());
program.emit_insn(Insn::Goto {
target_pc: jump_target_when_true,
});
}
}
ast::Expr::Like { not, .. } => {
let cur_reg = program.alloc_register();
@@ -2029,26 +2068,29 @@ pub fn translate_expr(
// but wrap it with appropriate expression context handling
let result_reg = target_register;
// Set result to NULL initially (matches SQLite behavior)
program.emit_insn(Insn::Null {
dest: result_reg,
let dest_if_false = program.allocate_label();
let dest_if_null = program.allocate_label();
let dest_if_true = program.allocate_label();
// Ideally we wouldn't need a tmp register, but currently if an IN expression
// is used inside an aggregator the target_register is cleared on every iteration,
// losing the state of the aggregator.
let tmp = program.alloc_register();
program.emit_no_constant_insn(Insn::Null {
dest: tmp,
dest_end: None,
});
let dest_if_false = program.allocate_label();
let label_integer_conversion = program.allocate_label();
// Call the core InList logic with expression-appropriate condition metadata
translate_in_list(
program,
referenced_tables,
lhs,
rhs,
*not,
ConditionMetadata {
jump_if_condition_is_true: false,
jump_target_when_true: label_integer_conversion, // will be resolved below
jump_target_when_true: dest_if_true,
jump_target_when_false: dest_if_false,
jump_target_when_null: dest_if_null,
},
resolver,
)?;
@@ -2056,27 +2098,30 @@ pub fn translate_expr(
// condition true: set result to 1
program.emit_insn(Insn::Integer {
value: 1,
dest: result_reg,
});
program.emit_insn(Insn::Goto {
target_pc: label_integer_conversion,
dest: tmp,
});
// False path: set result to 0
program.resolve_label(dest_if_false, program.offset());
program.emit_insn(Insn::Integer {
value: 0,
dest: result_reg,
});
program.resolve_label(label_integer_conversion, program.offset());
// Force integer conversion with AddImm 0
program.emit_insn(Insn::AddImm {
register: result_reg,
register: tmp,
value: 0,
});
if *not {
program.emit_insn(Insn::Not {
reg: tmp,
dest: tmp,
});
}
program.resolve_label(dest_if_null, program.offset());
program.emit_insn(Insn::Copy {
src_reg: tmp,
dst_reg: result_reg,
extra_amount: 0,
});
Ok(result_reg)
}
ast::Expr::InSelect { .. } => {

View File

@@ -786,6 +786,8 @@ pub fn group_by_emit_row_phase<'a>(
jump_if_condition_is_true: false,
jump_target_when_false: labels.label_group_by_end_without_emitting_row,
jump_target_when_true: if_true_target,
// treat null result has false for now
jump_target_when_null: labels.label_group_by_end_without_emitting_row,
},
&t_ctx.resolver,
)?;

View File

@@ -245,6 +245,7 @@ pub fn translate_create_index(
jump_if_condition_is_true: false,
jump_target_when_false: label,
jump_target_when_true: BranchOffset::Placeholder,
jump_target_when_null: label,
},
resolver,
)?;

View File

@@ -388,6 +388,7 @@ pub fn init_loop(
jump_if_condition_is_true: false,
jump_target_when_true: jump_target,
jump_target_when_false: t_ctx.label_main_loop_end.unwrap(),
jump_target_when_null: t_ctx.label_main_loop_end.unwrap(),
};
translate_condition_expr(program, tables, &cond.expr, meta, &t_ctx.resolver)?;
program.preassign_label_to_next_insn(jump_target);
@@ -713,6 +714,7 @@ fn emit_conditions(
jump_if_condition_is_true: false,
jump_target_when_true,
jump_target_when_false: next,
jump_target_when_null: next,
};
translate_condition_expr(
program,
@@ -727,7 +729,7 @@ fn emit_conditions(
Ok(())
}
/// SQLite (and so Limbo) processes joins as a nested loop.
/// SQLite (and so Turso) processes joins as a nested loop.
/// The loop may emit rows to various destinations depending on the query:
/// - a GROUP BY sorter (grouping is done by sorting based on the GROUP BY keys and aggregating while the GROUP BY keys match)
/// - a GROUP BY phase with no sorting (when the rows are already in the order required by the GROUP BY keys)

View File

@@ -341,6 +341,14 @@ impl ProgramBuilder {
self.insns.push((insn, self.insns.len()));
}
/// Emit an instruction that is guaranteed not to be in any constant span.
/// This ensures the instruction won't be hoisted when emit_constant_insns is called.
#[instrument(skip(self), level = Level::DEBUG)]
pub fn emit_no_constant_insn(&mut self, insn: Insn) {
self.constant_span_end_all();
self.emit_insn(insn);
}
pub fn close_cursors(&mut self, cursors: &[CursorID]) {
for cursor in cursors {
self.emit_insn(Insn::Close { cursor_id: *cursor });

View File

@@ -853,10 +853,12 @@ pub enum Insn {
db: usize,
},
/// Make a copy of register src..src+extra_amount into dst..dst+extra_amount.
Copy {
src_reg: usize,
dst_reg: usize,
extra_amount: usize, // 0 extra_amount means we include src_reg, dst_reg..=dst_reg+amount = src_reg..=src_reg+amount
/// 0 extra_amount means we include src_reg, dst_reg..=dst_reg+amount = src_reg..=src_reg+amount
extra_amount: usize,
},
/// Allocate a new b-tree.

View File

@@ -542,6 +542,17 @@ impl Expr {
pub fn raise(resolve_type: ResolveType, expr: Option<Expr>) -> Expr {
Expr::Raise(resolve_type, expr.map(Box::new))
}
pub fn can_be_null(&self) -> bool {
// todo: better handling columns. Check sqlite3ExprCanBeNull
match self {
Expr::Literal(literal) => !matches!(
literal,
Literal::Numeric(_) | Literal::String(_) | Literal::Blob(_)
),
_ => true,
}
}
}
/// SQL literal

View File

@@ -1723,11 +1723,23 @@ impl<'a> Parser<'a> {
_ => {
let exprs = self.parse_expr_list()?;
eat_expect!(self, TK_RP);
Box::new(Expr::InList {
lhs: result,
not,
rhs: exprs,
})
// Expressions in the form:
// lhs IN ()
// lhs NOT IN ()
// can be simplified to constants 0 (false) and 1 (true), respectively.
//
// todo: should check if lhs has a function. If so, this optimization cannot
// be done.
if exprs.is_empty() {
let name = if not { "1" } else { "0" };
Box::new(Expr::Literal(Literal::Numeric(name.into())))
} else {
Box::new(Expr::InList {
lhs: result,
rhs: exprs,
not,
})
}
}
}
}

View File

@@ -913,6 +913,19 @@ do_execsql_test_on_specific_db {:memory:} select-in-simple {
} {1
0}
do_execsql_test_on_specific_db {:memory:} select-in-with-nulls {
SELECT 4 IN (1, 4, null);
SELECT 4 NOT IN (1, 4, null);
} {1
0}
# All should be null
do_execsql_test_on_specific_db {:memory:} select-in-with-nulls-2 {
SELECT 1 IN (2, 3, null);
SELECT 1 NOT IN (2, 3, null);
SELECT null in (null);
} {\n\n}
do_execsql_test_on_specific_db {:memory:} select-in-complex {
CREATE TABLE test_table (id INTEGER, category TEXT, value INTEGER);
INSERT INTO test_table VALUES (1, 'A', 10), (2, 'B', 20), (3, 'A', 30), (4, 'C', 40);