Merge 'IN queries' from Glauber Costa

Merge 'IN queries' from Glauber Costa

Implement IN queries.
It is currently as todo!(), but my main motivation is that scavenging
for EXPLAINs, that pattern, at least in simple queries like SELECT ...
IN (1,2,3) uses the AddImm instruction we just added.

Reviewed-by: Jussi Saurio <jussi.saurio@gmail.com>

Closes #2342
This commit is contained in:
Preston Thorpe
2025-07-31 10:00:18 -04:00
committed by GitHub
6 changed files with 258 additions and 117 deletions

View File

@@ -415,7 +415,7 @@ Modifiers:
| Opcode | Status | Comment |
|----------------|--------|---------|
| Add | Yes | |
| AddImm | No | |
| AddImm | Yes | |
| Affinity | No | |
| AggFinal | Yes | |
| AggStep | Yes | |

View File

@@ -141,6 +141,138 @@ macro_rules! expect_arguments_even {
}};
}
/// Core implementation of IN expression logic that can be used in both conditional and expression contexts.
/// This follows SQLite's approach where a single core function handles all InList cases.
///
/// This is extracted from the original conditional implementation to be reusable.
/// The logic exactly matches the original conditional InList implementation.
#[instrument(skip(program, referenced_tables, resolver), level = Level::DEBUG)]
fn translate_in_list(
program: &mut ProgramBuilder,
referenced_tables: Option<&TableReferences>,
lhs: &ast::Expr,
rhs: &Option<Vec<ast::Expr>>,
not: bool,
condition_metadata: ConditionMetadata,
resolver: &Resolver,
) -> Result<()> {
// lhs is e.g. a column reference
// rhs is an Option<Vec<Expr>>
// If rhs is None, it means the IN expression is always false, i.e. tbl.id IN ().
// If rhs is Some, it means the IN expression has a list of values to compare against, e.g. tbl.id IN (1, 2, 3).
//
// The IN expression is equivalent to a series of OR expressions.
// For example, `a IN (1, 2, 3)` is equivalent to `a = 1 OR a = 2 OR a = 3`.
// The NOT IN expression is equivalent to a series of AND expressions.
// For example, `a NOT IN (1, 2, 3)` is equivalent to `a != 1 AND a != 2 AND a != 3`.
//
// SQLite typically optimizes IN expressions to use a binary search on an ephemeral index if there are many values.
// For now we don't have the plumbing to do that, so we'll just emit a series of comparisons,
// which is what SQLite also does for small lists of values.
// TODO: Let's refactor this later to use a more efficient implementation conditionally based on the number of values.
if rhs.is_none() {
// If rhs is None, IN expressions are always false and NOT IN expressions are always true.
if not {
// On a trivially true NOT IN () expression we can only jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'; otherwise me must fall through.
// This is because in a more complex condition we might need to evaluate the rest of the condition.
// Note that we are already breaking up our WHERE clauses into a series of terms at "AND" boundaries, so right now we won't be running into cases where jumping on true would be incorrect,
// but once we have e.g. parenthesization and more complex conditions, not having this 'if' here would introduce a bug.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
} else {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_false,
});
}
return Ok(());
}
// The left hand side only needs to be evaluated once we have a list of values to compare against.
let lhs_reg = program.alloc_register();
let _ = translate_expr(program, referenced_tables, lhs, lhs_reg, resolver)?;
let rhs = rhs.as_ref().unwrap();
// The difference between a local jump and an "upper level" jump is that for example in this case:
// WHERE foo IN (1,2,3) OR bar = 5,
// we can immediately jump to the 'jump_target_when_true' label of the ENTIRE CONDITION if foo = 1, foo = 2, or foo = 3 without evaluating the bar = 5 condition.
// This is why in Binary-OR expressions we set jump_if_condition_is_true to true for the first condition.
// However, in this example:
// WHERE foo IN (1,2,3) AND bar = 5,
// we can't jump to the 'jump_target_when_true' label of the entire condition foo = 1, foo = 2, or foo = 3, because we still need to evaluate the bar = 5 condition later.
// This is why in that case we just jump over the rest of the IN conditions in this "local" branch which evaluates the IN condition.
let jump_target_when_true = if condition_metadata.jump_if_condition_is_true {
condition_metadata.jump_target_when_true
} else {
program.allocate_label()
};
if !not {
// If it's an IN expression, we need to jump to the 'jump_target_when_true' label if any of the conditions are true.
for (i, expr) in rhs.iter().enumerate() {
let rhs_reg = program.alloc_register();
let last_condition = i == rhs.len() - 1;
let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?;
// If this is not the last condition, we need to jump to the 'jump_target_when_true' label if the condition is true.
if !last_condition {
program.emit_insn(Insn::Eq {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: jump_target_when_true,
flags: CmpInsFlags::default(),
collation: program.curr_collation(),
});
} else {
// If this is the last condition, we need to jump to the 'jump_target_when_false' label if there is no match.
program.emit_insn(Insn::Ne {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: condition_metadata.jump_target_when_false,
flags: CmpInsFlags::default().jump_if_null(),
collation: program.curr_collation(),
});
}
}
// If we got here, then the last condition was a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'.
// If not, we can just fall through without emitting an unnecessary instruction.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
} else {
// If it's a NOT IN expression, we need to jump to the 'jump_target_when_false' label if any of the conditions are true.
for expr in rhs.iter() {
let rhs_reg = program.alloc_register();
let _ = translate_expr(program, referenced_tables, expr, rhs_reg, resolver)?;
program.emit_insn(Insn::Eq {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: condition_metadata.jump_target_when_false,
flags: CmpInsFlags::default().jump_if_null(),
collation: program.curr_collation(),
});
}
// If we got here, then none of the conditions were a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'.
// If not, we can just fall through without emitting an unnecessary instruction.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
}
if !condition_metadata.jump_if_condition_is_true {
program.preassign_label_to_next_insn(jump_target_when_true);
}
Ok(())
}
#[instrument(skip(program, referenced_tables, expr, resolver), level = Level::DEBUG)]
pub fn translate_condition_expr(
program: &mut ProgramBuilder,
@@ -219,121 +351,15 @@ pub fn translate_condition_expr(
emit_cond_jump(program, condition_metadata, reg);
}
ast::Expr::InList { lhs, not, rhs } => {
// lhs is e.g. a column reference
// rhs is an Option<Vec<Expr>>
// If rhs is None, it means the IN expression is always false, i.e. tbl.id IN ().
// If rhs is Some, it means the IN expression has a list of values to compare against, e.g. tbl.id IN (1, 2, 3).
//
// The IN expression is equivalent to a series of OR expressions.
// For example, `a IN (1, 2, 3)` is equivalent to `a = 1 OR a = 2 OR a = 3`.
// The NOT IN expression is equivalent to a series of AND expressions.
// For example, `a NOT IN (1, 2, 3)` is equivalent to `a != 1 AND a != 2 AND a != 3`.
//
// SQLite typically optimizes IN expressions to use a binary search on an ephemeral index if there are many values.
// For now we don't have the plumbing to do that, so we'll just emit a series of comparisons,
// which is what SQLite also does for small lists of values.
// TODO: Let's refactor this later to use a more efficient implementation conditionally based on the number of values.
if rhs.is_none() {
// If rhs is None, IN expressions are always false and NOT IN expressions are always true.
if *not {
// On a trivially true NOT IN () expression we can only jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'; otherwise me must fall through.
// This is because in a more complex condition we might need to evaluate the rest of the condition.
// Note that we are already breaking up our WHERE clauses into a series of terms at "AND" boundaries, so right now we won't be running into cases where jumping on true would be incorrect,
// but once we have e.g. parenthesization and more complex conditions, not having this 'if' here would introduce a bug.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
} else {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_false,
});
}
return Ok(());
}
// The left hand side only needs to be evaluated once we have a list of values to compare against.
let lhs_reg = program.alloc_register();
let _ = translate_expr(program, Some(referenced_tables), lhs, lhs_reg, resolver)?;
let rhs = rhs.as_ref().unwrap();
// The difference between a local jump and an "upper level" jump is that for example in this case:
// WHERE foo IN (1,2,3) OR bar = 5,
// we can immediately jump to the 'jump_target_when_true' label of the ENTIRE CONDITION if foo = 1, foo = 2, or foo = 3 without evaluating the bar = 5 condition.
// This is why in Binary-OR expressions we set jump_if_condition_is_true to true for the first condition.
// However, in this example:
// WHERE foo IN (1,2,3) AND bar = 5,
// we can't jump to the 'jump_target_when_true' label of the entire condition foo = 1, foo = 2, or foo = 3, because we still need to evaluate the bar = 5 condition later.
// This is why in that case we just jump over the rest of the IN conditions in this "local" branch which evaluates the IN condition.
let jump_target_when_true = if condition_metadata.jump_if_condition_is_true {
condition_metadata.jump_target_when_true
} else {
program.allocate_label()
};
if !*not {
// If it's an IN expression, we need to jump to the 'jump_target_when_true' label if any of the conditions are true.
for (i, expr) in rhs.iter().enumerate() {
let rhs_reg = program.alloc_register();
let last_condition = i == rhs.len() - 1;
let _ =
translate_expr(program, Some(referenced_tables), expr, rhs_reg, resolver)?;
// If this is not the last condition, we need to jump to the 'jump_target_when_true' label if the condition is true.
if !last_condition {
program.emit_insn(Insn::Eq {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: jump_target_when_true,
flags: CmpInsFlags::default(),
collation: program.curr_collation(),
});
} else {
// If this is the last condition, we need to jump to the 'jump_target_when_false' label if there is no match.
program.emit_insn(Insn::Ne {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: condition_metadata.jump_target_when_false,
flags: CmpInsFlags::default().jump_if_null(),
collation: program.curr_collation(),
});
}
}
// If we got here, then the last condition was a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'.
// If not, we can just fall through without emitting an unnecessary instruction.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
} else {
// If it's a NOT IN expression, we need to jump to the 'jump_target_when_false' label if any of the conditions are true.
for expr in rhs.iter() {
let rhs_reg = program.alloc_register();
let _ =
translate_expr(program, Some(referenced_tables), expr, rhs_reg, resolver)?;
program.emit_insn(Insn::Eq {
lhs: lhs_reg,
rhs: rhs_reg,
target_pc: condition_metadata.jump_target_when_false,
flags: CmpInsFlags::default().jump_if_null(),
collation: program.curr_collation(),
});
}
// If we got here, then none of the conditions were a match, so we jump to the 'jump_target_when_true' label if 'jump_if_condition_is_true'.
// If not, we can just fall through without emitting an unnecessary instruction.
if condition_metadata.jump_if_condition_is_true {
program.emit_insn(Insn::Goto {
target_pc: condition_metadata.jump_target_when_true,
});
}
}
if !condition_metadata.jump_if_condition_is_true {
program.preassign_label_to_next_insn(jump_target_when_true);
}
translate_in_list(
program,
Some(referenced_tables),
lhs,
rhs,
*not,
condition_metadata,
resolver,
)?;
}
ast::Expr::Like { not, .. } => {
let cur_reg = program.alloc_register();
@@ -2047,7 +2073,61 @@ pub fn translate_expr(
}
Ok(target_register)
}
ast::Expr::InList { .. } => todo!(),
ast::Expr::InList { lhs, rhs, not } => {
// Following SQLite's approach: use the same core logic as conditional InList,
// but wrap it with appropriate expression context handling
let result_reg = target_register;
// Set result to NULL initially (matches SQLite behavior)
program.emit_insn(Insn::Null {
dest: result_reg,
dest_end: None,
});
let dest_if_false = program.allocate_label();
let label_integer_conversion = program.allocate_label();
// Call the core InList logic with expression-appropriate condition metadata
translate_in_list(
program,
referenced_tables,
lhs,
rhs,
*not,
ConditionMetadata {
jump_if_condition_is_true: false,
jump_target_when_true: label_integer_conversion, // will be resolved below
jump_target_when_false: dest_if_false,
},
resolver,
)?;
// condition true: set result to 1
program.emit_insn(Insn::Integer {
value: 1,
dest: result_reg,
});
program.emit_insn(Insn::Goto {
target_pc: label_integer_conversion,
});
// False path: set result to 0
program.resolve_label(dest_if_false, program.offset());
program.emit_insn(Insn::Integer {
value: 0,
dest: result_reg,
});
program.resolve_label(label_integer_conversion, program.offset());
// Force integer conversion with AddImm 0
program.emit_insn(Insn::AddImm {
register: result_reg,
value: 0,
});
Ok(result_reg)
}
ast::Expr::InSelect { .. } => todo!(),
ast::Expr::InTable { .. } => todo!(),
ast::Expr::IsNull(expr) => {

View File

@@ -6197,6 +6197,37 @@ pub fn op_shift_left(
Ok(InsnFunctionStepResult::Step)
}
pub fn op_add_imm(
program: &Program,
state: &mut ProgramState,
insn: &Insn,
pager: &Rc<Pager>,
mv_store: Option<&Rc<MvStore>>,
) -> Result<InsnFunctionStepResult> {
let Insn::AddImm { register, value } = insn else {
unreachable!("unexpected Insn {:?}", insn)
};
let current = &state.registers[*register];
let current_value = match current {
Register::Value(val) => val,
Register::Aggregate(_) => &Value::Null,
Register::Record(_) => &Value::Null,
};
let int_val = match current_value {
Value::Integer(i) => i + value,
Value::Float(f) => (*f as i64) + value,
Value::Text(s) => s.as_str().parse::<i64>().unwrap_or(0) + value,
Value::Blob(_) => *value, // BLOB becomes the added value
Value::Null => *value, // NULL becomes the added value
};
state.registers[*register] = Register::Value(Value::Integer(int_val));
state.pc += 1;
Ok(InsnFunctionStepResult::Step)
}
pub fn op_variable(
program: &Program,
state: &mut ProgramState,

View File

@@ -1358,6 +1358,15 @@ pub fn insn_to_str(
0,
format!("r[{dest}]=r[{lhs}] << r[{rhs}]"),
),
Insn::AddImm { register, value } => (
"AddImm",
*register as i32,
*value as i32,
0,
Value::build_text(""),
0,
format!("r[{register}]=r[{register}]+{value}"),
),
Insn::Variable { index, dest } => (
"Variable",
usize::from(*index) as i32,

View File

@@ -877,6 +877,14 @@ pub enum Insn {
dest: usize,
},
/// Add immediate value to register and force integer conversion.
/// Add the constant P2 to the value in register P1. The result is always an integer.
/// To force any register to be an integer, just add 0.
AddImm {
register: usize, // P1: target register
value: i64, // P2: immediate value to add
},
/// Get parameter variable.
Variable {
index: NonZero<usize>,
@@ -1106,6 +1114,7 @@ impl Insn {
Insn::ParseSchema { .. } => execute::op_parse_schema,
Insn::ShiftRight { .. } => execute::op_shift_right,
Insn::ShiftLeft { .. } => execute::op_shift_left,
Insn::AddImm { .. } => execute::op_add_imm,
Insn::Variable { .. } => execute::op_variable,
Insn::ZeroOrNull { .. } => execute::op_zero_or_null,
Insn::Not { .. } => execute::op_not,

View File

@@ -687,3 +687,15 @@ do_execsql_test_skip_lines_on_specific_db 1 {:memory:} select-double-quotes-lite
SELECT "literal_string" AS col;
} {literal_string}
do_execsql_test_on_specific_db {:memory:} select-in-simple {
SELECT 1 IN (1, 2, 3);
SELECT 4 IN (1, 2, 3);
} {1
0}
do_execsql_test_on_specific_db {:memory:} select-in-complex {
CREATE TABLE test_table (id INTEGER, category TEXT, value INTEGER);
INSERT INTO test_table VALUES (1, 'A', 10), (2, 'B', 20), (3, 'A', 30), (4, 'C', 40);
SELECT * FROM test_table WHERE category IN ('A', 'B') AND value IN (10, 30, 40);
} {1|A|10
3|A|30}