Merge 'Index search fixes' from Nikita Sivukhin

This PR bundles 2 fixes:
1. Index search must skip NULL values
2. UPDATE must avoid using index which column is used in the SET clause
    * This was an optimization to not do full scan in case of `UPDATE t
SET ... WHERE col = ?` but instead of doing this hacks we must properly
load updated row set to the ephemeral index and flush it after update
will be finished instead of modifying BTree inplace
    * So, for now we completely remove this optimization and quitely
wait for proper optimization to land

Reviewed-by: Jussi Saurio <jussi.saurio@gmail.com>

Closes #3459
This commit is contained in:
Preston Thorpe
2025-09-30 12:34:52 -04:00
committed by GitHub
6 changed files with 288 additions and 146 deletions

View File

@@ -551,90 +551,89 @@ pub fn open_loop(
);
// Open the loop for the index search.
// Rowid equality point lookups are handled with a SeekRowid instruction which does not loop, since it is a single row lookup.
if let Search::RowidEq { cmp_expr } = search {
let src_reg = program.alloc_register();
translate_expr(
program,
Some(table_references),
cmp_expr,
src_reg,
&t_ctx.resolver,
)?;
program.emit_insn(Insn::SeekRowid {
cursor_id: table_cursor_id
.expect("Search::RowidEq requires a table cursor"),
src_reg,
target_pc: next,
});
} else {
// Otherwise, it's an index/rowid scan, i.e. first a seek is performed and then a scan until the comparison expression is not satisfied anymore.
if let Search::Seek {
index: Some(index), ..
} = search
{
if index.ephemeral {
let table_has_rowid = if let Table::BTree(btree) = &table.table {
btree.has_rowid
} else {
false
};
Some(emit_autoindex(
program,
index,
table_cursor_id
.expect("an ephemeral index must have a source table cursor"),
index_cursor_id
.expect("an ephemeral index must have an index cursor"),
table_has_rowid,
)?)
} else {
index_cursor_id
match search {
Search::RowidEq { cmp_expr } => {
let src_reg = program.alloc_register();
translate_expr(
program,
Some(table_references),
cmp_expr,
src_reg,
&t_ctx.resolver,
)?;
program.emit_insn(Insn::SeekRowid {
cursor_id: table_cursor_id
.expect("Search::RowidEq requires a table cursor"),
src_reg,
target_pc: next,
});
}
Search::Seek { index, .. } => {
// Otherwise, it's an index/rowid scan, i.e. first a seek is performed and then a scan until the comparison expression is not satisfied anymore.
if let Some(index) = index {
if index.ephemeral {
let table_has_rowid = if let Table::BTree(btree) = &table.table {
btree.has_rowid
} else {
false
};
let _ = emit_autoindex(
program,
index,
table_cursor_id.expect(
"an ephemeral index must have a source table cursor",
),
index_cursor_id
.expect("an ephemeral index must have an index cursor"),
table_has_rowid,
)?;
}
}
} else {
index_cursor_id
};
let is_index = index_cursor_id.is_some();
let seek_cursor_id = temp_cursor_id.unwrap_or_else(|| {
index_cursor_id.unwrap_or_else(|| {
table_cursor_id
.expect("Either ephemeral or index or table cursor must be opened")
})
});
let Search::Seek { seek_def, .. } = search else {
unreachable!("Rowid equality point lookup should have been handled above");
};
let seek_cursor_id = temp_cursor_id.unwrap_or_else(|| {
index_cursor_id.unwrap_or_else(|| {
table_cursor_id.expect(
"Either ephemeral or index or table cursor must be opened",
)
})
});
let Search::Seek { seek_def, .. } = search else {
unreachable!(
"Rowid equality point lookup should have been handled above"
);
};
let start_reg = program.alloc_registers(seek_def.key.len());
emit_seek(
program,
table_references,
seek_def,
t_ctx,
seek_cursor_id,
start_reg,
loop_end,
is_index,
)?;
emit_seek_termination(
program,
table_references,
seek_def,
t_ctx,
seek_cursor_id,
start_reg,
loop_start,
loop_end,
is_index,
)?;
let start_reg = program.alloc_registers(seek_def.key.len());
emit_seek(
program,
table_references,
seek_def,
t_ctx,
seek_cursor_id,
start_reg,
loop_end,
index.as_ref(),
)?;
emit_seek_termination(
program,
table_references,
seek_def,
t_ctx,
seek_cursor_id,
start_reg,
loop_start,
loop_end,
index.as_ref(),
)?;
if let Some(index_cursor_id) = index_cursor_id {
if let Some(table_cursor_id) = table_cursor_id {
// Don't do a btree table seek until it's actually necessary to read from the table.
program.emit_insn(Insn::DeferredSeek {
index_cursor_id,
table_cursor_id,
});
if let Some(index_cursor_id) = index_cursor_id {
if let Some(table_cursor_id) = table_cursor_id {
// Don't do a btree table seek until it's actually necessary to read from the table.
program.emit_insn(Insn::DeferredSeek {
index_cursor_id,
table_cursor_id,
});
}
}
}
}
@@ -1141,23 +1140,53 @@ fn emit_seek(
seek_cursor_id: usize,
start_reg: usize,
loop_end: BranchOffset,
is_index: bool,
seek_index: Option<&Arc<Index>>,
) -> Result<()> {
let is_index = seek_index.is_some();
let Some(seek) = seek_def.seek.as_ref() else {
// If there is no seek key, we start from the first or last row of the index,
// depending on the iteration direction.
//
// Also, if we will encounter NULLs in the index at the beginning of iteration (Forward + Asc OR Backward + Desc)
// then, we must explicitly skip them as seek always has some bound condition over indexed column (e.g. c < ?, c >= ?, ...)
//
// note that table seek has some rules to convert seek key values to the integer affinity + it has some logic to check for explicit NULL searches
// so, we emit simple Rewind/Last in case of search over table BTree
// (this is safe as table BTree keys are always non-null single integer)
match seek_def.iter_dir {
IterationDirection::Forwards => {
program.emit_insn(Insn::Rewind {
cursor_id: seek_cursor_id,
pc_if_empty: loop_end,
});
if seek_index.is_some_and(|index| index.columns[0].order == SortOrder::Asc) {
program.emit_null(start_reg, None);
program.emit_insn(Insn::SeekGT {
is_index,
cursor_id: seek_cursor_id,
start_reg,
num_regs: 1,
target_pc: loop_end,
});
} else {
program.emit_insn(Insn::Rewind {
cursor_id: seek_cursor_id,
pc_if_empty: loop_end,
});
}
}
IterationDirection::Backwards => {
program.emit_insn(Insn::Last {
cursor_id: seek_cursor_id,
pc_if_empty: loop_end,
});
if seek_index.is_some_and(|index| index.columns[0].order == SortOrder::Desc) {
program.emit_null(start_reg, None);
program.emit_insn(Insn::SeekLT {
is_index,
cursor_id: seek_cursor_id,
start_reg,
num_regs: 1,
target_pc: loop_end,
});
} else {
program.emit_insn(Insn::Last {
cursor_id: seek_cursor_id,
pc_if_empty: loop_end,
});
}
}
}
return Ok(());
@@ -1254,10 +1283,37 @@ fn emit_seek_termination(
start_reg: usize,
loop_start: BranchOffset,
loop_end: BranchOffset,
is_index: bool,
seek_index: Option<&Arc<Index>>,
) -> Result<()> {
let is_index = seek_index.is_some();
let Some(termination) = seek_def.termination.as_ref() else {
program.preassign_label_to_next_insn(loop_start);
// If we will encounter NULLs in the index at the end of iteration (Forward + Desc OR Backward + Asc)
// then, we must explicitly stop before them as seek always has some bound condition over indexed column (e.g. c < ?, c >= ?, ...)
match seek_def.iter_dir {
IterationDirection::Forwards => {
if seek_index.is_some_and(|index| index.columns[0].order == SortOrder::Desc) {
program.emit_null(start_reg, None);
program.emit_insn(Insn::IdxGE {
cursor_id: seek_cursor_id,
start_reg,
num_regs: 1,
target_pc: loop_end,
});
}
}
IterationDirection::Backwards => {
if seek_index.is_some_and(|index| index.columns[0].order == SortOrder::Asc) {
program.emit_null(start_reg, None);
program.emit_insn(Insn::IdxLE {
cursor_id: seek_cursor_id,
start_reg,
num_regs: 1,
target_pc: loop_end,
});
}
}
}
return Ok(());
};

View File

@@ -142,10 +142,6 @@ fn optimize_update_plan(plan: &mut UpdatePlan, schema: &Schema) -> Result<()> {
if !plan.indexes_to_update.iter().any(|i| Arc::ptr_eq(index, i)) {
return Ok(());
}
// Fine to use index if we aren't going to be iterating over it, since it returns at most 1 row.
if table_ref.op.returns_max_1_row() {
return Ok(());
}
// Otherwise, fall back to a table scan.
table_ref.op = Operation::Scan(Scan::BTreeTable {
iter_dir: IterationDirection::Forwards,

View File

@@ -795,19 +795,6 @@ impl Operation {
Operation::Search(Search::Seek { index, .. }) => index.as_ref(),
}
}
pub fn returns_max_1_row(&self) -> bool {
match self {
Operation::Scan(_) => false,
Operation::Search(Search::RowidEq { .. }) => true,
Operation::Search(Search::Seek { index, seek_def }) => {
let Some(index) = index else {
return false;
};
index.unique && seek_def.seek.as_ref().is_some_and(|seek| seek.op.eq_only())
}
}
}
}
impl JoinedTable {

View File

@@ -787,3 +787,28 @@ do_execsql_test_on_specific_db {:memory:} rowid-references {
AND `oid` = 2
AND [oid] = 2;
} {2|2|2|2|2|2|2|2|2|2|2|2}
do_execsql_test_on_specific_db {:memory:} null-in-search {
CREATE TABLE t_x_asc (id INTEGER PRIMARY KEY, x);
CREATE INDEX t_x_asc_idx ON t_x_asc(x ASC);
CREATE TABLE t_x_desc (id INTEGER PRIMARY KEY, x);
CREATE INDEX t_x_desc_idx ON t_x_desc(x DESC);
INSERT INTO t_x_asc VALUES (1, NULL), (2, 2), (10, 10);
INSERT INTO t_x_desc VALUES (1, NULL), (2, 2), (10, 10);
SELECT * FROM t_x_asc WHERE x > 5 ORDER BY x ASC;
SELECT * FROM t_x_asc WHERE x > 5 ORDER BY x DESC;
SELECT * FROM t_x_asc WHERE x < 5 ORDER BY x ASC;
SELECT * FROM t_x_asc WHERE x < 5 ORDER BY x DESC;
SELECT * FROM t_x_desc WHERE x > 5 ORDER BY x ASC;
SELECT * FROM t_x_desc WHERE x > 5 ORDER BY x DESC;
SELECT * FROM t_x_desc WHERE x < 5 ORDER BY x ASC;
SELECT * FROM t_x_desc WHERE x < 5 ORDER BY x DESC;
} {10|10
10|10
2|2
2|2
10|10
10|10
2|2
2|2}

View File

@@ -672,6 +672,15 @@ mod tests {
#[test]
pub fn partial_index_mutation_and_upsert_fuzz() {
index_mutation_upsert_fuzz(1.0, 1);
}
#[test]
pub fn simple_index_mutation_and_upsert_fuzz() {
index_mutation_upsert_fuzz(0.0, 0);
}
fn index_mutation_upsert_fuzz(partial_index_prob: f64, conflict_chain_max_len: u32) {
let _ = env_logger::try_init();
const OUTER_ITERS: usize = 5;
const INNER_ITERS: usize = 500;
@@ -723,6 +732,7 @@ mod tests {
let functions = ["lower", "upper", "length"];
let num_pidx = rng.random_range(0..=3);
let mut conflict_match_targets = vec!["".to_string(), "(id)".to_string()];
let mut idx_ddls: Vec<String> = Vec::new();
for i in 0..num_pidx {
// Pick 1 or 2 key columns; always include "k" sometimes to get frequent conflicts.
@@ -790,13 +800,24 @@ mod tests {
parts.join(" AND ")
};
let ddl = format!(
"CREATE UNIQUE INDEX idx_p{}_{} ON t({}) WHERE {}",
outer,
i,
key_cols.join(","),
pred
);
let ddl = if rng.random_bool(partial_index_prob) {
format!(
"CREATE UNIQUE INDEX idx_p{}_{} ON t({}) WHERE {}",
outer,
i,
key_cols.join(","),
pred
)
} else {
// ON CONFLICT (...) can use only column set with non-partial UNIQUE constraint
conflict_match_targets.push(format!("({})", key_cols.join(",")));
format!(
"CREATE UNIQUE INDEX idx_p{}_{} ON t({})",
outer,
i,
key_cols.join(","),
)
};
idx_ddls.push(ddl.clone());
// Create in both engines
println!("{ddl};");
@@ -817,9 +838,11 @@ mod tests {
vals.push(v);
}
let ins = format!("INSERT INTO t VALUES ({})", vals.join(", "));
println!("{ins}; -- seed");
// Execute on both; ignore errors due to partial unique conflicts (keep seeding going)
let _ = sqlite.execute(&ins, rusqlite::params![]);
let _ = limbo_exec_rows_fallible(&limbo_db, &limbo_conn, &ins);
let sqlite_res = sqlite.execute(&ins, rusqlite::params![]);
let limbo_res = limbo_exec_rows_fallible(&limbo_db, &limbo_conn, &ins);
assert!(sqlite_res.is_ok() == limbo_res.is_ok());
}
for _ in 0..INNER_ITERS {
@@ -909,37 +932,43 @@ mod tests {
});
}
}
if rng.random_bool(0.3) {
// 30% chance ON CONFLICT DO UPDATE SET ...
let mut set_list = Vec::new();
let num_set = rng.random_range(1..=cols_list.len());
let set_cols = cols_list
.choose_multiple(&mut rng, num_set)
.cloned()
.collect::<Vec<_>>();
for c in set_cols.iter() {
let v = if c == "k" {
format!("'{}'", K_POOL.choose(&mut rng).unwrap())
} else if rng.random_bool(0.2) {
"NULL".into()
} else {
rng.random_range(-5..=15).to_string()
};
set_list.push(format!("{c} = {v}"));
let chain_length = rng.random_range(0..=conflict_chain_max_len);
let mut on_conflict = String::new();
for _ in 0..chain_length {
let idx = rng.random_range(0..conflict_match_targets.len());
let target = &conflict_match_targets[idx];
if rng.random_bool(0.8) {
let mut set_list = Vec::new();
let num_set = rng.random_range(1..=cols_list.len());
let set_cols = cols_list
.choose_multiple(&mut rng, num_set)
.cloned()
.collect::<Vec<_>>();
for c in set_cols.iter() {
let v = if c == "k" {
format!("'{}'", K_POOL.choose(&mut rng).unwrap())
} else if rng.random_bool(0.2) {
"NULL".into()
} else {
rng.random_range(-5..=15).to_string()
};
set_list.push(format!("{c} = {v}"));
}
on_conflict.push_str(&format!(
" ON CONFLICT{} DO UPDATE SET {}",
target,
set_list.join(", ")
));
} else {
on_conflict.push_str(&format!(" ON CONFLICT{target} DO NOTHING"));
}
format!(
"INSERT INTO t({}) VALUES({}) ON CONFLICT DO UPDATE SET {}",
cols_list.join(","),
vals_list.join(","),
set_list.join(", ")
)
} else {
format!(
"INSERT INTO t({}) VALUES({}) ON CONFLICT DO NOTHING",
cols_list.join(","),
vals_list.join(",")
)
}
format!(
"INSERT INTO t({}) VALUES({}) {}",
cols_list.join(","),
vals_list.join(","),
on_conflict
)
}
_ => unreachable!(),
};

View File

@@ -1,4 +1,4 @@
use crate::common::{self, maybe_setup_tracing};
use crate::common::{self, limbo_exec_rows, maybe_setup_tracing};
use crate::common::{compare_string, do_flush, TempDatabase};
use log::debug;
use std::io::{Read, Seek, Write};
@@ -790,3 +790,52 @@ fn test_insert_with_column_names() -> anyhow::Result<()> {
Ok(())
}
#[test]
pub fn delete_search_op_ignore_nulls() {
let limbo = TempDatabase::new_empty(true);
let conn = limbo.db.connect().unwrap();
for sql in [
"CREATE TABLE t (id INTEGER PRIMARY KEY AUTOINCREMENT, c INT);",
"CREATE UNIQUE INDEX t_idx ON t(c);",
"INSERT INTO t VALUES (NULL, NULL)",
"DELETE FROM t WHERE c < -1;",
] {
conn.execute(sql).unwrap();
}
assert_eq!(
vec![vec![
rusqlite::types::Value::Integer(1),
rusqlite::types::Value::Null
]],
limbo_exec_rows(&limbo, &conn, "SELECT * FROM t ORDER BY id")
);
}
#[test]
pub fn delete_eq_correct() {
let limbo = TempDatabase::new_empty(true);
let conn = limbo.db.connect().unwrap();
for sql in [
"CREATE TABLE t (id INTEGER PRIMARY KEY AUTOINCREMENT, c INT);",
"CREATE UNIQUE INDEX t_idx ON t(c);",
"INSERT INTO t VALUES (null, -1);",
"INSERT INTO t VALUES (null, -2);",
"UPDATE t SET c = NULL WHERE c = -1;",
] {
conn.execute(sql).unwrap();
}
assert_eq!(
vec![
vec![
rusqlite::types::Value::Integer(1),
rusqlite::types::Value::Null
],
vec![
rusqlite::types::Value::Integer(2),
rusqlite::types::Value::Integer(-2),
]
],
limbo_exec_rows(&limbo, &conn, "SELECT * FROM t ORDER BY id")
);
}