Fix DISTINCT with ORDER BY

We had a bug where we were checking for duplicates in the DISTINCT
index based on both the result column count plus any ORDER BY columns
not present in the DISTINCT clause.

This is wrong, so fix it by only using the result columns for the
dedupe check.
This commit is contained in:
Jussi Saurio
2025-08-15 14:58:14 +03:00
parent a99c8a8ca0
commit d2cfe06aa5
3 changed files with 132 additions and 4 deletions

View File

@@ -185,7 +185,6 @@ pub fn order_by_sorter_insert(
)?;
}
let mut cur_reg = start_reg + order_by_len;
let mut translated_result_col_count = 0;
for (i, rc) in result_columns.iter().enumerate() {
// If the result column is an exact duplicate of a sort key, we skip it.
if sort_metadata
@@ -203,15 +202,71 @@ pub fn order_by_sorter_insert(
cur_reg,
resolver,
)?;
translated_result_col_count += 1;
cur_reg += 1;
}
// Handle SELECT DISTINCT deduplication
if let Distinctness::Distinct { ctx } = &plan.distinctness {
let distinct_ctx = ctx.as_ref().expect("distinct context must exist");
let num_regs = order_by_len + translated_result_col_count;
distinct_ctx.emit_deduplication_insns(program, num_regs, start_reg);
// For distinctness checking with Insn::Found, we need a contiguous run of registers containing all the result columns.
// The emitted columns are in the ORDER BY sorter order, which may be different from the SELECT order, and obviously the
// ORDER BY clause may not have all the result columns.
// Hence, we need to allocate new registers and Copy from the existing ones to make a contiguous run of registers.
let mut needs_reordering = false;
// Check if result columns in sorter are in SELECT order
let mut prev = None;
for (select_idx, _rc) in result_columns.iter().enumerate() {
let sorter_idx = sort_metadata
.remappings
.get(select_idx)
.expect("remapping must exist for all result columns")
.orderby_sorter_idx;
if prev.is_some_and(|p| sorter_idx != p + 1) {
needs_reordering = true;
break;
}
prev = Some(sorter_idx);
}
if needs_reordering {
// Allocate registers for reordered result columns.
// TODO: it may be possible to optimize this to minimize the number of Insn::Copy we do, but for now
// we will just allocate a new reg for every result column.
let reordered_start_reg = program.alloc_registers(result_columns.len());
for (select_idx, _rc) in result_columns.iter().enumerate() {
let src_reg = sort_metadata
.remappings
.get(select_idx)
.map(|r| start_reg + r.orderby_sorter_idx)
.expect("remapping must exist for all result columns");
let dst_reg = reordered_start_reg + select_idx;
program.emit_insn(Insn::Copy {
src_reg,
dst_reg,
extra_amount: 0,
});
}
distinct_ctx.emit_deduplication_insns(
program,
result_columns.len(),
reordered_start_reg,
);
} else {
// Result columns are already in SELECT order, use them directly
let start_reg = sort_metadata
.remappings
.first()
.map(|r| start_reg + r.orderby_sorter_idx)
.expect("remapping must exist for all result columns");
distinct_ctx.emit_deduplication_insns(program, result_columns.len(), start_reg);
}
}
let SortMetadata {

View File

@@ -211,3 +211,10 @@ do_execsql_test orderby_desc_regression_verify_order {
} {99
98
97}
do_execsql_test_on_specific_db {:memory:} distinct_orderby_regression {
CREATE TABLE t (a,b,c,d);
INSERT INTO t VALUES (1,2,3,4),(2,3,4,5);
SELECT DISTINCT c,b FROM t ORDER BY d,b;
} {3|2
4|3}

View File

@@ -1720,6 +1720,72 @@ mod tests {
}
}
#[test]
pub fn fuzz_distinct() {
let db = TempDatabase::new_empty(true);
let limbo_conn = db.connect_limbo();
let sqlite_conn = rusqlite::Connection::open_in_memory().unwrap();
let (mut rng, seed) = rng_from_time_or_env();
tracing::info!("fuzz_distinct seed: {}", seed);
let columns = ["a", "b", "c", "d", "e"];
// Create table with 3 integer columns
let create_table = format!("CREATE TABLE t ({})", columns.join(", "));
limbo_exec_rows(&db, &limbo_conn, &create_table);
sqlite_exec_rows(&sqlite_conn, &create_table);
// Insert some random data
for _ in 0..1000 {
let values = (0..columns.len())
.map(|_| rng.random_range(1..3)) // intentionally narrow range
.collect::<Vec<_>>();
let query = format!(
"INSERT INTO t VALUES ({})",
values
.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(",")
);
limbo_exec_rows(&db, &limbo_conn, &query);
sqlite_exec_rows(&sqlite_conn, &query);
}
// Test different DISTINCT + ORDER BY combinations
for _ in 0..300 {
// Randomly select columns for DISTINCT
let num_distinct_cols = rng.random_range(1..=columns.len());
let mut available_cols = columns.to_vec();
let mut distinct_cols = Vec::with_capacity(num_distinct_cols);
for _ in 0..num_distinct_cols {
let idx = rng.random_range(0..available_cols.len());
distinct_cols.push(available_cols.remove(idx));
}
let distinct_cols = distinct_cols.join(", ");
// Randomly select columns for ORDER BY
let num_order_cols = rng.random_range(1..=columns.len());
let mut available_cols = columns.to_vec();
let mut order_cols = Vec::with_capacity(num_order_cols);
for _ in 0..num_order_cols {
let idx = rng.random_range(0..available_cols.len());
order_cols.push(available_cols.remove(idx));
}
let order_cols = order_cols.join(", ");
let query = format!("SELECT DISTINCT {distinct_cols} FROM t ORDER BY {order_cols}");
let limbo = limbo_exec_rows(&db, &limbo_conn, &query);
let sqlite = sqlite_exec_rows(&sqlite_conn, &query);
assert_eq!(limbo, sqlite, "seed: {seed}, query: {query}");
}
}
#[test]
#[ignore]
pub fn fuzz_long_create_table_drop_table_alter_table() {