From 6a66053ca8d96bdb6a52f30a5762f7629ef62856 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 5 Aug 2025 13:38:58 -0500 Subject: [PATCH 1/2] make sure value comparisons for min and max are collation aware They currently aren't, which isn't right. --- core/vdbe/execute.rs | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 497218cb8..a29af4b5e 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -117,6 +117,25 @@ pub type InsnFunction = fn( Option<&Arc>, ) -> Result; +/// Compare two values using the specified collation for text values. +/// Non-text values are compared using their natural ordering. +fn compare_with_collation( + lhs: &Value, + rhs: &Value, + collation: Option, +) -> std::cmp::Ordering { + match (lhs, rhs) { + (Value::Text(lhs_text), Value::Text(rhs_text)) => { + if let Some(coll) = collation { + coll.compare_strings(lhs_text.as_str(), rhs_text.as_str()) + } else { + lhs.cmp(rhs) + } + } + _ => lhs.cmp(rhs), + } +} + pub enum InsnFunctionStepResult { Done, IO, @@ -3364,7 +3383,13 @@ pub fn op_agg_step( }; let new_value = col.get_owned_value(); - if *new_value != Value::Null && acc.as_ref().is_none_or(|acc| new_value > acc) { + if *new_value != Value::Null + && acc.as_ref().is_none_or(|acc| { + use std::cmp::Ordering; + compare_with_collation(new_value, acc, state.current_collation) + == Ordering::Greater + }) + { *acc = Some(new_value.clone()); } } @@ -3382,7 +3407,13 @@ pub fn op_agg_step( let new_value = col.get_owned_value(); - if *new_value != Value::Null && acc.as_ref().is_none_or(|acc| new_value < acc) { + if *new_value != Value::Null + && acc.as_ref().is_none_or(|acc| { + use std::cmp::Ordering; + compare_with_collation(new_value, acc, state.current_collation) + == Ordering::Less + }) + { *acc = Some(new_value.clone()); } } From d1be7ad0bb1f10d2086698a100d44acc6989c183 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sat, 2 Aug 2025 17:41:53 -0500 Subject: [PATCH 2/2] implement the collseq bytecode instruction SQLite generates those in aggregations like min / max with collation information either in the table definition or in the column expression. We currently generate the wrong result here, and properly generating the bytecode instruction fixes it. --- COMPAT.md | 2 +- core/translate/aggregation.rs | 34 ++++++++++++++++++++++++++++++++++ core/vdbe/execute.rs | 23 +++++++++++++++++++++++ core/vdbe/explain.rs | 9 +++++++++ core/vdbe/insn.rs | 17 +++++++++++++++++ core/vdbe/mod.rs | 5 +++++ testing/collate.test | 24 ++++++++++++++++++++++++ 7 files changed, 113 insertions(+), 1 deletion(-) diff --git a/COMPAT.md b/COMPAT.md index a988ca829..e72304a30 100644 --- a/COMPAT.md +++ b/COMPAT.md @@ -431,7 +431,7 @@ Modifiers: | Checkpoint | Yes | | | Clear | No | | | Close | Yes | | -| CollSeq | No | | +| CollSeq | Yes | | | Column | Yes | | | Compare | Yes | | | Concat | Yes | | diff --git a/core/translate/aggregation.rs b/core/translate/aggregation.rs index 14870fbdf..bc2eb9260 100644 --- a/core/translate/aggregation.rs +++ b/core/translate/aggregation.rs @@ -2,6 +2,7 @@ use turso_sqlite3_parser::ast; use crate::{ function::AggFunc, + translate::collate::CollationSeq, vdbe::{ builder::ProgramBuilder, insn::{IdxInsertFlags, Insn}, @@ -60,6 +61,37 @@ pub fn emit_ungrouped_aggregation<'a>( Ok(()) } +fn emit_collseq_if_needed( + program: &mut ProgramBuilder, + referenced_tables: &TableReferences, + expr: &ast::Expr, +) { + // Check if this is a column expression with explicit COLLATE clause + if let ast::Expr::Collate(_, collation_name) = expr { + if let Ok(collation) = CollationSeq::new(collation_name) { + program.emit_insn(Insn::CollSeq { + reg: None, + collation, + }); + } + return; + } + + // If no explicit collation, check if this is a column with table-defined collation + if let ast::Expr::Column { table, column, .. } = expr { + if let Some(table_ref) = referenced_tables.find_table_by_internal_id(*table) { + if let Some(table_column) = table_ref.get_column_at(*column) { + if let Some(collation) = &table_column.collation { + program.emit_insn(Insn::CollSeq { + reg: None, + collation: *collation, + }); + } + } + } + } +} + /// Emits the bytecode for handling duplicates in a distinct aggregate. /// This is used in both GROUP BY and non-GROUP BY aggregations to jump over /// the AggStep that would otherwise accumulate the same value multiple times. @@ -196,6 +228,7 @@ pub fn translate_aggregation_step( let expr_reg = program.alloc_register(); let _ = translate_expr(program, Some(referenced_tables), expr, expr_reg, resolver)?; handle_distinct(program, agg, expr_reg); + emit_collseq_if_needed(program, referenced_tables, expr); program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, @@ -212,6 +245,7 @@ pub fn translate_aggregation_step( let expr_reg = program.alloc_register(); let _ = translate_expr(program, Some(referenced_tables), expr, expr_reg, resolver)?; handle_distinct(program, agg, expr_reg); + emit_collseq_if_needed(program, referenced_tables, expr); program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index a29af4b5e..44fb05a12 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -6114,6 +6114,29 @@ pub fn op_is_null( Ok(InsnFunctionStepResult::Step) } +pub fn op_coll_seq( + _program: &Program, + state: &mut ProgramState, + insn: &Insn, + _pager: &Rc, + _mv_store: Option<&Arc>, +) -> Result { + let Insn::CollSeq { reg, collation } = insn else { + unreachable!("unexpected Insn {:?}", insn) + }; + + // Set the current collation sequence for use by subsequent functions + state.current_collation = Some(*collation); + + // If P1 is not zero, initialize that register to 0 + if let Some(reg_idx) = reg { + state.registers[*reg_idx] = Register::Value(Value::Integer(0)); + } + + state.pc += 1; + Ok(InsnFunctionStepResult::Step) +} + pub fn op_page_count( program: &Program, state: &mut ProgramState, diff --git a/core/vdbe/explain.rs b/core/vdbe/explain.rs index b891632ae..11353bf1e 100644 --- a/core/vdbe/explain.rs +++ b/core/vdbe/explain.rs @@ -1636,6 +1636,15 @@ pub fn insn_to_str( 0, format!("rename_table({from}, {to})"), ), + Insn::CollSeq { reg, collation } => ( + "CollSeq", + reg.unwrap_or(0) as i32, + 0, + 0, + Value::build_text(collation.to_string().as_str()), + 0, + format!("collation={collation}"), + ), }; format!( "{:<4} {:<17} {:<4} {:<4} {:<4} {:<13} {:<2} {}", diff --git a/core/vdbe/insn.rs b/core/vdbe/insn.rs index d3a326943..7d81bc223 100644 --- a/core/vdbe/insn.rs +++ b/core/vdbe/insn.rs @@ -864,6 +864,22 @@ pub enum Insn { /// Jump to this PC if the register is null (P2). target_pc: BranchOffset, }, + + /// Set the collation sequence for the next function call. + /// P4 is a pointer to a CollationSeq. If the next call to a user function + /// or aggregate calls sqlite3GetFuncCollSeq(), this collation sequence will + /// be returned. This is used by the built-in min(), max() and nullif() + /// functions. + /// + /// If P1 is not zero, then it is a register that a subsequent min() or + /// max() aggregate will set to 1 if the current row is not the minimum or + /// maximum. The P1 register is initialized to 0 by this instruction. + CollSeq { + /// Optional register to initialize to 0 (P1). + reg: Option, + /// The collation sequence to set (P4). + collation: CollationSeq, + }, ParseSchema { db: usize, where_clause: Option, @@ -1121,6 +1137,7 @@ impl Insn { Insn::DropTable { .. } => execute::op_drop_table, Insn::Close { .. } => execute::op_close, Insn::IsNull { .. } => execute::op_is_null, + Insn::CollSeq { .. } => execute::op_coll_seq, Insn::ParseSchema { .. } => execute::op_parse_schema, Insn::ShiftRight { .. } => execute::op_shift_right, Insn::ShiftLeft { .. } => execute::op_shift_left, diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index efc838839..99d115095 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -29,6 +29,7 @@ use crate::{ function::{AggFunc, FuncCtx}, state_machine::StateTransition, storage::sqlite3_ondisk::SmallVec, + translate::collate::CollationSeq, translate::plan::TableReferences, types::{IOResult, RawSlice, TextRef}, vdbe::execute::{ @@ -260,6 +261,8 @@ pub struct ProgramState { op_insert_state: OpInsertState, op_no_conflict_state: OpNoConflictState, seek_state: OpSeekState, + /// Current collation sequence set by OP_CollSeq instruction + current_collation: Option, } impl ProgramState { @@ -291,6 +294,7 @@ impl ProgramState { op_insert_state: OpInsertState::Insert, op_no_conflict_state: OpNoConflictState::Start, seek_state: OpSeekState::Start, + current_collation: None, } } @@ -330,6 +334,7 @@ impl ProgramState { self.regex_cache.like.clear(); self.interrupted = false; self.parameters.clear(); + self.current_collation = None; #[cfg(feature = "json")] self.json_cache.clear() } diff --git a/testing/collate.test b/testing/collate.test index 653b42397..2b9aa3ff3 100755 --- a/testing/collate.test +++ b/testing/collate.test @@ -50,3 +50,27 @@ do_execsql_test_in_memory_any_error collate_unique_constraint { CREATE TABLE t (a TEXT COLLATE NOCASE PRIMARY KEY); INSERT INTO t VALUES ('lol'), ('LOL'), ('lOl'); } + +do_execsql_test_on_specific_db {:memory:} collate_aggregation_default_binary { + create table fruits(name collate binary); + insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY'); + select max(name) from fruits; +} {banana} + +do_execsql_test_on_specific_db {:memory:} collate_aggregation_default_nocase { + create table fruits(name collate nocase); + insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY'); + select max(name) from fruits; +} {CHERRY} + +do_execsql_test_on_specific_db {:memory:} collate_aggregation_explicit_binary { + create table fruits(name collate nocase); + insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY'); + select max(name collate binary) from fruits; +} {banana} + +do_execsql_test_on_specific_db {:memory:} collate_aggregation_explicit_nocase { + create table fruits(name collate binary); + insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY'); + select max(name collate nocase) from fruits; +} {CHERRY}