Merge 'Coll seq' from Glauber Costa

Implement the CollSeq vdbe opcode.

Reviewed-by: Preston Thorpe <preston@turso.tech>

Closes #2454
This commit is contained in:
Jussi Saurio
2025-08-06 11:28:27 +03:00
committed by GitHub
7 changed files with 146 additions and 3 deletions

View File

@@ -431,7 +431,7 @@ Modifiers:
| Checkpoint | Yes | |
| Clear | No | |
| Close | Yes | |
| CollSeq | No | |
| CollSeq | Yes | |
| Column | Yes | |
| Compare | Yes | |
| Concat | Yes | |

View File

@@ -2,6 +2,7 @@ use turso_sqlite3_parser::ast;
use crate::{
function::AggFunc,
translate::collate::CollationSeq,
vdbe::{
builder::ProgramBuilder,
insn::{IdxInsertFlags, Insn},
@@ -60,6 +61,37 @@ pub fn emit_ungrouped_aggregation<'a>(
Ok(())
}
fn emit_collseq_if_needed(
program: &mut ProgramBuilder,
referenced_tables: &TableReferences,
expr: &ast::Expr,
) {
// Check if this is a column expression with explicit COLLATE clause
if let ast::Expr::Collate(_, collation_name) = expr {
if let Ok(collation) = CollationSeq::new(collation_name) {
program.emit_insn(Insn::CollSeq {
reg: None,
collation,
});
}
return;
}
// If no explicit collation, check if this is a column with table-defined collation
if let ast::Expr::Column { table, column, .. } = expr {
if let Some(table_ref) = referenced_tables.find_table_by_internal_id(*table) {
if let Some(table_column) = table_ref.get_column_at(*column) {
if let Some(collation) = &table_column.collation {
program.emit_insn(Insn::CollSeq {
reg: None,
collation: *collation,
});
}
}
}
}
}
/// Emits the bytecode for handling duplicates in a distinct aggregate.
/// This is used in both GROUP BY and non-GROUP BY aggregations to jump over
/// the AggStep that would otherwise accumulate the same value multiple times.
@@ -196,6 +228,7 @@ pub fn translate_aggregation_step(
let expr_reg = program.alloc_register();
let _ = translate_expr(program, Some(referenced_tables), expr, expr_reg, resolver)?;
handle_distinct(program, agg, expr_reg);
emit_collseq_if_needed(program, referenced_tables, expr);
program.emit_insn(Insn::AggStep {
acc_reg: target_register,
col: expr_reg,
@@ -212,6 +245,7 @@ pub fn translate_aggregation_step(
let expr_reg = program.alloc_register();
let _ = translate_expr(program, Some(referenced_tables), expr, expr_reg, resolver)?;
handle_distinct(program, agg, expr_reg);
emit_collseq_if_needed(program, referenced_tables, expr);
program.emit_insn(Insn::AggStep {
acc_reg: target_register,
col: expr_reg,

View File

@@ -117,6 +117,25 @@ pub type InsnFunction = fn(
Option<&Arc<MvStore>>,
) -> Result<InsnFunctionStepResult>;
/// Compare two values using the specified collation for text values.
/// Non-text values are compared using their natural ordering.
fn compare_with_collation(
lhs: &Value,
rhs: &Value,
collation: Option<CollationSeq>,
) -> std::cmp::Ordering {
match (lhs, rhs) {
(Value::Text(lhs_text), Value::Text(rhs_text)) => {
if let Some(coll) = collation {
coll.compare_strings(lhs_text.as_str(), rhs_text.as_str())
} else {
lhs.cmp(rhs)
}
}
_ => lhs.cmp(rhs),
}
}
pub enum InsnFunctionStepResult {
Done,
IO,
@@ -3364,7 +3383,13 @@ pub fn op_agg_step(
};
let new_value = col.get_owned_value();
if *new_value != Value::Null && acc.as_ref().is_none_or(|acc| new_value > acc) {
if *new_value != Value::Null
&& acc.as_ref().is_none_or(|acc| {
use std::cmp::Ordering;
compare_with_collation(new_value, acc, state.current_collation)
== Ordering::Greater
})
{
*acc = Some(new_value.clone());
}
}
@@ -3382,7 +3407,13 @@ pub fn op_agg_step(
let new_value = col.get_owned_value();
if *new_value != Value::Null && acc.as_ref().is_none_or(|acc| new_value < acc) {
if *new_value != Value::Null
&& acc.as_ref().is_none_or(|acc| {
use std::cmp::Ordering;
compare_with_collation(new_value, acc, state.current_collation)
== Ordering::Less
})
{
*acc = Some(new_value.clone());
}
}
@@ -6083,6 +6114,29 @@ pub fn op_is_null(
Ok(InsnFunctionStepResult::Step)
}
pub fn op_coll_seq(
_program: &Program,
state: &mut ProgramState,
insn: &Insn,
_pager: &Rc<Pager>,
_mv_store: Option<&Arc<MvStore>>,
) -> Result<InsnFunctionStepResult> {
let Insn::CollSeq { reg, collation } = insn else {
unreachable!("unexpected Insn {:?}", insn)
};
// Set the current collation sequence for use by subsequent functions
state.current_collation = Some(*collation);
// If P1 is not zero, initialize that register to 0
if let Some(reg_idx) = reg {
state.registers[*reg_idx] = Register::Value(Value::Integer(0));
}
state.pc += 1;
Ok(InsnFunctionStepResult::Step)
}
pub fn op_page_count(
program: &Program,
state: &mut ProgramState,

View File

@@ -1636,6 +1636,15 @@ pub fn insn_to_str(
0,
format!("rename_table({from}, {to})"),
),
Insn::CollSeq { reg, collation } => (
"CollSeq",
reg.unwrap_or(0) as i32,
0,
0,
Value::build_text(collation.to_string().as_str()),
0,
format!("collation={collation}"),
),
};
format!(
"{:<4} {:<17} {:<4} {:<4} {:<4} {:<13} {:<2} {}",

View File

@@ -864,6 +864,22 @@ pub enum Insn {
/// Jump to this PC if the register is null (P2).
target_pc: BranchOffset,
},
/// Set the collation sequence for the next function call.
/// P4 is a pointer to a CollationSeq. If the next call to a user function
/// or aggregate calls sqlite3GetFuncCollSeq(), this collation sequence will
/// be returned. This is used by the built-in min(), max() and nullif()
/// functions.
///
/// If P1 is not zero, then it is a register that a subsequent min() or
/// max() aggregate will set to 1 if the current row is not the minimum or
/// maximum. The P1 register is initialized to 0 by this instruction.
CollSeq {
/// Optional register to initialize to 0 (P1).
reg: Option<usize>,
/// The collation sequence to set (P4).
collation: CollationSeq,
},
ParseSchema {
db: usize,
where_clause: Option<String>,
@@ -1121,6 +1137,7 @@ impl Insn {
Insn::DropTable { .. } => execute::op_drop_table,
Insn::Close { .. } => execute::op_close,
Insn::IsNull { .. } => execute::op_is_null,
Insn::CollSeq { .. } => execute::op_coll_seq,
Insn::ParseSchema { .. } => execute::op_parse_schema,
Insn::ShiftRight { .. } => execute::op_shift_right,
Insn::ShiftLeft { .. } => execute::op_shift_left,

View File

@@ -29,6 +29,7 @@ use crate::{
function::{AggFunc, FuncCtx},
state_machine::StateTransition,
storage::sqlite3_ondisk::SmallVec,
translate::collate::CollationSeq,
translate::plan::TableReferences,
types::{IOResult, RawSlice, TextRef},
vdbe::execute::{
@@ -260,6 +261,8 @@ pub struct ProgramState {
op_insert_state: OpInsertState,
op_no_conflict_state: OpNoConflictState,
seek_state: OpSeekState,
/// Current collation sequence set by OP_CollSeq instruction
current_collation: Option<CollationSeq>,
}
impl ProgramState {
@@ -291,6 +294,7 @@ impl ProgramState {
op_insert_state: OpInsertState::Insert,
op_no_conflict_state: OpNoConflictState::Start,
seek_state: OpSeekState::Start,
current_collation: None,
}
}
@@ -330,6 +334,7 @@ impl ProgramState {
self.regex_cache.like.clear();
self.interrupted = false;
self.parameters.clear();
self.current_collation = None;
#[cfg(feature = "json")]
self.json_cache.clear()
}

View File

@@ -50,3 +50,27 @@ do_execsql_test_in_memory_any_error collate_unique_constraint {
CREATE TABLE t (a TEXT COLLATE NOCASE PRIMARY KEY);
INSERT INTO t VALUES ('lol'), ('LOL'), ('lOl');
}
do_execsql_test_on_specific_db {:memory:} collate_aggregation_default_binary {
create table fruits(name collate binary);
insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY');
select max(name) from fruits;
} {banana}
do_execsql_test_on_specific_db {:memory:} collate_aggregation_default_nocase {
create table fruits(name collate nocase);
insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY');
select max(name) from fruits;
} {CHERRY}
do_execsql_test_on_specific_db {:memory:} collate_aggregation_explicit_binary {
create table fruits(name collate nocase);
insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY');
select max(name collate binary) from fruits;
} {banana}
do_execsql_test_on_specific_db {:memory:} collate_aggregation_explicit_nocase {
create table fruits(name collate binary);
insert into fruits(name) values ('Apple') ,('banana') ,('CHERRY');
select max(name collate nocase) from fruits;
} {CHERRY}