fix substr handling with utf-8 and blobs

This commit is contained in:
pedrocarlo
2025-09-29 20:16:33 -03:00
parent 9788f6d005
commit ddfe56bbb9

View File

@@ -8410,31 +8410,96 @@ impl Value {
start_value: &Value,
length_value: Option<&Value>,
) -> Value {
if let (Some(str), Value::Integer(start)) = (value.cast_text(), start_value) {
let str_len = str.len() as i64;
/// Function is stabilized but not released for version 1.88 \
/// https://doc.rust-lang.org/src/core/str/mod.rs.html#453
const fn ceil_char_boundary(s: &str, index: usize) -> usize {
const fn is_utf8_char_boundary(c: u8) -> bool {
// This is bit magic equivalent to: b < 128 || b >= 192
(c as i8) >= -0x40
}
if index >= s.len() {
s.len()
} else {
let mut i = index;
while i < s.len() {
if is_utf8_char_boundary(s.as_bytes()[i]) {
break;
}
i += 1;
}
// The character boundary will be within four bytes of the index
debug_assert!(i <= index + 3);
i
}
}
fn calculate_postions(
start: i64,
bytes_len: usize,
length_value: Option<&Value>,
) -> (usize, usize) {
let bytes_len = bytes_len as i64;
// The left-most character of X is number 1.
// If Y is negative then the first character of the substring is found by counting from the right rather than the left.
let first_position = if *start < 0 {
str_len.saturating_sub((*start).abs())
let first_position = if start < 0 {
bytes_len.saturating_sub((start).abs())
} else {
*start - 1
start - 1
};
// If Z is negative then the abs(Z) characters preceding the Y-th character are returned.
let last_position = match length_value {
Some(Value::Integer(length)) => first_position + *length,
_ => str_len,
_ => bytes_len,
};
let (start, end) = if first_position <= last_position {
(first_position, last_position)
} else {
(last_position, first_position)
};
Value::build_text(
&str.as_str()[start.clamp(-0, str_len) as usize..end.clamp(0, str_len) as usize],
(
start.clamp(-0, bytes_len) as usize,
end.clamp(0, bytes_len) as usize,
)
} else {
Value::Null
}
let start_value = start_value.exec_cast("INT");
let length_value = length_value.map(|value| value.exec_cast("INT"));
match (value, start_value) {
(Value::Blob(b), Value::Integer(start)) => {
let (start, end) = calculate_postions(start, b.len(), length_value.as_ref());
Value::from_blob(b[start..end].to_vec())
}
(value, Value::Integer(start)) => {
if let Some(text) = value.cast_text() {
let (mut start, mut end) =
calculate_postions(start, text.len(), length_value.as_ref());
// https://github.com/sqlite/sqlite/blob/a248d84f/src/func.c#L417
let s = text.as_str();
let mut start_byte_idx = 0;
end = end - start;
while start > 0 {
start_byte_idx = ceil_char_boundary(s, start_byte_idx + 1);
start -= 1;
}
let mut end_byte_idx = start_byte_idx;
while end > 0 {
end_byte_idx = ceil_char_boundary(s, end_byte_idx + 1);
end -= 1;
}
Value::build_text(&s[start_byte_idx..end_byte_idx])
} else {
Value::Null
}
}
_ => Value::Null,
}
}