Files
turso/core/io/mod.rs
Pekka Enberg 87d3f74e6e Merge 'Evict page from cache if page is unlocked and unloaded' from Pedro Muniz
Because we can abort a read_page completion, this means a page can be in
the cache but be unloaded and unlocked. However, if we do not evict that
page from the page cache, we will return an unloaded page later which
will trigger assertions later on. This is worsened by the fact that page
cache is not per `Statement`, so you can abort a completion in one
Statement, and trigger some error in the next one if we don't evict the
page in these circumstances.
Also, to propagate IO errors we need to return the Error from
IOCompletions on step.

Closes #2785
2025-09-02 09:08:12 +03:00

529 lines
15 KiB
Rust

use crate::storage::buffer_pool::ArenaBuffer;
use crate::storage::sqlite3_ondisk::WAL_FRAME_HEADER_SIZE;
use crate::{BufferPool, CompletionError, Result};
use bitflags::bitflags;
use cfg_block::cfg_block;
use std::cell::RefCell;
use std::fmt;
use std::ptr::NonNull;
use std::sync::{Arc, OnceLock};
use std::{fmt::Debug, pin::Pin};
pub trait File: Send + Sync {
fn lock_file(&self, exclusive: bool) -> Result<()>;
fn unlock_file(&self) -> Result<()>;
fn pread(&self, pos: u64, c: Completion) -> Result<Completion>;
fn pwrite(&self, pos: u64, buffer: Arc<Buffer>, c: Completion) -> Result<Completion>;
fn sync(&self, c: Completion) -> Result<Completion>;
fn pwritev(&self, pos: u64, buffers: Vec<Arc<Buffer>>, c: Completion) -> Result<Completion> {
use std::sync::atomic::{AtomicUsize, Ordering};
if buffers.is_empty() {
c.complete(0);
return Ok(c);
}
if buffers.len() == 1 {
return self.pwrite(pos, buffers[0].clone(), c);
}
// naive default implementation can be overridden on backends where it makes sense to
let mut pos = pos;
let outstanding = Arc::new(AtomicUsize::new(buffers.len()));
let total_written = Arc::new(AtomicUsize::new(0));
for buf in buffers {
let len = buf.len();
let child_c = {
let c_main = c.clone();
let outstanding = outstanding.clone();
let total_written = total_written.clone();
let _cloned = buf.clone();
Completion::new_write(move |n| {
if let Ok(n) = n {
// reference buffer in callback to ensure alive for async io
let _buf = _cloned.clone();
// accumulate bytes actually reported by the backend
total_written.fetch_add(n as usize, Ordering::Relaxed);
if outstanding.fetch_sub(1, Ordering::AcqRel) == 1 {
// last one finished
c_main.complete(total_written.load(Ordering::Acquire) as i32);
}
}
})
};
if let Err(e) = self.pwrite(pos, buf.clone(), child_c) {
// best-effort: mark as abort so caller won't wait forever
// TODO: when we have `pwrite` and other I/O methods return CompletionError
// instead of LimboError, store the error inside
c.abort();
return Err(e);
}
pos += len as u64;
}
Ok(c)
}
fn size(&self) -> Result<u64>;
fn truncate(&self, len: u64, c: Completion) -> Result<Completion>;
}
#[derive(Debug, Copy, Clone, PartialEq)]
pub struct OpenFlags(i32);
bitflags! {
impl OpenFlags: i32 {
const None = 0b00000000;
const Create = 0b0000001;
const ReadOnly = 0b0000010;
}
}
impl Default for OpenFlags {
fn default() -> Self {
Self::Create
}
}
pub trait IO: Clock + Send + Sync {
fn open_file(&self, path: &str, flags: OpenFlags, direct: bool) -> Result<Arc<dyn File>>;
// remove_file is used in the sync-engine
fn remove_file(&self, path: &str) -> Result<()>;
fn run_once(&self) -> Result<()> {
Ok(())
}
fn wait_for_completion(&self, c: Completion) -> Result<()> {
while !c.finished() {
self.run_once()?
}
if let Some(Some(err)) = c.inner.result.get().copied() {
return Err(err.into());
}
Ok(())
}
fn generate_random_number(&self) -> i64 {
let mut buf = [0u8; 8];
getrandom::getrandom(&mut buf).unwrap();
i64::from_ne_bytes(buf)
}
fn get_memory_io(&self) -> Arc<MemoryIO> {
Arc::new(MemoryIO::new())
}
fn register_fixed_buffer(&self, _ptr: NonNull<u8>, _len: usize) -> Result<u32> {
Err(crate::LimboError::InternalError(
"unsupported operation".to_string(),
))
}
}
pub type ReadComplete = dyn Fn(Result<(Arc<Buffer>, i32), CompletionError>);
pub type WriteComplete = dyn Fn(Result<i32, CompletionError>);
pub type SyncComplete = dyn Fn(Result<i32, CompletionError>);
pub type TruncateComplete = dyn Fn(Result<i32, CompletionError>);
#[must_use]
#[derive(Debug, Clone)]
pub struct Completion {
inner: Arc<CompletionInner>,
}
#[derive(Debug)]
struct CompletionInner {
completion_type: CompletionType,
/// None means we completed successfully
// Thread safe with OnceLock
result: std::sync::OnceLock<Option<CompletionError>>,
}
impl Debug for CompletionType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Read(..) => f.debug_tuple("Read").finish(),
Self::Write(..) => f.debug_tuple("Write").finish(),
Self::Sync(..) => f.debug_tuple("Sync").finish(),
Self::Truncate(..) => f.debug_tuple("Truncate").finish(),
}
}
}
pub enum CompletionType {
Read(ReadCompletion),
Write(WriteCompletion),
Sync(SyncCompletion),
Truncate(TruncateCompletion),
}
impl Completion {
pub fn new(completion_type: CompletionType) -> Self {
Self {
inner: Arc::new(CompletionInner {
completion_type,
result: OnceLock::new(),
}),
}
}
pub fn new_write<F>(complete: F) -> Self
where
F: Fn(Result<i32, CompletionError>) + 'static,
{
Self::new(CompletionType::Write(WriteCompletion::new(Box::new(
complete,
))))
}
pub fn new_read<F>(buf: Arc<Buffer>, complete: F) -> Self
where
F: Fn(Result<(Arc<Buffer>, i32), CompletionError>) + 'static,
{
Self::new(CompletionType::Read(ReadCompletion::new(
buf,
Box::new(complete),
)))
}
pub fn new_sync<F>(complete: F) -> Self
where
F: Fn(Result<i32, CompletionError>) + 'static,
{
Self::new(CompletionType::Sync(SyncCompletion::new(Box::new(
complete,
))))
}
pub fn new_trunc<F>(complete: F) -> Self
where
F: Fn(Result<i32, CompletionError>) + 'static,
{
Self::new(CompletionType::Truncate(TruncateCompletion::new(Box::new(
complete,
))))
}
/// Create a dummy completed completion
pub fn new_dummy() -> Self {
let c = Self::new_write(|_| {});
c.complete(0);
c
}
pub fn is_completed(&self) -> bool {
self.inner.result.get().is_some_and(|val| val.is_none())
}
pub fn has_error(&self) -> bool {
self.inner.result.get().is_some_and(|val| val.is_some())
}
pub fn get_error(&self) -> Option<CompletionError> {
self.inner.result.get().and_then(|res| *res)
}
/// Checks if the Completion completed or errored
pub fn finished(&self) -> bool {
self.inner.result.get().is_some()
}
pub fn complete(&self, result: i32) {
if self.inner.result.set(None).is_ok() {
let result = Ok(result);
match &self.inner.completion_type {
CompletionType::Read(r) => r.callback(result),
CompletionType::Write(w) => w.callback(result),
CompletionType::Sync(s) => s.callback(result), // fix
CompletionType::Truncate(t) => t.callback(result),
};
}
}
pub fn error(&self, err: CompletionError) {
if self.inner.result.set(Some(err)).is_ok() {
let result = Err(err);
match &self.inner.completion_type {
CompletionType::Read(r) => r.callback(result),
CompletionType::Write(w) => w.callback(result),
CompletionType::Sync(s) => s.callback(result), // fix
CompletionType::Truncate(t) => t.callback(result),
};
}
}
pub fn abort(&self) {
self.error(CompletionError::Aborted);
}
/// only call this method if you are sure that the completion is
/// a ReadCompletion, panics otherwise
pub fn as_read(&self) -> &ReadCompletion {
match self.inner.completion_type {
CompletionType::Read(ref r) => r,
_ => unreachable!(),
}
}
/// only call this method if you are sure that the completion is
/// a WriteCompletion, panics otherwise
pub fn as_write(&self) -> &WriteCompletion {
match self.inner.completion_type {
CompletionType::Write(ref w) => w,
_ => unreachable!(),
}
}
}
pub struct ReadCompletion {
pub buf: Arc<Buffer>,
pub complete: Box<ReadComplete>,
}
impl ReadCompletion {
pub fn new(buf: Arc<Buffer>, complete: Box<ReadComplete>) -> Self {
Self { buf, complete }
}
pub fn buf(&self) -> &Buffer {
&self.buf
}
pub fn callback(&self, bytes_read: Result<i32, CompletionError>) {
(self.complete)(bytes_read.map(|b| (self.buf.clone(), b)));
}
pub fn buf_arc(&self) -> Arc<Buffer> {
self.buf.clone()
}
}
pub struct WriteCompletion {
pub complete: Box<WriteComplete>,
}
impl WriteCompletion {
pub fn new(complete: Box<WriteComplete>) -> Self {
Self { complete }
}
pub fn callback(&self, bytes_written: Result<i32, CompletionError>) {
(self.complete)(bytes_written);
}
}
pub struct SyncCompletion {
pub complete: Box<SyncComplete>,
}
impl SyncCompletion {
pub fn new(complete: Box<SyncComplete>) -> Self {
Self { complete }
}
pub fn callback(&self, res: Result<i32, CompletionError>) {
(self.complete)(res);
}
}
pub struct TruncateCompletion {
pub complete: Box<TruncateComplete>,
}
impl TruncateCompletion {
pub fn new(complete: Box<TruncateComplete>) -> Self {
Self { complete }
}
pub fn callback(&self, res: Result<i32, CompletionError>) {
(self.complete)(res);
}
}
pub type BufferData = Pin<Box<[u8]>>;
pub enum Buffer {
Heap(BufferData),
Pooled(ArenaBuffer),
}
impl Debug for Buffer {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Pooled(p) => write!(f, "Pooled(len={})", p.logical_len()),
Self::Heap(buf) => write!(f, "{buf:?}: {}", buf.len()),
}
}
}
impl Drop for Buffer {
fn drop(&mut self) {
let len = self.len();
if let Self::Heap(buf) = self {
TEMP_BUFFER_CACHE.with(|cache| {
let mut cache = cache.borrow_mut();
// take ownership of the buffer by swapping it with a dummy
let buffer = std::mem::replace(buf, Pin::new(vec![].into_boxed_slice()));
cache.return_buffer(buffer, len);
});
}
}
}
impl Buffer {
pub fn new(data: Vec<u8>) -> Self {
tracing::trace!("buffer::new({:?})", data);
Self::Heap(Pin::new(data.into_boxed_slice()))
}
/// Returns the index of the underlying `Arena` if it was registered with
/// io_uring. Only for use with `UringIO` backend.
pub fn fixed_id(&self) -> Option<u32> {
match self {
Self::Heap { .. } => None,
Self::Pooled(buf) => buf.fixed_id(),
}
}
pub fn new_pooled(buf: ArenaBuffer) -> Self {
Self::Pooled(buf)
}
pub fn new_temporary(size: usize) -> Self {
TEMP_BUFFER_CACHE.with(|cache| {
if let Some(buffer) = cache.borrow_mut().get_buffer(size) {
Self::Heap(buffer)
} else {
Self::Heap(Pin::new(vec![0; size].into_boxed_slice()))
}
})
}
pub fn len(&self) -> usize {
match self {
Self::Heap(buf) => buf.len(),
Self::Pooled(buf) => buf.logical_len(),
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn as_slice(&self) -> &[u8] {
match self {
Self::Heap(buf) => {
// SAFETY: The buffer is guaranteed to be valid for the lifetime of the slice
unsafe { std::slice::from_raw_parts(buf.as_ptr(), buf.len()) }
}
Self::Pooled(buf) => buf,
}
}
#[allow(clippy::mut_from_ref)]
pub fn as_mut_slice(&self) -> &mut [u8] {
unsafe { std::slice::from_raw_parts_mut(self.as_mut_ptr(), self.len()) }
}
#[inline]
pub fn as_ptr(&self) -> *const u8 {
match self {
Self::Heap(buf) => buf.as_ptr(),
Self::Pooled(buf) => buf.as_ptr(),
}
}
#[inline]
pub fn as_mut_ptr(&self) -> *mut u8 {
match self {
Self::Heap(buf) => buf.as_ptr() as *mut u8,
Self::Pooled(buf) => buf.as_ptr() as *mut u8,
}
}
}
thread_local! {
/// thread local cache to re-use temporary buffers to prevent churn when pool overflows
pub static TEMP_BUFFER_CACHE: RefCell<TempBufferCache> = RefCell::new(TempBufferCache::new());
}
/// A cache for temporary or any additional `Buffer` allocations beyond
/// what the `BufferPool` has room for, or for use before the pool is
/// fully initialized.
pub(crate) struct TempBufferCache {
/// The `[Database::page_size]` at the time the cache is initiated.
page_size: usize,
/// Cache of buffers of size `self.page_size`.
page_buffers: Vec<BufferData>,
/// Cache of buffers of size `self.page_size` + WAL_FRAME_HEADER_SIZE.
wal_frame_buffers: Vec<BufferData>,
/// Maximum number of buffers that will live in each cache.
max_cached: usize,
}
impl TempBufferCache {
const DEFAULT_MAX_CACHE_SIZE: usize = 256;
fn new() -> Self {
Self {
page_size: BufferPool::DEFAULT_PAGE_SIZE,
page_buffers: Vec::with_capacity(8),
wal_frame_buffers: Vec::with_capacity(8),
max_cached: Self::DEFAULT_MAX_CACHE_SIZE,
}
}
/// If the `[Database::page_size]` is set, any temporary buffers that might
/// exist prior need to be cleared and new `page_size` needs to be saved.
pub fn reinit_cache(&mut self, page_size: usize) {
self.page_buffers.clear();
self.wal_frame_buffers.clear();
self.page_size = page_size;
}
fn get_buffer(&mut self, size: usize) -> Option<BufferData> {
match size {
sz if sz == self.page_size => self.page_buffers.pop(),
sz if sz == (self.page_size + WAL_FRAME_HEADER_SIZE) => self.wal_frame_buffers.pop(),
_ => None,
}
}
fn return_buffer(&mut self, buff: BufferData, len: usize) {
let sz = self.page_size;
let cache = match len {
n if n.eq(&sz) => &mut self.page_buffers,
n if n.eq(&(sz + WAL_FRAME_HEADER_SIZE)) => &mut self.wal_frame_buffers,
_ => return,
};
if self.max_cached > cache.len() {
cache.push(buff);
}
}
}
cfg_block! {
#[cfg(all(target_os = "linux", feature = "io_uring"))] {
mod io_uring;
#[cfg(feature = "fs")]
pub use io_uring::UringIO;
}
#[cfg(target_family = "unix")] {
mod unix;
#[cfg(feature = "fs")]
pub use unix::UnixIO;
pub use unix::UnixIO as PlatformIO;
pub use PlatformIO as SyscallIO;
}
#[cfg(not(any(target_family = "unix", target_os = "android", target_os = "ios")))] {
mod generic;
pub use generic::GenericIO as PlatformIO;
pub use PlatformIO as SyscallIO;
}
}
mod memory;
#[cfg(feature = "fs")]
mod vfs;
pub use memory::MemoryIO;
pub mod clock;
mod common;
pub use clock::Clock;