use crate::storage::buffer_pool::ArenaBuffer; use crate::storage::sqlite3_ondisk::WAL_FRAME_HEADER_SIZE; use crate::{BufferPool, Result}; use bitflags::bitflags; use cfg_block::cfg_block; use std::fmt; use std::ptr::NonNull; use std::sync::Arc; use std::{cell::Cell, fmt::Debug, pin::Pin}; pub trait File: Send + Sync { fn lock_file(&self, exclusive: bool) -> Result<()>; fn unlock_file(&self) -> Result<()>; fn pread(&self, pos: usize, c: Completion) -> Result; fn pwrite(&self, pos: usize, buffer: Arc, c: Completion) -> Result; fn sync(&self, c: Completion) -> Result; fn pwritev(&self, pos: usize, buffers: Vec>, c: Completion) -> Result { use std::sync::atomic::{AtomicUsize, Ordering}; if buffers.is_empty() { c.complete(0); return Ok(c); } if buffers.len() == 1 { return self.pwrite(pos, buffers[0].clone(), c); } // naive default implementation can be overridden on backends where it makes sense to let mut pos = pos; let outstanding = Arc::new(AtomicUsize::new(buffers.len())); let total_written = Arc::new(AtomicUsize::new(0)); for buf in buffers { let len = buf.len(); let child_c = { let c_main = c.clone(); let outstanding = outstanding.clone(); let total_written = total_written.clone(); let _cloned = buf.clone(); Completion::new_write(move |n| { // reference buffer in callback to ensure alive for async io let _buf = _cloned.clone(); // accumulate bytes actually reported by the backend total_written.fetch_add(n as usize, Ordering::Relaxed); if outstanding.fetch_sub(1, Ordering::AcqRel) == 1 { // last one finished c_main.complete(total_written.load(Ordering::Acquire) as i32); } }) }; if let Err(e) = self.pwrite(pos, buf.clone(), child_c) { // best-effort: mark as done so caller won't wait forever c.complete(-1); return Err(e); } pos += len; } Ok(c) } fn size(&self) -> Result; fn truncate(&self, len: usize, c: Completion) -> Result; fn copy_to(&self, io: &dyn IO, path: &str) -> Result<()> { // Open or create the destination file let dest_file = io.open_file(path, OpenFlags::Create, false)?; // Get the size of the source file let file_size = self.size()? as usize; if file_size == 0 { return Ok(()); } // use 1MB chunk size const BUFFER_SIZE: usize = 1024 * 1024; let mut pos = 0; while pos < file_size { let chunk_size = (file_size - pos).min(BUFFER_SIZE); // Read from source let read_buffer = Arc::new(Buffer::allocate(chunk_size, Rc::new(|_| {}))); let read_completion = self.pread( pos, Completion::new_read(read_buffer.clone(), move |_, _| {}), )?; // Wait for read to complete io.wait_for_completion(read_completion)?; // Write to destination let write_completion = dest_file.pwrite(pos, read_buffer, Completion::new_write(|_| {}))?; io.wait_for_completion(write_completion)?; pos += chunk_size; } let sync_completion = dest_file.sync(Completion::new_sync(|_| {}))?; io.wait_for_completion(sync_completion)?; Ok(()) } } #[derive(Debug, Copy, Clone, PartialEq)] pub struct OpenFlags(i32); bitflags! { impl OpenFlags: i32 { const None = 0b00000000; const Create = 0b0000001; const ReadOnly = 0b0000010; } } impl Default for OpenFlags { fn default() -> Self { Self::Create } } pub trait IO: Clock + Send + Sync { fn open_file(&self, path: &str, flags: OpenFlags, direct: bool) -> Result>; fn run_once(&self) -> Result<()>; fn wait_for_completion(&self, c: Completion) -> Result<()>; fn generate_random_number(&self) -> i64; fn get_memory_io(&self) -> Arc; fn register_fixed_buffer(&self, _ptr: NonNull, _len: usize) -> Result { Err(crate::LimboError::InternalError( "unsupported operation".to_string(), )) } } pub type Complete = dyn Fn(Arc, i32); pub type WriteComplete = dyn Fn(i32); pub type SyncComplete = dyn Fn(i32); pub type TruncateComplete = dyn Fn(i32); #[must_use] #[derive(Clone)] pub struct Completion { inner: Arc, } struct CompletionInner { pub completion_type: CompletionType, is_completed: Cell, } pub enum CompletionType { Read(ReadCompletion), Write(WriteCompletion), Sync(SyncCompletion), Truncate(TruncateCompletion), } pub struct ReadCompletion { pub buf: Arc, pub complete: Box, } impl Completion { pub fn new(completion_type: CompletionType) -> Self { Self { inner: Arc::new(CompletionInner { completion_type, is_completed: Cell::new(false), }), } } pub fn new_write(complete: F) -> Self where F: Fn(i32) + 'static, { Self::new(CompletionType::Write(WriteCompletion::new(Box::new( complete, )))) } pub fn new_read(buf: Arc, complete: F) -> Self where F: Fn(Arc, i32) + 'static, { Self::new(CompletionType::Read(ReadCompletion::new( buf, Box::new(complete), ))) } pub fn new_sync(complete: F) -> Self where F: Fn(i32) + 'static, { Self::new(CompletionType::Sync(SyncCompletion::new(Box::new( complete, )))) } pub fn new_trunc(complete: F) -> Self where F: Fn(i32) + 'static, { Self::new(CompletionType::Truncate(TruncateCompletion::new(Box::new( complete, )))) } pub fn is_completed(&self) -> bool { self.inner.is_completed.get() } pub fn complete(&self, result: i32) { if !self.inner.is_completed.get() { match &self.inner.completion_type { CompletionType::Read(r) => r.complete(result), CompletionType::Write(w) => w.complete(result), CompletionType::Sync(s) => s.complete(result), // fix CompletionType::Truncate(t) => t.complete(result), }; self.inner.is_completed.set(true); } } /// only call this method if you are sure that the completion is /// a ReadCompletion, panics otherwise pub fn as_read(&self) -> &ReadCompletion { match self.inner.completion_type { CompletionType::Read(ref r) => r, _ => unreachable!(), } } /// only call this method if you are sure that the completion is /// a WriteCompletion, panics otherwise pub fn as_write(&self) -> &WriteCompletion { match self.inner.completion_type { CompletionType::Write(ref w) => w, _ => unreachable!(), } } } pub struct WriteCompletion { pub complete: Box, } pub struct SyncCompletion { pub complete: Box, } impl ReadCompletion { pub fn new(buf: Arc, complete: Box) -> Self { Self { buf, complete } } pub fn buf(&self) -> &Buffer { &self.buf } pub fn complete(&self, bytes_read: i32) { (self.complete)(self.buf.clone(), bytes_read); } } impl WriteCompletion { pub fn new(complete: Box) -> Self { Self { complete } } pub fn complete(&self, bytes_written: i32) { (self.complete)(bytes_written); } } impl SyncCompletion { pub fn new(complete: Box) -> Self { Self { complete } } pub fn complete(&self, res: i32) { (self.complete)(res); } } pub struct TruncateCompletion { pub complete: Box, } impl TruncateCompletion { pub fn new(complete: Box) -> Self { Self { complete } } pub fn complete(&self, res: i32) { (self.complete)(res); } } pub type BufferData = Pin>; pub enum Buffer { Heap(BufferData), Pooled(ArenaBuffer), } impl Debug for Buffer { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Pooled(p) => write!(f, "{p:?}"), Self::Heap(buf) => write!(f, "{buf:?}: {}", buf.len()), } } } impl Clone for Buffer { fn clone(&self) -> Self { match self { Self::Heap(buf) => { let len = buf.len(); Self::Heap(Vec::from(&buf[..len]).into_boxed_slice().into()) } Self::Pooled(buf) => { // Clone pooled buffers as heap buffers let data = Vec::from(buf.as_slice()); Self::Heap(Pin::new(data.into_boxed_slice())) } } } } impl Drop for Buffer { fn drop(&mut self) { let len = self.len(); if let Self::Heap(buf) = self { TEMP_BUFFER_CACHE.with(|cache| { let mut cache = cache.borrow_mut(); // take ownership of the buffer by swapping it with a dummy let buffer = std::mem::replace(buf, Pin::new(vec![].into_boxed_slice())); cache.return_buffer(buffer, len); }); } } } impl Buffer { pub fn new(data: Vec) -> Self { tracing::trace!("buffer::new({:?})", data); Self::Heap(Pin::new(data.into_boxed_slice())) } pub fn fixed_id(&self) -> Option { match self { Self::Heap { .. } => None, Self::Pooled(buf) => buf.fixed_id(), } } pub fn new_pooled(buf: ArenaBuffer) -> Self { tracing::trace!("new_pooled({:?})", buf); Self::Pooled(buf) } pub fn new_temporary(size: usize) -> Self { TEMP_BUFFER_CACHE.with(|cache| { if let Some(buffer) = cache.borrow_mut().get_buffer(size) { Self::Heap(buffer) } else { Self::Heap(Pin::new(vec![0; size].into_boxed_slice())) } }) } pub fn len(&self) -> usize { match self { Self::Heap(buf) => buf.len(), Self::Pooled(buf) => buf.logical_len(), } } pub fn is_empty(&self) -> bool { self.len() == 0 } pub fn as_slice(&self) -> &[u8] { match self { Self::Heap(buf) => { // SAFETY: The buffer is guaranteed to be valid for the lifetime of the slice unsafe { std::slice::from_raw_parts(buf.as_ptr(), buf.len()) } } Self::Pooled(buf) => buf, } } #[allow(clippy::mut_from_ref)] pub fn as_mut_slice(&self) -> &mut [u8] { match self { Self::Heap(buf) => { // SAFETY: The buffer is guaranteed to be valid for the lifetime of the slice unsafe { std::slice::from_raw_parts_mut(buf.as_ptr() as *mut u8, buf.len()) } } Self::Pooled(buf) => unsafe { std::slice::from_raw_parts_mut(buf.as_ptr() as *mut u8, buf.len()) }, } } pub fn as_mut_ptr(&mut self) -> *mut u8 { match self { Self::Heap(buf) => buf.as_mut_ptr(), Self::Pooled(buf) => buf.as_mut_ptr(), } } } thread_local! { /// thread local cache to re-use temporary buffers to prevent churn when pool overflows pub static TEMP_BUFFER_CACHE: RefCell = RefCell::new(TempBufferCache::new()); } pub(crate) struct TempBufferCache { // Buffers indexed by size, we only cache common sizes page_size: usize, page_buffers: Vec, wal_frame_buffers: Vec, max_cached: usize, } impl TempBufferCache { fn new() -> Self { Self { page_size: BufferPool::DEFAULT_PAGE_SIZE, page_buffers: Vec::with_capacity(8), wal_frame_buffers: Vec::with_capacity(8), max_cached: 512, } } pub fn reinit_cache(&mut self, page_size: usize) { self.page_buffers.clear(); self.wal_frame_buffers.clear(); self.page_size = page_size; } fn get_buffer(&mut self, size: usize) -> Option { match size { sz if sz == self.page_size => self.page_buffers.pop(), sz if sz == (self.page_size + WAL_FRAME_HEADER_SIZE) => self.wal_frame_buffers.pop(), _ => None, } } fn return_buffer(&mut self, buff: BufferData, len: usize) { let sz = self.page_size; let cache = match len { n if n.eq(&sz) => &mut self.page_buffers, n if n.eq(&(sz + WAL_FRAME_HEADER_SIZE)) => &mut self.wal_frame_buffers, _ => return, }; if self.max_cached > cache.len() { cache.push(buff); } } } cfg_block! { #[cfg(all(target_os = "linux", feature = "io_uring"))] { mod io_uring; #[cfg(feature = "fs")] pub use io_uring::UringIO; } #[cfg(target_family = "unix")] { mod unix; #[cfg(feature = "fs")] pub use unix::UnixIO; pub use unix::UnixIO as PlatformIO; pub use PlatformIO as SyscallIO; } #[cfg(any(target_os = "android", target_os = "ios"))] { mod unix; #[cfg(feature = "fs")] pub use unix::UnixIO; pub use unix::UnixIO as SyscallIO; pub use unix::UnixIO as PlatformIO; } #[cfg(target_os = "windows")] { mod windows; pub use windows::WindowsIO as PlatformIO; pub use PlatformIO as SyscallIO; } #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows", target_os = "android", target_os = "ios")))] { mod generic; pub use generic::GenericIO as PlatformIO; pub use PlatformIO as SyscallIO; } } mod memory; #[cfg(feature = "fs")] mod vfs; pub use memory::MemoryIO; pub mod clock; mod common; pub use clock::Clock;