mirror of
https://github.com/aljazceru/turso.git
synced 2025-12-28 05:24:22 +01:00
Problem There are several problems with our current statically allocated `BufferPool`. 1. You cannot open two databases in the same process with different page sizes, because the `BufferPool`'s `Arena`s will be locked forever into the page size of the first database. This is the case regardless of whether the two `Database`s are open at the same time, or if the first is closed before the second is opened. 2. It is impossible to even write Rust tests for different page sizes because of this, assuming the test uses a single process. Solution Make `Database` own `BufferPool` instead of it being statically allocated, so this problem goes away. Note that I didn't touch the still statically-allocated `TEMP_BUFFER_CACHE`, because it should continue to work regardless of this change. It should only be a problem if the user has two or more databases with different page sizes open simultaneously, because `TEMP_BUFFER_CACHE` will only support one pool of a given page size at a time, so the rest of the allocations will go through the global allocator instead. Notes I extracted this change out from #2569, because I didn't want it to be smuggled in without being reviewed as an individual piece.
522 lines
15 KiB
Rust
522 lines
15 KiB
Rust
use crate::storage::buffer_pool::ArenaBuffer;
|
|
use crate::storage::sqlite3_ondisk::WAL_FRAME_HEADER_SIZE;
|
|
use crate::{BufferPool, Result};
|
|
use bitflags::bitflags;
|
|
use cfg_block::cfg_block;
|
|
use std::cell::RefCell;
|
|
use std::fmt;
|
|
use std::ptr::NonNull;
|
|
use std::sync::Arc;
|
|
use std::{cell::Cell, fmt::Debug, pin::Pin};
|
|
|
|
pub trait File: Send + Sync {
|
|
fn lock_file(&self, exclusive: bool) -> Result<()>;
|
|
fn unlock_file(&self) -> Result<()>;
|
|
fn pread(&self, pos: usize, c: Completion) -> Result<Completion>;
|
|
fn pwrite(&self, pos: usize, buffer: Arc<Buffer>, c: Completion) -> Result<Completion>;
|
|
fn sync(&self, c: Completion) -> Result<Completion>;
|
|
fn pwritev(&self, pos: usize, buffers: Vec<Arc<Buffer>>, c: Completion) -> Result<Completion> {
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
if buffers.is_empty() {
|
|
c.complete(0);
|
|
return Ok(c);
|
|
}
|
|
if buffers.len() == 1 {
|
|
return self.pwrite(pos, buffers[0].clone(), c);
|
|
}
|
|
// naive default implementation can be overridden on backends where it makes sense to
|
|
let mut pos = pos;
|
|
let outstanding = Arc::new(AtomicUsize::new(buffers.len()));
|
|
let total_written = Arc::new(AtomicUsize::new(0));
|
|
|
|
for buf in buffers {
|
|
let len = buf.len();
|
|
let child_c = {
|
|
let c_main = c.clone();
|
|
let outstanding = outstanding.clone();
|
|
let total_written = total_written.clone();
|
|
let _cloned = buf.clone();
|
|
Completion::new_write(move |n| {
|
|
// reference buffer in callback to ensure alive for async io
|
|
let _buf = _cloned.clone();
|
|
// accumulate bytes actually reported by the backend
|
|
total_written.fetch_add(n as usize, Ordering::Relaxed);
|
|
if outstanding.fetch_sub(1, Ordering::AcqRel) == 1 {
|
|
// last one finished
|
|
c_main.complete(total_written.load(Ordering::Acquire) as i32);
|
|
}
|
|
})
|
|
};
|
|
if let Err(e) = self.pwrite(pos, buf.clone(), child_c) {
|
|
// best-effort: mark as done so caller won't wait forever
|
|
c.complete(-1);
|
|
return Err(e);
|
|
}
|
|
pos += len;
|
|
}
|
|
Ok(c)
|
|
}
|
|
fn size(&self) -> Result<u64>;
|
|
fn truncate(&self, len: usize, c: Completion) -> Result<Completion>;
|
|
fn copy_to(&self, io: &dyn IO, path: &str) -> Result<()> {
|
|
// Open or create the destination file
|
|
let dest_file = io.open_file(path, OpenFlags::Create, false)?;
|
|
// Get the size of the source file
|
|
let file_size = self.size()? as usize;
|
|
if file_size == 0 {
|
|
return Ok(());
|
|
}
|
|
|
|
// use 1MB chunk size
|
|
const BUFFER_SIZE: usize = 1024 * 1024;
|
|
let mut pos = 0;
|
|
|
|
while pos < file_size {
|
|
let chunk_size = (file_size - pos).min(BUFFER_SIZE);
|
|
// Read from source
|
|
let read_buffer = Arc::new(Buffer::new_temporary(chunk_size));
|
|
let read_completion = self.pread(
|
|
pos,
|
|
Completion::new_read(read_buffer.clone(), move |_, _| {}),
|
|
)?;
|
|
|
|
// Wait for read to complete
|
|
io.wait_for_completion(read_completion)?;
|
|
|
|
// Write to destination
|
|
let write_completion =
|
|
dest_file.pwrite(pos, read_buffer, Completion::new_write(|_| {}))?;
|
|
io.wait_for_completion(write_completion)?;
|
|
|
|
pos += chunk_size;
|
|
}
|
|
let sync_completion = dest_file.sync(Completion::new_sync(|_| {}))?;
|
|
io.wait_for_completion(sync_completion)?;
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq)]
|
|
pub struct OpenFlags(i32);
|
|
|
|
bitflags! {
|
|
impl OpenFlags: i32 {
|
|
const None = 0b00000000;
|
|
const Create = 0b0000001;
|
|
const ReadOnly = 0b0000010;
|
|
}
|
|
}
|
|
|
|
impl Default for OpenFlags {
|
|
fn default() -> Self {
|
|
Self::Create
|
|
}
|
|
}
|
|
|
|
pub trait IO: Clock + Send + Sync {
|
|
fn open_file(&self, path: &str, flags: OpenFlags, direct: bool) -> Result<Arc<dyn File>>;
|
|
|
|
fn run_once(&self) -> Result<()>;
|
|
|
|
fn wait_for_completion(&self, c: Completion) -> Result<()>;
|
|
|
|
fn generate_random_number(&self) -> i64;
|
|
|
|
fn get_memory_io(&self) -> Arc<MemoryIO>;
|
|
|
|
fn register_fixed_buffer(&self, _ptr: NonNull<u8>, _len: usize) -> Result<u32> {
|
|
Err(crate::LimboError::InternalError(
|
|
"unsupported operation".to_string(),
|
|
))
|
|
}
|
|
}
|
|
|
|
pub type ReadComplete = dyn Fn(Arc<Buffer>, i32);
|
|
pub type WriteComplete = dyn Fn(i32);
|
|
pub type SyncComplete = dyn Fn(i32);
|
|
pub type TruncateComplete = dyn Fn(i32);
|
|
|
|
#[must_use]
|
|
#[derive(Debug, Clone)]
|
|
pub struct Completion {
|
|
inner: Arc<CompletionInner>,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct CompletionInner {
|
|
pub completion_type: CompletionType,
|
|
is_completed: Cell<bool>,
|
|
}
|
|
|
|
impl Debug for CompletionType {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
Self::Read(..) => f.debug_tuple("Read").finish(),
|
|
Self::Write(..) => f.debug_tuple("Write").finish(),
|
|
Self::Sync(..) => f.debug_tuple("Sync").finish(),
|
|
Self::Truncate(..) => f.debug_tuple("Truncate").finish(),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub enum CompletionType {
|
|
Read(ReadCompletion),
|
|
Write(WriteCompletion),
|
|
Sync(SyncCompletion),
|
|
Truncate(TruncateCompletion),
|
|
}
|
|
|
|
pub struct ReadCompletion {
|
|
pub buf: Arc<Buffer>,
|
|
pub complete: Box<ReadComplete>,
|
|
}
|
|
|
|
impl Completion {
|
|
pub fn new(completion_type: CompletionType) -> Self {
|
|
Self {
|
|
inner: Arc::new(CompletionInner {
|
|
completion_type,
|
|
is_completed: Cell::new(false),
|
|
}),
|
|
}
|
|
}
|
|
|
|
pub fn new_write<F>(complete: F) -> Self
|
|
where
|
|
F: Fn(i32) + 'static,
|
|
{
|
|
Self::new(CompletionType::Write(WriteCompletion::new(Box::new(
|
|
complete,
|
|
))))
|
|
}
|
|
|
|
pub fn new_read<F>(buf: Arc<Buffer>, complete: F) -> Self
|
|
where
|
|
F: Fn(Arc<Buffer>, i32) + 'static,
|
|
{
|
|
Self::new(CompletionType::Read(ReadCompletion::new(
|
|
buf,
|
|
Box::new(complete),
|
|
)))
|
|
}
|
|
pub fn new_sync<F>(complete: F) -> Self
|
|
where
|
|
F: Fn(i32) + 'static,
|
|
{
|
|
Self::new(CompletionType::Sync(SyncCompletion::new(Box::new(
|
|
complete,
|
|
))))
|
|
}
|
|
|
|
pub fn new_trunc<F>(complete: F) -> Self
|
|
where
|
|
F: Fn(i32) + 'static,
|
|
{
|
|
Self::new(CompletionType::Truncate(TruncateCompletion::new(Box::new(
|
|
complete,
|
|
))))
|
|
}
|
|
|
|
/// Create a dummy completed completion
|
|
pub fn new_dummy() -> Self {
|
|
let c = Self::new_write(|_| {});
|
|
c.complete(0);
|
|
c
|
|
}
|
|
|
|
pub fn is_completed(&self) -> bool {
|
|
self.inner.is_completed.get()
|
|
}
|
|
|
|
pub fn complete(&self, result: i32) {
|
|
if !self.inner.is_completed.get() {
|
|
match &self.inner.completion_type {
|
|
CompletionType::Read(r) => r.complete(result),
|
|
CompletionType::Write(w) => w.complete(result),
|
|
CompletionType::Sync(s) => s.complete(result), // fix
|
|
CompletionType::Truncate(t) => t.complete(result),
|
|
};
|
|
self.inner.is_completed.set(true);
|
|
}
|
|
}
|
|
|
|
/// only call this method if you are sure that the completion is
|
|
/// a ReadCompletion, panics otherwise
|
|
pub fn as_read(&self) -> &ReadCompletion {
|
|
match self.inner.completion_type {
|
|
CompletionType::Read(ref r) => r,
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
|
|
/// only call this method if you are sure that the completion is
|
|
/// a WriteCompletion, panics otherwise
|
|
pub fn as_write(&self) -> &WriteCompletion {
|
|
match self.inner.completion_type {
|
|
CompletionType::Write(ref w) => w,
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct WriteCompletion {
|
|
pub complete: Box<WriteComplete>,
|
|
}
|
|
|
|
pub struct SyncCompletion {
|
|
pub complete: Box<SyncComplete>,
|
|
}
|
|
|
|
impl ReadCompletion {
|
|
pub fn new(buf: Arc<Buffer>, complete: Box<ReadComplete>) -> Self {
|
|
Self { buf, complete }
|
|
}
|
|
|
|
pub fn buf(&self) -> &Buffer {
|
|
&self.buf
|
|
}
|
|
|
|
pub fn complete(&self, bytes_read: i32) {
|
|
(self.complete)(self.buf.clone(), bytes_read);
|
|
}
|
|
}
|
|
|
|
impl WriteCompletion {
|
|
pub fn new(complete: Box<WriteComplete>) -> Self {
|
|
Self { complete }
|
|
}
|
|
|
|
pub fn complete(&self, bytes_written: i32) {
|
|
(self.complete)(bytes_written);
|
|
}
|
|
}
|
|
|
|
impl SyncCompletion {
|
|
pub fn new(complete: Box<SyncComplete>) -> Self {
|
|
Self { complete }
|
|
}
|
|
|
|
pub fn complete(&self, res: i32) {
|
|
(self.complete)(res);
|
|
}
|
|
}
|
|
|
|
pub struct TruncateCompletion {
|
|
pub complete: Box<TruncateComplete>,
|
|
}
|
|
|
|
impl TruncateCompletion {
|
|
pub fn new(complete: Box<TruncateComplete>) -> Self {
|
|
Self { complete }
|
|
}
|
|
|
|
pub fn complete(&self, res: i32) {
|
|
(self.complete)(res);
|
|
}
|
|
}
|
|
|
|
pub type BufferData = Pin<Box<[u8]>>;
|
|
|
|
pub enum Buffer {
|
|
Heap(BufferData),
|
|
Pooled(ArenaBuffer),
|
|
}
|
|
|
|
impl Debug for Buffer {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
Self::Pooled(p) => write!(f, "Pooled(len={})", p.logical_len()),
|
|
Self::Heap(buf) => write!(f, "{buf:?}: {}", buf.len()),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Drop for Buffer {
|
|
fn drop(&mut self) {
|
|
let len = self.len();
|
|
if let Self::Heap(buf) = self {
|
|
TEMP_BUFFER_CACHE.with(|cache| {
|
|
let mut cache = cache.borrow_mut();
|
|
// take ownership of the buffer by swapping it with a dummy
|
|
let buffer = std::mem::replace(buf, Pin::new(vec![].into_boxed_slice()));
|
|
cache.return_buffer(buffer, len);
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Buffer {
|
|
pub fn new(data: Vec<u8>) -> Self {
|
|
tracing::trace!("buffer::new({:?})", data);
|
|
Self::Heap(Pin::new(data.into_boxed_slice()))
|
|
}
|
|
|
|
/// Returns the index of the underlying `Arena` if it was registered with
|
|
/// io_uring. Only for use with `UringIO` backend.
|
|
pub fn fixed_id(&self) -> Option<u32> {
|
|
match self {
|
|
Self::Heap { .. } => None,
|
|
Self::Pooled(buf) => buf.fixed_id(),
|
|
}
|
|
}
|
|
|
|
pub fn new_pooled(buf: ArenaBuffer) -> Self {
|
|
Self::Pooled(buf)
|
|
}
|
|
|
|
pub fn new_temporary(size: usize) -> Self {
|
|
TEMP_BUFFER_CACHE.with(|cache| {
|
|
if let Some(buffer) = cache.borrow_mut().get_buffer(size) {
|
|
Self::Heap(buffer)
|
|
} else {
|
|
Self::Heap(Pin::new(vec![0; size].into_boxed_slice()))
|
|
}
|
|
})
|
|
}
|
|
|
|
pub fn len(&self) -> usize {
|
|
match self {
|
|
Self::Heap(buf) => buf.len(),
|
|
Self::Pooled(buf) => buf.logical_len(),
|
|
}
|
|
}
|
|
|
|
pub fn is_empty(&self) -> bool {
|
|
self.len() == 0
|
|
}
|
|
|
|
pub fn as_slice(&self) -> &[u8] {
|
|
match self {
|
|
Self::Heap(buf) => {
|
|
// SAFETY: The buffer is guaranteed to be valid for the lifetime of the slice
|
|
unsafe { std::slice::from_raw_parts(buf.as_ptr(), buf.len()) }
|
|
}
|
|
Self::Pooled(buf) => buf,
|
|
}
|
|
}
|
|
|
|
#[allow(clippy::mut_from_ref)]
|
|
pub fn as_mut_slice(&self) -> &mut [u8] {
|
|
unsafe { std::slice::from_raw_parts_mut(self.as_mut_ptr(), self.len()) }
|
|
}
|
|
#[inline]
|
|
pub fn as_ptr(&self) -> *const u8 {
|
|
match self {
|
|
Self::Heap(buf) => buf.as_ptr(),
|
|
Self::Pooled(buf) => buf.as_ptr(),
|
|
}
|
|
}
|
|
#[inline]
|
|
pub fn as_mut_ptr(&self) -> *mut u8 {
|
|
match self {
|
|
Self::Heap(buf) => buf.as_ptr() as *mut u8,
|
|
Self::Pooled(buf) => buf.as_ptr() as *mut u8,
|
|
}
|
|
}
|
|
}
|
|
|
|
thread_local! {
|
|
/// thread local cache to re-use temporary buffers to prevent churn when pool overflows
|
|
pub static TEMP_BUFFER_CACHE: RefCell<TempBufferCache> = RefCell::new(TempBufferCache::new());
|
|
}
|
|
|
|
/// A cache for temporary or any additional `Buffer` allocations beyond
|
|
/// what the `BufferPool` has room for, or for use before the pool is
|
|
/// fully initialized.
|
|
pub(crate) struct TempBufferCache {
|
|
/// The `[Database::page_size]` at the time the cache is initiated.
|
|
page_size: usize,
|
|
/// Cache of buffers of size `self.page_size`.
|
|
page_buffers: Vec<BufferData>,
|
|
/// Cache of buffers of size `self.page_size` + WAL_FRAME_HEADER_SIZE.
|
|
wal_frame_buffers: Vec<BufferData>,
|
|
/// Maximum number of buffers that will live in each cache.
|
|
max_cached: usize,
|
|
}
|
|
|
|
impl TempBufferCache {
|
|
const DEFAULT_MAX_CACHE_SIZE: usize = 256;
|
|
|
|
fn new() -> Self {
|
|
Self {
|
|
page_size: BufferPool::DEFAULT_PAGE_SIZE,
|
|
page_buffers: Vec::with_capacity(8),
|
|
wal_frame_buffers: Vec::with_capacity(8),
|
|
max_cached: Self::DEFAULT_MAX_CACHE_SIZE,
|
|
}
|
|
}
|
|
|
|
/// If the `[Database::page_size]` is set, any temporary buffers that might
|
|
/// exist prior need to be cleared and new `page_size` needs to be saved.
|
|
pub fn reinit_cache(&mut self, page_size: usize) {
|
|
self.page_buffers.clear();
|
|
self.wal_frame_buffers.clear();
|
|
self.page_size = page_size;
|
|
}
|
|
|
|
fn get_buffer(&mut self, size: usize) -> Option<BufferData> {
|
|
match size {
|
|
sz if sz == self.page_size => self.page_buffers.pop(),
|
|
sz if sz == (self.page_size + WAL_FRAME_HEADER_SIZE) => self.wal_frame_buffers.pop(),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn return_buffer(&mut self, buff: BufferData, len: usize) {
|
|
let sz = self.page_size;
|
|
let cache = match len {
|
|
n if n.eq(&sz) => &mut self.page_buffers,
|
|
n if n.eq(&(sz + WAL_FRAME_HEADER_SIZE)) => &mut self.wal_frame_buffers,
|
|
_ => return,
|
|
};
|
|
if self.max_cached > cache.len() {
|
|
cache.push(buff);
|
|
}
|
|
}
|
|
}
|
|
|
|
cfg_block! {
|
|
#[cfg(all(target_os = "linux", feature = "io_uring"))] {
|
|
mod io_uring;
|
|
#[cfg(feature = "fs")]
|
|
pub use io_uring::UringIO;
|
|
}
|
|
|
|
#[cfg(target_family = "unix")] {
|
|
mod unix;
|
|
#[cfg(feature = "fs")]
|
|
pub use unix::UnixIO;
|
|
pub use unix::UnixIO as PlatformIO;
|
|
pub use PlatformIO as SyscallIO;
|
|
}
|
|
|
|
#[cfg(any(target_os = "android", target_os = "ios"))] {
|
|
mod unix;
|
|
#[cfg(feature = "fs")]
|
|
pub use unix::UnixIO;
|
|
pub use unix::UnixIO as SyscallIO;
|
|
pub use unix::UnixIO as PlatformIO;
|
|
}
|
|
|
|
#[cfg(target_os = "windows")] {
|
|
mod windows;
|
|
pub use windows::WindowsIO as PlatformIO;
|
|
pub use PlatformIO as SyscallIO;
|
|
}
|
|
|
|
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows", target_os = "android", target_os = "ios")))] {
|
|
mod generic;
|
|
pub use generic::GenericIO as PlatformIO;
|
|
pub use PlatformIO as SyscallIO;
|
|
}
|
|
}
|
|
|
|
mod memory;
|
|
#[cfg(feature = "fs")]
|
|
mod vfs;
|
|
pub use memory::MemoryIO;
|
|
pub mod clock;
|
|
mod common;
|
|
pub use clock::Clock;
|