From 8bf2898c47cdb291efe1dccbb05174fb10a92c4d Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 11:33:24 -0400 Subject: [PATCH 01/62] Add truncate method to IO trait and Truncate completion variant --- core/io/mod.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/core/io/mod.rs b/core/io/mod.rs index b58c17ab6..0113db0eb 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -19,6 +19,7 @@ pub trait File: Send + Sync { -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; + fn truncate(&self, len: u64, c: Completion) -> Result>; } #[derive(Debug, Copy, Clone, PartialEq)] @@ -53,6 +54,7 @@ pub trait IO: Clock + Send + Sync { pub type Complete = dyn Fn(Arc>, i32); pub type WriteComplete = dyn Fn(i32); pub type SyncComplete = dyn Fn(i32); +pub type TruncateComplete = dyn Fn(i32); #[must_use] #[derive(Clone)] @@ -69,6 +71,7 @@ pub enum CompletionType { Read(ReadCompletion), Write(WriteCompletion), Sync(SyncCompletion), + Truncate(TruncateCompletion), } pub struct ReadCompletion { @@ -122,6 +125,7 @@ impl Completion { CompletionType::Read(r) => r.complete(result), CompletionType::Write(w) => w.complete(result), CompletionType::Sync(s) => s.complete(result), // fix + CompletionType::Truncate(t) => t.complete(result), }; self.inner.is_completed.set(true); } @@ -191,6 +195,20 @@ impl SyncCompletion { } } +pub struct TruncateCompletion { + pub complete: Box, +} + +impl TruncateCompletion { + pub fn new(complete: Box) -> Self { + Self { complete } + } + + pub fn complete(&self, res: i32) { + (self.complete)(res); + } +} + pub type BufferData = Pin>; pub type BufferDropFn = Rc; From 73fe4ffa0625cd54a155705fdc134725e98ac174 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 11:34:08 -0400 Subject: [PATCH 02/62] Add truncate method to DatabaseStorage trait --- core/storage/database.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/core/storage/database.rs b/core/storage/database.rs index 8e539a5f3..422663027 100644 --- a/core/storage/database.rs +++ b/core/storage/database.rs @@ -18,6 +18,7 @@ pub trait DatabaseStorage: Send + Sync { ) -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; + fn truncate(&self, len: u64, c: Completion) -> Result<()>; } #[cfg(feature = "fs")] @@ -69,6 +70,12 @@ impl DatabaseStorage for DatabaseFile { fn size(&self) -> Result { self.file.size() } + + #[instrument(skip_all, level = Level::INFO)] + fn truncate(&self, len: u64, c: Completion) -> Result<()> { + self.file.truncate(len, c)?; + Ok(()) + } } #[cfg(feature = "fs")] @@ -122,6 +129,12 @@ impl DatabaseStorage for FileMemoryStorage { fn size(&self) -> Result { self.file.size() } + + #[instrument(skip_all, level = Level::INFO)] + fn truncate(&self, len: u64, c: Completion) -> Result<()> { + let _ = self.file.truncate(len, c)?; + Ok(()) + } } impl FileMemoryStorage { From a4121c8cea010c56c9a7eb0023088511a4bfbda5 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 11:34:29 -0400 Subject: [PATCH 03/62] Impl truncate method for generic IO --- core/io/generic.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/io/generic.rs b/core/io/generic.rs index 15ce52564..ac40e5757 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -121,6 +121,13 @@ impl File for GenericFile { Ok(c) } + fn truncate(&self, len: u64, c: Completion) -> Result> { + let mut file = self.file.borrow_mut(); + file.set_len(len).map_err(|err| LimboError::IOError(err))?; + c.complete(0); + Ok(Arc::new(c)) + } + fn size(&self) -> Result { let file = self.file.borrow(); Ok(file.metadata().unwrap().len()) From fa002cba5939440a4ae8ab915831b26aa15f6fc6 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 11:36:18 -0400 Subject: [PATCH 04/62] Impl truncate method for io_uring IO --- core/io/io_uring.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/core/io/io_uring.rs b/core/io/io_uring.rs index 2ccb3e30a..b0f1ed269 100644 --- a/core/io/io_uring.rs +++ b/core/io/io_uring.rs @@ -353,6 +353,18 @@ impl File for UringFile { fn size(&self) -> Result { Ok(self.file.metadata()?.len()) } + + fn truncate(&self, len: u64, c: Completion) -> Result> { + let mut io = self.io.borrow_mut(); + let truncate = with_fd!(self, |fd| { + io_uring::opcode::Ftruncate::new(fd, len) + .build() + .user_data(io.ring.get_key()) + }); + let c = Arc::new(c); + io.ring.submit_entry(&truncate, c.clone()); + Ok(c) + } } impl Drop for UringFile { From d30157518f1c501e6c41ef91b533e78376a3b1b0 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 11:37:19 -0400 Subject: [PATCH 05/62] Impl truncate method for VFS io module and create plumbing in extension lib --- core/io/vfs.rs | 14 ++++++++++++++ extensions/core/src/vfs_modules.rs | 4 ++++ extensions/tests/src/lib.rs | 5 +++++ macros/src/ext/vfs_derive.rs | 17 +++++++++++++++++ 4 files changed, 40 insertions(+) diff --git a/core/io/vfs.rs b/core/io/vfs.rs index 1f941e2c2..a34d8dda1 100644 --- a/core/io/vfs.rs +++ b/core/io/vfs.rs @@ -165,6 +165,20 @@ impl File for VfsFileImpl { Ok(result as u64) } } + + fn truncate(&self, len: u64, c: Completion) -> Result> { + if self.vfs.is_null() { + return Err(LimboError::ExtensionError("VFS is null".to_string())); + } + let vfs = unsafe { &*self.vfs }; + let result = unsafe { (vfs.truncate)(self.file, len as i64) }; + if result.is_error() { + Err(LimboError::ExtensionError("truncate failed".to_string())) + } else { + c.complete(0); + Ok(Arc::new(c)) + } + } } impl Drop for VfsMod { diff --git a/extensions/core/src/vfs_modules.rs b/extensions/core/src/vfs_modules.rs index f07d2eb81..1a5e48b25 100644 --- a/extensions/core/src/vfs_modules.rs +++ b/extensions/core/src/vfs_modules.rs @@ -41,6 +41,7 @@ pub trait VfsFile: Send + Sync { fn read(&mut self, buf: &mut [u8], count: usize, offset: i64) -> ExtResult; fn write(&mut self, buf: &[u8], count: usize, offset: i64) -> ExtResult; fn sync(&self) -> ExtResult<()>; + fn truncate(&self, len: i64) -> ExtResult<()>; fn size(&self) -> i64; } @@ -59,6 +60,7 @@ pub struct VfsImpl { pub run_once: VfsRunOnce, pub current_time: VfsGetCurrentTime, pub gen_random_number: VfsGenerateRandomNumber, + pub truncate: VfsTruncate, } pub type RegisterVfsFn = @@ -81,6 +83,8 @@ pub type VfsWrite = pub type VfsSync = unsafe extern "C" fn(file: *const c_void) -> i32; +pub type VfsTruncate = unsafe extern "C" fn(file: *const c_void, len: i64) -> ResultCode; + pub type VfsLock = unsafe extern "C" fn(file: *const c_void, exclusive: bool) -> ResultCode; pub type VfsUnlock = unsafe extern "C" fn(file: *const c_void) -> ResultCode; diff --git a/extensions/tests/src/lib.rs b/extensions/tests/src/lib.rs index e235c2f46..88ff9f26a 100644 --- a/extensions/tests/src/lib.rs +++ b/extensions/tests/src/lib.rs @@ -314,6 +314,11 @@ impl VfsFile for TestFile { self.file.sync_all().map_err(|_| ResultCode::Error) } + fn truncate(&self, len: i64) -> ExtResult<()> { + log::debug!("truncating file with testing VFS to length: {len}"); + self.file.set_len(len as u64).map_err(|_| ResultCode::Error) + } + fn size(&self) -> i64 { self.file.metadata().map(|m| m.len() as i64).unwrap_or(-1) } diff --git a/macros/src/ext/vfs_derive.rs b/macros/src/ext/vfs_derive.rs index 0ab9d52fc..eb2115ac1 100644 --- a/macros/src/ext/vfs_derive.rs +++ b/macros/src/ext/vfs_derive.rs @@ -11,6 +11,7 @@ pub fn derive_vfs_module(input: TokenStream) -> TokenStream { let close_fn_name = format_ident!("{}_close", struct_name); let read_fn_name = format_ident!("{}_read", struct_name); let write_fn_name = format_ident!("{}_write", struct_name); + let trunc_fn_name = format_ident!("{}_truncate", struct_name); let lock_fn_name = format_ident!("{}_lock", struct_name); let unlock_fn_name = format_ident!("{}_unlock", struct_name); let sync_fn_name = format_ident!("{}_sync", struct_name); @@ -36,6 +37,7 @@ pub fn derive_vfs_module(input: TokenStream) -> TokenStream { unlock: #unlock_fn_name, sync: #sync_fn_name, size: #size_fn_name, + truncate: #trunc_fn_name, run_once: #run_once_fn_name, gen_random_number: #generate_random_number_fn_name, current_time: #get_current_time_fn_name, @@ -59,6 +61,7 @@ pub fn derive_vfs_module(input: TokenStream) -> TokenStream { unlock: #unlock_fn_name, sync: #sync_fn_name, size: #size_fn_name, + truncate: #trunc_fn_name, run_once: #run_once_fn_name, gen_random_number: #generate_random_number_fn_name, current_time: #get_current_time_fn_name, @@ -188,6 +191,20 @@ pub fn derive_vfs_module(input: TokenStream) -> TokenStream { 0 } + #[no_mangle] + pub unsafe extern "C" fn #trunc_fn_name(file_ptr: *const ::std::ffi::c_void, len: i64) -> i32 { + if file_ptr.is_null() { + return -1; + } + let vfs_file: &mut ::turso_ext::VfsFileImpl = &mut *(file_ptr as *mut ::turso_ext::VfsFileImpl); + let file: &mut <#struct_name as ::turso_ext::VfsExtension>::File = + &mut *(vfs_file.file as *mut <#struct_name as ::turso_ext::VfsExtension>::File); + if <#struct_name as ::turso_ext::VfsExtension>::File::truncate(file, len).is_err() { + return -1; + } + 0 + } + #[no_mangle] pub unsafe extern "C" fn #size_fn_name(file_ptr: *const ::std::ffi::c_void) -> i64 { if file_ptr.is_null() { From ce1897cb4a8fbfaa1bbc6438d20b70b827e7c67a Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 11:58:11 -0400 Subject: [PATCH 06/62] Impl truncate for the remainder of IO modules --- core/io/memory.rs | 14 ++++++++++++++ core/io/unix.rs | 15 +++++++++++++++ core/io/windows.rs | 7 +++++++ 3 files changed, 36 insertions(+) diff --git a/core/io/memory.rs b/core/io/memory.rs index aed9531c1..93c248571 100644 --- a/core/io/memory.rs +++ b/core/io/memory.rs @@ -174,6 +174,20 @@ impl File for MemoryFile { Ok(c) } + fn truncate(&self, len: u64, c: Completion) -> Result> { + let new_size = len as usize; + if new_size < self.size.get() { + // Truncate pages + unsafe { + let pages = &mut *self.pages.get(); + pages.retain(|&k, _| k * PAGE_SIZE < new_size); + } + } + self.size.set(new_size); + c.complete(0); + Ok(Arc::new(c)) + } + fn size(&self) -> Result { Ok(self.size.get() as u64) } diff --git a/core/io/unix.rs b/core/io/unix.rs index cfe6d37fe..0eee99321 100644 --- a/core/io/unix.rs +++ b/core/io/unix.rs @@ -450,6 +450,21 @@ impl File for UnixFile<'_> { let file = self.file.lock().unwrap(); Ok(file.metadata()?.len()) } + + #[instrument(err, skip_all, level = Level::INFO)] + fn truncate(&self, len: u64, c: Completion) -> Result> { + let file = self.file.borrow(); + let result = file.set_len(len); + let c = Arc::new(c); + match result { + Ok(()) => { + trace!("file truncated to len=({})", len); + c.complete(0); + Ok(c) + } + Err(e) => Err(e.into()), + } + } } impl Drop for UnixFile<'_> { diff --git a/core/io/windows.rs b/core/io/windows.rs index 701e61269..ed21257fd 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -123,6 +123,13 @@ impl File for WindowsFile { } #[instrument(err, skip_all, level = Level::TRACE)] + fn truncate(&self, len: u64, c: Completion) -> Result<()> { + let mut file = self.file.borrow_mut(); + file.set_len(len).map_err(LimboError::IOError)?; + c.complete(0); + Ok(()) + } + fn size(&self) -> Result { let file = self.file.read(); Ok(file.metadata().unwrap().len()) From bbb47fd58a3556e02a0245158806384f87de2874 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 13:00:33 -0400 Subject: [PATCH 07/62] Add truncate method to wasm bindings module --- bindings/wasm/lib.rs | 457 +++++++++++++++++++++++++++++++ bindings/wasm/node/src/vfs.cjs | 37 +++ bindings/wasm/web/src/web-vfs.js | 33 +++ 3 files changed, 527 insertions(+) create mode 100644 bindings/wasm/lib.rs create mode 100644 bindings/wasm/node/src/vfs.cjs create mode 100644 bindings/wasm/web/src/web-vfs.js diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs new file mode 100644 index 000000000..23ddebf68 --- /dev/null +++ b/bindings/wasm/lib.rs @@ -0,0 +1,457 @@ +#[cfg(all(feature = "web", feature = "nodejs"))] +compile_error!("Features 'web' and 'nodejs' cannot be enabled at the same time"); + +use js_sys::{Array, Object}; +use std::cell::RefCell; +use std::sync::Arc; +use turso_core::{Clock, Instant, OpenFlags, Result}; +use wasm_bindgen::prelude::*; + +#[allow(dead_code)] +#[wasm_bindgen] +pub struct Database { + db: Arc, + conn: Arc, +} + +#[allow(clippy::arc_with_non_send_sync)] +#[wasm_bindgen] +impl Database { + #[wasm_bindgen(constructor)] + pub fn new(path: &str) -> Database { + let io: Arc = Arc::new(PlatformIO { vfs: VFS::new() }); + let file = io.open_file(path, OpenFlags::Create, false).unwrap(); + let db_file = Arc::new(DatabaseFile::new(file)); + let db = turso_core::Database::open(io, path, db_file, false, false).unwrap(); + let conn = db.connect().unwrap(); + Database { db, conn } + } + + #[wasm_bindgen] + pub fn exec(&self, _sql: &str) { + self.conn.execute(_sql).unwrap(); + } + + #[wasm_bindgen] + pub fn prepare(&self, _sql: &str) -> Statement { + let stmt = self.conn.prepare(_sql).unwrap(); + Statement::new(RefCell::new(stmt), false) + } +} + +#[wasm_bindgen] +pub struct RowIterator { + inner: RefCell, +} + +#[wasm_bindgen] +impl RowIterator { + fn new(inner: RefCell) -> Self { + Self { inner } + } + + #[wasm_bindgen] + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> JsValue { + let mut stmt = self.inner.borrow_mut(); + match stmt.step() { + Ok(turso_core::StepResult::Row) => { + let row = stmt.row().unwrap(); + let row_array = Array::new(); + for value in row.get_values() { + let value = to_js_value(value); + row_array.push(&value); + } + JsValue::from(row_array) + } + Ok(turso_core::StepResult::IO) => JsValue::UNDEFINED, + Ok(turso_core::StepResult::Done) | Ok(turso_core::StepResult::Interrupt) => { + JsValue::UNDEFINED + } + + Ok(turso_core::StepResult::Busy) => JsValue::UNDEFINED, + Err(e) => panic!("Error: {e:?}"), + } + } +} + +#[wasm_bindgen] +pub struct Statement { + inner: RefCell, + raw: bool, +} + +#[wasm_bindgen] +impl Statement { + fn new(inner: RefCell, raw: bool) -> Self { + Self { inner, raw } + } + + #[wasm_bindgen] + pub fn raw(mut self, toggle: Option) -> Self { + self.raw = toggle.unwrap_or(true); + self + } + + pub fn get(&self) -> JsValue { + let mut stmt = self.inner.borrow_mut(); + match stmt.step() { + Ok(turso_core::StepResult::Row) => { + let row = stmt.row().unwrap(); + let row_array = js_sys::Array::new(); + for value in row.get_values() { + let value = to_js_value(value); + row_array.push(&value); + } + JsValue::from(row_array) + } + + Ok(turso_core::StepResult::IO) + | Ok(turso_core::StepResult::Done) + | Ok(turso_core::StepResult::Interrupt) + | Ok(turso_core::StepResult::Busy) => JsValue::UNDEFINED, + Err(e) => panic!("Error: {e:?}"), + } + } + + pub fn all(&self) -> js_sys::Array { + let array = js_sys::Array::new(); + loop { + let mut stmt = self.inner.borrow_mut(); + match stmt.step() { + Ok(turso_core::StepResult::Row) => { + let row = stmt.row().unwrap(); + let row_array = js_sys::Array::new(); + for value in row.get_values() { + let value = to_js_value(value); + row_array.push(&value); + } + array.push(&row_array); + } + Ok(turso_core::StepResult::IO) => {} + Ok(turso_core::StepResult::Interrupt) => break, + Ok(turso_core::StepResult::Done) => break, + Ok(turso_core::StepResult::Busy) => break, + Err(e) => panic!("Error: {e:?}"), + } + } + array + } + + #[wasm_bindgen] + pub fn iterate(self) -> JsValue { + let iterator = RowIterator::new(self.inner); + let iterator_obj = Object::new(); + + // Define the next method that will be called by JavaScript + let next_fn = js_sys::Function::new_with_args( + "", + "const value = this.iterator.next(); + const done = value === undefined; + return { + value, + done + };", + ); + + js_sys::Reflect::set(&iterator_obj, &JsValue::from_str("next"), &next_fn).unwrap(); + + js_sys::Reflect::set( + &iterator_obj, + &JsValue::from_str("iterator"), + &JsValue::from(iterator), + ) + .unwrap(); + + let symbol_iterator = js_sys::Function::new_no_args("return this;"); + js_sys::Reflect::set(&iterator_obj, &js_sys::Symbol::iterator(), &symbol_iterator).unwrap(); + + JsValue::from(iterator_obj) + } +} + +fn to_js_value(value: &turso_core::Value) -> JsValue { + match value { + turso_core::Value::Null => JsValue::null(), + turso_core::Value::Integer(i) => { + let i = *i; + if i >= i32::MIN as i64 && i <= i32::MAX as i64 { + JsValue::from(i as i32) + } else { + JsValue::from(i) + } + } + turso_core::Value::Float(f) => JsValue::from(*f), + turso_core::Value::Text(t) => JsValue::from_str(t.as_str()), + turso_core::Value::Blob(b) => js_sys::Uint8Array::from(b.as_slice()).into(), + } +} + +pub struct File { + vfs: VFS, + fd: i32, +} + +unsafe impl Send for File {} +unsafe impl Sync for File {} + +#[allow(dead_code)] +impl File { + fn new(vfs: VFS, fd: i32) -> Self { + Self { vfs, fd } + } +} + +impl turso_core::File for File { + fn lock_file(&self, _exclusive: bool) -> Result<()> { + // TODO + Ok(()) + } + + fn unlock_file(&self) -> Result<()> { + // TODO + Ok(()) + } + + fn pread( + &self, + pos: usize, + c: Arc, + ) -> Result> { + let r = match c.completion_type { + turso_core::CompletionType::Read(ref r) => r, + _ => unreachable!(), + }; + let nr = { + let mut buf = r.buf_mut(); + let buf: &mut [u8] = buf.as_mut_slice(); + self.vfs.pread(self.fd, buf, pos) + }; + r.complete(nr); + #[allow(clippy::arc_with_non_send_sync)] + Ok(c) + } + + fn pwrite( + &self, + pos: usize, + buffer: Arc>, + c: Arc, + ) -> Result> { + let w = match c.completion_type { + turso_core::CompletionType::Write(ref w) => w, + _ => unreachable!(), + }; + let buf = buffer.borrow(); + let buf: &[u8] = buf.as_slice(); + self.vfs.pwrite(self.fd, buf, pos); + w.complete(buf.len() as i32); + #[allow(clippy::arc_with_non_send_sync)] + Ok(c) + } + + fn sync(&self, c: Arc) -> Result> { + self.vfs.sync(self.fd); + c.complete(0); + #[allow(clippy::arc_with_non_send_sync)] + Ok(c) + } + + fn size(&self) -> Result { + Ok(self.vfs.size(self.fd)) + } + + fn truncate(&self, len: u64, c: turso_core::Completion) -> Result> { + self.vfs.truncate(self.fd, len as usize); + c.complete(0); + #[allow(clippy::arc_with_non_send_sync)] + Ok(Arc::new(c)) + } +} + +pub struct PlatformIO { + vfs: VFS, +} +unsafe impl Send for PlatformIO {} +unsafe impl Sync for PlatformIO {} + +impl Clock for PlatformIO { + fn now(&self) -> Instant { + let date = Date::new(); + let ms_since_epoch = date.getTime(); + + Instant { + secs: (ms_since_epoch / 1000.0) as i64, + micros: ((ms_since_epoch % 1000.0) * 1000.0) as u32, + } + } +} + +impl turso_core::IO for PlatformIO { + fn open_file( + &self, + path: &str, + _flags: OpenFlags, + _direct: bool, + ) -> Result> { + let fd = self.vfs.open(path, "a+"); + Ok(Arc::new(File { + vfs: VFS::new(), + fd, + })) + } + + fn wait_for_completion(&self, c: Arc) -> Result<()> { + while !c.is_completed() { + self.run_once()?; + } + Ok(()) + } + + fn run_once(&self) -> Result<()> { + Ok(()) + } + + fn generate_random_number(&self) -> i64 { + let mut buf = [0u8; 8]; + getrandom::getrandom(&mut buf).unwrap(); + i64::from_ne_bytes(buf) + } + + fn get_memory_io(&self) -> Arc { + Arc::new(turso_core::MemoryIO::new()) + } +} + +#[wasm_bindgen] +extern "C" { + type Date; + + #[wasm_bindgen(constructor)] + fn new() -> Date; + + #[wasm_bindgen(method, getter)] + fn toISOString(this: &Date) -> String; + + #[wasm_bindgen(method)] + fn getTime(this: &Date) -> f64; +} + +pub struct DatabaseFile { + file: Arc, +} + +unsafe impl Send for DatabaseFile {} +unsafe impl Sync for DatabaseFile {} + +impl DatabaseFile { + pub fn new(file: Arc) -> Self { + Self { file } + } +} + +impl turso_core::DatabaseStorage for DatabaseFile { + fn read_page(&self, page_idx: usize, c: turso_core::Completion) -> Result<()> { + let r = match c.completion_type { + turso_core::CompletionType::Read(ref r) => r, + _ => unreachable!(), + }; + let size = r.buf().len(); + assert!(page_idx > 0); + if !(512..=65536).contains(&size) || size & (size - 1) != 0 { + return Err(turso_core::LimboError::NotADB); + } + let pos = (page_idx - 1) * size; + self.file.pread(pos, c.into())?; + Ok(()) + } + + fn write_page( + &self, + page_idx: usize, + buffer: Arc>, + c: turso_core::Completion, + ) -> Result<()> { + let size = buffer.borrow().len(); + let pos = (page_idx - 1) * size; + self.file.pwrite(pos, buffer, c.into())?; + Ok(()) + } + + fn sync(&self, c: turso_core::Completion) -> Result<()> { + let _ = self.file.sync(c.into())?; + Ok(()) + } + + fn size(&self) -> Result { + self.file.size() + } + + fn truncate(&self, len: u64, c: turso_core::Completion) -> Result<()> { + self.file.truncate(len, c); + Ok(()) + } +} + +#[cfg(all(feature = "web", not(feature = "nodejs")))] +#[wasm_bindgen(module = "/web/src/web-vfs.js")] +extern "C" { + type VFS; + #[wasm_bindgen(constructor)] + fn new() -> VFS; + + #[wasm_bindgen(method)] + fn open(this: &VFS, path: &str, flags: &str) -> i32; + + #[wasm_bindgen(method)] + fn close(this: &VFS, fd: i32) -> bool; + + #[wasm_bindgen(method)] + fn pwrite(this: &VFS, fd: i32, buffer: &[u8], offset: usize) -> i32; + + #[wasm_bindgen(method)] + fn pread(this: &VFS, fd: i32, buffer: &mut [u8], offset: usize) -> i32; + + #[wasm_bindgen(method)] + fn size(this: &VFS, fd: i32) -> u64; + + #[wasm_bindgen(method)] + fn truncate(this: &VFS, fd: i32, len: usize); + + #[wasm_bindgen(method)] + fn sync(this: &VFS, fd: i32); +} + +#[cfg(all(feature = "nodejs", not(feature = "web")))] +#[wasm_bindgen(module = "/node/src/vfs.cjs")] +extern "C" { + type VFS; + #[wasm_bindgen(constructor)] + fn new() -> VFS; + + #[wasm_bindgen(method)] + fn open(this: &VFS, path: &str, flags: &str) -> i32; + + #[wasm_bindgen(method)] + fn close(this: &VFS, fd: i32) -> bool; + + #[wasm_bindgen(method)] + fn pwrite(this: &VFS, fd: i32, buffer: &[u8], offset: usize) -> i32; + + #[wasm_bindgen(method)] + fn pread(this: &VFS, fd: i32, buffer: &mut [u8], offset: usize) -> i32; + + #[wasm_bindgen(method)] + fn size(this: &VFS, fd: i32) -> u64; + + #[wasm_bindgen(method)] + fn truncate(this: &VFS, fd: i32, len: usize); + + #[wasm_bindgen(method)] + fn sync(this: &VFS, fd: i32); +} + +#[wasm_bindgen(start)] +pub fn init() { + console_error_panic_hook::set_once(); +} diff --git a/bindings/wasm/node/src/vfs.cjs b/bindings/wasm/node/src/vfs.cjs new file mode 100644 index 000000000..2501c512d --- /dev/null +++ b/bindings/wasm/node/src/vfs.cjs @@ -0,0 +1,37 @@ +const fs = require('node:fs'); + +class VFS { + constructor() { + } + + open(path, flags) { + return fs.openSync(path, flags); + } + + close(fd) { + fs.closeSync(fd); + } + + pread(fd, buffer, offset) { + return fs.readSync(fd, buffer, 0, buffer.length, offset); + } + + pwrite(fd, buffer, offset) { + return fs.writeSync(fd, buffer, 0, buffer.length, offset); + } + + size(fd) { + let stats = fs.fstatSync(fd); + return BigInt(stats.size); + } + + sync(fd) { + fs.fsyncSync(fd); + } + + truncate(fd, size) { + fs.ftruncateSync(fs, size) + } +} + +module.exports = { VFS }; diff --git a/bindings/wasm/web/src/web-vfs.js b/bindings/wasm/web/src/web-vfs.js new file mode 100644 index 000000000..21bdf9c0a --- /dev/null +++ b/bindings/wasm/web/src/web-vfs.js @@ -0,0 +1,33 @@ +export class VFS { + constructor() { + return self.vfs; + } + + open(path, flags) { + return self.vfs.open(path); + } + + close(fd) { + return self.vfs.close(fd); + } + + pread(fd, buffer, offset) { + return self.vfs.pread(fd, buffer, offset); + } + + pwrite(fd, buffer, offset) { + return self.vfs.pwrite(fd, buffer, offset); + } + + size(fd) { + return self.vfs.size(fd); + } + + sync(fd) { + return self.vfs.sync(fd); + } + + truncate(fd, size) { + return self.vfs.truncate(fd, size) + } +} From 52f63b2af089b9a23dec04fae01ac1bdeaf57fc0 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 23:25:11 -0400 Subject: [PATCH 08/62] Impl truncate for simulator --- simulator/runner/file.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/simulator/runner/file.rs b/simulator/runner/file.rs index 1be2cb48b..f78ae3287 100644 --- a/simulator/runner/file.rs +++ b/simulator/runner/file.rs @@ -225,6 +225,38 @@ impl File for SimulatorFile { fn size(&self) -> Result { self.inner.size() } + + fn truncate( + &self, + len: usize, + mut c: turso_core::Completion, + ) -> Result> { + if self.fault.get() { + return Err(turso_core::LimboError::InternalError( + FAULT_ERROR_MSG.into(), + )); + } + if let Some(latency) = self.generate_latency_duration() { + let CompletionType::Truncate(truncate_completion) = &mut c.completion_type else { + unreachable!(); + }; + let before = self.rng.borrow_mut().gen_bool(0.5); + let dummy_complete = Box::new(|_| {}); + let prev_complete = + std::mem::replace(&mut truncate_completion.complete, dummy_complete); + let new_complete = move |res| { + if before { + std::thread::sleep(latency); + } + (prev_complete)(res); + if !before { + std::thread::sleep(latency); + } + }; + truncate_completion.complete = Box::new(new_complete); + }; + self.inner.truncate(len, c) + } } impl Drop for SimulatorFile { From 3be8bb374d61053c6943802adc27a1cedbc3942c Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 23:25:47 -0400 Subject: [PATCH 09/62] Use usize for truncate method in place of u64 --- core/io/io_uring.rs | 4 ++-- core/io/memory.rs | 2 +- core/io/mod.rs | 2 +- core/io/unix.rs | 4 ++-- core/io/vfs.rs | 2 +- macros/src/ext/vfs_derive.rs | 8 ++++---- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/core/io/io_uring.rs b/core/io/io_uring.rs index b0f1ed269..be8083032 100644 --- a/core/io/io_uring.rs +++ b/core/io/io_uring.rs @@ -354,10 +354,10 @@ impl File for UringFile { Ok(self.file.metadata()?.len()) } - fn truncate(&self, len: u64, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result> { let mut io = self.io.borrow_mut(); let truncate = with_fd!(self, |fd| { - io_uring::opcode::Ftruncate::new(fd, len) + io_uring::opcode::Ftruncate::new(fd, len as u64) .build() .user_data(io.ring.get_key()) }); diff --git a/core/io/memory.rs b/core/io/memory.rs index 93c248571..037a0b0f7 100644 --- a/core/io/memory.rs +++ b/core/io/memory.rs @@ -174,7 +174,7 @@ impl File for MemoryFile { Ok(c) } - fn truncate(&self, len: u64, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result> { let new_size = len as usize; if new_size < self.size.get() { // Truncate pages diff --git a/core/io/mod.rs b/core/io/mod.rs index 0113db0eb..10b64339d 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -19,7 +19,7 @@ pub trait File: Send + Sync { -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; - fn truncate(&self, len: u64, c: Completion) -> Result>; + fn truncate(&self, len: usize, c: Completion) -> Result>; } #[derive(Debug, Copy, Clone, PartialEq)] diff --git a/core/io/unix.rs b/core/io/unix.rs index 0eee99321..a98943836 100644 --- a/core/io/unix.rs +++ b/core/io/unix.rs @@ -452,9 +452,9 @@ impl File for UnixFile<'_> { } #[instrument(err, skip_all, level = Level::INFO)] - fn truncate(&self, len: u64, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result> { let file = self.file.borrow(); - let result = file.set_len(len); + let result = file.set_len(len as u64); let c = Arc::new(c); match result { Ok(()) => { diff --git a/core/io/vfs.rs b/core/io/vfs.rs index a34d8dda1..50873636e 100644 --- a/core/io/vfs.rs +++ b/core/io/vfs.rs @@ -166,7 +166,7 @@ impl File for VfsFileImpl { } } - fn truncate(&self, len: u64, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result> { if self.vfs.is_null() { return Err(LimboError::ExtensionError("VFS is null".to_string())); } diff --git a/macros/src/ext/vfs_derive.rs b/macros/src/ext/vfs_derive.rs index eb2115ac1..5e813410a 100644 --- a/macros/src/ext/vfs_derive.rs +++ b/macros/src/ext/vfs_derive.rs @@ -192,17 +192,17 @@ pub fn derive_vfs_module(input: TokenStream) -> TokenStream { } #[no_mangle] - pub unsafe extern "C" fn #trunc_fn_name(file_ptr: *const ::std::ffi::c_void, len: i64) -> i32 { + pub unsafe extern "C" fn #trunc_fn_name(file_ptr: *const ::std::ffi::c_void, len: i64) -> ::turso_ext::ResultCode { if file_ptr.is_null() { - return -1; + return ::turso_ext::ResultCode::Error; } let vfs_file: &mut ::turso_ext::VfsFileImpl = &mut *(file_ptr as *mut ::turso_ext::VfsFileImpl); let file: &mut <#struct_name as ::turso_ext::VfsExtension>::File = &mut *(vfs_file.file as *mut <#struct_name as ::turso_ext::VfsExtension>::File); if <#struct_name as ::turso_ext::VfsExtension>::File::truncate(file, len).is_err() { - return -1; + return ::turso_ext::ResultCode::Error; } - 0 + ::turso_ext::ResultCode::OK } #[no_mangle] From 272a63b56246deeab3cfd98461208c386dd6149f Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 23:26:07 -0400 Subject: [PATCH 10/62] Fix truncate impl in JS bindings --- bindings/javascript/src/lib.rs | 4 ++++ bindings/wasm/lib.rs | 10 +++++++--- core/storage/database.rs | 6 +++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/bindings/javascript/src/lib.rs b/bindings/javascript/src/lib.rs index 45ce3b958..2c9f874ef 100644 --- a/bindings/javascript/src/lib.rs +++ b/bindings/javascript/src/lib.rs @@ -726,6 +726,10 @@ impl turso_core::DatabaseStorage for DatabaseFile { fn size(&self) -> turso_core::Result { self.file.size() } + fn truncate(&self, len: usize, c: turso_core::Completion) -> turso_core::Result<()> { + let _ = self.file.truncate(len, c)?; + Ok(()) + } } #[inline] diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs index 23ddebf68..40bafe9b5 100644 --- a/bindings/wasm/lib.rs +++ b/bindings/wasm/lib.rs @@ -261,7 +261,11 @@ impl turso_core::File for File { Ok(self.vfs.size(self.fd)) } - fn truncate(&self, len: u64, c: turso_core::Completion) -> Result> { + fn truncate( + &self, + len: usize, + c: turso_core::Completion, + ) -> Result> { self.vfs.truncate(self.fd, len as usize); c.complete(0); #[allow(clippy::arc_with_non_send_sync)] @@ -387,8 +391,8 @@ impl turso_core::DatabaseStorage for DatabaseFile { self.file.size() } - fn truncate(&self, len: u64, c: turso_core::Completion) -> Result<()> { - self.file.truncate(len, c); + fn truncate(&self, len: usize, c: turso_core::Completion) -> Result<()> { + self.file.truncate(len, c)?; Ok(()) } } diff --git a/core/storage/database.rs b/core/storage/database.rs index 422663027..17f17439b 100644 --- a/core/storage/database.rs +++ b/core/storage/database.rs @@ -18,7 +18,7 @@ pub trait DatabaseStorage: Send + Sync { ) -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; - fn truncate(&self, len: u64, c: Completion) -> Result<()>; + fn truncate(&self, len: usize, c: Completion) -> Result<()>; } #[cfg(feature = "fs")] @@ -72,7 +72,7 @@ impl DatabaseStorage for DatabaseFile { } #[instrument(skip_all, level = Level::INFO)] - fn truncate(&self, len: u64, c: Completion) -> Result<()> { + fn truncate(&self, len: usize, c: Completion) -> Result<()> { self.file.truncate(len, c)?; Ok(()) } @@ -131,7 +131,7 @@ impl DatabaseStorage for FileMemoryStorage { } #[instrument(skip_all, level = Level::INFO)] - fn truncate(&self, len: u64, c: Completion) -> Result<()> { + fn truncate(&self, len: usize, c: Completion) -> Result<()> { let _ = self.file.truncate(len, c)?; Ok(()) } From edbc29e2a6d803b5af00c32b55bc35d8277d1b6d Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 23:27:03 -0400 Subject: [PATCH 11/62] Add restart_header and truncate for WAL and tests to prep for truncate checkpoint mode --- core/io/mod.rs | 10 ++- core/storage/wal.rs | 145 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/core/io/mod.rs b/core/io/mod.rs index 10b64339d..c1b2e8ac7 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -19,7 +19,7 @@ pub trait File: Send + Sync { -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; - fn truncate(&self, len: usize, c: Completion) -> Result>; + fn truncate(&self, len: usize, c: Arc) -> Result>; } #[derive(Debug, Copy, Clone, PartialEq)] @@ -116,6 +116,14 @@ impl Completion { )))) } + pub fn new_trunc(complete: F) -> Self + where + F: Fn(i32) + 'static, + { + Self::new(CompletionType::Truncate(TruncateCompletion::new(Box::new( + complete, + )))) + } pub fn is_completed(&self) -> bool { self.inner.is_completed.get() } diff --git a/core/storage/wal.rs b/core/storage/wal.rs index c166d8017..e1cea6635 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -17,7 +17,7 @@ use std::{ }; use crate::fast_lock::SpinLock; -use crate::io::{File, IO}; +use crate::io::{CompletionType, File, IO}; use crate::result::LimboResult; use crate::storage::sqlite3_ondisk::{ begin_read_wal_frame, begin_read_wal_frame_raw, finish_read_page, prepare_wal_frame, @@ -255,6 +255,7 @@ pub trait Wal { fn get_max_frame(&self) -> u64; fn get_min_frame(&self) -> u64; fn rollback(&mut self) -> Result<()>; + fn truncate(&mut self) -> Result<()>; } /// A dummy WAL implementation that does nothing. @@ -351,6 +352,9 @@ impl Wal for DummyWAL { fn rollback(&mut self) -> Result<()> { Ok(()) } + fn truncate(&mut self) -> Result<()> { + Ok(()) + } } // Syncing requires a state machine because we need to schedule a sync and then wait until it is @@ -1077,6 +1081,32 @@ impl Wal for WalFile { shared.last_checksum = self.last_checksum; Ok(()) } + + #[instrument(skip_all, level = Level::INFO)] + fn truncate(&mut self) -> Result<()> { + tracing::debug!("truncate_wal_file"); + let shared = self.get_shared(); + let busy = !shared.write_lock.write(); + if busy { + return Err(LimboError::Busy); + } + let busy = !shared.checkpoint_lock.write(); + if busy { + shared.write_lock.unlock(); + return Err(LimboError::Busy); + } + let shared = self.shared.clone(); + let io = self.io.clone(); + let c = Completion::new_trunc(move |_| { + let shared = shared.clone(); + let shared = unsafe { shared.get().as_mut().unwrap() }; + let io = io.clone(); + let _ = shared.restart_wal_header(&io); + }); + let shared = self.get_shared(); + shared.file.truncate(WAL_HEADER_SIZE, c.into())?; + Ok(()) + } } impl WalFile { @@ -1264,4 +1294,117 @@ impl WalFileShared { pub fn page_size(&self) -> u32 { self.wal_header.lock().page_size } + + /// Called after a successful RESTART/TRUNCATE checkpoint when + /// all frames are back‑filled. + /// + /// sqlite3/src/wal.c + /// The following is guaranteed when this function is called: + /// + /// a) the WRITER lock is held, + /// b) the entire log file has been checkpointed, and + /// c) any existing readers are reading exclusively from the database + /// file - there are no readers that may attempt to read a frame from + /// the log file. + /// + /// This function updates the shared-memory structures so that the next + /// client to write to the database (which may be this one) does so by + /// writing frames into the start of the log file. + fn restart_wal_header(&mut self, io: &Arc) -> Result<()> { + // bump checkpoint sequence + let mut hdr = self.wal_header.lock(); + hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); + + // reset frame counters + hdr.checksum_1 = 0; + hdr.checksum_2 = 0; + self.max_frame.store(0, Ordering::SeqCst); + self.nbackfills.store(0, Ordering::SeqCst); + + // update salts. increment the first and generate a new random one for the second + hdr.salt_1 = hdr.salt_1.wrapping_add(1); // aSalt[0]++ + hdr.salt_2 = io.generate_random_number() as u32; + + // rewrite header on disk + sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; + + // clear per‑page caches + self.frame_cache.lock().clear(); + self.pages_in_frames.lock().clear(); + self.last_checksum = (hdr.checksum_1, hdr.checksum_2); + + // reset read‑marks + self.read_locks[0].value.store(0, Ordering::SeqCst); // always 0 + self.read_locks[1].value.store(0, Ordering::SeqCst); // available + for lock in &self.read_locks[2..] { + lock.value.store(READMARK_NOT_USED, Ordering::SeqCst); + } + Ok(()) + } +} + +#[cfg(test)] +pub mod test { + use crate::{ + storage::sqlite3_ondisk::WAL_HEADER_SIZE, Completion, CompletionType, Database, MemoryIO, + IO, + }; + use std::{cell::Cell, rc::Rc, sync::Arc}; + use tempfile::TempDir; + + #[allow(clippy::arc_with_non_send_sync)] + fn get_database() -> Arc { + let mut path = TempDir::new().unwrap().keep(); + path.push("test.db"); + { + let connection = rusqlite::Connection::open(&path).unwrap(); + connection + .pragma_update(None, "journal_mode", "wal") + .unwrap(); + } + let io: Arc = Arc::new(MemoryIO::new()); + let db = Database::open_file(io.clone(), path.to_str().unwrap(), false, false).unwrap(); + db + } + #[test] + fn test_truncate_file() { + let db = get_database(); + let conn = db.connect().unwrap(); + conn.execute("create table test (id integer primary key, value text)") + .unwrap(); + let _ = conn.execute("insert into test (value) values ('test1'), ('test2'), ('test3')"); + let wal = db.maybe_shared_wal.write(); + let wal_file = wal.as_ref().unwrap(); + let file = unsafe { &mut *wal_file.get() }; + let done = Rc::new(Cell::new(false)); + let _done = done.clone(); + let _ = file.file.truncate( + WAL_HEADER_SIZE, + Completion::new(CompletionType::Truncate( + crate::io::TruncateCompletion::new(Box::new(move |_| { + let done = _done.clone(); + done.set(true); + })), + )), + ); + assert!(file.file.size().unwrap() == WAL_HEADER_SIZE as u64); + assert!(done.get()); + } + + #[test] + fn test_truncate_wal() { + let db = get_database(); + let conn = db.connect().unwrap(); + conn.execute("create table test (id integer primary key, value text)") + .unwrap(); + for _i in 0..100 { + let _ = conn.execute("insert into test (value) values ('test1'), ('test2'), ('test3')"); + } + let file = conn.pager.borrow_mut(); + let _ = file.cacheflush(); + let mut wal = file.wal.borrow_mut(); + wal.truncate().unwrap(); + drop(wal); + assert_eq!(file.wal_frame_count().unwrap(), 0); + } } From f22313575f9fd2ab54fb047a228e14de0f448510 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 23:51:36 -0400 Subject: [PATCH 12/62] Fix windows method to take usize in place of u64 --- core/io/windows.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/io/windows.rs b/core/io/windows.rs index ed21257fd..263093af6 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -123,7 +123,7 @@ impl File for WindowsFile { } #[instrument(err, skip_all, level = Level::TRACE)] - fn truncate(&self, len: u64, c: Completion) -> Result<()> { + fn truncate(&self, len: usize, c: Completion) -> Result> { let mut file = self.file.borrow_mut(); file.set_len(len).map_err(LimboError::IOError)?; c.complete(0); From cfbe9d5b9e394901e31b4963f5e0f33f5958a764 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 17 Jul 2025 23:59:00 -0400 Subject: [PATCH 13/62] Fix clippy error casting same size --- core/io/generic.rs | 5 +++-- core/io/memory.rs | 7 +++---- core/io/windows.rs | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/io/generic.rs b/core/io/generic.rs index ac40e5757..005de9553 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -121,9 +121,10 @@ impl File for GenericFile { Ok(c) } - fn truncate(&self, len: u64, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result> { let mut file = self.file.borrow_mut(); - file.set_len(len).map_err(|err| LimboError::IOError(err))?; + file.set_len(len as u64) + .map_err(|err| LimboError::IOError(err))?; c.complete(0); Ok(Arc::new(c)) } diff --git a/core/io/memory.rs b/core/io/memory.rs index 037a0b0f7..ee9903801 100644 --- a/core/io/memory.rs +++ b/core/io/memory.rs @@ -175,15 +175,14 @@ impl File for MemoryFile { } fn truncate(&self, len: usize, c: Completion) -> Result> { - let new_size = len as usize; - if new_size < self.size.get() { + if len < self.size.get() { // Truncate pages unsafe { let pages = &mut *self.pages.get(); - pages.retain(|&k, _| k * PAGE_SIZE < new_size); + pages.retain(|&k, _| k * PAGE_SIZE < len); } } - self.size.set(new_size); + self.size.set(len); c.complete(0); Ok(Arc::new(c)) } diff --git a/core/io/windows.rs b/core/io/windows.rs index 263093af6..418faf59f 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -125,7 +125,7 @@ impl File for WindowsFile { #[instrument(err, skip_all, level = Level::TRACE)] fn truncate(&self, len: usize, c: Completion) -> Result> { let mut file = self.file.borrow_mut(); - file.set_len(len).map_err(LimboError::IOError)?; + file.set_len(len as u64).map_err(LimboError::IOError)?; c.complete(0); Ok(()) } From dcf346b9b092ac88ed066ff780536b316492db03 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Fri, 18 Jul 2025 00:20:06 -0400 Subject: [PATCH 14/62] Update apparently very old branch to new api --- bindings/javascript/src/lib.rs | 2 +- bindings/wasm/lib.rs | 8 ++++---- core/io/generic.rs | 4 ++-- core/io/io_uring.rs | 3 +-- core/io/memory.rs | 4 ++-- core/io/unix.rs | 3 +-- core/io/vfs.rs | 4 ++-- core/io/windows.rs | 4 ++-- core/storage/database.rs | 6 +++--- core/storage/wal.rs | 17 ++++++----------- simulator/runner/file.rs | 31 +++++++++++-------------------- 11 files changed, 35 insertions(+), 51 deletions(-) diff --git a/bindings/javascript/src/lib.rs b/bindings/javascript/src/lib.rs index 2c9f874ef..7a8ed46c1 100644 --- a/bindings/javascript/src/lib.rs +++ b/bindings/javascript/src/lib.rs @@ -726,7 +726,7 @@ impl turso_core::DatabaseStorage for DatabaseFile { fn size(&self) -> turso_core::Result { self.file.size() } - fn truncate(&self, len: usize, c: turso_core::Completion) -> turso_core::Result<()> { + fn truncate(&self, len: usize, c: Arc) -> turso_core::Result<()> { let _ = self.file.truncate(len, c)?; Ok(()) } diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs index 40bafe9b5..2e9e49984 100644 --- a/bindings/wasm/lib.rs +++ b/bindings/wasm/lib.rs @@ -264,12 +264,12 @@ impl turso_core::File for File { fn truncate( &self, len: usize, - c: turso_core::Completion, + c: Arc, ) -> Result> { - self.vfs.truncate(self.fd, len as usize); + self.vfs.truncate(self.fd, len); c.complete(0); #[allow(clippy::arc_with_non_send_sync)] - Ok(Arc::new(c)) + Ok(c) } } @@ -391,7 +391,7 @@ impl turso_core::DatabaseStorage for DatabaseFile { self.file.size() } - fn truncate(&self, len: usize, c: turso_core::Completion) -> Result<()> { + fn truncate(&self, len: usize, c: Arc) -> Result<()> { self.file.truncate(len, c)?; Ok(()) } diff --git a/core/io/generic.rs b/core/io/generic.rs index 005de9553..cb70ea068 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -121,12 +121,12 @@ impl File for GenericFile { Ok(c) } - fn truncate(&self, len: usize, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Arc) -> Result> { let mut file = self.file.borrow_mut(); file.set_len(len as u64) .map_err(|err| LimboError::IOError(err))?; c.complete(0); - Ok(Arc::new(c)) + Ok(c) } fn size(&self) -> Result { diff --git a/core/io/io_uring.rs b/core/io/io_uring.rs index be8083032..4b02aa516 100644 --- a/core/io/io_uring.rs +++ b/core/io/io_uring.rs @@ -354,14 +354,13 @@ impl File for UringFile { Ok(self.file.metadata()?.len()) } - fn truncate(&self, len: usize, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Arc) -> Result> { let mut io = self.io.borrow_mut(); let truncate = with_fd!(self, |fd| { io_uring::opcode::Ftruncate::new(fd, len as u64) .build() .user_data(io.ring.get_key()) }); - let c = Arc::new(c); io.ring.submit_entry(&truncate, c.clone()); Ok(c) } diff --git a/core/io/memory.rs b/core/io/memory.rs index ee9903801..df9161453 100644 --- a/core/io/memory.rs +++ b/core/io/memory.rs @@ -174,7 +174,7 @@ impl File for MemoryFile { Ok(c) } - fn truncate(&self, len: usize, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Arc) -> Result> { if len < self.size.get() { // Truncate pages unsafe { @@ -184,7 +184,7 @@ impl File for MemoryFile { } self.size.set(len); c.complete(0); - Ok(Arc::new(c)) + Ok(c) } fn size(&self) -> Result { diff --git a/core/io/unix.rs b/core/io/unix.rs index a98943836..0d388044b 100644 --- a/core/io/unix.rs +++ b/core/io/unix.rs @@ -452,10 +452,9 @@ impl File for UnixFile<'_> { } #[instrument(err, skip_all, level = Level::INFO)] - fn truncate(&self, len: usize, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Arc) -> Result> { let file = self.file.borrow(); let result = file.set_len(len as u64); - let c = Arc::new(c); match result { Ok(()) => { trace!("file truncated to len=({})", len); diff --git a/core/io/vfs.rs b/core/io/vfs.rs index 50873636e..25a52263a 100644 --- a/core/io/vfs.rs +++ b/core/io/vfs.rs @@ -166,7 +166,7 @@ impl File for VfsFileImpl { } } - fn truncate(&self, len: usize, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Arc) -> Result> { if self.vfs.is_null() { return Err(LimboError::ExtensionError("VFS is null".to_string())); } @@ -176,7 +176,7 @@ impl File for VfsFileImpl { Err(LimboError::ExtensionError("truncate failed".to_string())) } else { c.complete(0); - Ok(Arc::new(c)) + Ok(c) } } } diff --git a/core/io/windows.rs b/core/io/windows.rs index 418faf59f..2ae910b9b 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -123,11 +123,11 @@ impl File for WindowsFile { } #[instrument(err, skip_all, level = Level::TRACE)] - fn truncate(&self, len: usize, c: Completion) -> Result> { + fn truncate(&self, len: usize, c: Arc) -> Result> { let mut file = self.file.borrow_mut(); file.set_len(len as u64).map_err(LimboError::IOError)?; c.complete(0); - Ok(()) + Ok(c) } fn size(&self) -> Result { diff --git a/core/storage/database.rs b/core/storage/database.rs index 17f17439b..c7560f8fe 100644 --- a/core/storage/database.rs +++ b/core/storage/database.rs @@ -18,7 +18,7 @@ pub trait DatabaseStorage: Send + Sync { ) -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; - fn truncate(&self, len: usize, c: Completion) -> Result<()>; + fn truncate(&self, len: usize, c: Arc) -> Result<()>; } #[cfg(feature = "fs")] @@ -72,7 +72,7 @@ impl DatabaseStorage for DatabaseFile { } #[instrument(skip_all, level = Level::INFO)] - fn truncate(&self, len: usize, c: Completion) -> Result<()> { + fn truncate(&self, len: usize, c: Arc) -> Result<()> { self.file.truncate(len, c)?; Ok(()) } @@ -131,7 +131,7 @@ impl DatabaseStorage for FileMemoryStorage { } #[instrument(skip_all, level = Level::INFO)] - fn truncate(&self, len: usize, c: Completion) -> Result<()> { + fn truncate(&self, len: usize, c: Arc) -> Result<()> { let _ = self.file.truncate(len, c)?; Ok(()) } diff --git a/core/storage/wal.rs b/core/storage/wal.rs index e1cea6635..9cb997f9f 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -17,7 +17,7 @@ use std::{ }; use crate::fast_lock::SpinLock; -use crate::io::{CompletionType, File, IO}; +use crate::io::{File, IO}; use crate::result::LimboResult; use crate::storage::sqlite3_ondisk::{ begin_read_wal_frame, begin_read_wal_frame_raw, finish_read_page, prepare_wal_frame, @@ -1345,10 +1345,7 @@ impl WalFileShared { #[cfg(test)] pub mod test { - use crate::{ - storage::sqlite3_ondisk::WAL_HEADER_SIZE, Completion, CompletionType, Database, MemoryIO, - IO, - }; + use crate::{storage::sqlite3_ondisk::WAL_HEADER_SIZE, Completion, Database, MemoryIO, IO}; use std::{cell::Cell, rc::Rc, sync::Arc}; use tempfile::TempDir; @@ -1380,12 +1377,10 @@ pub mod test { let _done = done.clone(); let _ = file.file.truncate( WAL_HEADER_SIZE, - Completion::new(CompletionType::Truncate( - crate::io::TruncateCompletion::new(Box::new(move |_| { - let done = _done.clone(); - done.set(true); - })), - )), + Arc::new(Completion::new_trunc(move |_| { + let done = _done.clone(); + done.set(true); + })), ); assert!(file.file.size().unwrap() == WAL_HEADER_SIZE as u64); assert!(done.get()); diff --git a/simulator/runner/file.rs b/simulator/runner/file.rs index f78ae3287..928b51d5a 100644 --- a/simulator/runner/file.rs +++ b/simulator/runner/file.rs @@ -229,33 +229,24 @@ impl File for SimulatorFile { fn truncate( &self, len: usize, - mut c: turso_core::Completion, + c: Arc, ) -> Result> { if self.fault.get() { return Err(turso_core::LimboError::InternalError( FAULT_ERROR_MSG.into(), )); } - if let Some(latency) = self.generate_latency_duration() { - let CompletionType::Truncate(truncate_completion) = &mut c.completion_type else { - unreachable!(); - }; - let before = self.rng.borrow_mut().gen_bool(0.5); - let dummy_complete = Box::new(|_| {}); - let prev_complete = - std::mem::replace(&mut truncate_completion.complete, dummy_complete); - let new_complete = move |res| { - if before { - std::thread::sleep(latency); - } - (prev_complete)(res); - if !before { - std::thread::sleep(latency); - } - }; - truncate_completion.complete = Box::new(new_complete); + let c = if let Some(latency) = self.generate_latency_duration() { + let cloned_c = c.clone(); + let op = Box::new(move |file: &SimulatorFile| file.inner.truncate(len, cloned_c)); + self.queued_io + .borrow_mut() + .push(DelayedIo { time: latency, op }); + c + } else { + self.inner.truncate(len, c)? }; - self.inner.truncate(len, c) + Ok(c) } } From 9c75872827b2d57f4328d43a5e46c78832bf89a7 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Fri, 18 Jul 2025 14:46:19 -0400 Subject: [PATCH 15/62] Make writing wal header able to be sync --- bindings/javascript/src/lib.rs | 9 ++++++--- bindings/wasm/lib.rs | 18 +++++++++--------- core/io/generic.rs | 2 +- core/io/io_uring.rs | 2 +- core/io/memory.rs | 2 +- core/io/mod.rs | 2 +- core/io/unix.rs | 4 ++-- core/io/vfs.rs | 2 +- core/io/windows.rs | 2 +- core/storage/database.rs | 12 +++++------- core/storage/sqlite3_ondisk.rs | 3 ++- core/storage/wal.rs | 32 ++++++++++++++++---------------- simulator/runner/file.rs | 6 +----- 13 files changed, 47 insertions(+), 49 deletions(-) diff --git a/bindings/javascript/src/lib.rs b/bindings/javascript/src/lib.rs index 7a8ed46c1..470bdfbc0 100644 --- a/bindings/javascript/src/lib.rs +++ b/bindings/javascript/src/lib.rs @@ -726,9 +726,12 @@ impl turso_core::DatabaseStorage for DatabaseFile { fn size(&self) -> turso_core::Result { self.file.size() } - fn truncate(&self, len: usize, c: Arc) -> turso_core::Result<()> { - let _ = self.file.truncate(len, c)?; - Ok(()) + fn truncate( + &self, + len: usize, + c: turso_core::Completion, + ) -> turso_core::Result { + self.file.truncate(len, c) } } diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs index 2e9e49984..1996d073f 100644 --- a/bindings/wasm/lib.rs +++ b/bindings/wasm/lib.rs @@ -216,8 +216,8 @@ impl turso_core::File for File { fn pread( &self, pos: usize, - c: Arc, - ) -> Result> { + c: turso_core::Completion, + ) -> Result { let r = match c.completion_type { turso_core::CompletionType::Read(ref r) => r, _ => unreachable!(), @@ -236,8 +236,8 @@ impl turso_core::File for File { &self, pos: usize, buffer: Arc>, - c: Arc, - ) -> Result> { + c: turso_core::Completion, + ) -> Result { let w = match c.completion_type { turso_core::CompletionType::Write(ref w) => w, _ => unreachable!(), @@ -250,7 +250,7 @@ impl turso_core::File for File { Ok(c) } - fn sync(&self, c: Arc) -> Result> { + fn sync(&self, c: turso_core::Completion) -> Result { self.vfs.sync(self.fd); c.complete(0); #[allow(clippy::arc_with_non_send_sync)] @@ -264,8 +264,8 @@ impl turso_core::File for File { fn truncate( &self, len: usize, - c: Arc, - ) -> Result> { + c: turso_core::Completion, + ) -> Result { self.vfs.truncate(self.fd, len); c.complete(0); #[allow(clippy::arc_with_non_send_sync)] @@ -305,7 +305,7 @@ impl turso_core::IO for PlatformIO { })) } - fn wait_for_completion(&self, c: Arc) -> Result<()> { + fn wait_for_completion(&self, c: turso_core::Completion) -> Result<()> { while !c.is_completed() { self.run_once()?; } @@ -391,7 +391,7 @@ impl turso_core::DatabaseStorage for DatabaseFile { self.file.size() } - fn truncate(&self, len: usize, c: Arc) -> Result<()> { + fn truncate(&self, len: usize, c: turso_core::Completion) -> Result<()> { self.file.truncate(len, c)?; Ok(()) } diff --git a/core/io/generic.rs b/core/io/generic.rs index cb70ea068..5bfda1db0 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -121,7 +121,7 @@ impl File for GenericFile { Ok(c) } - fn truncate(&self, len: usize, c: Arc) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result { let mut file = self.file.borrow_mut(); file.set_len(len as u64) .map_err(|err| LimboError::IOError(err))?; diff --git a/core/io/io_uring.rs b/core/io/io_uring.rs index 4b02aa516..fe3e8d6dd 100644 --- a/core/io/io_uring.rs +++ b/core/io/io_uring.rs @@ -354,7 +354,7 @@ impl File for UringFile { Ok(self.file.metadata()?.len()) } - fn truncate(&self, len: usize, c: Arc) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result { let mut io = self.io.borrow_mut(); let truncate = with_fd!(self, |fd| { io_uring::opcode::Ftruncate::new(fd, len as u64) diff --git a/core/io/memory.rs b/core/io/memory.rs index df9161453..7dbf05d50 100644 --- a/core/io/memory.rs +++ b/core/io/memory.rs @@ -174,7 +174,7 @@ impl File for MemoryFile { Ok(c) } - fn truncate(&self, len: usize, c: Arc) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result { if len < self.size.get() { // Truncate pages unsafe { diff --git a/core/io/mod.rs b/core/io/mod.rs index c1b2e8ac7..82ef51313 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -19,7 +19,7 @@ pub trait File: Send + Sync { -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; - fn truncate(&self, len: usize, c: Arc) -> Result>; + fn truncate(&self, len: usize, c: Completion) -> Result; } #[derive(Debug, Copy, Clone, PartialEq)] diff --git a/core/io/unix.rs b/core/io/unix.rs index 0d388044b..cf7245b88 100644 --- a/core/io/unix.rs +++ b/core/io/unix.rs @@ -452,8 +452,8 @@ impl File for UnixFile<'_> { } #[instrument(err, skip_all, level = Level::INFO)] - fn truncate(&self, len: usize, c: Arc) -> Result> { - let file = self.file.borrow(); + fn truncate(&self, len: usize, c: Completion) -> Result { + let file = self.file.lock().unwrap(); let result = file.set_len(len as u64); match result { Ok(()) => { diff --git a/core/io/vfs.rs b/core/io/vfs.rs index 25a52263a..beb535aff 100644 --- a/core/io/vfs.rs +++ b/core/io/vfs.rs @@ -166,7 +166,7 @@ impl File for VfsFileImpl { } } - fn truncate(&self, len: usize, c: Arc) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result { if self.vfs.is_null() { return Err(LimboError::ExtensionError("VFS is null".to_string())); } diff --git a/core/io/windows.rs b/core/io/windows.rs index 2ae910b9b..b7c5b06cc 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -123,7 +123,7 @@ impl File for WindowsFile { } #[instrument(err, skip_all, level = Level::TRACE)] - fn truncate(&self, len: usize, c: Arc) -> Result> { + fn truncate(&self, len: usize, c: Completion) -> Result { let mut file = self.file.borrow_mut(); file.set_len(len as u64).map_err(LimboError::IOError)?; c.complete(0); diff --git a/core/storage/database.rs b/core/storage/database.rs index c7560f8fe..1ff74c95e 100644 --- a/core/storage/database.rs +++ b/core/storage/database.rs @@ -18,7 +18,7 @@ pub trait DatabaseStorage: Send + Sync { ) -> Result; fn sync(&self, c: Completion) -> Result; fn size(&self) -> Result; - fn truncate(&self, len: usize, c: Arc) -> Result<()>; + fn truncate(&self, len: usize, c: Completion) -> Result; } #[cfg(feature = "fs")] @@ -72,9 +72,8 @@ impl DatabaseStorage for DatabaseFile { } #[instrument(skip_all, level = Level::INFO)] - fn truncate(&self, len: usize, c: Arc) -> Result<()> { - self.file.truncate(len, c)?; - Ok(()) + fn truncate(&self, len: usize, c: Completion) -> Result { + self.file.truncate(len, c) } } @@ -131,9 +130,8 @@ impl DatabaseStorage for FileMemoryStorage { } #[instrument(skip_all, level = Level::INFO)] - fn truncate(&self, len: usize, c: Arc) -> Result<()> { - let _ = self.file.truncate(len, c)?; - Ok(()) + fn truncate(&self, len: usize, c: Completion) -> Result { + self.file.truncate(len, c) } } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 46cb738b4..829f049b6 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -1695,7 +1695,8 @@ pub fn begin_write_wal_header(io: &Arc, header: &WalHeader) -> Result< }; #[allow(clippy::arc_with_non_send_sync)] let c = Completion::new_write(write_complete); - io.pwrite(0, buffer.clone(), c) + let c = io.pwrite(0, buffer.clone(), c.clone())?; + Ok(c) } /// Checks if payload will overflow a cell based on the maximum allowed size. diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 9cb997f9f..d99724c95 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1101,10 +1101,14 @@ impl Wal for WalFile { let shared = shared.clone(); let shared = unsafe { shared.get().as_mut().unwrap() }; let io = io.clone(); - let _ = shared.restart_wal_header(&io); + shared.restart_wal_header(&io).unwrap(); }); let shared = self.get_shared(); - shared.file.truncate(WAL_HEADER_SIZE, c.into())?; + let c_cloned = c.clone(); + let c = shared.file.truncate(WAL_HEADER_SIZE, c_cloned.clone())?; + // ensure that the header is written and not just this completion fires + self.io.wait_for_completion(c_cloned)?; + self.io.wait_for_completion(c)?; Ok(()) } } @@ -1260,19 +1264,14 @@ impl WalFileShared { let c = sqlite3_ondisk::begin_write_wal_header(&file, &wal_header)?; // TODO: for now wait for completion io.wait_for_completion(c)?; - let header = Arc::new(SpinLock::new(wal_header)); - let checksum = { - let checksum = header.lock(); - (checksum.checksum_1, checksum.checksum_2) - }; - tracing::debug!("new_shared(header={:?})", header); + tracing::debug!("new_shared(header={:?})", wal_header); let shared = WalFileShared { - wal_header: header, + wal_header: Arc::new(SpinLock::new(wal_header)), min_frame: AtomicU64::new(0), max_frame: AtomicU64::new(0), nbackfills: AtomicU64::new(0), frame_cache: Arc::new(SpinLock::new(HashMap::new())), - last_checksum: checksum, + last_checksum: (checksums.0, checksums.1), file, pages_in_frames: Arc::new(SpinLock::new(Vec::new())), read_locks: array::from_fn(|_| LimboRwLock { @@ -1326,16 +1325,17 @@ impl WalFileShared { hdr.salt_2 = io.generate_random_number() as u32; // rewrite header on disk - sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; + let c = sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; + // TODO: for now wait for completion + io.wait_for_completion(c)?; - // clear per‑page caches self.frame_cache.lock().clear(); self.pages_in_frames.lock().clear(); self.last_checksum = (hdr.checksum_1, hdr.checksum_2); // reset read‑marks - self.read_locks[0].value.store(0, Ordering::SeqCst); // always 0 - self.read_locks[1].value.store(0, Ordering::SeqCst); // available + self.read_locks[0].value.store(0, Ordering::SeqCst); + self.read_locks[1].value.store(0, Ordering::SeqCst); for lock in &self.read_locks[2..] { lock.value.store(READMARK_NOT_USED, Ordering::SeqCst); } @@ -1377,10 +1377,10 @@ pub mod test { let _done = done.clone(); let _ = file.file.truncate( WAL_HEADER_SIZE, - Arc::new(Completion::new_trunc(move |_| { + Completion::new_trunc(move |_| { let done = _done.clone(); done.set(true); - })), + }), ); assert!(file.file.size().unwrap() == WAL_HEADER_SIZE as u64); assert!(done.get()); diff --git a/simulator/runner/file.rs b/simulator/runner/file.rs index 928b51d5a..ba3680333 100644 --- a/simulator/runner/file.rs +++ b/simulator/runner/file.rs @@ -226,11 +226,7 @@ impl File for SimulatorFile { self.inner.size() } - fn truncate( - &self, - len: usize, - c: Arc, - ) -> Result> { + fn truncate(&self, len: usize, c: turso_core::Completion) -> Result { if self.fault.get() { return Err(turso_core::LimboError::InternalError( FAULT_ERROR_MSG.into(), From ad286bb8732c35710e4a5516391dfcbb326ffc44 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Fri, 18 Jul 2025 15:01:46 -0400 Subject: [PATCH 16/62] Use new wait_for_completion for sync IO --- core/storage/wal.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index d99724c95..69515f97b 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1262,9 +1262,13 @@ impl WalFileShared { wal_header.checksum_1 = checksums.0; wal_header.checksum_2 = checksums.1; let c = sqlite3_ondisk::begin_write_wal_header(&file, &wal_header)?; - // TODO: for now wait for completion + let header = Arc::new(SpinLock::new(wal_header)); + let checksum = { + let checksum = header.lock(); + (checksum.checksum_1, checksum.checksum_2) + }; io.wait_for_completion(c)?; - tracing::debug!("new_shared(header={:?})", wal_header); + tracing::debug!("new_shared(header={:?})", header); let shared = WalFileShared { wal_header: Arc::new(SpinLock::new(wal_header)), min_frame: AtomicU64::new(0), @@ -1326,7 +1330,6 @@ impl WalFileShared { // rewrite header on disk let c = sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; - // TODO: for now wait for completion io.wait_for_completion(c)?; self.frame_cache.lock().clear(); From 441319e774e83f3241f422d7ae27ec53c2b5a9c9 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Fri, 18 Jul 2025 15:27:19 -0400 Subject: [PATCH 17/62] Fix todo! in vfs IO preventing write wal header from using wait_for_completion --- core/io/vfs.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/io/vfs.rs b/core/io/vfs.rs index beb535aff..256ba4494 100644 --- a/core/io/vfs.rs +++ b/core/io/vfs.rs @@ -43,10 +43,8 @@ impl IO for VfsMod { Ok(()) } - fn wait_for_completion(&self, c: Completion) -> Result<()> { - while !c.is_completed() { - self.run_once()?; - } + fn wait_for_completion(&self, _c: Completion) -> Result<()> { + // for the moment anyway, this is currently a sync api Ok(()) } From f78c6af51ab4b78c03789f4a16b4ccae6e08ed9a Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:17:54 -0400 Subject: [PATCH 18/62] Display error when unable to open file from within cli --- cli/app.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cli/app.rs b/cli/app.rs index fa98d04a8..2b422af86 100644 --- a/cli/app.rs +++ b/cli/app.rs @@ -613,8 +613,8 @@ impl Limbo { std::process::exit(0) } Command::Open(args) => { - if self.open_db(&args.path, args.vfs_name.as_deref()).is_err() { - let _ = self.writeln("Error: Unable to open database file."); + if let Err(e) = self.open_db(&args.path, args.vfs_name.as_deref()) { + let _ = self.writeln(e.to_string()); } } Command::Schema(args) => { From 6b71efa5b6ce05babd5582b8ada0af1ff6dd8063 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:18:34 -0400 Subject: [PATCH 19/62] Accept mode argument for checkpoint in connection api --- core/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 2c1e73beb..ec0b7a56f 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -1188,13 +1188,13 @@ impl Connection { Ok(()) } - pub fn checkpoint(&self) -> Result { + pub fn checkpoint(&self, mode: CheckpointMode) -> Result { if self.closed.get() { return Err(LimboError::InternalError("Connection closed".to_string())); } self.pager .borrow() - .wal_checkpoint(self.wal_checkpoint_disabled.get()) + .wal_checkpoint(self.wal_checkpoint_disabled.get(), mode) } /// Close a connection and checkpoint. From 1a9b7ef76edb467a84a7a387fe56b7815cf68127 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:19:08 -0400 Subject: [PATCH 20/62] Add support for truncate, restart and full checkpointing methods --- core/storage/pager.rs | 13 +- core/storage/wal.rs | 738 +++++++++++++++++++++++++++++++++++------- 2 files changed, 620 insertions(+), 131 deletions(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 3a73a96a1..1fe798ab6 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1283,17 +1283,18 @@ impl Pager { _attempts += 1; } } - self.wal_checkpoint(wal_checkpoint_disabled)?; + self.wal_checkpoint(wal_checkpoint_disabled, CheckpointMode::Passive)?; Ok(()) } #[instrument(skip_all, level = Level::DEBUG)] - pub fn wal_checkpoint(&self, wal_checkpoint_disabled: bool) -> Result { + pub fn wal_checkpoint( + &self, + wal_checkpoint_disabled: bool, + mode: CheckpointMode, + ) -> Result { if wal_checkpoint_disabled { - return Ok(CheckpointResult { - num_wal_frames: 0, - num_checkpointed_frames: 0, - }); + return Ok(CheckpointResult::default()); } let checkpoint_result = self.io.block(|| { diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 69515f97b..322c2f8cf 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -19,6 +19,7 @@ use std::{ use crate::fast_lock::SpinLock; use crate::io::{File, IO}; use crate::result::LimboResult; +use crate::storage::header_accessor; use crate::storage::sqlite3_ondisk::{ begin_read_wal_frame, begin_read_wal_frame_raw, finish_read_page, prepare_wal_frame, WAL_FRAME_HEADER_SIZE, WAL_HEADER_SIZE, @@ -255,7 +256,6 @@ pub trait Wal { fn get_max_frame(&self) -> u64; fn get_min_frame(&self) -> u64; fn rollback(&mut self) -> Result<()>; - fn truncate(&mut self) -> Result<()>; } /// A dummy WAL implementation that does nothing. @@ -352,9 +352,6 @@ impl Wal for DummyWAL { fn rollback(&mut self) -> Result<()> { Ok(()) } - fn truncate(&mut self) -> Result<()> { - Ok(()) - } } // Syncing requires a state machine because we need to schedule a sync and then wait until it is @@ -432,6 +429,8 @@ pub struct WalFile { /// Private copy of WalHeader pub header: WalHeader, + + checkpoint_guard: Option, } impl fmt::Debug for WalFile { @@ -474,7 +473,10 @@ pub struct WalFileShared { /// read_locks is a list of read locks that can coexist with the max_frame number stored in /// value. There is a limited amount because and unbounded amount of connections could be /// fatal. Therefore, for now we copy how SQLite behaves with limited amounts of read max - /// frames that is equal to 5 + /// frames that is equal to 5. + /// read_locks[0] is the exclusive read lock that is always 0 except during a checkpoint where + /// the log is restarted. read_locks[1] is the 'default reader' slot that always contains the + /// current max_frame. pub read_locks: [LimboRwLock; 5], /// There is only one write allowed in WAL mode. This lock takes care of ensuring there is only /// one used. @@ -498,6 +500,70 @@ impl fmt::Debug for WalFileShared { } } +enum CheckpointGuard { + Writer { ptr: Arc> }, + Read0 { ptr: Arc> }, +} + +impl CheckpointGuard { + fn new(ptr: Arc>, mode: CheckpointMode) -> Result { + let shared = &mut unsafe { &mut *ptr.get() }; + if !shared.checkpoint_lock.write() { + tracing::trace!("CheckpointGuard::new: checkpoint lock failed, returning Busy"); + // exclusive lock on checkpoint lock + return Err(LimboError::Busy); + } + match mode { + CheckpointMode::Passive | CheckpointMode::Full => { + let read0 = &mut shared.read_locks[0]; + if !read0.write() { + shared.checkpoint_lock.unlock(); + tracing::trace!("CheckpointGuard::new: read0 lock failed, returning Busy"); + // exclusive lock on slot‑0 + return Err(LimboError::Busy); + } + Ok(Self::Read0 { ptr }) + } + CheckpointMode::Restart | CheckpointMode::Truncate => { + if !shared.write_lock.write() { + shared.checkpoint_lock.unlock(); + tracing::trace!("CheckpointGuard::new: read0 lock failed, returning Busy"); + return Err(LimboError::Busy); + } + Ok(Self::Writer { ptr }) + } + } + } +} + +/// Small macro to remove the writer lock in the event that a checkpoint errors out and would +/// otherwise leak the lock. Only used during checkpointing. +macro_rules! ensure_unlock { + ($self:ident, $fun:expr) => { + $fun.inspect_err(|e| { + tracing::trace!( + "CheckpointGuard::ensure_unlock: error occurred, releaseing held locks: {e}" + ); + let _ = $self.checkpoint_guard.take(); + })?; + }; +} + +impl Drop for CheckpointGuard { + fn drop(&mut self) { + match self { + CheckpointGuard::Writer { ptr: shared } => unsafe { + (*shared.get()).write_lock.unlock(); + (*shared.get()).checkpoint_lock.unlock(); + }, + CheckpointGuard::Read0 { ptr: shared } => unsafe { + (*shared.get()).read_locks[0].unlock(); + (*shared.get()).checkpoint_lock.unlock(); + }, + } + } +} + impl Wal for WalFile { /// Begin a read transaction. #[instrument(skip_all, level = Level::DEBUG)] @@ -510,7 +576,7 @@ impl Wal for WalFile { let mut max_read_mark_index = -1; // Find the largest mark we can find, ignore frames that are impossible to be in range and // that are not set - for (index, lock) in self.get_shared().read_locks.iter().enumerate() { + for (index, lock) in self.get_shared().read_locks.iter().enumerate().skip(1) { let this_mark = lock.value.load(Ordering::SeqCst); if this_mark > max_read_mark && this_mark <= max_frame_in_wal as u32 { max_read_mark = this_mark; @@ -520,7 +586,7 @@ impl Wal for WalFile { // If we didn't find any mark or we can update, let's update them if (max_read_mark as u64) < max_frame_in_wal || max_read_mark_index == -1 { - for (index, lock) in self.get_shared().read_locks.iter_mut().enumerate() { + for (index, lock) in self.get_shared().read_locks.iter_mut().enumerate().skip(1) { let busy = !lock.write(); if !busy { // If this was busy then it must mean >1 threads tried to set this read lock @@ -824,14 +890,14 @@ impl Wal for WalFile { write_counter: Rc>, mode: CheckpointMode, ) -> Result> { - assert!( - matches!(mode, CheckpointMode::Passive), - "only passive mode supported for now" - ); 'checkpoint_loop: loop { let state = self.ongoing_checkpoint.state; tracing::debug!(?state); match state { + // Acquire the relevant exclusive lock (slot‑0 or WRITER) + // and checkpoint_lock so no other checkpointer can run. + // fsync WAL if there are unapplied frames. + // Decide the largest frame we are allowed to back‑fill. CheckpointState::Start => { // TODO(pere): check what frames are safe to checkpoint between many readers! let (shared_max, nbackfills) = { @@ -846,31 +912,18 @@ impl Wal for WalFile { // we return the previous number of backfilled pages from last checkpoint. return Ok(IOResult::Done(self.prev_checkpoint)); } - self.ongoing_checkpoint.min_frame = nbackfills + 1; let shared = self.get_shared(); let busy = !shared.checkpoint_lock.write(); if busy { return Err(LimboError::Busy); } - let mut max_safe_frame = shared.max_frame.load(Ordering::SeqCst); - for (read_lock_idx, read_lock) in shared.read_locks.iter_mut().enumerate() { - let this_mark = read_lock.value.load(Ordering::SeqCst); - if this_mark < max_safe_frame as u32 { - let busy = !read_lock.write(); - if !busy { - let new_mark = if read_lock_idx == 0 { - max_safe_frame as u32 - } else { - READMARK_NOT_USED - }; - read_lock.value.store(new_mark, Ordering::SeqCst); - read_lock.unlock(); - } else { - max_safe_frame = this_mark as u64; - } - } + // acquire either the read0 or write lock depending on the checkpoint mode + if self.checkpoint_guard.is_none() { + let guard = CheckpointGuard::new(self.shared.clone(), mode)?; + self.checkpoint_guard = Some(guard); } - self.ongoing_checkpoint.max_frame = max_safe_frame; + self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); + self.ongoing_checkpoint.min_frame = nbackfills + 1; self.ongoing_checkpoint.current_page = 0; self.ongoing_checkpoint.state = CheckpointState::ReadFrame; tracing::trace!( @@ -879,6 +932,8 @@ impl Wal for WalFile { self.ongoing_checkpoint.max_frame, ); } + // Find the next page that has a frame in the safe interval and + // schedule a read of that frame. CheckpointState::ReadFrame => { let shared = self.get_shared(); let min_frame = self.ongoing_checkpoint.min_frame; @@ -907,12 +962,14 @@ impl Wal for WalFile { *frame ); self.ongoing_checkpoint.page.get().id = page as usize; - - let c = self.read_frame( - *frame, - self.ongoing_checkpoint.page.clone(), - self.buffer_pool.clone(), - )?; + ensure_unlock!( + self, + self.read_frame( + *frame, + self.ongoing_checkpoint.page.clone(), + self.buffer_pool.clone(), + ) + ); self.ongoing_checkpoint.state = CheckpointState::WaitReadFrame; continue 'checkpoint_loop; } @@ -954,6 +1011,10 @@ impl Wal for WalFile { self.ongoing_checkpoint.state = CheckpointState::Done; } } + // All eligible frames copied to the db file. + // Update nBackfills. + // In Reset or Truncate mode, we need to restart and possibly truncate the log. + // Release all locks and return the current num of wal frames and the amount we backfilled CheckpointState::Done => { if *write_counter.borrow() > 0 { return Ok(IOResult::IO); @@ -963,40 +1024,38 @@ impl Wal for WalFile { let max_frame = shared.max_frame.load(Ordering::SeqCst); let nbackfills = shared.nbackfills.load(Ordering::SeqCst); - // Record two num pages fields to return as checkpoint result to caller. - // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html - let frames_in_wal = max_frame.saturating_sub(nbackfills); - let frames_checkpointed = self - .ongoing_checkpoint - .max_frame - .saturating_sub(self.ongoing_checkpoint.min_frame - 1); - let checkpoint_result = - CheckpointResult::new(frames_in_wal, frames_checkpointed); - let everything_backfilled = shared.max_frame.load(Ordering::SeqCst) - == self.ongoing_checkpoint.max_frame; - shared + let (checkpoint_result, everything_backfilled) = { + let shared = self.get_shared(); + let current_mx = shared.max_frame.load(Ordering::SeqCst); + // Record two num pages fields to return as checkpoint result to caller. + // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html + let frames_in_wal = max_frame.saturating_sub(nbackfills); + let frames_checkpointed = self + .ongoing_checkpoint + .max_frame + .saturating_sub(self.ongoing_checkpoint.min_frame - 1); + let checkpoint_result = + CheckpointResult::new(frames_in_wal, frames_checkpointed); + let everything_backfilled = shared.max_frame.load(Ordering::SeqCst) + == self.ongoing_checkpoint.max_frame; + (checkpoint_result, everything_backfilled) + }; + // we will just overwrite nbackfills with 0 if we are resetting + self.get_shared() .nbackfills .store(self.ongoing_checkpoint.max_frame, Ordering::SeqCst); if everything_backfilled { - // TODO: Even in Passive mode, if everything was backfilled we should - // truncate and fsync the *db file* - - // To properly reset the *wal file* we will need restart and/or truncate mode. - // Currently, it will grow the WAL file indefinetly, but don't resetting is better than breaking. - // Check: https://github.com/sqlite/sqlite/blob/2bd9f69d40dd240c4122c6d02f1ff447e7b5c098/src/wal.c#L2193 - if !matches!(mode, CheckpointMode::Passive) { - // Here we know that we backfilled everything, therefore we can safely - // reset the wal. - shared.frame_cache.lock().clear(); - shared.pages_in_frames.lock().clear(); - shared.max_frame.store(0, Ordering::SeqCst); - shared.nbackfills.store(0, Ordering::SeqCst); - // TODO: if all frames were backfilled into the db file, calls fsync - // TODO(pere): truncate wal file here. - } + match mode { + CheckpointMode::Restart | CheckpointMode::Truncate => { + ensure_unlock!(self, self.restart_log(mode)); + } + _ => {} + }; + ensure_unlock!(self, self.trunc_db_file(pager)); } self.prev_checkpoint = checkpoint_result; self.ongoing_checkpoint.state = CheckpointState::Start; + let _ = self.checkpoint_guard.take(); return Ok(IOResult::Done(checkpoint_result)); } } @@ -1081,36 +1140,6 @@ impl Wal for WalFile { shared.last_checksum = self.last_checksum; Ok(()) } - - #[instrument(skip_all, level = Level::INFO)] - fn truncate(&mut self) -> Result<()> { - tracing::debug!("truncate_wal_file"); - let shared = self.get_shared(); - let busy = !shared.write_lock.write(); - if busy { - return Err(LimboError::Busy); - } - let busy = !shared.checkpoint_lock.write(); - if busy { - shared.write_lock.unlock(); - return Err(LimboError::Busy); - } - let shared = self.shared.clone(); - let io = self.io.clone(); - let c = Completion::new_trunc(move |_| { - let shared = shared.clone(); - let shared = unsafe { shared.get().as_mut().unwrap() }; - let io = io.clone(); - shared.restart_wal_header(&io).unwrap(); - }); - let shared = self.get_shared(); - let c_cloned = c.clone(); - let c = shared.file.truncate(WAL_HEADER_SIZE, c_cloned.clone())?; - // ensure that the header is written and not just this completion fires - self.io.wait_for_completion(c_cloned)?; - self.io.wait_for_completion(c)?; - Ok(()) - } } impl WalFile { @@ -1154,6 +1183,7 @@ impl WalFile { max_frame_read_lock_index: 0, last_checksum, prev_checkpoint: CheckpointResult::default(), + checkpoint_guard: None, start_pages_in_frames: 0, header: *header, } @@ -1200,6 +1230,111 @@ impl WalFile { self.sync_state.set(SyncState::NotSyncing); self.syncing.set(false); } + + // coordinate among many readers what the maximum safe frame is for us + // to backfill when checkpointing. + fn determine_max_safe_checkpoint_frame(&self) -> u64 { + let shared = self.get_shared(); + let mut max_safe_frame = shared.max_frame.load(Ordering::SeqCst); + // If a reader is positioned before max_frame we either: + // bump its mark up if we can take the slot write-lock OR + // lower max_frame if the reader is busy and we cannot overtake + for (read_lock_idx, read_lock) in shared.read_locks.iter_mut().enumerate().skip(1) { + let this_mark = read_lock.value.load(Ordering::SeqCst); + if this_mark < max_safe_frame as u32 { + let busy = !read_lock.write(); + if !busy { + let new_mark = if read_lock_idx == 1 { + max_safe_frame as u32 + } else { + READMARK_NOT_USED + }; + read_lock.value.store(new_mark, Ordering::SeqCst); + read_lock.unlock(); + } else { + max_safe_frame = this_mark as u64; + } + } + } + max_safe_frame + } + + /// Called once the entire WAL has been back‑filled in RESTART or TRUNCATE mode. + /// Must be invoked while writer and checkpoint locks are still held. + fn restart_log(&mut self, mode: CheckpointMode) -> Result<()> { + turso_assert!( + matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate), + "CheckpointMode must be Restart or Truncate" + ); + turso_assert!( + matches!(self.checkpoint_guard, Some(CheckpointGuard::Writer { .. })), + "We must hold writer and checkpoint locks to restart the log" + ); + tracing::info!("restart_log(mode={mode:?})"); + { + // Block all readers + let shared = self.get_shared(); + for idx in 1..shared.read_locks.len() { + let lock = &mut shared.read_locks[idx]; + if !lock.write() { + // Reader is active, cannot proceed + return Err(LimboError::Busy); + } + lock.value.store(READMARK_NOT_USED, Ordering::SeqCst); + lock.unlock(); + } + } + + // reinitialize in‑memory state + ensure_unlock!(self, self.get_shared().restart_wal_header(&self.io, mode)); + // For TRUNCATE: physically shrink the WAL to 0 B + if matches!(mode, CheckpointMode::Truncate) { + let c = Completion::new_trunc(|_| { + tracing::trace!("WAL file truncated to 0 B"); + }); + ensure_unlock!(self, self.get_shared().file.truncate(0, c)); + + let c = Completion::new_sync(|_| { + tracing::trace!("WAL file synced after truncation"); + }); + // fsync after truncation + self.get_shared().file.sync(c.into())?; + } + + // release read‑locks 1..4 + { + let shared = self.get_shared(); + for idx in 1..shared.read_locks.len() { + shared.read_locks[idx].unlock(); + } + } + + self.max_frame = 0; + self.min_frame = 0; + Ok(()) + } + + /// Truncate the db file to the size of the database in pages. + /// to be called after any checkpoint where we are able to backfill + /// all frames in the WAL. + fn trunc_db_file(&self, pager: &Pager) -> Result<()> { + let db_size = header_accessor::get_database_size(pager)? as u64; + let sz_db = db_size * (self.page_size() as u64); + if pager.db_file.size()? != sz_db { + let _ = pager.db_file.truncate( + sz_db as usize, + Completion::new_trunc(move |_| { + tracing::trace!("db file truncated to {} bytes", sz_db); + }) + .into(), + ); + } + // then fsync + pager.db_file.sync(Completion::new_sync(move |_| { + tracing::trace!("db file syncd after truncating"); + }))?; + Ok(()) + } } impl WalFileShared { @@ -1269,20 +1404,25 @@ impl WalFileShared { }; io.wait_for_completion(c)?; tracing::debug!("new_shared(header={:?})", header); + let read_locks = array::from_fn(|_| LimboRwLock { + lock: AtomicU32::new(NO_LOCK), + nreads: AtomicU32::new(0), + value: AtomicU32::new(READMARK_NOT_USED), + }); + // slots 0 and 1 begin at 0 + read_locks[0].value.store(0, Ordering::SeqCst); + read_locks[1].value.store(0, Ordering::SeqCst); + let shared = WalFileShared { wal_header: Arc::new(SpinLock::new(wal_header)), min_frame: AtomicU64::new(0), max_frame: AtomicU64::new(0), nbackfills: AtomicU64::new(0), frame_cache: Arc::new(SpinLock::new(HashMap::new())), - last_checksum: (checksums.0, checksums.1), + last_checksum: checksum, file, pages_in_frames: Arc::new(SpinLock::new(Vec::new())), - read_locks: array::from_fn(|_| LimboRwLock { - lock: AtomicU32::new(NO_LOCK), - nreads: AtomicU32::new(0), - value: AtomicU32::new(READMARK_NOT_USED), - }), + read_locks, write_lock: LimboRwLock { lock: AtomicU32::new(NO_LOCK), nreads: AtomicU32::new(0), @@ -1298,8 +1438,8 @@ impl WalFileShared { self.wal_header.lock().page_size } - /// Called after a successful RESTART/TRUNCATE checkpoint when - /// all frames are back‑filled. + /// Called after a successful RESTART/TRUNCATE mode checkpoint + /// when all frames are back‑filled. /// /// sqlite3/src/wal.c /// The following is guaranteed when this function is called: @@ -1313,7 +1453,7 @@ impl WalFileShared { /// This function updates the shared-memory structures so that the next /// client to write to the database (which may be this one) does so by /// writing frames into the start of the log file. - fn restart_wal_header(&mut self, io: &Arc) -> Result<()> { + fn restart_wal_header(&mut self, io: &Arc, mode: CheckpointMode) -> Result<()> { // bump checkpoint sequence let mut hdr = self.wal_header.lock(); hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); @@ -1327,11 +1467,26 @@ impl WalFileShared { // update salts. increment the first and generate a new random one for the second hdr.salt_1 = hdr.salt_1.wrapping_add(1); // aSalt[0]++ hdr.salt_2 = io.generate_random_number() as u32; + let native = cfg!(target_endian = "big"); + let header_prefix = &hdr.as_bytes()[..WAL_HEADER_SIZE - 2 * 4]; + let (c1, c2) = checksum_wal(header_prefix, &hdr, (0, 0), native); + hdr.checksum_1 = c1; + hdr.checksum_2 = c2; + self.last_checksum = (c1, c2); - // rewrite header on disk - let c = sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; - io.wait_for_completion(c)?; - + // for RESTART, we write a new header to the WAL file. truncate will simply + // write it in memory and let the following writer append it to the empty WAL file + if !matches!(mode, CheckpointMode::Truncate) { + // if we are truncating the WAL, we don't bother writing a new header + let c = sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; + io.wait_for_completion(c)?; + self.file.sync( + Completion::new_sync(move |_| { + tracing::trace!("WAL header synced after restart"); + }) + .into(), + )?; + } self.frame_cache.lock().clear(); self.pages_in_frames.lock().clear(); self.last_checksum = (hdr.checksum_1, hdr.checksum_2); @@ -1348,13 +1503,21 @@ impl WalFileShared { #[cfg(test)] pub mod test { - use crate::{storage::sqlite3_ondisk::WAL_HEADER_SIZE, Completion, Database, MemoryIO, IO}; - use std::{cell::Cell, rc::Rc, sync::Arc}; - use tempfile::TempDir; - + use crate::{ + result::LimboResult, storage::sqlite3_ondisk::WAL_HEADER_SIZE, types::IOResult, + CheckpointMode, CheckpointResult, Completion, Connection, Database, PlatformIO, Wal, + WalFileShared, IO, + }; + use std::{ + cell::{Cell, RefCell, UnsafeCell}, + os::unix::fs::MetadataExt, + rc::Rc, + sync::{atomic::Ordering, Arc}, + }; #[allow(clippy::arc_with_non_send_sync)] - fn get_database() -> Arc { - let mut path = TempDir::new().unwrap().keep(); + fn get_database() -> (Arc, std::path::PathBuf) { + let mut path = tempfile::tempdir().unwrap().keep(); + let dbpath = path.clone(); path.push("test.db"); { let connection = rusqlite::Connection::open(&path).unwrap(); @@ -1362,13 +1525,14 @@ pub mod test { .pragma_update(None, "journal_mode", "wal") .unwrap(); } - let io: Arc = Arc::new(MemoryIO::new()); + let io: Arc = Arc::new(PlatformIO::new().unwrap()); let db = Database::open_file(io.clone(), path.to_str().unwrap(), false, false).unwrap(); - db + // db + tmp directory + (db, dbpath) } #[test] fn test_truncate_file() { - let db = get_database(); + let (db, _path) = get_database(); let conn = db.connect().unwrap(); conn.execute("create table test (id integer primary key, value text)") .unwrap(); @@ -1390,19 +1554,343 @@ pub mod test { } #[test] - fn test_truncate_wal() { - let db = get_database(); + fn test_truncate_checkpoint() { + let (db, path) = get_database(); + let mut walpath = path.clone().into_os_string().into_string().unwrap(); + walpath.push_str("/test.db-wal"); + let walpath = std::path::PathBuf::from(walpath); + let conn = db.connect().unwrap(); conn.execute("create table test (id integer primary key, value text)") .unwrap(); - for _i in 0..100 { - let _ = conn.execute("insert into test (value) values ('test1'), ('test2'), ('test3')"); + for _i in 0..25 { + let _ = conn.execute("insert into test (value) values (randomblob(1024)), (randomblob(1024)), (randomblob(1024))"); } - let file = conn.pager.borrow_mut(); - let _ = file.cacheflush(); - let mut wal = file.wal.borrow_mut(); - wal.truncate().unwrap(); + let pager = conn.pager.borrow_mut(); + let _ = pager.cacheflush(); + let mut wal = pager.wal.borrow_mut(); + + let stat = std::fs::metadata(&walpath).unwrap(); + let meta_before = std::fs::metadata(&walpath).unwrap(); + #[cfg(not(unix))] + let bytes_before = meta_before.len(); + #[cfg(unix)] + let blocks_before = meta_before.blocks(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Truncate); drop(wal); - assert_eq!(file.wal_frame_count().unwrap(), 0); + + assert_eq!(pager.wal_frame_count().unwrap(), 0); + + tracing::info!("wal filepath: {walpath:?}, size: {}", stat.len()); + let meta_after = std::fs::metadata(&walpath).unwrap(); + + #[cfg(unix)] + { + let blocks_after = meta_after.blocks(); + assert_ne!( + blocks_before, blocks_after, + "WAL file should not have been empty before checkpoint" + ); + assert_eq!( + blocks_after, 0, + "WAL file should be truncated to 0 bytes, but is {blocks_after} blocks", + ); + } + #[cfg(not(unix))] + { + let bytes_after = meta_after.len(); + assert_ne!( + bytes_before, bytes_after, + "WAL file should not have been empty before checkpoint" + ); + // On Windows, we check the size in bytes + assert_eq!( + bytes_after, 0, + "WAL file should be truncated to 0 bytes, but is {bytes_after} bytes", + ); + } + std::fs::remove_dir_all(path).unwrap(); + } + + fn bulk_inserts(conn: &Arc, n_txns: usize, rows_per_txn: usize) { + for _ in 0..n_txns { + conn.execute("begin transaction").unwrap(); + for i in 0..rows_per_txn { + conn.execute(format!("insert into test(value) values ('v{i}')")) + .unwrap(); + } + conn.execute("commit").unwrap(); + } + } + + fn run_checkpoint_until_done( + wal: &mut dyn Wal, + pager: &crate::Pager, + mode: CheckpointMode, + ) -> CheckpointResult { + let wc = Rc::new(RefCell::new(0usize)); + loop { + match wal.checkpoint(pager, wc.clone(), mode).unwrap() { + IOResult::Done(r) => return r, + IOResult::IO => { + pager.io.run_once().unwrap(); + } + } + } + } + + #[test] + fn test_full_checkpoint_mode() { + let (db, path) = get_database(); + let conn = db.connect().unwrap(); + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + let mut walpath = path.clone().into_os_string().into_string().unwrap(); + walpath.push_str("/test.db-wal"); + let walpath = std::path::PathBuf::from(walpath); + + // Produce multiple WAL frames. + bulk_inserts(&conn, 25, 2); + conn.pager.borrow_mut().cacheflush().unwrap(); + + let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); + let (before_max, before_backfills) = unsafe { + let s = &*wal_shared.get(); + ( + s.max_frame.load(Ordering::SeqCst), + s.nbackfills.load(Ordering::SeqCst), + ) + }; + assert!(before_max > 0); + assert_eq!(before_backfills, 0); + + let meta_before = std::fs::metadata(&walpath).unwrap(); + #[cfg(not(unix))] + let bytes_before = meta_before.len(); + #[cfg(unix)] + let blocks_before = meta_before.blocks(); + + // Run FULL checkpoint. + { + let pager_ref = conn.pager.borrow(); + let mut wal = pager_ref.wal.borrow_mut(); + let result = run_checkpoint_until_done(&mut *wal, &pager_ref, CheckpointMode::Full); + assert_eq!(result.num_wal_frames, before_max); + assert_eq!(result.num_checkpointed_frames, before_max); + } + + // Validate state after FULL: max_frame unchanged; nbackfills == max_frame; not restarted. + let (after_max, after_backfills, read_mark0) = unsafe { + let s = &*wal_shared.get(); + ( + s.max_frame.load(Ordering::SeqCst), + s.nbackfills.load(Ordering::SeqCst), + s.read_locks[0].value.load(Ordering::SeqCst), + ) + }; + assert_eq!(after_max, before_max, "Full should not reset WAL sequence"); + assert_eq!(after_backfills, after_max); + assert_eq!( + read_mark0, 0, + "Restart was NOT performed, read mark 0 stays 0" + ); + let meta_after = std::fs::metadata(&walpath).unwrap(); + #[cfg(unix)] + { + let blocks_after = meta_after.blocks(); + assert_eq!( + blocks_before, blocks_after, + "WAL file should not change size after full checkpoint" + ); + } + #[cfg(not(unix))] + { + let bytes_after = meta_after.len(); + assert_eq!( + bytes_before, bytes_after, + "WAL file should not change size after full checkpoint" + ); + } + + // Append another transaction -> frame id should become previous_max + 1 + let prev_max = after_max; + conn.execute("insert into test(value) values ('after_full')") + .unwrap(); + conn.pager + .borrow_mut() + .wal + .borrow_mut() + .finish_append_frames_commit() + .unwrap(); + + let (new_max, _) = unsafe { + let s = &*wal_shared.get(); + ( + s.max_frame.load(Ordering::SeqCst), + s.nbackfills.load(Ordering::SeqCst), + ) + }; + assert_eq!( + new_max, + prev_max + 1, + "WAL should continue appending not restart" + ); + std::fs::remove_dir_all(path).unwrap(); + } + + fn wal_header_snapshot(shared: &Arc>) -> (u32, u32, u32, u32) { + // (checkpoint_seq, salt1, salt2, page_size) + unsafe { + let hdr = (*shared.get()).wal_header.lock(); + (hdr.checkpoint_seq, hdr.salt_1, hdr.salt_2, hdr.page_size) + } + } + + #[test] + fn test_restart_checkpoint_resets_sequence() { + let (db, path) = get_database(); + + let mut walpath = path.clone().into_os_string().into_string().unwrap(); + walpath.push_str("/test.db-wal"); + let walpath = std::path::PathBuf::from(walpath); + + let conn = db.connect().unwrap(); + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn.clone(), 20, 3); + conn.pager.borrow_mut().cacheflush().unwrap(); + + let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); + let (seq0, salt10, salt20, _ps) = wal_header_snapshot(&wal_shared); + let (before_max, before_backfills) = unsafe { + let s = &*wal_shared.get(); + ( + s.max_frame.load(Ordering::SeqCst), + s.nbackfills.load(Ordering::SeqCst), + ) + }; + assert!(before_max > 0); + assert_eq!(before_backfills, 0); + + let meta_before = std::fs::metadata(&walpath).unwrap(); + + #[cfg(not(unix))] + let bytes_before = meta_before.len(); + #[cfg(unix)] + let blocks_before = meta_before.blocks(); + + { + let pager_ref = conn.pager.borrow(); + let mut wal = pager_ref.wal.borrow_mut(); + let r = run_checkpoint_until_done(&mut *wal, &pager_ref, CheckpointMode::Restart); + assert_eq!(r.num_wal_frames, before_max); + assert_eq!(r.num_checkpointed_frames, before_max); + } + + // After restart: max_frame == 0, nbackfills == 0, salts changed, seq incremented. + let (seq1, salt11, salt21, _ps2) = wal_header_snapshot(&wal_shared); + assert_eq!(seq1, seq0.wrapping_add(1), "checkpoint_seq increments"); + assert_ne!(salt21, salt20); + assert_eq!(salt11, salt10.wrapping_add(1), "salt_1 should increment"); + let (after_max, after_backfills) = unsafe { + let s = &*wal_shared.get(); + ( + s.max_frame.load(Ordering::SeqCst), + s.nbackfills.load(Ordering::SeqCst), + ) + }; + assert_eq!(after_max, 0); + assert_eq!(after_backfills, 0); + + // Next write should create frame_id = 1 again. + conn.execute("insert into test(value) values ('post_restart')") + .unwrap(); + conn.pager + .borrow_mut() + .wal + .borrow_mut() + .finish_append_frames_commit() + .unwrap(); + let new_max = unsafe { (&*wal_shared.get()).max_frame.load(Ordering::SeqCst) }; + assert_eq!(new_max, 1, "Sequence restarted at 1"); + + let meta_after = std::fs::metadata(&walpath).unwrap(); + #[cfg(unix)] + { + let blocks_after = meta_after.blocks(); + assert_eq!( + blocks_before, blocks_after, + "WAL file should not change size after full checkpoint" + ); + } + #[cfg(not(unix))] + { + let bytes_after = meta_after.len(); + assert_eq!( + bytes_before, bytes_after, + "WAL file should not change size after full checkpoint" + ); + } + std::fs::remove_dir_all(path).unwrap(); + } + + #[test] + fn test_passive_partial_then_complete() { + let (db, _tmp) = get_database(); + let conn1 = db.connect().unwrap(); + let conn2 = db.connect().unwrap(); + + conn1 + .execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn1.clone(), 15, 2); + conn1.pager.borrow_mut().cacheflush().unwrap(); + + // Force a read transaction that will freeze a lower read mark + { + let pager = conn2.pager.borrow_mut(); + let mut wal2 = pager.wal.borrow_mut(); + assert!(matches!(wal2.begin_read_tx().unwrap(), LimboResult::Ok)); + } + + // generate more frames that the reader will not see. + bulk_inserts(&conn1.clone(), 15, 2); + conn1.pager.borrow_mut().cacheflush().unwrap(); + + // Run passive checkpoint, expect partial + let (res1, max_before) = { + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let res = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive); + let maxf = unsafe { + (&*db.maybe_shared_wal.read().as_ref().unwrap().get()) + .max_frame + .load(Ordering::SeqCst) + }; + (res, maxf) + }; + assert_eq!(res1.num_wal_frames, max_before); + assert!( + res1.num_checkpointed_frames < res1.num_wal_frames, + "Partial backfill expected, {} : {}", + res1.num_checkpointed_frames, + res1.num_wal_frames + ); + + // Release reader + { + let pager = conn2.pager.borrow_mut(); + let wal2 = pager.wal.borrow_mut(); + wal2.end_read_tx().unwrap(); + } + + // Second passive checkpoint should finish + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let res2 = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive); + assert_eq!( + res2.num_checkpointed_frames, res2.num_wal_frames, + "Second checkpoint completes remaining frames" + ); } } From f4becd1296614efcf8cecbdb3af3443a09cbc4bd Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:19:34 -0400 Subject: [PATCH 21/62] Allow using !passive checkpoint methods in pragma wal_checkpoint --- core/translate/pragma.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/translate/pragma.rs b/core/translate/pragma.rs index 5265943ae..4c3c64d58 100644 --- a/core/translate/pragma.rs +++ b/core/translate/pragma.rs @@ -344,12 +344,6 @@ fn query_pragma( _ => CheckpointMode::Passive, }; - if !matches!(mode, CheckpointMode::Passive) { - return Err(LimboError::ParseError( - "only Passive mode supported".to_string(), - )); - } - program.alloc_registers(2); program.emit_insn(Insn::Checkpoint { database: 0, From 7643ef97a6e524acf2ef9f12db252be308f414fb Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:21:04 -0400 Subject: [PATCH 22/62] Pass checkpoint mode from sqlite3 c api argument --- core/vdbe/execute.rs | 4 ++-- sqlite3/src/lib.rs | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index d793f7b74..4f1a2b617 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -325,13 +325,13 @@ pub fn op_checkpoint( ) -> Result { let Insn::Checkpoint { database: _, - checkpoint_mode: _, + checkpoint_mode, dest, } = insn else { unreachable!("unexpected Insn {:?}", insn) }; - let result = program.connection.checkpoint(); + let result = program.connection.checkpoint(*checkpoint_mode); match result { Ok(CheckpointResult { num_wal_frames: num_wal_pages, diff --git a/sqlite3/src/lib.rs b/sqlite3/src/lib.rs index d9e562554..e216ccb5c 100644 --- a/sqlite3/src/lib.rs +++ b/sqlite3/src/lib.rs @@ -3,7 +3,7 @@ use std::ffi::{self, CStr, CString}; use tracing::trace; -use turso_core::{LimboError, Value}; +use turso_core::{CheckpointMode, LimboError, Value}; use std::sync::{Arc, Mutex}; @@ -1108,7 +1108,7 @@ pub unsafe extern "C" fn sqlite3_wal_checkpoint( pub unsafe extern "C" fn sqlite3_wal_checkpoint_v2( db: *mut sqlite3, _db_name: *const ffi::c_char, - _mode: ffi::c_int, + mode: ffi::c_int, _log_size: *mut ffi::c_int, _checkpoint_count: *mut ffi::c_int, ) -> ffi::c_int { @@ -1118,7 +1118,14 @@ pub unsafe extern "C" fn sqlite3_wal_checkpoint_v2( let db: &mut sqlite3 = &mut *db; let db = db.inner.lock().unwrap(); // TODO: Checkpointing modes and reporting back log size and checkpoint count to caller. - if db.conn.checkpoint().is_err() { + let chkptmode = match mode { + SQLITE_CHECKPOINT_PASSIVE => CheckpointMode::Passive, + SQLITE_CHECKPOINT_RESTART => CheckpointMode::Restart, + SQLITE_CHECKPOINT_TRUNCATE => CheckpointMode::Truncate, + SQLITE_CHECKPOINT_FULL => CheckpointMode::Full, + _ => return SQLITE_MISUSE, // Unsupported mode + }; + if db.conn.checkpoint(chkptmode).is_err() { return SQLITE_ERROR; } SQLITE_OK From 436747536c0fc4b1b9e12d91edc1964b28fbada3 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:23:03 -0400 Subject: [PATCH 23/62] Add integration test for truncate checkpointing wal --- .../query_processing/test_write_path.rs | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tests/integration/query_processing/test_write_path.rs b/tests/integration/query_processing/test_write_path.rs index df6042452..310acb9ed 100644 --- a/tests/integration/query_processing/test_write_path.rs +++ b/tests/integration/query_processing/test_write_path.rs @@ -2,8 +2,10 @@ use crate::common::{self, maybe_setup_tracing}; use crate::common::{compare_string, do_flush, TempDatabase}; use log::debug; use std::io::{Read, Seek, Write}; +use std::os::unix::fs::MetadataExt; +use std::path::PathBuf; use std::sync::Arc; -use turso_core::{Connection, Database, LimboError, Row, Statement, StepResult, Value}; +use turso_core::{CheckpointMode, Connection, Database, Row, Statement, StepResult, Value}; const WAL_HEADER_SIZE: usize = 32; const WAL_FRAME_HEADER_SIZE: usize = 24; @@ -285,7 +287,7 @@ fn test_wal_checkpoint() -> anyhow::Result<()> { for i in 0..iterations { let insert_query = format!("INSERT INTO test VALUES ({i})"); do_flush(&conn, &tmp_db)?; - conn.checkpoint()?; + conn.checkpoint(CheckpointMode::Passive)?; run_query(&tmp_db, &conn, &insert_query)?; } @@ -301,6 +303,40 @@ fn test_wal_checkpoint() -> anyhow::Result<()> { Ok(()) } +#[test] +#[ignore] +fn test_wal_checkpoint_truncate() -> anyhow::Result<()> { + let _ = env_logger::try_init(); + let tmp_db = + TempDatabase::new_with_rusqlite("CREATE TABLE test (x INTEGER PRIMARY KEY);", false); + let iterations = 100_usize; + let conn = tmp_db.connect_limbo(); + + for i in 0..iterations { + let insert_query = format!("INSERT INTO test VALUES ({i})"); + do_flush(&conn, &tmp_db)?; + run_query(&tmp_db, &conn, &insert_query)?; + } + conn.checkpoint(CheckpointMode::Truncate)?; + let file = PathBuf::from(format!("{}-wal", tmp_db.path.to_string_lossy())); + assert!(file.exists(), "WAL file should exist after checkpoint"); + let size = file.metadata()?.blocks(); + assert_eq!( + size, 0, + "WAL file should be truncated to a small size, found: {size}", + ); + // assert data is all in table + for i in 0..iterations { + let select_query = format!("SELECT * FROM test WHERE x = {i}"); + run_query_on_row(&tmp_db, &conn, &select_query, |row: &Row| { + let id = row.get::(0).unwrap(); + assert_eq!(i as i64, id); + })?; + } + do_flush(&conn, &tmp_db)?; + Ok(()) +} + #[test] fn test_wal_restart() -> anyhow::Result<()> { let _ = env_logger::try_init(); From 9b7e5ed2928b06b40e9f12a5578ac15e447612c7 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 14:45:06 -0400 Subject: [PATCH 24/62] Trunc db file after backfilling everything in calling method --- core/storage/pager.rs | 25 +++++++++++++++++++++-- core/storage/wal.rs | 46 ++++++++++++------------------------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 1fe798ab6..7435ea98d 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -4,7 +4,7 @@ use crate::storage::buffer_pool::BufferPool; use crate::storage::database::DatabaseStorage; use crate::storage::header_accessor; use crate::storage::sqlite3_ondisk::{ - self, parse_wal_frame_header, DatabaseHeader, PageContent, PageType, + self, parse_wal_frame_header, DatabaseHeader, PageContent, PageType, DEFAULT_PAGE_SIZE, }; use crate::storage::wal::{CheckpointResult, Wal}; use crate::types::{IOResult, WalInsertInfo}; @@ -1300,10 +1300,31 @@ impl Pager { let checkpoint_result = self.io.block(|| { self.wal .borrow_mut() - .checkpoint(self, Rc::new(RefCell::new(0)), CheckpointMode::Passive) + .checkpoint(self, Rc::new(RefCell::new(0)), mode) .map_err(|err| panic!("error while clearing cache {err}")) })?; + if checkpoint_result.everything_backfilled() { + let db_size = header_accessor::get_database_size(self)?; + let page_size = self.page_size.get().unwrap_or(DEFAULT_PAGE_SIZE); + let expected = (db_size * page_size) as u64; + if expected < self.db_file.size()? { + self.db_file.truncate( + expected as usize, + Completion::new_trunc(move |_| { + tracing::trace!( + "Database file truncated to expected size: {} bytes", + expected + ); + }) + .into(), + )?; + self.db_file.sync(Completion::new_sync(move |_| { + tracing::trace!("Database file syncd after truncation"); + }))?; + } + } + // TODO: only clear cache of things that are really invalidated self.page_cache .write() diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 322c2f8cf..7fff12b7a 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -19,7 +19,6 @@ use std::{ use crate::fast_lock::SpinLock; use crate::io::{File, IO}; use crate::result::LimboResult; -use crate::storage::header_accessor; use crate::storage::sqlite3_ondisk::{ begin_read_wal_frame, begin_read_wal_frame_raw, finish_read_page, prepare_wal_frame, WAL_FRAME_HEADER_SIZE, WAL_HEADER_SIZE, @@ -55,6 +54,9 @@ impl CheckpointResult { num_checkpointed_frames: n_ckpt, } } + pub const fn everything_backfilled(&self) -> bool { + self.num_wal_frames > 0 && self.num_wal_frames == self.num_checkpointed_frames + } } #[derive(Debug, Copy, Clone, EnumString)] @@ -1025,8 +1027,6 @@ impl Wal for WalFile { let nbackfills = shared.nbackfills.load(Ordering::SeqCst); let (checkpoint_result, everything_backfilled) = { - let shared = self.get_shared(); - let current_mx = shared.max_frame.load(Ordering::SeqCst); // Record two num pages fields to return as checkpoint result to caller. // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html let frames_in_wal = max_frame.saturating_sub(nbackfills); @@ -1044,16 +1044,18 @@ impl Wal for WalFile { self.get_shared() .nbackfills .store(self.ongoing_checkpoint.max_frame, Ordering::SeqCst); - if everything_backfilled { - match mode { - CheckpointMode::Restart | CheckpointMode::Truncate => { - ensure_unlock!(self, self.restart_log(mode)); - } - _ => {} - }; - ensure_unlock!(self, self.trunc_db_file(pager)); + if everything_backfilled + && matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate) + { + ensure_unlock!(self, self.restart_log(mode)); } self.prev_checkpoint = checkpoint_result; + // we cannot truncate the db file here, because we are currently inside a + // mut borrow of pager.wal, and accessing the header will attempt a borrow, + // so the caller will determine if: + // a. the max frame == num wal frames + // b. the db file size != num of db pages * page_size + // and truncate the db file if necessary. self.ongoing_checkpoint.state = CheckpointState::Start; let _ = self.checkpoint_guard.take(); return Ok(IOResult::Done(checkpoint_result)); @@ -1313,28 +1315,6 @@ impl WalFile { self.min_frame = 0; Ok(()) } - - /// Truncate the db file to the size of the database in pages. - /// to be called after any checkpoint where we are able to backfill - /// all frames in the WAL. - fn trunc_db_file(&self, pager: &Pager) -> Result<()> { - let db_size = header_accessor::get_database_size(pager)? as u64; - let sz_db = db_size * (self.page_size() as u64); - if pager.db_file.size()? != sz_db { - let _ = pager.db_file.truncate( - sz_db as usize, - Completion::new_trunc(move |_| { - tracing::trace!("db file truncated to {} bytes", sz_db); - }) - .into(), - ); - } - // then fsync - pager.db_file.sync(Completion::new_sync(move |_| { - tracing::trace!("db file syncd after truncating"); - }))?; - Ok(()) - } } impl WalFileShared { From 8ca37b71b6146964411c335de67c4b02a10d4209 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 15:05:24 -0400 Subject: [PATCH 25/62] Ensure we properly hold and release read locks in log restart method and fix tests --- core/storage/wal.rs | 73 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 7fff12b7a..8f0879339 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1076,7 +1076,7 @@ impl Wal for WalFile { syncing.set(false); }); let shared = self.get_shared(); - let c = shared.file.sync(completion)?; + let _c = shared.file.sync(completion)?; self.sync_state.set(SyncState::Syncing); Ok(IOResult::IO) } @@ -1279,28 +1279,51 @@ impl WalFile { for idx in 1..shared.read_locks.len() { let lock = &mut shared.read_locks[idx]; if !lock.write() { + // release everything we got so far + for j in 1..idx { + shared.read_locks[j].unlock(); + } // Reader is active, cannot proceed return Err(LimboError::Busy); } lock.value.store(READMARK_NOT_USED, Ordering::SeqCst); - lock.unlock(); } } + let handle_err = |e: &LimboError| { + // release all read locks we just acquired, the caller will take care of the others + let shared = self.get_shared(); + for idx in 1..shared.read_locks.len() { + shared.read_locks[idx].unlock(); + } + tracing::error!( + "Failed to restart WAL header: {:?}, releasing read locks", + e + ); + }; // reinitialize in‑memory state - ensure_unlock!(self, self.get_shared().restart_wal_header(&self.io, mode)); + self.get_shared() + .restart_wal_header(&self.io, mode) + .inspect_err(|e| { + handle_err(e); + })?; // For TRUNCATE: physically shrink the WAL to 0 B if matches!(mode, CheckpointMode::Truncate) { let c = Completion::new_trunc(|_| { tracing::trace!("WAL file truncated to 0 B"); }); - ensure_unlock!(self, self.get_shared().file.truncate(0, c)); + let shared = self.get_shared(); + shared.file.truncate(0, c.clone()).inspect_err(|e| { + handle_err(e); + })?; let c = Completion::new_sync(|_| { tracing::trace!("WAL file synced after truncation"); }); // fsync after truncation - self.get_shared().file.sync(c.into())?; + shared.file.sync(c).inspect_err(|e| { + handle_err(e); + })?; } // release read‑locks 1..4 @@ -1485,8 +1508,8 @@ impl WalFileShared { pub mod test { use crate::{ result::LimboResult, storage::sqlite3_ondisk::WAL_HEADER_SIZE, types::IOResult, - CheckpointMode, CheckpointResult, Completion, Connection, Database, PlatformIO, Wal, - WalFileShared, IO, + CheckpointMode, CheckpointResult, Completion, Connection, Database, LimboError, PlatformIO, + Wal, WalFileShared, IO, }; use std::{ cell::{Cell, RefCell, UnsafeCell}, @@ -1534,7 +1557,7 @@ pub mod test { } #[test] - fn test_truncate_checkpoint() { + fn test_wal_truncate_checkpoint() { let (db, path) = get_database(); let mut walpath = path.clone().into_os_string().into_string().unwrap(); walpath.push_str("/test.db-wal"); @@ -1620,7 +1643,7 @@ pub mod test { } #[test] - fn test_full_checkpoint_mode() { + fn test_wal_full_checkpoint_mode() { let (db, path) = get_database(); let conn = db.connect().unwrap(); conn.execute("create table test(id integer primary key, value text)") @@ -1727,7 +1750,7 @@ pub mod test { } #[test] - fn test_restart_checkpoint_resets_sequence() { + fn test_wal_restart_checkpoint_resets_sequence() { let (db, path) = get_database(); let mut walpath = path.clone().into_os_string().into_string().unwrap(); @@ -1815,7 +1838,7 @@ pub mod test { } #[test] - fn test_passive_partial_then_complete() { + fn test_wal_passive_partial_then_complete() { let (db, _tmp) = get_database(); let conn1 = db.connect().unwrap(); let conn2 = db.connect().unwrap(); @@ -1830,7 +1853,7 @@ pub mod test { { let pager = conn2.pager.borrow_mut(); let mut wal2 = pager.wal.borrow_mut(); - assert!(matches!(wal2.begin_read_tx().unwrap(), LimboResult::Ok)); + assert!(matches!(wal2.begin_read_tx().unwrap().0, LimboResult::Ok)); } // generate more frames that the reader will not see. @@ -1861,7 +1884,7 @@ pub mod test { { let pager = conn2.pager.borrow_mut(); let wal2 = pager.wal.borrow_mut(); - wal2.end_read_tx().unwrap(); + wal2.end_read_tx(); } // Second passive checkpoint should finish @@ -1873,4 +1896,28 @@ pub mod test { "Second checkpoint completes remaining frames" ); } + + #[test] + fn test_wal_restart_blocks_readers() { + let (db, _) = get_database(); + let conn1 = db.connect().unwrap(); + let conn2 = db.connect().unwrap(); + + // Start a read transaction + conn2 + .pager + .borrow_mut() + .wal + .borrow_mut() + .begin_read_tx() + .unwrap(); + + // checkpoint should fail + let result = { + let p = conn1.pager.borrow(); + let mut w = p.wal.borrow_mut(); + w.checkpoint(&p, Rc::new(RefCell::new(0)), CheckpointMode::Restart) + }; + assert!(matches!(result, Err(LimboError::Busy))); + } } From eaa6f99fa8d8e9238ac5c146abfd6c59a6ee28e4 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 15:30:26 -0400 Subject: [PATCH 26/62] Hold and ensure release of proper locks if we trunc the db file post-checkpoint --- core/storage/pager.rs | 5 +++-- core/storage/wal.rs | 34 +++++++++++++++++++++++++--------- core/vdbe/execute.rs | 1 + 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 7435ea98d..dcc70290d 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -351,7 +351,7 @@ pub struct Pager { free_page_state: RefCell, } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone)] /// The status of the current cache flush. pub enum PagerCommitResult { /// The WAL was written to disk and fsynced. @@ -1297,7 +1297,7 @@ impl Pager { return Ok(CheckpointResult::default()); } - let checkpoint_result = self.io.block(|| { + let mut checkpoint_result = self.io.block(|| { self.wal .borrow_mut() .checkpoint(self, Rc::new(RefCell::new(0)), mode) @@ -1323,6 +1323,7 @@ impl Pager { tracing::trace!("Database file syncd after truncation"); }))?; } + checkpoint_result.release_guard(); } // TODO: only clear cache of things that are really invalidated diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 8f0879339..aaad7182e 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -39,12 +39,14 @@ pub const NO_LOCK: u32 = 0; pub const SHARED_LOCK: u32 = 1; pub const WRITE_LOCK: u32 = 2; -#[derive(Debug, Copy, Clone, Default)] +#[derive(Debug, Clone, Default)] pub struct CheckpointResult { /// number of frames in WAL pub num_wal_frames: u64, /// number of frames moved successfully from WAL to db file after checkpoint pub num_checkpointed_frames: u64, + /// will drop automatically and free any remaining locks we need to hold + maybe_guard: Option, } impl CheckpointResult { @@ -52,11 +54,16 @@ impl CheckpointResult { Self { num_wal_frames: n_frames, num_checkpointed_frames: n_ckpt, + maybe_guard: None, } } + pub const fn everything_backfilled(&self) -> bool { self.num_wal_frames > 0 && self.num_wal_frames == self.num_checkpointed_frames } + pub fn release_guard(&mut self) { + let _ = self.maybe_guard.take(); + } } #[derive(Debug, Copy, Clone, EnumString)] @@ -502,6 +509,7 @@ impl fmt::Debug for WalFileShared { } } +#[derive(Clone, Debug)] enum CheckpointGuard { Writer { ptr: Arc> }, Read0 { ptr: Arc> }, @@ -912,7 +920,8 @@ impl Wal for WalFile { if shared_max <= nbackfills { // if there's nothing to do and we are fully back-filled, to match sqlite // we return the previous number of backfilled pages from last checkpoint. - return Ok(IOResult::Done(self.prev_checkpoint)); + self.prev_checkpoint.release_guard(); // just in case we are still holding + return Ok(IOResult::Done(self.prev_checkpoint.clone())); } let shared = self.get_shared(); let busy = !shared.checkpoint_lock.write(); @@ -1026,9 +1035,10 @@ impl Wal for WalFile { let max_frame = shared.max_frame.load(Ordering::SeqCst); let nbackfills = shared.nbackfills.load(Ordering::SeqCst); - let (checkpoint_result, everything_backfilled) = { + let (mut checkpoint_result, everything_backfilled) = { // Record two num pages fields to return as checkpoint result to caller. // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html + let frames_in_wal = max_frame.saturating_sub(nbackfills); let frames_checkpointed = self .ongoing_checkpoint @@ -1036,8 +1046,7 @@ impl Wal for WalFile { .saturating_sub(self.ongoing_checkpoint.min_frame - 1); let checkpoint_result = CheckpointResult::new(frames_in_wal, frames_checkpointed); - let everything_backfilled = shared.max_frame.load(Ordering::SeqCst) - == self.ongoing_checkpoint.max_frame; + let everything_backfilled = max_frame == self.ongoing_checkpoint.max_frame; (checkpoint_result, everything_backfilled) }; // we will just overwrite nbackfills with 0 if we are resetting @@ -1056,8 +1065,15 @@ impl Wal for WalFile { // a. the max frame == num wal frames // b. the db file size != num of db pages * page_size // and truncate the db file if necessary. + if checkpoint_result.everything_backfilled() { + // temporarily hold the locks in the case that the db file must be truncated + checkpoint_result.maybe_guard = self.checkpoint_guard.take(); + } else { + // we can drop them now + let _ = self.checkpoint_guard.take(); + } + self.prev_checkpoint = checkpoint_result.clone(); self.ongoing_checkpoint.state = CheckpointState::Start; - let _ = self.checkpoint_guard.take(); return Ok(IOResult::Done(checkpoint_result)); } } @@ -1233,8 +1249,7 @@ impl WalFile { self.syncing.set(false); } - // coordinate among many readers what the maximum safe frame is for us - // to backfill when checkpointing. + // Coordinate what the maximum safe frame is for us to backfill when checkpointing. fn determine_max_safe_checkpoint_frame(&self) -> u64 { let shared = self.get_shared(); let mut max_safe_frame = shared.max_frame.load(Ordering::SeqCst); @@ -1307,7 +1322,8 @@ impl WalFile { .inspect_err(|e| { handle_err(e); })?; - // For TRUNCATE: physically shrink the WAL to 0 B + + // For TRUNCATE mode: shrink the WAL file to 0 B if matches!(mode, CheckpointMode::Truncate) { let c = Completion::new_trunc(|_| { tracing::trace!("WAL file truncated to 0 B"); diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 4f1a2b617..85ed17698 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -336,6 +336,7 @@ pub fn op_checkpoint( Ok(CheckpointResult { num_wal_frames: num_wal_pages, num_checkpointed_frames: num_checkpointed_pages, + .. }) => { // https://sqlite.org/pragma.html#pragma_wal_checkpoint // 1st col: 1 (checkpoint SQLITE_BUSY) or 0 (not busy). From b214c3dfc87f26612a238edb2fe2e53d15efe636 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 19:02:06 -0400 Subject: [PATCH 27/62] Add diff chkpt modes to sqlite3 api, finish checkpoint logic and add tests --- core/storage/wal.rs | 361 +++++++++++++++++++++++++++++++++++++++----- sqlite3/src/lib.rs | 26 +++- 2 files changed, 344 insertions(+), 43 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index aaad7182e..e152ad375 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -483,9 +483,6 @@ pub struct WalFileShared { /// value. There is a limited amount because and unbounded amount of connections could be /// fatal. Therefore, for now we copy how SQLite behaves with limited amounts of read max /// frames that is equal to 5. - /// read_locks[0] is the exclusive read lock that is always 0 except during a checkpoint where - /// the log is restarted. read_locks[1] is the 'default reader' slot that always contains the - /// current max_frame. pub read_locks: [LimboRwLock; 5], /// There is only one write allowed in WAL mode. This lock takes care of ensuring there is only /// one used. @@ -510,6 +507,8 @@ impl fmt::Debug for WalFileShared { } #[derive(Clone, Debug)] +/// An RAII guard to ensure that no locks are leaked during checkpointing in +/// the case of errors. enum CheckpointGuard { Writer { ptr: Arc> }, Read0 { ptr: Arc> }, @@ -520,7 +519,7 @@ impl CheckpointGuard { let shared = &mut unsafe { &mut *ptr.get() }; if !shared.checkpoint_lock.write() { tracing::trace!("CheckpointGuard::new: checkpoint lock failed, returning Busy"); - // exclusive lock on checkpoint lock + // we hold the exclusive checkpoint lock no matter which mode for the duration return Err(LimboError::Busy); } match mode { @@ -529,12 +528,13 @@ impl CheckpointGuard { if !read0.write() { shared.checkpoint_lock.unlock(); tracing::trace!("CheckpointGuard::new: read0 lock failed, returning Busy"); - // exclusive lock on slot‑0 + // for passive and full we need to hold the read0 lock return Err(LimboError::Busy); } Ok(Self::Read0 { ptr }) } CheckpointMode::Restart | CheckpointMode::Truncate => { + // if we are resetting the log we must hold the write lock for the duration. if !shared.write_lock.write() { shared.checkpoint_lock.unlock(); tracing::trace!("CheckpointGuard::new: read0 lock failed, returning Busy"); @@ -545,14 +545,13 @@ impl CheckpointGuard { } } } - -/// Small macro to remove the writer lock in the event that a checkpoint errors out and would +/// macro to remove the writer lock in the event that a checkpoint errors out and would /// otherwise leak the lock. Only used during checkpointing. macro_rules! ensure_unlock { ($self:ident, $fun:expr) => { $fun.inspect_err(|e| { - tracing::trace!( - "CheckpointGuard::ensure_unlock: error occurred, releaseing held locks: {e}" + tracing::error!( + "CheckpointGuard::ensure_unlock: error occurred, releasing held locks: {e}" ); let _ = $self.checkpoint_guard.take(); })?; @@ -579,9 +578,23 @@ impl Wal for WalFile { #[instrument(skip_all, level = Level::DEBUG)] fn begin_read_tx(&mut self) -> Result<(LimboResult, bool)> { let max_frame_in_wal = self.get_shared().max_frame.load(Ordering::SeqCst); + let nbackfills = self.get_shared().nbackfills.load(Ordering::SeqCst); let db_has_changed = max_frame_in_wal > self.max_frame; + // Check if we can use slot 0 (bypass WAL) + if max_frame_in_wal == nbackfills { + let shared = self.get_shared(); + let lock = &mut shared.read_locks[0]; + // if wal is fully checkpointed, we can use read mark 0 to read directly + // from the db file + if lock.read() { + self.max_frame_read_lock_index = 0; + self.max_frame = max_frame_in_wal; + self.min_frame = max_frame_in_wal + 1; // Ignore WAL; + return Ok((LimboResult::Ok, db_has_changed)); + } + } let mut max_read_mark = 0; let mut max_read_mark_index = -1; // Find the largest mark we can find, ignore frames that are impossible to be in range and @@ -654,9 +667,8 @@ impl Wal for WalFile { /// Begin a write transaction #[instrument(skip_all, level = Level::DEBUG)] fn begin_write_tx(&mut self) -> Result { - let busy = !self.get_shared().write_lock.write(); - tracing::debug!("begin_write_transaction(busy={})", busy); - if busy { + if !self.get_shared().write_lock.write() { + tracing::debug!("begin_write_transaction(busy=true)"); return Ok(LimboResult::Busy); } // If the max frame is not the same as the one in the shared state, it means another @@ -917,22 +929,17 @@ impl Wal for WalFile { shared.nbackfills.load(Ordering::SeqCst), ) }; - if shared_max <= nbackfills { - // if there's nothing to do and we are fully back-filled, to match sqlite - // we return the previous number of backfilled pages from last checkpoint. + let needs_backfill = shared_max > nbackfills; + if !needs_backfill + && matches!(mode, CheckpointMode::Passive | CheckpointMode::Full) + { + // there are no frames to copy and we don't need to reset the log so we can + // return early success. self.prev_checkpoint.release_guard(); // just in case we are still holding return Ok(IOResult::Done(self.prev_checkpoint.clone())); } - let shared = self.get_shared(); - let busy = !shared.checkpoint_lock.write(); - if busy { - return Err(LimboError::Busy); - } - // acquire either the read0 or write lock depending on the checkpoint mode - if self.checkpoint_guard.is_none() { - let guard = CheckpointGuard::new(self.shared.clone(), mode)?; - self.checkpoint_guard = Some(guard); - } + // acquire the appropriate locks depending on the checkpoint mode + self.acquire_proper_checkpoint_guard(mode)?; self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); self.ongoing_checkpoint.min_frame = nbackfills + 1; self.ongoing_checkpoint.current_page = 0; @@ -1058,7 +1065,7 @@ impl Wal for WalFile { { ensure_unlock!(self, self.restart_log(mode)); } - self.prev_checkpoint = checkpoint_result; + self.prev_checkpoint = checkpoint_result.clone(); // we cannot truncate the db file here, because we are currently inside a // mut borrow of pager.wal, and accessing the header will attempt a borrow, // so the caller will determine if: @@ -1066,13 +1073,10 @@ impl Wal for WalFile { // b. the db file size != num of db pages * page_size // and truncate the db file if necessary. if checkpoint_result.everything_backfilled() { - // temporarily hold the locks in the case that the db file must be truncated checkpoint_result.maybe_guard = self.checkpoint_guard.take(); } else { - // we can drop them now let _ = self.checkpoint_guard.take(); } - self.prev_checkpoint = checkpoint_result.clone(); self.ongoing_checkpoint.state = CheckpointState::Start; return Ok(IOResult::Done(checkpoint_result)); } @@ -1285,7 +1289,8 @@ impl WalFile { ); turso_assert!( matches!(self.checkpoint_guard, Some(CheckpointGuard::Writer { .. })), - "We must hold writer and checkpoint locks to restart the log" + "We must hold writer and checkpoint locks to restart the log, found: {:?}", + self.checkpoint_guard ); tracing::info!("restart_log(mode={mode:?})"); { @@ -1350,10 +1355,33 @@ impl WalFile { } } + self.last_checksum = self.get_shared().last_checksum; self.max_frame = 0; self.min_frame = 0; Ok(()) } + + fn acquire_proper_checkpoint_guard(&mut self, mode: CheckpointMode) -> Result<()> { + let needs_new_guard = !matches!( + (&self.checkpoint_guard, mode), + ( + Some(CheckpointGuard::Read0 { .. }), + CheckpointMode::Passive | CheckpointMode::Full, + ) | ( + Some(CheckpointGuard::Writer { .. }), + CheckpointMode::Restart | CheckpointMode::Truncate, + ), + ); + if needs_new_guard { + // Drop any existing guard + if self.checkpoint_guard.is_some() { + let _ = self.checkpoint_guard.take(); + } + let guard = CheckpointGuard::new(self.shared.clone(), mode)?; + self.checkpoint_guard = Some(guard); + } + Ok(()) + } } impl WalFileShared { @@ -1523,7 +1551,9 @@ impl WalFileShared { #[cfg(test)] pub mod test { use crate::{ - result::LimboResult, storage::sqlite3_ondisk::WAL_HEADER_SIZE, types::IOResult, + result::LimboResult, + storage::{sqlite3_ondisk::WAL_HEADER_SIZE, wal::READMARK_NOT_USED}, + types::IOResult, CheckpointMode, CheckpointResult, Completion, Connection, Database, LimboError, PlatformIO, Wal, WalFileShared, IO, }; @@ -1928,12 +1958,269 @@ pub mod test { .begin_read_tx() .unwrap(); - // checkpoint should fail - let result = { - let p = conn1.pager.borrow(); - let mut w = p.wal.borrow_mut(); - w.checkpoint(&p, Rc::new(RefCell::new(0)), CheckpointMode::Restart) + // checkpoint should succeed here because the wal is fully checkpointed (empty) + // so the reader is using readmark0 to read directly from the db file. + let p = conn1.pager.borrow(); + let mut w = p.wal.borrow_mut(); + loop { + match w.checkpoint(&p, Rc::new(RefCell::new(0)), CheckpointMode::Restart) { + Ok(IOResult::IO) => { + conn1.run_once().unwrap(); + } + Ok(IOResult::Done(result)) => { + assert_eq!( + result.num_checkpointed_frames, 0, + "Should not checkpoint any frames (empty)" + ); + break; + } + Err(e) => { + panic!("Checkpoint failed: {e}"); + } + } + } + drop(w); + conn2.pager.borrow_mut().end_read_tx().unwrap(); + + conn1 + .execute("create table test(id integer primary key, value text)") + .unwrap(); + for i in 0..10 { + conn1 + .execute(format!("insert into test(value) values ('value{i}')")) + .unwrap(); + } + // now that we have some frames to checkpoint, try again + conn2.pager.borrow_mut().begin_read_tx().unwrap(); + let p = conn1.pager.borrow(); + let mut w = p.wal.borrow_mut(); + loop { + match w.checkpoint(&p, Rc::new(RefCell::new(0)), CheckpointMode::Restart) { + Ok(IOResult::IO) => { + conn1.run_once().unwrap(); + } + Ok(IOResult::Done(_)) => { + panic!("Checkpoint should not have succeeded"); + } + Err(e) => { + assert!(matches!(e, LimboError::Busy), "should block readers"); + break; + } + } + } + } + + #[test] + fn test_wal_read_lock_slot_0_optimization() { + let (db, _path) = get_database(); + let conn = db.connect().unwrap(); + + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + conn.execute("insert into test(value) values ('initial')") + .unwrap(); + + // Checkpoint everything + { + let pager = conn.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let result = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Full); + assert!(result.everything_backfilled()); + } + + // Now start a read transaction - should use slot 0 + let conn2 = db.connect().unwrap(); + { + let pager = conn2.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); + + // Verify it's using slot 0 and max_frame is 0 + assert_eq!( + wal.get_max_frame(), + 0, + "Should ignore WAL when using slot 0" + ); + assert_eq!(wal.get_min_frame(), 0); + + // Verify the read lock index + let wal_file = &*wal as *const dyn Wal as *const crate::WalFile; + let wal_file = unsafe { &*wal_file }; + assert_eq!( + wal_file.max_frame_read_lock_index, 0, + "Should be using slot 0" + ); + } + + // Another reader should also be able to use slot 0 (shared lock) + let conn3 = db.connect().unwrap(); + { + let pager = conn3.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); + } + } + + #[test] + fn test_wal_read_marks_after_restart() { + let (db, _path) = get_database(); + let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); + + let conn = db.connect().unwrap(); + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn, 10, 5); + // Checkpoint with restart + { + let pager = conn.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let result = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Restart); + assert!(result.everything_backfilled()); + } + + // Verify read marks after restart + let read_marks_after: Vec<_> = unsafe { + let s = &*wal_shared.get(); + (0..5) + .map(|i| s.read_locks[i].value.load(Ordering::SeqCst)) + .collect() }; - assert!(matches!(result, Err(LimboError::Busy))); + + assert_eq!(read_marks_after[0], 0, "Slot 0 should remain 0"); + assert_eq!( + read_marks_after[1], 0, + "Slot 1 (default reader) should be reset to 0" + ); + for i in 2..5 { + assert_eq!( + read_marks_after[i], READMARK_NOT_USED, + "Slot {i} should be READMARK_NOT_USED after restart", + ); + } + } + + #[test] + fn test_wal_concurrent_readers_during_checkpoint() { + let (db, _path) = get_database(); + let conn_writer = db.connect().unwrap(); + + conn_writer + .execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn_writer, 5, 10); + + // Start multiple readers at different points + let conn_r1 = db.connect().unwrap(); + let conn_r2 = db.connect().unwrap(); + + // R1 starts reading + { + let pager = conn_r1.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); + } + bulk_inserts(&conn_writer, 5, 10); + + // R2 starts reading, sees more frames than R1 + let r2_max_frame = { + let pager = conn_r2.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); + wal.get_max_frame() + }; + + // try passive checkpoint, should only checkpoint up to R1's position + let checkpoint_result = { + let pager = conn_writer.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive) + }; + + assert!( + checkpoint_result.num_checkpointed_frames < checkpoint_result.num_wal_frames, + "Should not checkpoint all frames when readers are active" + ); + + // Verify R2 still sees its frames + assert_eq!( + conn_r2.pager.borrow().wal.borrow().get_max_frame(), + r2_max_frame, + "Reader should maintain its snapshot" + ); + } + + #[test] + fn test_wal_checkpoint_updates_read_marks() { + let (db, _path) = get_database(); + let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); + + let conn = db.connect().unwrap(); + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn, 10, 5); + + // get max frame before checkpoint + let max_frame_before = unsafe { (*wal_shared.get()).max_frame.load(Ordering::SeqCst) }; + + { + let pager = conn.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let _result = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive); + } + + // check that read mark 1 (default reader) was updated to max_frame + let read_mark_1 = unsafe { + (*wal_shared.get()).read_locks[1] + .value + .load(Ordering::SeqCst) + }; + + assert_eq!( + read_mark_1 as u64, max_frame_before, + "Read mark 1 should be updated to max frame during checkpoint" + ); + } + + #[test] + fn test_wal_writer_blocks_restart_checkpoint() { + let (db, _path) = get_database(); + let conn1 = db.connect().unwrap(); + let conn2 = db.connect().unwrap(); + + conn1 + .execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn1, 5, 5); + + // start a write transaction + { + let pager = conn2.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + assert!(matches!(wal.begin_write_tx().unwrap(), LimboResult::Ok)); + } + + // should fail because writer lock is held + let result = { + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + wal.checkpoint(&pager, Rc::new(RefCell::new(0)), CheckpointMode::Restart) + }; + + assert!( + matches!(result, Err(LimboError::Busy)), + "Restart checkpoint should fail when write lock is held" + ); + + // release write lock + conn2.pager.borrow().wal.borrow().end_write_tx().unwrap(); + + // now restart should succeed + let result = { + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Restart) + }; + + assert!(result.everything_backfilled()); } } diff --git a/sqlite3/src/lib.rs b/sqlite3/src/lib.rs index e216ccb5c..2fa5419b6 100644 --- a/sqlite3/src/lib.rs +++ b/sqlite3/src/lib.rs @@ -1109,15 +1109,14 @@ pub unsafe extern "C" fn sqlite3_wal_checkpoint_v2( db: *mut sqlite3, _db_name: *const ffi::c_char, mode: ffi::c_int, - _log_size: *mut ffi::c_int, - _checkpoint_count: *mut ffi::c_int, + log_size: *mut ffi::c_int, + checkpoint_count: *mut ffi::c_int, ) -> ffi::c_int { if db.is_null() { return SQLITE_MISUSE; } let db: &mut sqlite3 = &mut *db; let db = db.inner.lock().unwrap(); - // TODO: Checkpointing modes and reporting back log size and checkpoint count to caller. let chkptmode = match mode { SQLITE_CHECKPOINT_PASSIVE => CheckpointMode::Passive, SQLITE_CHECKPOINT_RESTART => CheckpointMode::Restart, @@ -1125,10 +1124,25 @@ pub unsafe extern "C" fn sqlite3_wal_checkpoint_v2( SQLITE_CHECKPOINT_FULL => CheckpointMode::Full, _ => return SQLITE_MISUSE, // Unsupported mode }; - if db.conn.checkpoint(chkptmode).is_err() { - return SQLITE_ERROR; + match db.conn.checkpoint(chkptmode) { + Ok(res) => { + if !log_size.is_null() { + (*log_size) = res.num_wal_frames as ffi::c_int; + } + if !checkpoint_count.is_null() { + (*checkpoint_count) = res.num_checkpointed_frames as ffi::c_int; + } + SQLITE_OK + } + Err(e) => { + println!("Checkpoint error: {e}"); + if matches!(e, turso_core::LimboError::Busy) { + SQLITE_BUSY + } else { + SQLITE_ERROR + } + } } - SQLITE_OK } /// Get the number of frames in the WAL. From 49f90980d470b92b0dd992ca46285bfa7981049d Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 19:36:23 -0400 Subject: [PATCH 28/62] Create new header after truncation chkpt --- core/storage/wal.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index e152ad375..30dc3b4ca 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1337,7 +1337,12 @@ impl WalFile { shared.file.truncate(0, c.clone()).inspect_err(|e| { handle_err(e); })?; - + let hdr = shared.wal_header.lock(); + // sqlite just lets the next writer create it when the first frame is written. + // we can write the new header here for simplicity. + sqlite3_ondisk::begin_write_wal_header(&shared.file, &hdr).inspect_err(|e| { + handle_err(e); + })?; let c = Completion::new_sync(|_| { tracing::trace!("WAL file synced after truncation"); }); From 3db72cf1119fa21bf9f6b18f2bcc5ad4e35f0563 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 20:45:23 -0400 Subject: [PATCH 29/62] Just forget Full checkpoint mode for now, comment out compat test --- core/storage/wal.rs | 250 +++--------------- sqlite3/tests/compat/mod.rs | 21 +- .../query_processing/test_write_path.rs | 34 --- 3 files changed, 44 insertions(+), 261 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 30dc3b4ca..e9ad03cdb 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -482,7 +482,7 @@ pub struct WalFileShared { /// read_locks is a list of read locks that can coexist with the max_frame number stored in /// value. There is a limited amount because and unbounded amount of connections could be /// fatal. Therefore, for now we copy how SQLite behaves with limited amounts of read max - /// frames that is equal to 5. + /// frames that is equal to 5 pub read_locks: [LimboRwLock; 5], /// There is only one write allowed in WAL mode. This lock takes care of ensuring there is only /// one used. @@ -523,7 +523,7 @@ impl CheckpointGuard { return Err(LimboError::Busy); } match mode { - CheckpointMode::Passive | CheckpointMode::Full => { + CheckpointMode::Passive => { let read0 = &mut shared.read_locks[0]; if !read0.write() { shared.checkpoint_lock.unlock(); @@ -542,6 +542,7 @@ impl CheckpointGuard { } Ok(Self::Writer { ptr }) } + _ => todo!("not implemented yet"), } } } @@ -912,6 +913,11 @@ impl Wal for WalFile { write_counter: Rc>, mode: CheckpointMode, ) -> Result> { + if matches!(mode, CheckpointMode::Full) { + return Err(LimboError::InternalError( + "Full checkpoint mode is not implemented yet".into(), + )); + } 'checkpoint_loop: loop { let state = self.ongoing_checkpoint.state; tracing::debug!(?state); @@ -930,9 +936,7 @@ impl Wal for WalFile { ) }; let needs_backfill = shared_max > nbackfills; - if !needs_backfill - && matches!(mode, CheckpointMode::Passive | CheckpointMode::Full) - { + if !needs_backfill && matches!(mode, CheckpointMode::Passive) { // there are no frames to copy and we don't need to reset the log so we can // return early success. self.prev_checkpoint.release_guard(); // just in case we are still holding @@ -1029,9 +1033,9 @@ impl Wal for WalFile { self.ongoing_checkpoint.state = CheckpointState::Done; } } - // All eligible frames copied to the db file. - // Update nBackfills. - // In Reset or Truncate mode, we need to restart and possibly truncate the log. + // All eligible frames copied to the db file + // Update nBackfills + // In Restart or Truncate mode, we need to restart the log over and possibly truncate the file // Release all locks and return the current num of wal frames and the amount we backfilled CheckpointState::Done => { if *write_counter.borrow() > 0 { @@ -1369,13 +1373,11 @@ impl WalFile { fn acquire_proper_checkpoint_guard(&mut self, mode: CheckpointMode) -> Result<()> { let needs_new_guard = !matches!( (&self.checkpoint_guard, mode), - ( - Some(CheckpointGuard::Read0 { .. }), - CheckpointMode::Passive | CheckpointMode::Full, - ) | ( - Some(CheckpointGuard::Writer { .. }), - CheckpointMode::Restart | CheckpointMode::Truncate, - ), + (Some(CheckpointGuard::Read0 { .. }), CheckpointMode::Passive,) + | ( + Some(CheckpointGuard::Writer { .. }), + CheckpointMode::Restart | CheckpointMode::Truncate, + ), ); if needs_new_guard { // Drop any existing guard @@ -1564,7 +1566,6 @@ pub mod test { }; use std::{ cell::{Cell, RefCell, UnsafeCell}, - os::unix::fs::MetadataExt, rc::Rc, sync::{atomic::Ordering, Arc}, }; @@ -1626,10 +1627,7 @@ pub mod test { let stat = std::fs::metadata(&walpath).unwrap(); let meta_before = std::fs::metadata(&walpath).unwrap(); - #[cfg(not(unix))] let bytes_before = meta_before.len(); - #[cfg(unix)] - let blocks_before = meta_before.blocks(); run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Truncate); drop(wal); @@ -1637,32 +1635,15 @@ pub mod test { tracing::info!("wal filepath: {walpath:?}, size: {}", stat.len()); let meta_after = std::fs::metadata(&walpath).unwrap(); - - #[cfg(unix)] - { - let blocks_after = meta_after.blocks(); - assert_ne!( - blocks_before, blocks_after, - "WAL file should not have been empty before checkpoint" - ); - assert_eq!( - blocks_after, 0, - "WAL file should be truncated to 0 bytes, but is {blocks_after} blocks", - ); - } - #[cfg(not(unix))] - { - let bytes_after = meta_after.len(); - assert_ne!( - bytes_before, bytes_after, - "WAL file should not have been empty before checkpoint" - ); - // On Windows, we check the size in bytes - assert_eq!( - bytes_after, 0, - "WAL file should be truncated to 0 bytes, but is {bytes_after} bytes", - ); - } + let bytes_after = meta_after.len(); + assert_ne!( + bytes_before, bytes_after, + "WAL file should not have been empty before checkpoint" + ); + assert_eq!( + bytes_after, 32, + "WAL file should be truncated to 0 bytes, but is {bytes_after} bytes", + ); std::fs::remove_dir_all(path).unwrap(); } @@ -1693,105 +1674,6 @@ pub mod test { } } - #[test] - fn test_wal_full_checkpoint_mode() { - let (db, path) = get_database(); - let conn = db.connect().unwrap(); - conn.execute("create table test(id integer primary key, value text)") - .unwrap(); - let mut walpath = path.clone().into_os_string().into_string().unwrap(); - walpath.push_str("/test.db-wal"); - let walpath = std::path::PathBuf::from(walpath); - - // Produce multiple WAL frames. - bulk_inserts(&conn, 25, 2); - conn.pager.borrow_mut().cacheflush().unwrap(); - - let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); - let (before_max, before_backfills) = unsafe { - let s = &*wal_shared.get(); - ( - s.max_frame.load(Ordering::SeqCst), - s.nbackfills.load(Ordering::SeqCst), - ) - }; - assert!(before_max > 0); - assert_eq!(before_backfills, 0); - - let meta_before = std::fs::metadata(&walpath).unwrap(); - #[cfg(not(unix))] - let bytes_before = meta_before.len(); - #[cfg(unix)] - let blocks_before = meta_before.blocks(); - - // Run FULL checkpoint. - { - let pager_ref = conn.pager.borrow(); - let mut wal = pager_ref.wal.borrow_mut(); - let result = run_checkpoint_until_done(&mut *wal, &pager_ref, CheckpointMode::Full); - assert_eq!(result.num_wal_frames, before_max); - assert_eq!(result.num_checkpointed_frames, before_max); - } - - // Validate state after FULL: max_frame unchanged; nbackfills == max_frame; not restarted. - let (after_max, after_backfills, read_mark0) = unsafe { - let s = &*wal_shared.get(); - ( - s.max_frame.load(Ordering::SeqCst), - s.nbackfills.load(Ordering::SeqCst), - s.read_locks[0].value.load(Ordering::SeqCst), - ) - }; - assert_eq!(after_max, before_max, "Full should not reset WAL sequence"); - assert_eq!(after_backfills, after_max); - assert_eq!( - read_mark0, 0, - "Restart was NOT performed, read mark 0 stays 0" - ); - let meta_after = std::fs::metadata(&walpath).unwrap(); - #[cfg(unix)] - { - let blocks_after = meta_after.blocks(); - assert_eq!( - blocks_before, blocks_after, - "WAL file should not change size after full checkpoint" - ); - } - #[cfg(not(unix))] - { - let bytes_after = meta_after.len(); - assert_eq!( - bytes_before, bytes_after, - "WAL file should not change size after full checkpoint" - ); - } - - // Append another transaction -> frame id should become previous_max + 1 - let prev_max = after_max; - conn.execute("insert into test(value) values ('after_full')") - .unwrap(); - conn.pager - .borrow_mut() - .wal - .borrow_mut() - .finish_append_frames_commit() - .unwrap(); - - let (new_max, _) = unsafe { - let s = &*wal_shared.get(); - ( - s.max_frame.load(Ordering::SeqCst), - s.nbackfills.load(Ordering::SeqCst), - ) - }; - assert_eq!( - new_max, - prev_max + 1, - "WAL should continue appending not restart" - ); - std::fs::remove_dir_all(path).unwrap(); - } - fn wal_header_snapshot(shared: &Arc>) -> (u32, u32, u32, u32) { // (checkpoint_seq, salt1, salt2, page_size) unsafe { @@ -1828,11 +1710,7 @@ pub mod test { let meta_before = std::fs::metadata(&walpath).unwrap(); - #[cfg(not(unix))] let bytes_before = meta_before.len(); - #[cfg(unix)] - let blocks_before = meta_before.blocks(); - { let pager_ref = conn.pager.borrow(); let mut wal = pager_ref.wal.borrow_mut(); @@ -1869,22 +1747,11 @@ pub mod test { assert_eq!(new_max, 1, "Sequence restarted at 1"); let meta_after = std::fs::metadata(&walpath).unwrap(); - #[cfg(unix)] - { - let blocks_after = meta_after.blocks(); - assert_eq!( - blocks_before, blocks_after, - "WAL file should not change size after full checkpoint" - ); - } - #[cfg(not(unix))] - { - let bytes_after = meta_after.len(); - assert_eq!( - bytes_before, bytes_after, - "WAL file should not change size after full checkpoint" - ); - } + let bytes_after = meta_after.len(); + assert_eq!( + bytes_before, bytes_after, + "WAL file should not change size after full checkpoint" + ); std::fs::remove_dir_all(path).unwrap(); } @@ -2015,57 +1882,6 @@ pub mod test { } } - #[test] - fn test_wal_read_lock_slot_0_optimization() { - let (db, _path) = get_database(); - let conn = db.connect().unwrap(); - - conn.execute("create table test(id integer primary key, value text)") - .unwrap(); - conn.execute("insert into test(value) values ('initial')") - .unwrap(); - - // Checkpoint everything - { - let pager = conn.pager.borrow(); - let mut wal = pager.wal.borrow_mut(); - let result = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Full); - assert!(result.everything_backfilled()); - } - - // Now start a read transaction - should use slot 0 - let conn2 = db.connect().unwrap(); - { - let pager = conn2.pager.borrow_mut(); - let mut wal = pager.wal.borrow_mut(); - assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); - - // Verify it's using slot 0 and max_frame is 0 - assert_eq!( - wal.get_max_frame(), - 0, - "Should ignore WAL when using slot 0" - ); - assert_eq!(wal.get_min_frame(), 0); - - // Verify the read lock index - let wal_file = &*wal as *const dyn Wal as *const crate::WalFile; - let wal_file = unsafe { &*wal_file }; - assert_eq!( - wal_file.max_frame_read_lock_index, 0, - "Should be using slot 0" - ); - } - - // Another reader should also be able to use slot 0 (shared lock) - let conn3 = db.connect().unwrap(); - { - let pager = conn3.pager.borrow_mut(); - let mut wal = pager.wal.borrow_mut(); - assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); - } - } - #[test] fn test_wal_read_marks_after_restart() { let (db, _path) = get_database(); @@ -2096,9 +1912,9 @@ pub mod test { read_marks_after[1], 0, "Slot 1 (default reader) should be reset to 0" ); - for i in 2..5 { + for (i, item) in read_marks_after.iter().take(5).skip(2).enumerate() { assert_eq!( - read_marks_after[i], READMARK_NOT_USED, + *item, READMARK_NOT_USED, "Slot {i} should be READMARK_NOT_USED after restart", ); } diff --git a/sqlite3/tests/compat/mod.rs b/sqlite3/tests/compat/mod.rs index 9b1b1b56d..700fa6910 100644 --- a/sqlite3/tests/compat/mod.rs +++ b/sqlite3/tests/compat/mod.rs @@ -167,16 +167,17 @@ mod tests { SQLITE_OK ); - assert_eq!( - sqlite3_wal_checkpoint_v2( - db, - ptr::null(), - SQLITE_CHECKPOINT_FULL, - &mut log_size, - &mut checkpoint_count - ), - SQLITE_OK - ); + // TODO: uncomment when SQLITE_CHECKPOINT_FULL is supported + // assert_eq!( + // sqlite3_wal_checkpoint_v2( + // db, + // ptr::null(), + // SQLITE_CHECKPOINT_FULL, + // &mut log_size, + // &mut checkpoint_count + // ), + // SQLITE_OK + // ); assert_eq!( sqlite3_wal_checkpoint_v2( diff --git a/tests/integration/query_processing/test_write_path.rs b/tests/integration/query_processing/test_write_path.rs index 310acb9ed..fe2e189dd 100644 --- a/tests/integration/query_processing/test_write_path.rs +++ b/tests/integration/query_processing/test_write_path.rs @@ -303,40 +303,6 @@ fn test_wal_checkpoint() -> anyhow::Result<()> { Ok(()) } -#[test] -#[ignore] -fn test_wal_checkpoint_truncate() -> anyhow::Result<()> { - let _ = env_logger::try_init(); - let tmp_db = - TempDatabase::new_with_rusqlite("CREATE TABLE test (x INTEGER PRIMARY KEY);", false); - let iterations = 100_usize; - let conn = tmp_db.connect_limbo(); - - for i in 0..iterations { - let insert_query = format!("INSERT INTO test VALUES ({i})"); - do_flush(&conn, &tmp_db)?; - run_query(&tmp_db, &conn, &insert_query)?; - } - conn.checkpoint(CheckpointMode::Truncate)?; - let file = PathBuf::from(format!("{}-wal", tmp_db.path.to_string_lossy())); - assert!(file.exists(), "WAL file should exist after checkpoint"); - let size = file.metadata()?.blocks(); - assert_eq!( - size, 0, - "WAL file should be truncated to a small size, found: {size}", - ); - // assert data is all in table - for i in 0..iterations { - let select_query = format!("SELECT * FROM test WHERE x = {i}"); - run_query_on_row(&tmp_db, &conn, &select_query, |row: &Row| { - let id = row.get::(0).unwrap(); - assert_eq!(i as i64, id); - })?; - } - do_flush(&conn, &tmp_db)?; - Ok(()) -} - #[test] fn test_wal_restart() -> anyhow::Result<()> { let _ = env_logger::try_init(); From 5c1dbd1a9ff692d5658f5af5a28c39f69e8d079d Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sat, 19 Jul 2025 21:48:10 -0400 Subject: [PATCH 30/62] Remove unused import --- tests/integration/query_processing/test_write_path.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/query_processing/test_write_path.rs b/tests/integration/query_processing/test_write_path.rs index fe2e189dd..684627de5 100644 --- a/tests/integration/query_processing/test_write_path.rs +++ b/tests/integration/query_processing/test_write_path.rs @@ -2,8 +2,6 @@ use crate::common::{self, maybe_setup_tracing}; use crate::common::{compare_string, do_flush, TempDatabase}; use log::debug; use std::io::{Read, Seek, Write}; -use std::os::unix::fs::MetadataExt; -use std::path::PathBuf; use std::sync::Arc; use turso_core::{CheckpointMode, Connection, Database, Row, Statement, StepResult, Value}; From 3e7544438843712a01855dc9e0dbb4121fe69797 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Mon, 21 Jul 2025 21:39:42 -0400 Subject: [PATCH 31/62] Remove panic in cacheflush io.block in pager now that checkpoitns can return busy --- core/storage/pager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index dcc70290d..6f36f30be 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1297,11 +1297,11 @@ impl Pager { return Ok(CheckpointResult::default()); } + let counter = Rc::new(RefCell::new(0)); let mut checkpoint_result = self.io.block(|| { self.wal .borrow_mut() - .checkpoint(self, Rc::new(RefCell::new(0)), mode) - .map_err(|err| panic!("error while clearing cache {err}")) + .checkpoint(self, counter.clone(), mode) })?; if checkpoint_result.everything_backfilled() { From 1490a586b1797fcb9db6d856cfcab24065246933 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Mon, 21 Jul 2025 21:59:41 -0400 Subject: [PATCH 32/62] Apply suggestions/fixes and add extensive comments to wal chkpt --- core/storage/wal.rs | 559 +++++++++++++++++++++++++++----------------- 1 file changed, 340 insertions(+), 219 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index e9ad03cdb..ad62aa43d 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -70,6 +70,7 @@ impl CheckpointResult { #[strum(ascii_case_insensitive)] pub enum CheckpointMode { /// Checkpoint as many frames as possible without waiting for any database readers or writers to finish, then sync the database file if all frames in the log were checkpointed. + /// Passive never blocks readers or writers, only ensures (like all modes do) that there are no other checkpointers. Passive, /// This mode blocks until there is no database writer and all readers are reading from the most recent database snapshot. It then checkpoints all frames in the log file and syncs the database file. This mode blocks new database writers while it is pending, but new database readers are allowed to continue unimpeded. Full, @@ -459,6 +460,61 @@ impl fmt::Debug for WalFile { } } +/* +* sqlite3/src/wal.c +* +** nBackfill is the number of frames in the WAL that have been written +** back into the database. (We call the act of moving content from WAL to +** database "backfilling".) The nBackfill number is never greater than +** WalIndexHdr.mxFrame. nBackfill can only be increased by threads +** holding the WAL_CKPT_LOCK lock (which includes a recovery thread). +** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from +** mxFrame back to zero when the WAL is reset. +** +** nBackfillAttempted is the largest value of nBackfill that a checkpoint +** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however +** the nBackfillAttempted is set before any backfilling is done and the +** nBackfill is only set after all backfilling completes. So if a checkpoint +** crashes, nBackfillAttempted might be larger than nBackfill. The +** WalIndexHdr.mxFrame must never be less than nBackfillAttempted. +** +** The aLock[] field is a set of bytes used for locking. These bytes should +** never be read or written. +** +** There is one entry in aReadMark[] for each reader lock. If a reader +** holds read-lock K, then the value in aReadMark[K] is no greater than +** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff) +** for any aReadMark[] means that entry is unused. aReadMark[0] is +** a special case; its value is never used and it exists as a place-holder +** to avoid having to offset aReadMark[] indexes by one. Readers holding +** WAL_READ_LOCK(0) always ignore the entire WAL and read all content +** directly from the database. +** +** The value of aReadMark[K] may only be changed by a thread that +** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of +** aReadMark[K] cannot changed while there is a reader is using that mark +** since the reader will be holding a shared lock on WAL_READ_LOCK(K). +** +** The checkpointer may only transfer frames from WAL to database where +** the frame numbers are less than or equal to every aReadMark[] that is +** in use (that is, every aReadMark[j] for which there is a corresponding +** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the +** largest value and will increase an unused aReadMark[] to mxFrame if there +** is not already an aReadMark[] equal to mxFrame. The exception to the +** previous sentence is when nBackfill equals mxFrame (meaning that everything +** in the WAL has been backfilled into the database) then new readers +** will choose aReadMark[0] which has value 0 and hence such reader will +** get all their all content directly from the database file and ignore +** the WAL. +** +** Writers normally append new frames to the end of the WAL. However, +** if nBackfill equals mxFrame (meaning that all WAL content has been +** written back into the database) and if no readers are using the WAL +** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then +** the writer will first "reset" the WAL back to the beginning and start +** writing new content beginning at frame 1. +*/ + // TODO(pere): lock only important parts + pin WalFileShared /// WalFileShared is the part of a WAL that will be shared between threads. A wal has information /// that needs to be communicated between threads so this struct does the job. @@ -479,14 +535,19 @@ pub struct WalFileShared { pub pages_in_frames: Arc>>, pub last_checksum: (u32, u32), // Check of last frame in WAL, this is a cumulative checksum over all frames in the WAL pub file: Arc, - /// read_locks is a list of read locks that can coexist with the max_frame number stored in - /// value. There is a limited amount because and unbounded amount of connections could be - /// fatal. Therefore, for now we copy how SQLite behaves with limited amounts of read max - /// frames that is equal to 5 + + /// Read locks advertise the maximum WAL frame a reader may access. + /// Slot 0 is special, when it is held (shared) the reader bypasses the WAL and uses the main DB file. + /// When checkpointing, we must acquire the exclusive read lock 0 to ensure that no readers read + /// from a partially checkpointed db file. + /// Slots 1‑4 carry a frame‑number in value and may be shared by many readers. Slot 1 is the + /// default read lock and is to contain the max_frame in WAL. pub read_locks: [LimboRwLock; 5], /// There is only one write allowed in WAL mode. This lock takes care of ensuring there is only /// one used. pub write_lock: LimboRwLock, + + /// Serialises checkpointer threads, only one checkpoint can be in flight at any time. Blocking and exclusive only pub checkpoint_lock: LimboRwLock, pub loaded: AtomicBool, } @@ -514,6 +575,13 @@ enum CheckpointGuard { Read0 { ptr: Arc> }, } +/// Database checkpointers takes the following locks, in order: +/// The exclusive CHECKPOINTER lock. +/// The exclusive WRITER lock (FULL, RESTART and TRUNCATE only). +/// Exclusive lock on read-mark slots 1-N. These are immediately released after being taken. +/// Exclusive lock on read-mark 0. +/// Exclusive lock on read-mark slots 1-N again. These are immediately released after being taken (RESTART and TRUNCATE only). +/// All of the above use blocking locks. impl CheckpointGuard { fn new(ptr: Arc>, mode: CheckpointMode) -> Result { let shared = &mut unsafe { &mut *ptr.get() }; @@ -523,47 +591,52 @@ impl CheckpointGuard { return Err(LimboError::Busy); } match mode { + CheckpointMode::Full => Err(LimboError::InternalError( + "Full checkpoint mode is not yet supported".into(), + )), + // Passive mode is the only mode not requiring a write lock, as it doesn't block + // readers or writers. It acquires the checkpoint lock to ensure that no other + // concurrent checkpoint happens, and acquires the exclusive read lock 0 + // to ensure that no readers read from a partially checkpointed db file. CheckpointMode::Passive => { let read0 = &mut shared.read_locks[0]; if !read0.write() { shared.checkpoint_lock.unlock(); - tracing::trace!("CheckpointGuard::new: read0 lock failed, returning Busy"); + tracing::trace!("CheckpointGuard: read0 lock failed, returning Busy"); // for passive and full we need to hold the read0 lock return Err(LimboError::Busy); } Ok(Self::Read0 { ptr }) } CheckpointMode::Restart | CheckpointMode::Truncate => { + // like all modes, we must acquire an exclusive checkpoint lock and lock on read 0 + // to prevent a reader from reading a partially checkpointed db file. + let read0 = &mut shared.read_locks[0]; + if !read0.write() { + shared.checkpoint_lock.unlock(); + tracing::trace!("CheckpointGuard: read0 lock failed, returning Busy"); + return Err(LimboError::Busy); + } // if we are resetting the log we must hold the write lock for the duration. + // ensures no writer can append frames while we reset the log. if !shared.write_lock.write() { shared.checkpoint_lock.unlock(); - tracing::trace!("CheckpointGuard::new: read0 lock failed, returning Busy"); + read0.unlock(); + tracing::trace!("CheckpointGuard: write lock failed, returning Busy"); return Err(LimboError::Busy); } Ok(Self::Writer { ptr }) } - _ => todo!("not implemented yet"), } } } -/// macro to remove the writer lock in the event that a checkpoint errors out and would -/// otherwise leak the lock. Only used during checkpointing. -macro_rules! ensure_unlock { - ($self:ident, $fun:expr) => { - $fun.inspect_err(|e| { - tracing::error!( - "CheckpointGuard::ensure_unlock: error occurred, releasing held locks: {e}" - ); - let _ = $self.checkpoint_guard.take(); - })?; - }; -} impl Drop for CheckpointGuard { fn drop(&mut self) { match self { CheckpointGuard::Writer { ptr: shared } => unsafe { (*shared.get()).write_lock.unlock(); + (*shared.get()).read_locks[0].unlock(); (*shared.get()).checkpoint_lock.unlock(); }, CheckpointGuard::Read0 { ptr: shared } => unsafe { @@ -584,11 +657,11 @@ impl Wal for WalFile { let db_has_changed = max_frame_in_wal > self.max_frame; // Check if we can use slot 0 (bypass WAL) + // If we can obtain a shared lock on slot 0 and max_frame == nBackfills, we know every WAL frame has already + // been copied into the main database file, so we can skip mapping the WAL entirely. if max_frame_in_wal == nbackfills { let shared = self.get_shared(); let lock = &mut shared.read_locks[0]; - // if wal is fully checkpointed, we can use read mark 0 to read directly - // from the db file if lock.read() { self.max_frame_read_lock_index = 0; self.max_frame = max_frame_in_wal; @@ -668,8 +741,9 @@ impl Wal for WalFile { /// Begin a write transaction #[instrument(skip_all, level = Level::DEBUG)] fn begin_write_tx(&mut self) -> Result { - if !self.get_shared().write_lock.write() { - tracing::debug!("begin_write_transaction(busy=true)"); + let busy = !self.get_shared().write_lock.write(); + tracing::debug!("begin_write_transaction(busy={busy})"); + if busy { return Ok(LimboResult::Busy); } // If the max frame is not the same as the one in the shared state, it means another @@ -918,174 +992,10 @@ impl Wal for WalFile { "Full checkpoint mode is not implemented yet".into(), )); } - 'checkpoint_loop: loop { - let state = self.ongoing_checkpoint.state; - tracing::debug!(?state); - match state { - // Acquire the relevant exclusive lock (slot‑0 or WRITER) - // and checkpoint_lock so no other checkpointer can run. - // fsync WAL if there are unapplied frames. - // Decide the largest frame we are allowed to back‑fill. - CheckpointState::Start => { - // TODO(pere): check what frames are safe to checkpoint between many readers! - let (shared_max, nbackfills) = { - let shared = self.get_shared(); - ( - shared.max_frame.load(Ordering::SeqCst), - shared.nbackfills.load(Ordering::SeqCst), - ) - }; - let needs_backfill = shared_max > nbackfills; - if !needs_backfill && matches!(mode, CheckpointMode::Passive) { - // there are no frames to copy and we don't need to reset the log so we can - // return early success. - self.prev_checkpoint.release_guard(); // just in case we are still holding - return Ok(IOResult::Done(self.prev_checkpoint.clone())); - } - // acquire the appropriate locks depending on the checkpoint mode - self.acquire_proper_checkpoint_guard(mode)?; - self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); - self.ongoing_checkpoint.min_frame = nbackfills + 1; - self.ongoing_checkpoint.current_page = 0; - self.ongoing_checkpoint.state = CheckpointState::ReadFrame; - tracing::trace!( - "checkpoint_start(min_frame={}, max_frame={})", - self.ongoing_checkpoint.min_frame, - self.ongoing_checkpoint.max_frame, - ); - } - // Find the next page that has a frame in the safe interval and - // schedule a read of that frame. - CheckpointState::ReadFrame => { - let shared = self.get_shared(); - let min_frame = self.ongoing_checkpoint.min_frame; - let max_frame = self.ongoing_checkpoint.max_frame; - let pages_in_frames = shared.pages_in_frames.clone(); - let pages_in_frames = pages_in_frames.lock(); - - let frame_cache = shared.frame_cache.clone(); - let frame_cache = frame_cache.lock(); - assert!(self.ongoing_checkpoint.current_page as usize <= pages_in_frames.len()); - if self.ongoing_checkpoint.current_page as usize == pages_in_frames.len() { - self.ongoing_checkpoint.state = CheckpointState::Done; - continue 'checkpoint_loop; - } - let page = pages_in_frames[self.ongoing_checkpoint.current_page as usize]; - let frames = frame_cache - .get(&page) - .expect("page must be in frame cache if it's in list"); - - for frame in frames.iter().rev() { - if *frame >= min_frame && *frame <= max_frame { - tracing::debug!( - "checkpoint page(state={:?}, page={}, frame={})", - state, - page, - *frame - ); - self.ongoing_checkpoint.page.get().id = page as usize; - ensure_unlock!( - self, - self.read_frame( - *frame, - self.ongoing_checkpoint.page.clone(), - self.buffer_pool.clone(), - ) - ); - self.ongoing_checkpoint.state = CheckpointState::WaitReadFrame; - continue 'checkpoint_loop; - } - } - self.ongoing_checkpoint.current_page += 1; - } - CheckpointState::WaitReadFrame => { - if self.ongoing_checkpoint.page.is_locked() { - return Ok(IOResult::IO); - } else { - self.ongoing_checkpoint.state = CheckpointState::WritePage; - } - } - CheckpointState::WritePage => { - self.ongoing_checkpoint.page.set_dirty(); - let c = begin_write_btree_page( - pager, - &self.ongoing_checkpoint.page, - write_counter.clone(), - )?; - self.ongoing_checkpoint.state = CheckpointState::WaitWritePage; - } - CheckpointState::WaitWritePage => { - if *write_counter.borrow() > 0 { - return Ok(IOResult::IO); - } - // If page was in cache clear it. - if let Some(page) = pager.cache_get(self.ongoing_checkpoint.page.get().id) { - page.clear_dirty(); - } - self.ongoing_checkpoint.page.clear_dirty(); - let shared = self.get_shared(); - if (self.ongoing_checkpoint.current_page as usize) - < shared.pages_in_frames.lock().len() - { - self.ongoing_checkpoint.current_page += 1; - self.ongoing_checkpoint.state = CheckpointState::ReadFrame; - } else { - self.ongoing_checkpoint.state = CheckpointState::Done; - } - } - // All eligible frames copied to the db file - // Update nBackfills - // In Restart or Truncate mode, we need to restart the log over and possibly truncate the file - // Release all locks and return the current num of wal frames and the amount we backfilled - CheckpointState::Done => { - if *write_counter.borrow() > 0 { - return Ok(IOResult::IO); - } - let shared = self.get_shared(); - shared.checkpoint_lock.unlock(); - let max_frame = shared.max_frame.load(Ordering::SeqCst); - let nbackfills = shared.nbackfills.load(Ordering::SeqCst); - - let (mut checkpoint_result, everything_backfilled) = { - // Record two num pages fields to return as checkpoint result to caller. - // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html - - let frames_in_wal = max_frame.saturating_sub(nbackfills); - let frames_checkpointed = self - .ongoing_checkpoint - .max_frame - .saturating_sub(self.ongoing_checkpoint.min_frame - 1); - let checkpoint_result = - CheckpointResult::new(frames_in_wal, frames_checkpointed); - let everything_backfilled = max_frame == self.ongoing_checkpoint.max_frame; - (checkpoint_result, everything_backfilled) - }; - // we will just overwrite nbackfills with 0 if we are resetting - self.get_shared() - .nbackfills - .store(self.ongoing_checkpoint.max_frame, Ordering::SeqCst); - if everything_backfilled - && matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate) - { - ensure_unlock!(self, self.restart_log(mode)); - } - self.prev_checkpoint = checkpoint_result.clone(); - // we cannot truncate the db file here, because we are currently inside a - // mut borrow of pager.wal, and accessing the header will attempt a borrow, - // so the caller will determine if: - // a. the max frame == num wal frames - // b. the db file size != num of db pages * page_size - // and truncate the db file if necessary. - if checkpoint_result.everything_backfilled() { - checkpoint_result.maybe_guard = self.checkpoint_guard.take(); - } else { - let _ = self.checkpoint_guard.take(); - } - self.ongoing_checkpoint.state = CheckpointState::Start; - return Ok(IOResult::Done(checkpoint_result)); - } - } - } + self.checkpoint_inner(pager, write_counter, mode) + .inspect_err(|_| { + let _ = self.checkpoint_guard.take(); + }) } #[instrument(err, skip_all, level = Level::DEBUG)] @@ -1257,7 +1167,181 @@ impl WalFile { self.syncing.set(false); } - // Coordinate what the maximum safe frame is for us to backfill when checkpointing. + fn checkpoint_inner( + &mut self, + pager: &Pager, + write_counter: Rc>, + mode: CheckpointMode, + ) -> Result> { + 'checkpoint_loop: loop { + let state = self.ongoing_checkpoint.state; + tracing::debug!(?state); + match state { + // Acquire the relevant exclusive locks and checkpoint_lock + // so no other checkpointer can run. fsync WAL if there are unapplied frames. + // Decide the largest frame we are allowed to back‑fill. + CheckpointState::Start => { + let (max_frame, nbackfills) = { + let shared = self.get_shared(); + let max_frame = shared.max_frame.load(Ordering::SeqCst); + let n_backfills = shared.nbackfills.load(Ordering::SeqCst); + (max_frame, n_backfills) + }; + let needs_backfill = max_frame > nbackfills; + + if !needs_backfill && matches!(mode, CheckpointMode::Passive) { + // there are no frames to copy over and we don't need to reset + // the log so we can return early success. + return Ok(IOResult::Done(self.prev_checkpoint.clone())); + } + // acquire either the read0 or write lock depending on the checkpoint mode + self.acquire_proper_checkpoint_guard(mode)?; + self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); + self.ongoing_checkpoint.min_frame = nbackfills + 1; + self.ongoing_checkpoint.current_page = 0; + self.ongoing_checkpoint.state = CheckpointState::ReadFrame; + tracing::trace!( + "checkpoint_start(min_frame={}, max_frame={})", + self.ongoing_checkpoint.min_frame, + self.ongoing_checkpoint.max_frame, + ); + } + // Find the next page that has a frame in the safe interval and + // schedule a read of that frame. + CheckpointState::ReadFrame => { + let shared = self.get_shared(); + let min_frame = self.ongoing_checkpoint.min_frame; + let max_frame = self.ongoing_checkpoint.max_frame; + let pages_in_frames = shared.pages_in_frames.clone(); + let pages_in_frames = pages_in_frames.lock(); + + let frame_cache = shared.frame_cache.clone(); + let frame_cache = frame_cache.lock(); + assert!(self.ongoing_checkpoint.current_page as usize <= pages_in_frames.len()); + if self.ongoing_checkpoint.current_page as usize == pages_in_frames.len() { + self.ongoing_checkpoint.state = CheckpointState::Done; + continue 'checkpoint_loop; + } + let page = pages_in_frames[self.ongoing_checkpoint.current_page as usize]; + let frames = frame_cache + .get(&page) + .expect("page must be in frame cache if it's in list"); + + for frame in frames.iter().rev() { + if *frame >= min_frame && *frame <= max_frame { + tracing::debug!( + "checkpoint page(state={:?}, page={}, frame={})", + state, + page, + *frame + ); + self.ongoing_checkpoint.page.get().id = page as usize; + self.read_frame( + *frame, + self.ongoing_checkpoint.page.clone(), + self.buffer_pool.clone(), + )?; + self.ongoing_checkpoint.state = CheckpointState::WaitReadFrame; + continue 'checkpoint_loop; + } + } + self.ongoing_checkpoint.current_page += 1; + } + CheckpointState::WaitReadFrame => { + if self.ongoing_checkpoint.page.is_locked() { + return Ok(IOResult::IO); + } else { + self.ongoing_checkpoint.state = CheckpointState::WritePage; + } + } + CheckpointState::WritePage => { + self.ongoing_checkpoint.page.set_dirty(); + begin_write_btree_page( + pager, + &self.ongoing_checkpoint.page, + write_counter.clone(), + )?; + self.ongoing_checkpoint.state = CheckpointState::WaitWritePage; + } + CheckpointState::WaitWritePage => { + if *write_counter.borrow() > 0 { + return Ok(IOResult::IO); + } + // If page was in cache clear it. + if let Some(page) = pager.cache_get(self.ongoing_checkpoint.page.get().id) { + page.clear_dirty(); + } + self.ongoing_checkpoint.page.clear_dirty(); + let shared = self.get_shared(); + if (self.ongoing_checkpoint.current_page as usize) + < shared.pages_in_frames.lock().len() + { + self.ongoing_checkpoint.current_page += 1; + self.ongoing_checkpoint.state = CheckpointState::ReadFrame; + } else { + self.ongoing_checkpoint.state = CheckpointState::Done; + } + } + // All eligible frames copied to the db file + // Update nBackfills + // In Restart or Truncate mode, we need to restart the log over and possibly truncate the file + // Release all locks and return the current num of wal frames and the amount we backfilled + CheckpointState::Done => { + if *write_counter.borrow() > 0 { + return Ok(IOResult::IO); + } + let (mut checkpoint_result, everything_backfilled) = { + let shared = self.get_shared(); + let current_mx = shared.max_frame.load(Ordering::SeqCst); + let nbackfills = shared.nbackfills.load(Ordering::SeqCst); + + let frames_in_wal = current_mx.saturating_sub(nbackfills); + let frames_checkpointed = current_mx.saturating_sub(self.ongoing_checkpoint.min_frame - 1); + // Record two num pages fields to return as checkpoint result to caller. + // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html + ( + CheckpointResult::new(frames_in_wal, frames_checkpointed), + // if the current max frame that we read while holding checkpoint_lock is equal to the max frame of the + // ongoing checkpoint, it means that we have backfilled everything + current_mx == self.ongoing_checkpoint.max_frame, + ) + }; + // we will just overwrite nbackfills with 0 if we are resetting + self.get_shared() + .nbackfills + .store(self.ongoing_checkpoint.max_frame, Ordering::SeqCst); + + if everything_backfilled + && matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate) + { + self.restart_log(mode)?; + } + + // store a copy of the checkpoint result to return in the future if pragma + // wal_checkpoint is called and we haven't backfilled again since. + self.prev_checkpoint = checkpoint_result.clone(); + + // we cannot truncate the db file here because we are currently inside a + // mut borrow of pager.wal, and accessing the header will attempt a borrow + // during 'read_page', so the caller will use the result to determine if: + // a. the max frame == num wal frames (everything backfilled) + // b. the physical db file size differs from the expected pages * page_size + // and truncate + sync the db file if necessary. + if checkpoint_result.everything_backfilled() { + checkpoint_result.maybe_guard = self.checkpoint_guard.take(); + } else { + let _ = self.checkpoint_guard.take(); + } + self.ongoing_checkpoint.state = CheckpointState::Start; + return Ok(IOResult::Done(checkpoint_result)); + } + } + } + } + + /// Coordinate what the maximum safe frame is for us to backfill when checkpointing. + /// We can never backfill a frame with a higher number than any reader's max frame, + /// because we might overwrite content the reader is reading from the database file. fn determine_max_safe_checkpoint_frame(&self) -> u64 { let shared = self.get_shared(); let mut max_safe_frame = shared.max_frame.load(Ordering::SeqCst); @@ -1269,6 +1353,8 @@ impl WalFile { if this_mark < max_safe_frame as u32 { let busy = !read_lock.write(); if !busy { + // readmark 1 is the default reader, and should always contain the max safe + // frame for new readers so we bump it up to the current max frame instead of clearing it. let new_mark = if read_lock_idx == 1 { max_safe_frame as u32 } else { @@ -1310,6 +1396,7 @@ impl WalFile { // Reader is active, cannot proceed return Err(LimboError::Busy); } + // after the log is reset, we must set all secondary marks to READMARK_NOT_USED so the next reader selects a fresh slot lock.value.store(READMARK_NOT_USED, Ordering::SeqCst); } } @@ -1338,22 +1425,32 @@ impl WalFile { tracing::trace!("WAL file truncated to 0 B"); }); let shared = self.get_shared(); - shared.file.truncate(0, c.clone()).inspect_err(|e| { - handle_err(e); - })?; + // for now at least, lets do all this IO syncronously + let c = shared.file.truncate(0, c.clone()).inspect_err(handle_err)?; + self.io.wait_for_completion(c).inspect_err(handle_err)?; + let hdr = shared.wal_header.lock(); // sqlite just lets the next writer create it when the first frame is written. // we can write the new header here for simplicity. - sqlite3_ondisk::begin_write_wal_header(&shared.file, &hdr).inspect_err(|e| { - handle_err(e); - })?; - let c = Completion::new_sync(|_| { - tracing::trace!("WAL file synced after truncation"); - }); - // fsync after truncation - shared.file.sync(c).inspect_err(|e| { - handle_err(e); - })?; + self.io + .wait_for_completion( + sqlite3_ondisk::begin_write_wal_header(&shared.file, &hdr) + .inspect_err(handle_err)?, + ) + .inspect_err(handle_err)?; + self.io + .wait_for_completion( + shared + .file + .sync( + Completion::new_sync(|_| { + tracing::trace!("WAL file synced after reset/truncation"); + }) + .into(), + ) + .inspect_err(handle_err)?, + ) + .inspect_err(handle_err)?; } // release read‑locks 1..4 @@ -1463,7 +1560,10 @@ impl WalFileShared { nreads: AtomicU32::new(0), value: AtomicU32::new(READMARK_NOT_USED), }); - // slots 0 and 1 begin at 0 + + // slot zero is always zero as it signifies that reads can be done from the db file + // directly, and slot 1 is the default read mark containing the max frame. in this case + // our max frame is zero so both slots 0 and 1 begin at 0 read_locks[0].value.store(0, Ordering::SeqCst); read_locks[1].value.store(0, Ordering::SeqCst); @@ -1508,6 +1608,10 @@ impl WalFileShared { /// client to write to the database (which may be this one) does so by /// writing frames into the start of the log file. fn restart_wal_header(&mut self, io: &Arc, mode: CheckpointMode) -> Result<()> { + turso_assert!( + matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate), + "CheckpointMode must be Restart or Truncate" + ); // bump checkpoint sequence let mut hdr = self.wal_header.lock(); hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); @@ -1530,7 +1634,7 @@ impl WalFileShared { // for RESTART, we write a new header to the WAL file. truncate will simply // write it in memory and let the following writer append it to the empty WAL file - if !matches!(mode, CheckpointMode::Truncate) { + if matches!(mode, CheckpointMode::Restart) { // if we are truncating the WAL, we don't bother writing a new header let c = sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; io.wait_for_completion(c)?; @@ -1768,11 +1872,20 @@ pub mod test { conn1.pager.borrow_mut().cacheflush().unwrap(); // Force a read transaction that will freeze a lower read mark - { + let readmark = { let pager = conn2.pager.borrow_mut(); let mut wal2 = pager.wal.borrow_mut(); +<<<<<<< HEAD assert!(matches!(wal2.begin_read_tx().unwrap().0, LimboResult::Ok)); } +||||||| parent of 27000501 (Apply suggestions/fixes and add extensive comments to wal chkpt) + assert!(matches!(wal2.begin_read_tx().unwrap(), LimboResult::Ok)); + } +======= + assert!(matches!(wal2.begin_read_tx().unwrap(), LimboResult::Ok)); + wal2.get_max_frame() + }; +>>>>>>> 27000501 (Apply suggestions/fixes and add extensive comments to wal chkpt) // generate more frames that the reader will not see. bulk_inserts(&conn1.clone(), 15, 2); @@ -1797,7 +1910,10 @@ pub mod test { res1.num_checkpointed_frames, res1.num_wal_frames ); - + assert_eq!( + res1.num_checkpointed_frames, readmark, + "Checkpointed frames should match read mark" + ); // Release reader { let pager = conn2.pager.borrow_mut(); @@ -1839,16 +1955,13 @@ pub mod test { Ok(IOResult::IO) => { conn1.run_once().unwrap(); } - Ok(IOResult::Done(result)) => { - assert_eq!( - result.num_checkpointed_frames, 0, - "Should not checkpoint any frames (empty)" + e => { + assert!( + matches!(e, Err(LimboError::Busy)), + "reader is holding readmark0 we should return Busy" ); break; } - Err(e) => { - panic!("Checkpoint failed: {e}"); - } } } drop(w); @@ -1875,7 +1988,10 @@ pub mod test { panic!("Checkpoint should not have succeeded"); } Err(e) => { - assert!(matches!(e, LimboError::Busy), "should block readers"); + assert!( + matches!(e, LimboError::Busy), + "should return busy if we have readers" + ); break; } } @@ -1935,11 +2051,12 @@ pub mod test { let conn_r2 = db.connect().unwrap(); // R1 starts reading - { + let r1_max_frame = { let pager = conn_r1.pager.borrow_mut(); let mut wal = pager.wal.borrow_mut(); assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); - } + wal.get_max_frame() + }; bulk_inserts(&conn_writer, 5, 10); // R2 starts reading, sees more frames than R1 @@ -1961,6 +2078,10 @@ pub mod test { checkpoint_result.num_checkpointed_frames < checkpoint_result.num_wal_frames, "Should not checkpoint all frames when readers are active" ); + assert_eq!( + checkpoint_result.num_checkpointed_frames, r1_max_frame, + "Should have checkpointed up to R1's max frame" + ); // Verify R2 still sees its frames assert_eq!( From 318bfa9590fc1feba8e21e2e98056e16cf4c5427 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Tue, 22 Jul 2025 12:59:50 -0400 Subject: [PATCH 33/62] Change incorrect comments and rename guard --- core/storage/wal.rs | 46 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index ad62aa43d..1fadd9cf4 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -45,8 +45,15 @@ pub struct CheckpointResult { pub num_wal_frames: u64, /// number of frames moved successfully from WAL to db file after checkpoint pub num_checkpointed_frames: u64, - /// will drop automatically and free any remaining locks we need to hold - maybe_guard: Option, + /// In the case of everything backfilled, we need to hold the locks until the db + /// file is truncated. + maybe_guard: Option, +} + +impl Drop for CheckpointResult { + fn drop(&mut self) { + let _ = self.maybe_guard.take(); + } } impl CheckpointResult { @@ -440,7 +447,8 @@ pub struct WalFile { /// Private copy of WalHeader pub header: WalHeader, - checkpoint_guard: Option, + /// Manages locks needed for checkpointing + checkpoint_guard: Option, } impl fmt::Debug for WalFile { @@ -568,9 +576,10 @@ impl fmt::Debug for WalFileShared { } #[derive(Clone, Debug)] -/// An RAII guard to ensure that no locks are leaked during checkpointing in -/// the case of errors. -enum CheckpointGuard { +/// To manage and ensure that no locks are leaked during checkpointing in +/// the case of errors. It is held by the WalFile while checkpoint is ongoing +/// then transferred to the CheckpointResult if necessary. +enum CheckpointLocks { Writer { ptr: Arc> }, Read0 { ptr: Arc> }, } @@ -582,7 +591,7 @@ enum CheckpointGuard { /// Exclusive lock on read-mark 0. /// Exclusive lock on read-mark slots 1-N again. These are immediately released after being taken (RESTART and TRUNCATE only). /// All of the above use blocking locks. -impl CheckpointGuard { +impl CheckpointLocks { fn new(ptr: Arc>, mode: CheckpointMode) -> Result { let shared = &mut unsafe { &mut *ptr.get() }; if !shared.checkpoint_lock.write() { @@ -631,15 +640,15 @@ impl CheckpointGuard { } } -impl Drop for CheckpointGuard { +impl Drop for CheckpointLocks { fn drop(&mut self) { match self { - CheckpointGuard::Writer { ptr: shared } => unsafe { + CheckpointLocks::Writer { ptr: shared } => unsafe { (*shared.get()).write_lock.unlock(); (*shared.get()).read_locks[0].unlock(); (*shared.get()).checkpoint_lock.unlock(); }, - CheckpointGuard::Read0 { ptr: shared } => unsafe { + CheckpointLocks::Read0 { ptr: shared } => unsafe { (*shared.get()).read_locks[0].unlock(); (*shared.get()).checkpoint_lock.unlock(); }, @@ -741,9 +750,7 @@ impl Wal for WalFile { /// Begin a write transaction #[instrument(skip_all, level = Level::DEBUG)] fn begin_write_tx(&mut self) -> Result { - let busy = !self.get_shared().write_lock.write(); - tracing::debug!("begin_write_transaction(busy={busy})"); - if busy { + if !self.get_shared().write_lock.write() { return Ok(LimboResult::Busy); } // If the max frame is not the same as the one in the shared state, it means another @@ -1194,7 +1201,7 @@ impl WalFile { // the log so we can return early success. return Ok(IOResult::Done(self.prev_checkpoint.clone())); } - // acquire either the read0 or write lock depending on the checkpoint mode + // acquire the appropriate exclusive locks depending on the checkpoint mode self.acquire_proper_checkpoint_guard(mode)?; self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); self.ongoing_checkpoint.min_frame = nbackfills + 1; @@ -1296,7 +1303,8 @@ impl WalFile { let nbackfills = shared.nbackfills.load(Ordering::SeqCst); let frames_in_wal = current_mx.saturating_sub(nbackfills); - let frames_checkpointed = current_mx.saturating_sub(self.ongoing_checkpoint.min_frame - 1); + let frames_checkpointed = + current_mx.saturating_sub(self.ongoing_checkpoint.min_frame - 1); // Record two num pages fields to return as checkpoint result to caller. // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html ( @@ -1378,7 +1386,7 @@ impl WalFile { "CheckpointMode must be Restart or Truncate" ); turso_assert!( - matches!(self.checkpoint_guard, Some(CheckpointGuard::Writer { .. })), + matches!(self.checkpoint_guard, Some(CheckpointLocks::Writer { .. })), "We must hold writer and checkpoint locks to restart the log, found: {:?}", self.checkpoint_guard ); @@ -1470,9 +1478,9 @@ impl WalFile { fn acquire_proper_checkpoint_guard(&mut self, mode: CheckpointMode) -> Result<()> { let needs_new_guard = !matches!( (&self.checkpoint_guard, mode), - (Some(CheckpointGuard::Read0 { .. }), CheckpointMode::Passive,) + (Some(CheckpointLocks::Read0 { .. }), CheckpointMode::Passive,) | ( - Some(CheckpointGuard::Writer { .. }), + Some(CheckpointLocks::Writer { .. }), CheckpointMode::Restart | CheckpointMode::Truncate, ), ); @@ -1481,7 +1489,7 @@ impl WalFile { if self.checkpoint_guard.is_some() { let _ = self.checkpoint_guard.take(); } - let guard = CheckpointGuard::new(self.shared.clone(), mode)?; + let guard = CheckpointLocks::new(self.shared.clone(), mode)?; self.checkpoint_guard = Some(guard); } Ok(()) From ff1987a45c1a0cd2a14f8e67a6fbaeefc2fe9dc7 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Tue, 22 Jul 2025 23:12:05 -0400 Subject: [PATCH 34/62] Temporarily remove optimization for new read tx to grab read mark 0 and skip db file --- core/storage/wal.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 1fadd9cf4..c8b5a8876 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -678,8 +678,8 @@ impl Wal for WalFile { return Ok((LimboResult::Ok, db_has_changed)); } } - let mut max_read_mark = 0; let mut max_read_mark_index = -1; + let mut max_read_mark = 0; // Find the largest mark we can find, ignore frames that are impossible to be in range and // that are not set for (index, lock) in self.get_shared().read_locks.iter().enumerate().skip(1) { @@ -1195,7 +1195,6 @@ impl WalFile { (max_frame, n_backfills) }; let needs_backfill = max_frame > nbackfills; - if !needs_backfill && matches!(mode, CheckpointMode::Passive) { // there are no frames to copy over and we don't need to reset // the log so we can return early success. From 7640535ba4968ad451b6d10046d8ebed5d3da771 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Wed, 23 Jul 2025 22:41:30 -0400 Subject: [PATCH 35/62] Fix transaction read0 shortcut in WAL and track whether we have snapshot --- core/result.rs | 1 + core/storage/wal.rs | 100 ++++++++++++++++++++++++++++++-------------- 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/core/result.rs b/core/result.rs index 3056528ce..a13754b32 100644 --- a/core/result.rs +++ b/core/result.rs @@ -1,4 +1,5 @@ /// Common results that different functions can return in limbo. +#[derive(Debug)] pub enum LimboResult { /// Couldn't acquire a lock Busy, diff --git a/core/storage/wal.rs b/core/storage/wal.rs index c8b5a8876..e8b87a49d 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -39,6 +39,8 @@ pub const NO_LOCK: u32 = 0; pub const SHARED_LOCK: u32 = 1; pub const WRITE_LOCK: u32 = 2; +const NO_LOCK_HELD: usize = usize::MAX; + #[derive(Debug, Clone, Default)] pub struct CheckpointResult { /// number of frames in WAL @@ -427,10 +429,11 @@ pub struct WalFile { shared: Arc>, ongoing_checkpoint: OngoingCheckpoint, checkpoint_threshold: usize, + has_snapshot: Cell, // min and max frames for this connection /// This is the index to the read_lock in WalFileShared that we are holding. This lock contains /// the max frame for this connection. - max_frame_read_lock_index: usize, + max_frame_read_lock_index: Cell, /// Max frame allowed to lookup range=(minframe..max_frame) max_frame: u64, /// Start of range to look for frames range=(minframe..max_frame) @@ -662,22 +665,31 @@ impl Wal for WalFile { fn begin_read_tx(&mut self) -> Result<(LimboResult, bool)> { let max_frame_in_wal = self.get_shared().max_frame.load(Ordering::SeqCst); let nbackfills = self.get_shared().nbackfills.load(Ordering::SeqCst); - let db_has_changed = max_frame_in_wal > self.max_frame; - // Check if we can use slot 0 (bypass WAL) - // If we can obtain a shared lock on slot 0 and max_frame == nBackfills, we know every WAL frame has already - // been copied into the main database file, so we can skip mapping the WAL entirely. + // WAL is already fully back‑filled into the main DB image + // (mxFrame == nBackfill). Readers can therefore ignore the + // WAL and fetch pages directly from the DB file. We do this + // by taking read‑lock 0. if max_frame_in_wal == nbackfills { - let shared = self.get_shared(); - let lock = &mut shared.read_locks[0]; - if lock.read() { - self.max_frame_read_lock_index = 0; - self.max_frame = max_frame_in_wal; - self.min_frame = max_frame_in_wal + 1; // Ignore WAL; - return Ok((LimboResult::Ok, db_has_changed)); + let lock0 = &mut self.get_shared().read_locks[0]; + if !lock0.read() { + return Ok((LimboResult::Busy, db_has_changed)); } + self.max_frame = max_frame_in_wal; + // we need to keep self.max_frame set to the appropriate + // max frame in the wal at the time this transaction starts. + // but here we set min_frame=max_frame + 1 to keep an empty snapshot window, + // to demonstrate that we do not care about any frames, + // while still capturing a snapshot that we may need if we ever want to upgrade + // to a write transaction. + self.min_frame = max_frame_in_wal + 1; + self.max_frame_read_lock_index.set(0); + self.has_snapshot.set(true); + self.last_checksum = self.get_shared().last_checksum; + return Ok((LimboResult::Ok, db_has_changed)); } + let mut max_read_mark_index = -1; let mut max_read_mark = 0; // Find the largest mark we can find, ignore frames that are impossible to be in range and @@ -724,15 +736,17 @@ impl Wal for WalFile { ) }; self.min_frame = min_frame; - self.max_frame_read_lock_index = max_read_mark_index as usize; + self.max_frame_read_lock_index + .set(max_read_mark_index as usize); self.max_frame = max_read_mark as u64; self.last_checksum = last_checksum; + self.has_snapshot.set(true); self.start_pages_in_frames = start_pages_in_frames; tracing::debug!( "begin_read_tx(min_frame={}, max_frame={}, lock={}, max_frame_in_wal={})", self.min_frame, self.max_frame, - self.max_frame_read_lock_index, + max_read_mark_index, max_frame_in_wal ); Ok((LimboResult::Ok, db_has_changed)) @@ -742,9 +756,18 @@ impl Wal for WalFile { #[inline(always)] #[instrument(skip_all, level = Level::DEBUG)] fn end_read_tx(&self) { - tracing::debug!("end_read_tx(lock={})", self.max_frame_read_lock_index); - let read_lock = &mut self.get_shared().read_locks[self.max_frame_read_lock_index]; - read_lock.unlock(); + let held = self.max_frame_read_lock_index.get(); + turso_assert!( + held != NO_LOCK_HELD, + "We must have a read lock held to end a read transaction" + ); + tracing::debug!("end_read_tx(lock={})", held); + { + let read_lock = &mut self.get_shared().read_locks[held]; + read_lock.unlock(); + } + self.has_snapshot.set(false); + self.max_frame_read_lock_index.set(NO_LOCK_HELD); } /// Begin a write transaction @@ -753,16 +776,24 @@ impl Wal for WalFile { if !self.get_shared().write_lock.write() { return Ok(LimboResult::Busy); } - // If the max frame is not the same as the one in the shared state, it means another - // transaction wrote to the WAL after we started our read transaction. This means our - // snapshot is not consistent with the one in the shared state and we need to start another - // one. - let shared = self.get_shared(); - if self.max_frame != shared.max_frame.load(Ordering::SeqCst) { - shared.write_lock.unlock(); - return Ok(LimboResult::Busy); + let shared_max = self.get_shared().max_frame.load(Ordering::SeqCst); + + // If we have a snapshot and self.max_frame == shared.max_frame, + // then the snapshot is still valid and it's safe to promote to write tx. + // It is also valid if we do not yet have a snapshot. + if !self.has_snapshot.get() || self.max_frame == shared_max { + // Both cases mean we can safely use the shared state. + self.max_frame = shared_max; + self.last_checksum = self.get_shared().last_checksum; + self.min_frame = self.get_shared().nbackfills.load(Ordering::SeqCst) + 1; + self.has_snapshot.set(true); + return Ok(LimboResult::Ok); } - Ok(LimboResult::Ok) + // Otherwise, another transaction wrote to the WAL after we started our read transaction. + // This means our snapshot is not consistent with the one in the shared state and we need to start over. + let shared = self.get_shared(); + shared.write_lock.unlock(); + return Ok(LimboResult::Busy); } /// End a write transaction @@ -1118,12 +1149,13 @@ impl WalFile { max_frame: 0, current_page: 0, }, + has_snapshot: false.into(), checkpoint_threshold: 1000, buffer_pool, syncing: Rc::new(Cell::new(false)), sync_state: Cell::new(SyncState::NotSyncing), min_frame: 0, - max_frame_read_lock_index: 0, + max_frame_read_lock_index: NO_LOCK_HELD.into(), last_checksum, prev_checkpoint: CheckpointResult::default(), checkpoint_guard: None, @@ -1313,15 +1345,18 @@ impl WalFile { current_mx == self.ongoing_checkpoint.max_frame, ) }; + // we will just overwrite nbackfills with 0 if we are resetting self.get_shared() .nbackfills .store(self.ongoing_checkpoint.max_frame, Ordering::SeqCst); - if everything_backfilled - && matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate) - { - self.restart_log(mode)?; + if matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate) { + if everything_backfilled { + self.restart_log(mode)?; + } else { + return Err(LimboError::Busy); + } } // store a copy of the checkpoint result to return in the future if pragma @@ -2145,7 +2180,8 @@ pub mod test { { let pager = conn2.pager.borrow_mut(); let mut wal = pager.wal.borrow_mut(); - assert!(matches!(wal.begin_write_tx().unwrap(), LimboResult::Ok)); + let res = wal.begin_write_tx().unwrap(); + assert!(matches!(res, LimboResult::Ok), "result: {res:?}"); } // should fail because writer lock is held From 529cc14e29b618aad91f1f1c3773397bb89bc6d2 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Wed, 23 Jul 2025 23:01:31 -0400 Subject: [PATCH 36/62] Fix wal tests remove unwrap from previous Result return val --- core/storage/wal.rs | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index e8b87a49d..f434e651e 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -808,17 +808,24 @@ impl Wal for WalFile { fn find_frame(&self, page_id: u64) -> Result> { let shared = self.get_shared(); let frames = shared.frame_cache.lock(); + // if we have read_lock 0, we are reading straight from the db file + if self.max_frame_read_lock_index.get() == 0 { + return Ok(None); + } let frames = frames.get(&page_id); if frames.is_none() { return Ok(None); } let frames = frames.unwrap(); - for frame in frames.iter().rev() { - if *frame <= self.max_frame { - return Ok(Some(*frame)); - } + if let Some(&f) = frames + .iter() + .rev() + .find(|&&f| f <= self.max_frame && f >= self.min_frame) + { + Ok(Some(f)) + } else { + Ok(None) } - Ok(None) } /// Read a frame from the WAL. @@ -1917,17 +1924,9 @@ pub mod test { let readmark = { let pager = conn2.pager.borrow_mut(); let mut wal2 = pager.wal.borrow_mut(); -<<<<<<< HEAD assert!(matches!(wal2.begin_read_tx().unwrap().0, LimboResult::Ok)); - } -||||||| parent of 27000501 (Apply suggestions/fixes and add extensive comments to wal chkpt) - assert!(matches!(wal2.begin_read_tx().unwrap(), LimboResult::Ok)); - } -======= - assert!(matches!(wal2.begin_read_tx().unwrap(), LimboResult::Ok)); wal2.get_max_frame() }; ->>>>>>> 27000501 (Apply suggestions/fixes and add extensive comments to wal chkpt) // generate more frames that the reader will not see. bulk_inserts(&conn1.clone(), 15, 2); @@ -2096,7 +2095,7 @@ pub mod test { let r1_max_frame = { let pager = conn_r1.pager.borrow_mut(); let mut wal = pager.wal.borrow_mut(); - assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); + assert!(matches!(wal.begin_read_tx().unwrap().0, LimboResult::Ok)); wal.get_max_frame() }; bulk_inserts(&conn_writer, 5, 10); @@ -2105,7 +2104,7 @@ pub mod test { let r2_max_frame = { let pager = conn_r2.pager.borrow_mut(); let mut wal = pager.wal.borrow_mut(); - assert!(matches!(wal.begin_read_tx().unwrap(), LimboResult::Ok)); + assert!(matches!(wal.begin_read_tx().unwrap().0, LimboResult::Ok)); wal.get_max_frame() }; @@ -2197,7 +2196,7 @@ pub mod test { ); // release write lock - conn2.pager.borrow().wal.borrow().end_write_tx().unwrap(); + conn2.pager.borrow().wal.borrow().end_write_tx(); // now restart should succeed let result = { From 8ec99a914364f6d33d5c97879e501adb148920e0 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Thu, 24 Jul 2025 18:04:44 -0400 Subject: [PATCH 37/62] Remove assert for !NO_LOCK_HELD, properly handle writing header if reset --- core/io/io_uring.rs | 4 +- core/storage/wal.rs | 183 ++++++++++-------- .../query_processing/test_write_path.rs | 4 +- 3 files changed, 110 insertions(+), 81 deletions(-) diff --git a/core/io/io_uring.rs b/core/io/io_uring.rs index fe3e8d6dd..721313df4 100644 --- a/core/io/io_uring.rs +++ b/core/io/io_uring.rs @@ -359,9 +359,9 @@ impl File for UringFile { let truncate = with_fd!(self, |fd| { io_uring::opcode::Ftruncate::new(fd, len as u64) .build() - .user_data(io.ring.get_key()) + .user_data(get_key(c.clone())) }); - io.ring.submit_entry(&truncate, c.clone()); + io.ring.submit_entry(&truncate); Ok(c) } } diff --git a/core/storage/wal.rs b/core/storage/wal.rs index f434e651e..a7ab813d4 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -68,7 +68,7 @@ impl CheckpointResult { } pub const fn everything_backfilled(&self) -> bool { - self.num_wal_frames > 0 && self.num_wal_frames == self.num_checkpointed_frames + self.num_wal_frames == self.num_checkpointed_frames } pub fn release_guard(&mut self) { let _ = self.maybe_guard.take(); @@ -757,12 +757,8 @@ impl Wal for WalFile { #[instrument(skip_all, level = Level::DEBUG)] fn end_read_tx(&self) { let held = self.max_frame_read_lock_index.get(); - turso_assert!( - held != NO_LOCK_HELD, - "We must have a read lock held to end a read transaction" - ); tracing::debug!("end_read_tx(lock={})", held); - { + if held != NO_LOCK_HELD { let read_lock = &mut self.get_shared().read_locks[held]; read_lock.unlock(); } @@ -806,26 +802,19 @@ impl Wal for WalFile { /// Find the latest frame containing a page. #[instrument(skip_all, level = Level::DEBUG)] fn find_frame(&self, page_id: u64) -> Result> { - let shared = self.get_shared(); - let frames = shared.frame_cache.lock(); - // if we have read_lock 0, we are reading straight from the db file + // if we are holding read_lock 0, skip and read right from db file. if self.max_frame_read_lock_index.get() == 0 { return Ok(None); } - let frames = frames.get(&page_id); - if frames.is_none() { - return Ok(None); - } - let frames = frames.unwrap(); - if let Some(&f) = frames - .iter() - .rev() - .find(|&&f| f <= self.max_frame && f >= self.min_frame) - { - Ok(Some(f)) - } else { - Ok(None) + let shared = self.get_shared(); + // if we have read_lock 0, we are reading straight from the db file + let frames = shared.frame_cache.lock(); + if let Some(list) = frames.get(&page_id) { + if let Some(f) = list.iter().rev().find(|f| **f <= self.max_frame) { + return Ok(Some(*f)); + } } + Ok(None) } /// Read a frame from the WAL. @@ -971,6 +960,10 @@ impl Wal for WalFile { db_size: u32, write_counter: Rc>, ) -> Result { + let shared = self.get_shared(); + if shared.max_frame.load(Ordering::SeqCst).eq(&0) { + self.ensure_header_if_needed()?; + } let page_id = page.get().id; let frame_id = self.max_frame + 1; let offset = self.frame_offset(frame_id); @@ -1213,6 +1206,46 @@ impl WalFile { self.syncing.set(false); } + /// the WAL file has been truncated and we are writing the first + /// frame since then. We need to ensure that the header is initialized. + fn ensure_header_if_needed(&mut self) -> Result<()> { + self.last_checksum = { + let shared = self.get_shared(); + if shared.max_frame.load(Ordering::SeqCst) != 0 { + return Ok(()); + } + if shared.file.size()? >= WAL_HEADER_SIZE as u64 { + return Ok(()); + } + + let mut hdr = shared.wal_header.lock(); + if hdr.page_size == 0 { + hdr.page_size = self.page_size(); + } + + // recompute header checksum + let prefix = &hdr.as_bytes()[..WAL_HEADER_SIZE - 8]; + let use_native = (hdr.magic & 1) != 0; + let (c1, c2) = checksum_wal(prefix, &hdr, (0, 0), use_native); + hdr.checksum_1 = c1; + hdr.checksum_2 = c2; + + shared.last_checksum = (c1, c2); + (c1, c2) + }; + + self.max_frame = 0; + let shared = self.get_shared(); + self.io + .wait_for_completion(sqlite3_ondisk::begin_write_wal_header( + &shared.file, + &shared.wal_header.lock(), + )?)?; + self.io + .wait_for_completion(shared.file.sync(Completion::new_sync(|_| {}).into())?)?; + Ok(()) + } + fn checkpoint_inner( &mut self, pager: &Pager, @@ -1335,31 +1368,43 @@ impl WalFile { if *write_counter.borrow() > 0 { return Ok(IOResult::IO); } - let (mut checkpoint_result, everything_backfilled) = { + let mut checkpoint_result = { let shared = self.get_shared(); let current_mx = shared.max_frame.load(Ordering::SeqCst); let nbackfills = shared.nbackfills.load(Ordering::SeqCst); - - let frames_in_wal = current_mx.saturating_sub(nbackfills); - let frames_checkpointed = - current_mx.saturating_sub(self.ongoing_checkpoint.min_frame - 1); // Record two num pages fields to return as checkpoint result to caller. // Ref: pnLog, pnCkpt on https://www.sqlite.org/c3ref/wal_checkpoint_v2.html - ( - CheckpointResult::new(frames_in_wal, frames_checkpointed), - // if the current max frame that we read while holding checkpoint_lock is equal to the max frame of the - // ongoing checkpoint, it means that we have backfilled everything - current_mx == self.ongoing_checkpoint.max_frame, - ) + + // the total # of frames we could have possibly backfilled + let frames_possible = current_mx.saturating_sub(nbackfills); + + // the total # of frames we actually backfilled + let frames_checkpointed = + current_mx.saturating_sub(self.ongoing_checkpoint.min_frame - 1); + + if matches!(mode, CheckpointMode::Truncate) { + // sqlite always returns zeros for truncate mode + CheckpointResult::default() + } else if frames_checkpointed == 0 + && matches!(mode, CheckpointMode::Restart) + // if we restarted the log but didn't backfill pages we still have to + // return the last checkpoint result. + { + self.prev_checkpoint.clone() + } else { + // otherwise return the normal result of the total # of possible frames + // we could have backfilled, and the number we actually did. + CheckpointResult::new(frames_possible, frames_checkpointed) + } }; - // we will just overwrite nbackfills with 0 if we are resetting + // store the max frame we were able to successfully checkpoint. self.get_shared() .nbackfills .store(self.ongoing_checkpoint.max_frame, Ordering::SeqCst); if matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate) { - if everything_backfilled { + if checkpoint_result.everything_backfilled() { self.restart_log(mode)?; } else { return Err(LimboError::Busy); @@ -1399,6 +1444,10 @@ impl WalFile { // lower max_frame if the reader is busy and we cannot overtake for (read_lock_idx, read_lock) in shared.read_locks.iter_mut().enumerate().skip(1) { let this_mark = read_lock.value.load(Ordering::SeqCst); + if this_mark == READMARK_NOT_USED { + // this read lock is not used, skip it + continue; + } if this_mark < max_safe_frame as u32 { let busy = !read_lock.write(); if !busy { @@ -1475,18 +1524,9 @@ impl WalFile { }); let shared = self.get_shared(); // for now at least, lets do all this IO syncronously - let c = shared.file.truncate(0, c.clone()).inspect_err(handle_err)?; + let c = shared.file.truncate(0, c).inspect_err(handle_err)?; self.io.wait_for_completion(c).inspect_err(handle_err)?; - - let hdr = shared.wal_header.lock(); - // sqlite just lets the next writer create it when the first frame is written. - // we can write the new header here for simplicity. - self.io - .wait_for_completion( - sqlite3_ondisk::begin_write_wal_header(&shared.file, &hdr) - .inspect_err(handle_err)?, - ) - .inspect_err(handle_err)?; + // fsync after truncation self.io .wait_for_completion( shared @@ -1661,42 +1701,29 @@ impl WalFileShared { matches!(mode, CheckpointMode::Restart | CheckpointMode::Truncate), "CheckpointMode must be Restart or Truncate" ); - // bump checkpoint sequence - let mut hdr = self.wal_header.lock(); - hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); + { + let mut hdr = self.wal_header.lock(); + // bump checkpoint sequence + hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); + // keep hdr.magic, hdr.file_format, hdr.page_size as-is + hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); + hdr.salt_1 = hdr.salt_1.wrapping_add(1); + hdr.salt_2 = io.generate_random_number() as u32; - // reset frame counters - hdr.checksum_1 = 0; - hdr.checksum_2 = 0; - self.max_frame.store(0, Ordering::SeqCst); - self.nbackfills.store(0, Ordering::SeqCst); - - // update salts. increment the first and generate a new random one for the second - hdr.salt_1 = hdr.salt_1.wrapping_add(1); // aSalt[0]++ - hdr.salt_2 = io.generate_random_number() as u32; - let native = cfg!(target_endian = "big"); - let header_prefix = &hdr.as_bytes()[..WAL_HEADER_SIZE - 2 * 4]; - let (c1, c2) = checksum_wal(header_prefix, &hdr, (0, 0), native); - hdr.checksum_1 = c1; - hdr.checksum_2 = c2; - self.last_checksum = (c1, c2); - - // for RESTART, we write a new header to the WAL file. truncate will simply - // write it in memory and let the following writer append it to the empty WAL file - if matches!(mode, CheckpointMode::Restart) { - // if we are truncating the WAL, we don't bother writing a new header - let c = sqlite3_ondisk::begin_write_wal_header(&self.file, &hdr)?; - io.wait_for_completion(c)?; - self.file.sync( - Completion::new_sync(move |_| { - tracing::trace!("WAL header synced after restart"); - }) - .into(), - )?; + self.max_frame.store(0, Ordering::SeqCst); + self.nbackfills.store(0, Ordering::SeqCst); + self.last_checksum = (hdr.checksum_1, hdr.checksum_2); } + self.frame_cache.lock().clear(); self.pages_in_frames.lock().clear(); - self.last_checksum = (hdr.checksum_1, hdr.checksum_2); + + // read-marks + self.read_locks[0].value.store(0, Ordering::SeqCst); + self.read_locks[1].value.store(0, Ordering::SeqCst); + for l in &self.read_locks[2..] { + l.value.store(READMARK_NOT_USED, Ordering::SeqCst); + } // reset read‑marks self.read_locks[0].value.store(0, Ordering::SeqCst); diff --git a/tests/integration/query_processing/test_write_path.rs b/tests/integration/query_processing/test_write_path.rs index 684627de5..fc1ff8976 100644 --- a/tests/integration/query_processing/test_write_path.rs +++ b/tests/integration/query_processing/test_write_path.rs @@ -3,7 +3,9 @@ use crate::common::{compare_string, do_flush, TempDatabase}; use log::debug; use std::io::{Read, Seek, Write}; use std::sync::Arc; -use turso_core::{CheckpointMode, Connection, Database, Row, Statement, StepResult, Value}; +use turso_core::{ + CheckpointMode, Connection, Database, LimboError, Row, Statement, StepResult, Value, +}; const WAL_HEADER_SIZE: usize = 32; const WAL_FRAME_HEADER_SIZE: usize = 24; From d702e6a80cb0f4e7a702b41a3464dcd1a60d8bb4 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Sun, 27 Jul 2025 23:42:03 -0400 Subject: [PATCH 38/62] Polish checkpointing and fix tests, add documentation --- core/storage/wal.rs | 155 +++++++++++++++++++++++++++++--------------- 1 file changed, 104 insertions(+), 51 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index a7ab813d4..eb9a5307a 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1274,7 +1274,8 @@ impl WalFile { } // acquire the appropriate exclusive locks depending on the checkpoint mode self.acquire_proper_checkpoint_guard(mode)?; - self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); + self.ongoing_checkpoint.max_frame = + self.determine_max_safe_checkpoint_frame(mode); self.ongoing_checkpoint.min_frame = nbackfills + 1; self.ongoing_checkpoint.current_page = 0; self.ongoing_checkpoint.state = CheckpointState::ReadFrame; @@ -1379,8 +1380,10 @@ impl WalFile { let frames_possible = current_mx.saturating_sub(nbackfills); // the total # of frames we actually backfilled - let frames_checkpointed = - current_mx.saturating_sub(self.ongoing_checkpoint.min_frame - 1); + let frames_checkpointed = self + .ongoing_checkpoint + .max_frame + .saturating_sub(self.ongoing_checkpoint.min_frame - 1); if matches!(mode, CheckpointMode::Truncate) { // sqlite always returns zeros for truncate mode @@ -1436,32 +1439,61 @@ impl WalFile { /// Coordinate what the maximum safe frame is for us to backfill when checkpointing. /// We can never backfill a frame with a higher number than any reader's max frame, /// because we might overwrite content the reader is reading from the database file. - fn determine_max_safe_checkpoint_frame(&self) -> u64 { + /// + /// A checkpoint must never overwrite a page in the main DB file if some + /// active reader might still need to read that page from the WAL. + /// Concretely: the checkpoint may only copy frames `<= aReadMark[k]` for + /// every in‑use reader slot `k > 0`. + /// + /// `read_locks[0]` is special: readers holding slot 0 ignore the WAL entirely + /// (they read only the DB file). Its value is a placeholder and does not + /// constrain `mxSafeFrame`. + /// + /// Slot 1 is the “default” reader slot. If it is free (we can take its + /// write-lock) we raise it to the global max so new readers see the most + /// recent snapshot. We do not clear it to `READMARK_NOT_USED` in ordinary + /// checkpoints (SQLite only clears nonzero slots during a log reset). + /// + /// Slots 2..N: If a reader is stuck at an older frame, that frame becomes the + /// limit. If we can’t atomically bump that slot (write-lock fails), we must + /// clamp `mxSafeFrame` down to that mark. In PASSIVE mode we stop trying + /// immediately (we are not allowed to block or spin). In the blocking modes + /// (FULL/RESTART/TRUNCATE) we can loop and retry, but for now we can + /// just respect the first busy slot and move on. + /// + /// Locking rules: + /// This routine **tries** to take an exclusive (write) lock on each slot to + /// update/clean it. If the try-lock fails: + /// PASSIVE: do not wait; just lower `mxSafeFrame` and break. + /// Others: lower `mxSafeFrame` and continue scanning. + /// + /// We never modify slot values while a reader holds that slot. + fn determine_max_safe_checkpoint_frame(&self, mode: CheckpointMode) -> u64 { let shared = self.get_shared(); let mut max_safe_frame = shared.max_frame.load(Ordering::SeqCst); - // If a reader is positioned before max_frame we either: - // bump its mark up if we can take the slot write-lock OR - // lower max_frame if the reader is busy and we cannot overtake + for (read_lock_idx, read_lock) in shared.read_locks.iter_mut().enumerate().skip(1) { let this_mark = read_lock.value.load(Ordering::SeqCst); if this_mark == READMARK_NOT_USED { - // this read lock is not used, skip it continue; } if this_mark < max_safe_frame as u32 { let busy = !read_lock.write(); if !busy { - // readmark 1 is the default reader, and should always contain the max safe - // frame for new readers so we bump it up to the current max frame instead of clearing it. - let new_mark = if read_lock_idx == 1 { - max_safe_frame as u32 - } else { - READMARK_NOT_USED - }; - read_lock.value.store(new_mark, Ordering::SeqCst); + // Only adjust, never clear, in ordinary checkpoints + if read_lock_idx == 1 { + // store the shared max_frame for the default read slot 1 + read_lock + .value + .store(max_safe_frame as u32, Ordering::SeqCst); + } read_lock.unlock(); } else { max_safe_frame = this_mark as u64; + if matches!(mode, CheckpointMode::Passive) { + // Don't keep poking, PASSIVE can't block or spin + break; + } } } } @@ -1703,10 +1735,8 @@ impl WalFileShared { ); { let mut hdr = self.wal_header.lock(); - // bump checkpoint sequence hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); // keep hdr.magic, hdr.file_format, hdr.page_size as-is - hdr.checkpoint_seq = hdr.checkpoint_seq.wrapping_add(1); hdr.salt_1 = hdr.salt_1.wrapping_add(1); hdr.salt_2 = io.generate_random_number() as u32; @@ -1744,6 +1774,8 @@ pub mod test { CheckpointMode, CheckpointResult, Completion, Connection, Database, LimboError, PlatformIO, Wal, WalFileShared, IO, }; + #[cfg(unix)] + use std::os::unix::fs::MetadataExt; use std::{ cell::{Cell, RefCell, UnsafeCell}, rc::Rc, @@ -1821,7 +1853,7 @@ pub mod test { "WAL file should not have been empty before checkpoint" ); assert_eq!( - bytes_after, 32, + bytes_after, 0, "WAL file should be truncated to 0 bytes, but is {bytes_after} bytes", ); std::fs::remove_dir_all(path).unwrap(); @@ -1863,58 +1895,85 @@ pub mod test { } #[test] - fn test_wal_restart_checkpoint_resets_sequence() { + fn restart_checkpoint_resets_wal_state_and_increments_ckpt_seq() { let (db, path) = get_database(); - let mut walpath = path.clone().into_os_string().into_string().unwrap(); - walpath.push_str("/test.db-wal"); - let walpath = std::path::PathBuf::from(walpath); + let walpath = { + let mut p = path.clone().into_os_string().into_string().unwrap(); + p.push_str("/test.db-wal"); + std::path::PathBuf::from(p) + }; let conn = db.connect().unwrap(); conn.execute("create table test(id integer primary key, value text)") .unwrap(); - bulk_inserts(&conn.clone(), 20, 3); + bulk_inserts(&conn, 20, 3); conn.pager.borrow_mut().cacheflush().unwrap(); + // Snapshot header & counters before the RESTART checkpoint. let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); - let (seq0, salt10, salt20, _ps) = wal_header_snapshot(&wal_shared); - let (before_max, before_backfills) = unsafe { + let (seq_before, salt1_before, salt2_before, _ps_before) = wal_header_snapshot(&wal_shared); + let (mx_before, backfill_before) = unsafe { let s = &*wal_shared.get(); ( s.max_frame.load(Ordering::SeqCst), s.nbackfills.load(Ordering::SeqCst), ) }; - assert!(before_max > 0); - assert_eq!(before_backfills, 0); + assert!(mx_before > 0); + assert_eq!(backfill_before, 0); let meta_before = std::fs::metadata(&walpath).unwrap(); - - let bytes_before = meta_before.len(); + #[cfg(unix)] + let size_before = meta_before.blocks(); + #[cfg(not(unix))] + let size_before = meta_before.len(); + // Run a RESTART checkpoint, should backfill everything and reset WAL counters, + // but NOT truncate the file. { - let pager_ref = conn.pager.borrow(); - let mut wal = pager_ref.wal.borrow_mut(); - let r = run_checkpoint_until_done(&mut *wal, &pager_ref, CheckpointMode::Restart); - assert_eq!(r.num_wal_frames, before_max); - assert_eq!(r.num_checkpointed_frames, before_max); + let pager = conn.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let res = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Restart); + assert_eq!(res.num_wal_frames, mx_before); + assert_eq!(res.num_checkpointed_frames, mx_before); } - // After restart: max_frame == 0, nbackfills == 0, salts changed, seq incremented. - let (seq1, salt11, salt21, _ps2) = wal_header_snapshot(&wal_shared); - assert_eq!(seq1, seq0.wrapping_add(1), "checkpoint_seq increments"); - assert_ne!(salt21, salt20); - assert_eq!(salt11, salt10.wrapping_add(1), "salt_1 should increment"); - let (after_max, after_backfills) = unsafe { + // Validate post‑RESTART header & counters. + let (seq_after, salt1_after, salt2_after, _ps_after) = wal_header_snapshot(&wal_shared); + assert_eq!( + seq_after, + seq_before.wrapping_add(1), + "checkpoint_seq must increment on RESTART" + ); + assert_eq!( + salt1_after, + salt1_before.wrapping_add(1), + "salt_1 is incremented" + ); + assert_ne!(salt2_after, salt2_before, "salt_2 is randomized"); + + let (mx_after, backfill_after) = unsafe { let s = &*wal_shared.get(); ( s.max_frame.load(Ordering::SeqCst), s.nbackfills.load(Ordering::SeqCst), ) }; - assert_eq!(after_max, 0); - assert_eq!(after_backfills, 0); + assert_eq!(mx_after, 0, "mxFrame reset to 0 after RESTART"); + assert_eq!(backfill_after, 0, "nBackfill reset to 0 after RESTART"); - // Next write should create frame_id = 1 again. + // File size should be unchanged for RESTART (no truncate). + let meta_after = std::fs::metadata(&walpath).unwrap(); + #[cfg(unix)] + let size_after = meta_after.blocks(); + #[cfg(not(unix))] + let size_after = meta_after.len(); + assert_eq!( + size_before, size_after, + "RESTART must not change WAL file size" + ); + + // Next write should start a new sequence at frame 1. conn.execute("insert into test(value) values ('post_restart')") .unwrap(); conn.pager @@ -1924,14 +1983,8 @@ pub mod test { .finish_append_frames_commit() .unwrap(); let new_max = unsafe { (&*wal_shared.get()).max_frame.load(Ordering::SeqCst) }; - assert_eq!(new_max, 1, "Sequence restarted at 1"); + assert_eq!(new_max, 1, "first append after RESTART starts at frame 1"); - let meta_after = std::fs::metadata(&walpath).unwrap(); - let bytes_after = meta_after.len(); - assert_eq!( - bytes_before, bytes_after, - "WAL file should not change size after full checkpoint" - ); std::fs::remove_dir_all(path).unwrap(); } From 8806b77d26218fa2b194dde1229cc0c5bca946e6 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Mon, 28 Jul 2025 00:15:02 -0400 Subject: [PATCH 39/62] Clear snapshot and readmark/lock index flags on failure --- core/io/unix.rs | 4 +- core/io/windows.rs | 2 +- core/storage/pager.rs | 4 +- core/storage/wal.rs | 160 +++++++++++++++++++++++------------------- 4 files changed, 94 insertions(+), 76 deletions(-) diff --git a/core/io/unix.rs b/core/io/unix.rs index cf7245b88..9cb50a3f8 100644 --- a/core/io/unix.rs +++ b/core/io/unix.rs @@ -453,7 +453,9 @@ impl File for UnixFile<'_> { #[instrument(err, skip_all, level = Level::INFO)] fn truncate(&self, len: usize, c: Completion) -> Result { - let file = self.file.lock().unwrap(); + let file = self.file.lock().map_err(|e| { + LimboError::LockingError(format!("Failed to lock file for truncation: {e}")) + })?; let result = file.set_len(len as u64); match result { Ok(()) => { diff --git a/core/io/windows.rs b/core/io/windows.rs index b7c5b06cc..26d80cfc7 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -124,7 +124,7 @@ impl File for WindowsFile { #[instrument(err, skip_all, level = Level::TRACE)] fn truncate(&self, len: usize, c: Completion) -> Result { - let mut file = self.file.borrow_mut(); + let file = self.file.write(); file.set_len(len as u64).map_err(LimboError::IOError)?; c.complete(0); Ok(c) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 6f36f30be..bc9a0e171 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1304,7 +1304,9 @@ impl Pager { .checkpoint(self, counter.clone(), mode) })?; - if checkpoint_result.everything_backfilled() { + if checkpoint_result.everything_backfilled() + && checkpoint_result.num_checkpointed_frames != 0 + { let db_size = header_accessor::get_database_size(self)?; let page_size = self.page_size.get().unwrap_or(DEFAULT_PAGE_SIZE); let expected = (db_size * page_size) as u64; diff --git a/core/storage/wal.rs b/core/storage/wal.rs index eb9a5307a..d1a91fc09 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -663,133 +663,144 @@ impl Wal for WalFile { /// Begin a read transaction. #[instrument(skip_all, level = Level::DEBUG)] fn begin_read_tx(&mut self) -> Result<(LimboResult, bool)> { - let max_frame_in_wal = self.get_shared().max_frame.load(Ordering::SeqCst); - let nbackfills = self.get_shared().nbackfills.load(Ordering::SeqCst); - let db_has_changed = max_frame_in_wal > self.max_frame; + let (shared_max, nbackfills, last_checksum) = { + let shared = self.get_shared(); + let mx = shared.max_frame.load(Ordering::SeqCst); + let nb = shared.nbackfills.load(Ordering::SeqCst); + let ck = shared.last_checksum; + (mx, nb, ck) + }; + let db_changed = shared_max > self.max_frame; // WAL is already fully back‑filled into the main DB image // (mxFrame == nBackfill). Readers can therefore ignore the // WAL and fetch pages directly from the DB file. We do this // by taking read‑lock 0. - if max_frame_in_wal == nbackfills { - let lock0 = &mut self.get_shared().read_locks[0]; - if !lock0.read() { - return Ok((LimboResult::Busy, db_has_changed)); + if shared_max == nbackfills { + let l0 = &mut self.get_shared().read_locks[0]; + if !l0.read() { + return Ok((LimboResult::Busy, db_changed)); } - self.max_frame = max_frame_in_wal; + self.max_frame = shared_max; // we need to keep self.max_frame set to the appropriate // max frame in the wal at the time this transaction starts. // but here we set min_frame=max_frame + 1 to keep an empty snapshot window, // to demonstrate that we do not care about any frames, // while still capturing a snapshot that we may need if we ever want to upgrade // to a write transaction. - self.min_frame = max_frame_in_wal + 1; + self.min_frame = shared_max.saturating_add(1); self.max_frame_read_lock_index.set(0); self.has_snapshot.set(true); - self.last_checksum = self.get_shared().last_checksum; - return Ok((LimboResult::Ok, db_has_changed)); + self.last_checksum = last_checksum; + return Ok((LimboResult::Ok, db_changed)); } - let mut max_read_mark_index = -1; - let mut max_read_mark = 0; - // Find the largest mark we can find, ignore frames that are impossible to be in range and - // that are not set - for (index, lock) in self.get_shared().read_locks.iter().enumerate().skip(1) { - let this_mark = lock.value.load(Ordering::SeqCst); - if this_mark > max_read_mark && this_mark <= max_frame_in_wal as u32 { - max_read_mark = this_mark; - max_read_mark_index = index as i64; + // Find largest mark <= mx among slots 1..N + let mut best_idx: i64 = -1; + let mut best_mark: u32 = 0; + for (idx, lock) in self.get_shared().read_locks.iter().enumerate().skip(1) { + let m = lock.value.load(Ordering::SeqCst); + if m != READMARK_NOT_USED && m <= shared_max as u32 && m > best_mark { + best_mark = m; + best_idx = idx as i64; } } - // If we didn't find any mark or we can update, let's update them - if (max_read_mark as u64) < max_frame_in_wal || max_read_mark_index == -1 { - for (index, lock) in self.get_shared().read_locks.iter_mut().enumerate().skip(1) { - let busy = !lock.write(); - if !busy { - // If this was busy then it must mean >1 threads tried to set this read lock - lock.value.store(max_frame_in_wal as u32, Ordering::SeqCst); - max_read_mark = max_frame_in_wal as u32; - max_read_mark_index = index as i64; - lock.unlock(); - break; + // If none found or lagging, try to claim/update a slot + if best_idx == -1 || (best_mark as u64) < shared_max { + for (idx, lock) in self.get_shared().read_locks.iter_mut().enumerate().skip(1) { + if !lock.write() { + continue; // busy slot } + // claim or bump this slot + lock.value.store(shared_max as u32, Ordering::SeqCst); + best_idx = idx as i64; + best_mark = shared_max as u32; + lock.unlock(); + break; } } - if max_read_mark_index == -1 { - return Ok((LimboResult::Busy, db_has_changed)); + if best_idx == -1 { + return Ok((LimboResult::Busy, db_changed)); } - let (min_frame, last_checksum, start_pages_in_frames) = { + // Now take a shared read on that slot + let (min_frame, start_pages) = { let shared = self.get_shared(); - let lock = &mut shared.read_locks[max_read_mark_index as usize]; - tracing::trace!("begin_read_tx_read_lock(lock={})", max_read_mark_index); - let busy = !lock.read(); - if busy { - return Ok((LimboResult::Busy, db_has_changed)); + let lock = &mut shared.read_locks[best_idx as usize]; + if !lock.read() { + return Ok((LimboResult::Busy, db_changed)); } ( shared.nbackfills.load(Ordering::SeqCst) + 1, - shared.last_checksum, shared.pages_in_frames.lock().len(), ) }; + self.min_frame = min_frame; - self.max_frame_read_lock_index - .set(max_read_mark_index as usize); - self.max_frame = max_read_mark as u64; - self.last_checksum = last_checksum; + self.max_frame = best_mark as u64; + self.start_pages_in_frames = start_pages; + self.max_frame_read_lock_index.set(best_idx as usize); self.has_snapshot.set(true); - self.start_pages_in_frames = start_pages_in_frames; + tracing::debug!( - "begin_read_tx(min_frame={}, max_frame={}, lock={}, max_frame_in_wal={})", + "begin_read_tx(min={}, max={}, slot={}, max_frame_in_wal={})", self.min_frame, self.max_frame, - max_read_mark_index, - max_frame_in_wal + best_idx, + shared_max ); - Ok((LimboResult::Ok, db_has_changed)) + + Ok((LimboResult::Ok, db_changed)) } /// End a read transaction. #[inline(always)] #[instrument(skip_all, level = Level::DEBUG)] fn end_read_tx(&self) { - let held = self.max_frame_read_lock_index.get(); - tracing::debug!("end_read_tx(lock={})", held); - if held != NO_LOCK_HELD { - let read_lock = &mut self.get_shared().read_locks[held]; - read_lock.unlock(); + let slot = self.max_frame_read_lock_index.get(); + if slot != NO_LOCK_HELD { + let rl = &mut self.get_shared().read_locks[slot]; + rl.unlock(); + self.max_frame_read_lock_index.set(NO_LOCK_HELD); + self.has_snapshot.set(false); + tracing::debug!("end_read_tx(slot={slot})"); + } else { + tracing::debug!("end_read_tx(no snapshot)"); } - self.has_snapshot.set(false); - self.max_frame_read_lock_index.set(NO_LOCK_HELD); } /// Begin a write transaction #[instrument(skip_all, level = Level::DEBUG)] fn begin_write_tx(&mut self) -> Result { - if !self.get_shared().write_lock.write() { + let shared = self.get_shared(); + if !shared.write_lock.write() { return Ok(LimboResult::Busy); } - let shared_max = self.get_shared().max_frame.load(Ordering::SeqCst); - - // If we have a snapshot and self.max_frame == shared.max_frame, - // then the snapshot is still valid and it's safe to promote to write tx. - // It is also valid if we do not yet have a snapshot. - if !self.has_snapshot.get() || self.max_frame == shared_max { - // Both cases mean we can safely use the shared state. - self.max_frame = shared_max; - self.last_checksum = self.get_shared().last_checksum; - self.min_frame = self.get_shared().nbackfills.load(Ordering::SeqCst) + 1; - self.has_snapshot.set(true); + if !self.has_snapshot.get() { + // In SQLite this cannot happen (assert). Either assert or handle like Busy. + shared.write_lock.unlock(); + return Ok(LimboResult::Busy); + } + let (shared_max, nbackfills, last_checksum) = { + let shared = self.get_shared(); + ( + shared.max_frame.load(Ordering::SeqCst), + shared.nbackfills.load(Ordering::SeqCst), + shared.last_checksum, + ) + }; + if self.max_frame == shared_max { + // Snapshot still valid; adopt counters + self.last_checksum = last_checksum; + self.min_frame = nbackfills + 1; return Ok(LimboResult::Ok); } - // Otherwise, another transaction wrote to the WAL after we started our read transaction. - // This means our snapshot is not consistent with the one in the shared state and we need to start over. - let shared = self.get_shared(); + + // Snapshot is stale, give up and let caller retry from scratch shared.write_lock.unlock(); - return Ok(LimboResult::Busy); + Ok(LimboResult::Busy) } /// End a write transaction @@ -1202,6 +1213,7 @@ impl WalFile { self.ongoing_checkpoint.min_frame = 0; self.ongoing_checkpoint.max_frame = 0; self.ongoing_checkpoint.current_page = 0; + self.max_frame_read_lock_index.set(NO_LOCK_HELD); self.sync_state.set(SyncState::NotSyncing); self.syncing.set(false); } @@ -1424,7 +1436,9 @@ impl WalFile { // a. the max frame == num wal frames (everything backfilled) // b. the physical db file size differs from the expected pages * page_size // and truncate + sync the db file if necessary. - if checkpoint_result.everything_backfilled() { + if checkpoint_result.everything_backfilled() + && checkpoint_result.num_checkpointed_frames > 0 + { checkpoint_result.maybe_guard = self.checkpoint_guard.take(); } else { let _ = self.checkpoint_guard.take(); From 2c3a9fe5efeb86d0b32a41f6ba22ea8c4a7d3d6e Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Tue, 29 Jul 2025 11:19:21 -0400 Subject: [PATCH 40/62] Finish wal transaction handling and add more wal and chkpt testing --- bindings/javascript/src/lib.rs | 3 +- bindings/wasm/node/src/vfs.cjs | 37 --- bindings/wasm/web/src/web-vfs.js | 33 --- core/io/io_uring.rs | 2 +- core/storage/database.rs | 6 +- core/storage/wal.rs | 433 ++++++++++++++++++++++++++----- core/vdbe/execute.rs | 3 + 7 files changed, 379 insertions(+), 138 deletions(-) delete mode 100644 bindings/wasm/node/src/vfs.cjs delete mode 100644 bindings/wasm/web/src/web-vfs.js diff --git a/bindings/javascript/src/lib.rs b/bindings/javascript/src/lib.rs index 470bdfbc0..e15cdaf7f 100644 --- a/bindings/javascript/src/lib.rs +++ b/bindings/javascript/src/lib.rs @@ -731,7 +731,8 @@ impl turso_core::DatabaseStorage for DatabaseFile { len: usize, c: turso_core::Completion, ) -> turso_core::Result { - self.file.truncate(len, c) + let c = self.file.truncate(len, c)?; + Ok(c) } } diff --git a/bindings/wasm/node/src/vfs.cjs b/bindings/wasm/node/src/vfs.cjs deleted file mode 100644 index 2501c512d..000000000 --- a/bindings/wasm/node/src/vfs.cjs +++ /dev/null @@ -1,37 +0,0 @@ -const fs = require('node:fs'); - -class VFS { - constructor() { - } - - open(path, flags) { - return fs.openSync(path, flags); - } - - close(fd) { - fs.closeSync(fd); - } - - pread(fd, buffer, offset) { - return fs.readSync(fd, buffer, 0, buffer.length, offset); - } - - pwrite(fd, buffer, offset) { - return fs.writeSync(fd, buffer, 0, buffer.length, offset); - } - - size(fd) { - let stats = fs.fstatSync(fd); - return BigInt(stats.size); - } - - sync(fd) { - fs.fsyncSync(fd); - } - - truncate(fd, size) { - fs.ftruncateSync(fs, size) - } -} - -module.exports = { VFS }; diff --git a/bindings/wasm/web/src/web-vfs.js b/bindings/wasm/web/src/web-vfs.js deleted file mode 100644 index 21bdf9c0a..000000000 --- a/bindings/wasm/web/src/web-vfs.js +++ /dev/null @@ -1,33 +0,0 @@ -export class VFS { - constructor() { - return self.vfs; - } - - open(path, flags) { - return self.vfs.open(path); - } - - close(fd) { - return self.vfs.close(fd); - } - - pread(fd, buffer, offset) { - return self.vfs.pread(fd, buffer, offset); - } - - pwrite(fd, buffer, offset) { - return self.vfs.pwrite(fd, buffer, offset); - } - - size(fd) { - return self.vfs.size(fd); - } - - sync(fd) { - return self.vfs.sync(fd); - } - - truncate(fd, size) { - return self.vfs.truncate(fd, size) - } -} diff --git a/core/io/io_uring.rs b/core/io/io_uring.rs index 721313df4..f33c04db3 100644 --- a/core/io/io_uring.rs +++ b/core/io/io_uring.rs @@ -226,7 +226,7 @@ impl Clock for UringIO { /// use the callback pointer as the user_data for the operation as is /// common practice for io_uring to prevent more indirection fn get_key(c: Completion) -> u64 { - Arc::into_raw(c.inner) as u64 + Arc::into_raw(c.inner.clone()) as u64 } #[inline(always)] diff --git a/core/storage/database.rs b/core/storage/database.rs index 1ff74c95e..fd2555b59 100644 --- a/core/storage/database.rs +++ b/core/storage/database.rs @@ -73,7 +73,8 @@ impl DatabaseStorage for DatabaseFile { #[instrument(skip_all, level = Level::INFO)] fn truncate(&self, len: usize, c: Completion) -> Result { - self.file.truncate(len, c) + let c = self.file.truncate(len, c)?; + Ok(c) } } @@ -131,7 +132,8 @@ impl DatabaseStorage for FileMemoryStorage { #[instrument(skip_all, level = Level::INFO)] fn truncate(&self, len: usize, c: Completion) -> Result { - self.file.truncate(len, c) + let c = self.file.truncate(len, c)?; + Ok(c) } } diff --git a/core/storage/wal.rs b/core/storage/wal.rs index d1a91fc09..853797470 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -275,6 +275,9 @@ pub trait Wal { fn get_max_frame(&self) -> u64; fn get_min_frame(&self) -> u64; fn rollback(&mut self) -> Result<()>; + + #[cfg(debug_assertions)] + fn as_any(&self) -> &dyn std::any::Any; } /// A dummy WAL implementation that does nothing. @@ -371,6 +374,10 @@ impl Wal for DummyWAL { fn rollback(&mut self) -> Result<()> { Ok(()) } + #[cfg(debug_assertions)] + fn as_any(&self) -> &dyn std::any::Any { + self + } } // Syncing requires a state machine because we need to schedule a sync and then wait until it is @@ -429,7 +436,6 @@ pub struct WalFile { shared: Arc>, ongoing_checkpoint: OngoingCheckpoint, checkpoint_threshold: usize, - has_snapshot: Cell, // min and max frames for this connection /// This is the index to the read_lock in WalFileShared that we are holding. This lock contains /// the max frame for this connection. @@ -660,41 +666,48 @@ impl Drop for CheckpointLocks { } impl Wal for WalFile { - /// Begin a read transaction. + /// Begin a read transaction. The caller must ensure that there is not already + /// an ongoing read transaction. + /// sqlite/src/wal.c 3023 + /// assert(pWal->readLock < 0); /* Not currently locked */ #[instrument(skip_all, level = Level::DEBUG)] fn begin_read_tx(&mut self) -> Result<(LimboResult, bool)> { - let (shared_max, nbackfills, last_checksum) = { + turso_assert!( + self.max_frame_read_lock_index.get().eq(&NO_LOCK_HELD), + "cannot start a new read tx without ending an existing one" + ); + let (shared_max, nbackfills, last_checksum, checkpoint_seq) = { let shared = self.get_shared(); let mx = shared.max_frame.load(Ordering::SeqCst); let nb = shared.nbackfills.load(Ordering::SeqCst); let ck = shared.last_checksum; - (mx, nb, ck) + let checkpoint_seq = shared.wal_header.lock().checkpoint_seq; + (mx, nb, ck, checkpoint_seq) }; let db_changed = shared_max > self.max_frame; // WAL is already fully back‑filled into the main DB image - // (mxFrame == nBackfill). Readers can therefore ignore the + // (mxFrame == nBackfill). Readers can therefore ignore the // WAL and fetch pages directly from the DB file. We do this - // by taking read‑lock 0. + // by taking read‑lock 0, and capturing the latest state. if shared_max == nbackfills { - let l0 = &mut self.get_shared().read_locks[0]; - if !l0.read() { + let lock_idx = 0; + if !self.get_shared().read_locks[lock_idx].read() { return Ok((LimboResult::Busy, db_changed)); } - self.max_frame = shared_max; // we need to keep self.max_frame set to the appropriate // max frame in the wal at the time this transaction starts. - // but here we set min_frame=max_frame + 1 to keep an empty snapshot window, - // to demonstrate that we do not care about any frames, - // while still capturing a snapshot that we may need if we ever want to upgrade - // to a write transaction. - self.min_frame = shared_max.saturating_add(1); - self.max_frame_read_lock_index.set(0); - self.has_snapshot.set(true); + self.max_frame = shared_max; + self.max_frame_read_lock_index.set(lock_idx); + self.min_frame = nbackfills + 1; self.last_checksum = last_checksum; return Ok((LimboResult::Ok, db_changed)); } + // If we get this far, it means that the reader will want to use + // the WAL to get at content from recent commits. The job now is + // to select one of the aReadMark[] entries that is closest to + // but not exceeding pWal->hdr.mxFrame and lock that entry. // Find largest mark <= mx among slots 1..N let mut best_idx: i64 = -1; let mut best_mark: u32 = 0; @@ -725,25 +738,54 @@ impl Wal for WalFile { return Ok((LimboResult::Busy, db_changed)); } - // Now take a shared read on that slot - let (min_frame, start_pages) = { + // Now take a shared read on that slot, and if we are successful, + // grab another snapshot of the shared state. + let (mx2, nb2, cksm2, start_pages, ckpt_seq2) = { let shared = self.get_shared(); - let lock = &mut shared.read_locks[best_idx as usize]; - if !lock.read() { + if !shared.read_locks[best_idx as usize].read() { + // TODO: we should retry here instead of always returning Busy return Ok((LimboResult::Busy, db_changed)); } ( - shared.nbackfills.load(Ordering::SeqCst) + 1, + shared.max_frame.load(Ordering::SeqCst), + shared.nbackfills.load(Ordering::SeqCst), + shared.last_checksum, shared.pages_in_frames.lock().len(), + shared.wal_header.lock().checkpoint_seq, ) }; - self.min_frame = min_frame; + // sqlite/src/wal.c 3225 + // Now that the read-lock has been obtained, check that neither the + // value in the aReadMark[] array or the contents of the wal-index + // header have changed. + // + // It is necessary to check that the wal-index header did not change + // between the time it was read and when the shared-lock was obtained + // on WAL_READ_LOCK(mxI) was obtained to account for the possibility + // that the log file may have been wrapped by a writer, or that frames + // that occur later in the log than pWal->hdr.mxFrame may have been + // copied into the database by a checkpointer. If either of these things + // happened, then reading the database with the current value of + // pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry + // instead. + // + // Before checking that the live wal-index header has not changed + // since it was read, set Wal.minFrame to the first frame in the wal + // file that has not yet been checkpointed. This client will not need + // to read any frames earlier than minFrame from the wal file - they + // can be safely read directly from the database file. + self.min_frame = nb2 + 1; + if mx2 != shared_max + || nb2 != nbackfills + || cksm2 != last_checksum + || ckpt_seq2 != checkpoint_seq + { + return Err(LimboError::Busy); + } self.max_frame = best_mark as u64; self.start_pages_in_frames = start_pages; self.max_frame_read_lock_index.set(best_idx as usize); - self.has_snapshot.set(true); - tracing::debug!( "begin_read_tx(min={}, max={}, slot={}, max_frame_in_wal={})", self.min_frame, @@ -751,7 +793,6 @@ impl Wal for WalFile { best_idx, shared_max ); - Ok((LimboResult::Ok, db_changed)) } @@ -764,10 +805,9 @@ impl Wal for WalFile { let rl = &mut self.get_shared().read_locks[slot]; rl.unlock(); self.max_frame_read_lock_index.set(NO_LOCK_HELD); - self.has_snapshot.set(false); tracing::debug!("end_read_tx(slot={slot})"); } else { - tracing::debug!("end_read_tx(no snapshot)"); + tracing::debug!("end_read_tx(slot=no_lock)"); } } @@ -775,14 +815,18 @@ impl Wal for WalFile { #[instrument(skip_all, level = Level::DEBUG)] fn begin_write_tx(&mut self) -> Result { let shared = self.get_shared(); + // sqlite/src/wal.c 3702 + // Cannot start a write transaction without first holding a read + // transaction. + // assert(pWal->readLock >= 0); + // assert(pWal->writeLock == 0 && pWal->iReCksum == 0); + turso_assert!( + self.max_frame_read_lock_index.get() != NO_LOCK_HELD, + "must have a read transaction to begin a write transaction" + ); if !shared.write_lock.write() { return Ok(LimboResult::Busy); } - if !self.has_snapshot.get() { - // In SQLite this cannot happen (assert). Either assert or handle like Busy. - shared.write_lock.unlock(); - return Ok(LimboResult::Busy); - } let (shared_max, nbackfills, last_checksum) = { let shared = self.get_shared(); ( @@ -818,10 +862,10 @@ impl Wal for WalFile { return Ok(None); } let shared = self.get_shared(); - // if we have read_lock 0, we are reading straight from the db file let frames = shared.frame_cache.lock(); + let range = self.min_frame..=self.max_frame; if let Some(list) = frames.get(&page_id) { - if let Some(f) = list.iter().rev().find(|f| **f <= self.max_frame) { + if let Some(f) = list.iter().rfind(|&&f| range.contains(&f)) { return Ok(Some(*f)); } } @@ -1125,6 +1169,10 @@ impl Wal for WalFile { shared.last_checksum = self.last_checksum; Ok(()) } + #[cfg(debug_assertions)] + fn as_any(&self) -> &dyn std::any::Any { + self + } } impl WalFile { @@ -1160,7 +1208,6 @@ impl WalFile { max_frame: 0, current_page: 0, }, - has_snapshot: false.into(), checkpoint_threshold: 1000, buffer_pool, syncing: Rc::new(Cell::new(false)), @@ -1221,6 +1268,7 @@ impl WalFile { /// the WAL file has been truncated and we are writing the first /// frame since then. We need to ensure that the header is initialized. fn ensure_header_if_needed(&mut self) -> Result<()> { + tracing::debug!("ensure_header_if_needed"); self.last_checksum = { let shared = self.get_shared(); if shared.max_frame.load(Ordering::SeqCst) != 0 { @@ -1286,8 +1334,7 @@ impl WalFile { } // acquire the appropriate exclusive locks depending on the checkpoint mode self.acquire_proper_checkpoint_guard(mode)?; - self.ongoing_checkpoint.max_frame = - self.determine_max_safe_checkpoint_frame(mode); + self.ongoing_checkpoint.max_frame = self.determine_max_safe_checkpoint_frame(); self.ongoing_checkpoint.min_frame = nbackfills + 1; self.ongoing_checkpoint.current_page = 0; self.ongoing_checkpoint.state = CheckpointState::ReadFrame; @@ -1297,8 +1344,7 @@ impl WalFile { self.ongoing_checkpoint.max_frame, ); } - // Find the next page that has a frame in the safe interval and - // schedule a read of that frame. + // Find the next page that has a frame in the safe interval and schedule a read of that frame. CheckpointState::ReadFrame => { let shared = self.get_shared(); let min_frame = self.ongoing_checkpoint.min_frame; @@ -1476,38 +1522,31 @@ impl WalFile { /// just respect the first busy slot and move on. /// /// Locking rules: - /// This routine **tries** to take an exclusive (write) lock on each slot to + /// This routine tries to take an exclusive (write) lock on each slot to /// update/clean it. If the try-lock fails: /// PASSIVE: do not wait; just lower `mxSafeFrame` and break. /// Others: lower `mxSafeFrame` and continue scanning. /// /// We never modify slot values while a reader holds that slot. - fn determine_max_safe_checkpoint_frame(&self, mode: CheckpointMode) -> u64 { + fn determine_max_safe_checkpoint_frame(&self) -> u64 { let shared = self.get_shared(); let mut max_safe_frame = shared.max_frame.load(Ordering::SeqCst); for (read_lock_idx, read_lock) in shared.read_locks.iter_mut().enumerate().skip(1) { let this_mark = read_lock.value.load(Ordering::SeqCst); - if this_mark == READMARK_NOT_USED { - continue; - } if this_mark < max_safe_frame as u32 { let busy = !read_lock.write(); if !busy { - // Only adjust, never clear, in ordinary checkpoints - if read_lock_idx == 1 { + let val = if read_lock_idx == 1 { // store the shared max_frame for the default read slot 1 - read_lock - .value - .store(max_safe_frame as u32, Ordering::SeqCst); - } + max_safe_frame as u32 + } else { + READMARK_NOT_USED + }; + read_lock.value.store(val, Ordering::SeqCst); read_lock.unlock(); } else { max_safe_frame = this_mark as u64; - if matches!(mode, CheckpointMode::Passive) { - // Don't keep poking, PASSIVE can't block or spin - break; - } } } } @@ -1768,13 +1807,6 @@ impl WalFileShared { for l in &self.read_locks[2..] { l.value.store(READMARK_NOT_USED, Ordering::SeqCst); } - - // reset read‑marks - self.read_locks[0].value.store(0, Ordering::SeqCst); - self.read_locks[1].value.store(0, Ordering::SeqCst); - for lock in &self.read_locks[2..] { - lock.value.store(READMARK_NOT_USED, Ordering::SeqCst); - } Ok(()) } } @@ -1783,10 +1815,13 @@ impl WalFileShared { pub mod test { use crate::{ result::LimboResult, - storage::{sqlite3_ondisk::WAL_HEADER_SIZE, wal::READMARK_NOT_USED}, + storage::{ + sqlite3_ondisk::{self, WAL_HEADER_SIZE}, + wal::READMARK_NOT_USED, + }, types::IOResult, CheckpointMode, CheckpointResult, Completion, Connection, Database, LimboError, PlatformIO, - Wal, WalFileShared, IO, + StepResult, Wal, WalFile, WalFileShared, IO, }; #[cfg(unix)] use std::os::unix::fs::MetadataExt; @@ -1909,7 +1944,7 @@ pub mod test { } #[test] - fn restart_checkpoint_resets_wal_state_and_increments_ckpt_seq() { + fn restart_checkpoint_reset_wal_state_handling() { let (db, path) = get_database(); let walpath = { @@ -2273,6 +2308,7 @@ pub mod test { { let pager = conn2.pager.borrow_mut(); let mut wal = pager.wal.borrow_mut(); + let _ = wal.begin_read_tx().unwrap(); let res = wal.begin_write_tx().unwrap(); assert!(matches!(res, LimboResult::Ok), "result: {res:?}"); } @@ -2289,6 +2325,7 @@ pub mod test { "Restart checkpoint should fail when write lock is held" ); + conn2.pager.borrow().wal.borrow().end_read_tx(); // release write lock conn2.pager.borrow().wal.borrow().end_write_tx(); @@ -2301,4 +2338,272 @@ pub mod test { assert!(result.everything_backfilled()); } + + #[test] + #[should_panic(expected = "must have a read transaction to begin a write transaction")] + fn test_wal_read_transaction_required_before_write() { + let (db, _path) = get_database(); + let conn = db.connect().unwrap(); + + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + + // Attempt to start a write transaction without a read transaction + let pager = conn.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let _ = wal.begin_write_tx(); + } + + fn check_read_lock_slot(conn: &Arc, expected_slot: usize) -> bool { + let pager = conn.pager.borrow(); + let wal = pager.wal.borrow(); + let wal_any = wal.as_any(); + if let Some(wal_file) = wal_any.downcast_ref::() { + return wal_file.max_frame_read_lock_index.get() == expected_slot; + } + false + } + + #[test] + fn test_wal_multiple_readers_at_different_frames() { + let (db, _path) = get_database(); + let conn_writer = db.connect().unwrap(); + + conn_writer + .execute("CREATE TABLE test(id INTEGER PRIMARY KEY, value TEXT)") + .unwrap(); + + fn start_reader(conn: &Arc) -> (u64, crate::Statement) { + conn.execute("BEGIN").unwrap(); + let mut stmt = conn.prepare("SELECT * FROM test").unwrap(); + stmt.step().unwrap(); + let frame = conn.pager.borrow().wal.borrow().get_max_frame(); + (frame, stmt) + } + + bulk_inserts(&conn_writer, 3, 5); + + let conn1 = &db.connect().unwrap(); + let (r1_frame, _stmt) = start_reader(conn1); // reader 1 + + bulk_inserts(&conn_writer, 3, 5); + + let conn_r2 = db.connect().unwrap(); + let (r2_frame, _stmt2) = start_reader(&conn_r2); // reader 2 + + bulk_inserts(&conn_writer, 3, 5); + + let conn_r3 = db.connect().unwrap(); + let (r3_frame, _stmt3) = start_reader(&conn_r3); // reader 3 + + assert!(r1_frame < r2_frame && r2_frame < r3_frame); + + // passive checkpoint #1 + let result1 = { + let pager = conn_writer.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive) + }; + assert_eq!(result1.num_checkpointed_frames, r1_frame); + + // finish reader‑1 + conn1.execute("COMMIT").unwrap(); + + // passive checkpoint #2 + let result2 = { + let pager = conn_writer.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive) + }; + assert_eq!( + result1.num_checkpointed_frames + result2.num_checkpointed_frames, + r2_frame + ); + + // verify visible rows + let mut stmt = conn_r2.query("SELECT COUNT(*) FROM test").unwrap().unwrap(); + while !matches!(stmt.step().unwrap(), StepResult::Row) { + stmt.run_once().unwrap(); + } + let r2_cnt: i64 = stmt.row().unwrap().get(0).unwrap(); + + let mut stmt2 = conn_r3.query("SELECT COUNT(*) FROM test").unwrap().unwrap(); + while !matches!(stmt2.step().unwrap(), StepResult::Row) { + stmt2.run_once().unwrap(); + } + let r3_cnt: i64 = stmt2.row().unwrap().get(0).unwrap(); + assert_eq!(r2_cnt, 30); + assert_eq!(r3_cnt, 45); + } + + #[test] + fn test_checkpoint_truncate_reset_handling() { + let (db, path) = get_database(); + let conn = db.connect().unwrap(); + + let walpath = { + let mut p = path.clone().into_os_string().into_string().unwrap(); + p.push_str("/test.db-wal"); + std::path::PathBuf::from(p) + }; + + conn.execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn, 10, 10); + + // Get size before checkpoint + let size_before = std::fs::metadata(&walpath).unwrap().len(); + assert!(size_before > 0, "WAL file should have content"); + + // Do a TRUNCATE checkpoint + { + let pager = conn.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Truncate); + } + + // Check file size after truncate + let size_after = std::fs::metadata(&walpath).unwrap().len(); + assert_eq!(size_after, 0, "WAL file should be truncated to 0 bytes"); + + // Verify we can still write to the database + conn.execute("INSERT INTO test VALUES (1001, 'after-truncate')") + .unwrap(); + + // Check WAL has new content + let new_size = std::fs::metadata(&walpath).unwrap().len(); + assert!(new_size >= 32, "WAL file too small"); + let hdr = read_wal_header(&walpath); + assert!( + hdr.magic == sqlite3_ondisk::WAL_MAGIC_BE, + "bad WAL magic: {:#X}", + hdr.magic + ); + assert_eq!(hdr.file_format, 3007000); + assert_eq!(hdr.page_size, 4096, "invalid page size"); + assert_eq!(hdr.checkpoint_seq, 0, "invalid checkpoint_seq"); + std::fs::remove_dir_all(path).unwrap(); + } + + fn read_wal_header(path: &std::path::Path) -> sqlite3_ondisk::WalHeader { + use std::{fs::File, io::Read}; + let mut hdr = [0u8; 32]; + File::open(path).unwrap().read_exact(&mut hdr).unwrap(); + let be = |i| u32::from_be_bytes(hdr[i..i + 4].try_into().unwrap()); + sqlite3_ondisk::WalHeader { + magic: be(0x00), + file_format: be(0x04), + page_size: be(0x08), + checkpoint_seq: be(0x0C), + salt_1: be(0x10), + salt_2: be(0x14), + checksum_1: be(0x18), + checksum_2: be(0x1C), + } + } + + #[test] + fn test_wal_stale_snapshot_in_write_transaction() { + let (db, _path) = get_database(); + let conn1 = db.connect().unwrap(); + let conn2 = db.connect().unwrap(); + + conn1 + .execute("create table test(id integer primary key, value text)") + .unwrap(); + // Start a read transaction on conn2 + { + let pager = conn2.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + let (res, _) = wal.begin_read_tx().unwrap(); + assert!(matches!(res, LimboResult::Ok)); + } + // Make changes using conn1 + bulk_inserts(&conn1, 5, 5); + // Try to start a write transaction on conn2 with a stale snapshot + let result = { + let pager = conn2.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + wal.begin_write_tx() + }; + // Should get Busy due to stale snapshot + assert!(matches!(result.unwrap(), LimboResult::Busy)); + + // End read transaction and start a fresh one + { + let pager = conn2.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + wal.end_read_tx(); + let (res, _) = wal.begin_read_tx().unwrap(); + assert!(matches!(res, LimboResult::Ok)); + } + // Now write transaction should work + let result = { + let pager = conn2.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + wal.begin_write_tx() + }; + assert!(matches!(result.unwrap(), LimboResult::Ok)); + } + + #[test] + fn test_wal_readlock0_optimization_behavior() { + let (db, _path) = get_database(); + let conn1 = db.connect().unwrap(); + let conn2 = db.connect().unwrap(); + + conn1 + .execute("create table test(id integer primary key, value text)") + .unwrap(); + bulk_inserts(&conn1, 5, 5); + // Do a full checkpoint to move all data to DB file + { + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Passive); + } + + // Start a read transaction on conn2 + { + let pager = conn2.pager.borrow_mut(); + let mut wal = pager.wal.borrow_mut(); + let (res, _) = wal.begin_read_tx().unwrap(); + assert!(matches!(res, LimboResult::Ok)); + } + // should use slot 0, as everything is backfilled + assert!(check_read_lock_slot(&conn2, 0)); + { + let pager = conn1.pager.borrow(); + let wal = pager.wal.borrow(); + let frame = wal.find_frame(5); + // since we hold readlock0, we should ignore the db file and find_frame should return none + assert!(frame.is_ok_and(|f| f.is_none())); + } + // Try checkpoint, should fail because reader has slot 0 + { + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let result = wal.checkpoint(&pager, Rc::new(RefCell::new(0)), CheckpointMode::Restart); + + assert!( + matches!(result, Err(LimboError::Busy)), + "RESTART checkpoint should fail when a reader is using slot 0" + ); + } + // End the read transaction + { + let pager = conn2.pager.borrow(); + let wal = pager.wal.borrow(); + wal.end_read_tx(); + } + { + let pager = conn1.pager.borrow(); + let mut wal = pager.wal.borrow_mut(); + let result = run_checkpoint_until_done(&mut *wal, &pager, CheckpointMode::Restart); + assert!( + result.everything_backfilled(), + "RESTART checkpoint should succeed after reader releases slot 0" + ); + } + } } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 85ed17698..8d0fbaea8 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -6317,6 +6317,9 @@ pub fn op_open_ephemeral( } OpOpenEphemeralState::StartingTxn { pager } => { tracing::trace!("StartingTxn"); + pager + .begin_read_tx() // we have to begin a read tx before beginning a write + .expect("Failed to start read transaction"); return_if_io!(pager.begin_write_tx()); state.op_open_ephemeral_state = OpOpenEphemeralState::CreateBtree { pager: pager.clone(), From 4dc15492d84dcb46654c657604505f84cbe80989 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Tue, 29 Jul 2025 22:55:00 -0400 Subject: [PATCH 41/62] Integrate changes from tx isolation commits from @jussisaurio --- core/lib.rs | 11 +++++++---- core/storage/pager.rs | 14 +++++++------- core/storage/wal.rs | 11 ++++------- core/vdbe/execute.rs | 37 +++++++++++++++++++++++++++++++++---- core/vdbe/mod.rs | 3 +++ 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index ec0b7a56f..847244b7d 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -103,6 +103,7 @@ pub type Result = std::result::Result; enum TransactionState { Write { schema_did_change: bool }, Read, + PendingUpgrade, None, } @@ -1206,16 +1207,18 @@ impl Connection { match self.transaction_state.get() { TransactionState::Write { schema_did_change } => { - let _result = self.pager.borrow().end_tx( + while let IOResult::IO = self.pager.borrow().end_tx( true, // rollback = true for close schema_did_change, self, self.wal_checkpoint_disabled.get(), - ); + )? { + self.run_once()?; + } self.transaction_state.set(TransactionState::None); } - TransactionState::Read => { - let _result = self.pager.borrow().end_read_tx(); + TransactionState::PendingUpgrade | TransactionState::Read => { + self.pager.borrow().end_read_tx()?; self.transaction_state.set(TransactionState::None); } TransactionState::None => { diff --git a/core/storage/pager.rs b/core/storage/pager.rs index bc9a0e171..fbdabece3 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1311,19 +1311,19 @@ impl Pager { let page_size = self.page_size.get().unwrap_or(DEFAULT_PAGE_SIZE); let expected = (db_size * page_size) as u64; if expected < self.db_file.size()? { - self.db_file.truncate( + self.io.wait_for_completion(self.db_file.truncate( expected as usize, Completion::new_trunc(move |_| { tracing::trace!( "Database file truncated to expected size: {} bytes", expected ); - }) - .into(), - )?; - self.db_file.sync(Completion::new_sync(move |_| { - tracing::trace!("Database file syncd after truncation"); - }))?; + }), + )?)?; + self.io + .wait_for_completion(self.db_file.sync(Completion::new_sync(move |_| { + tracing::trace!("Database file syncd after truncation"); + }))?)?; } checkpoint_result.release_guard(); } diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 853797470..038b51c91 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1393,7 +1393,7 @@ impl WalFile { } CheckpointState::WritePage => { self.ongoing_checkpoint.page.set_dirty(); - begin_write_btree_page( + let _ = begin_write_btree_page( pager, &self.ongoing_checkpoint.page, write_counter.clone(), @@ -1616,12 +1616,9 @@ impl WalFile { .wait_for_completion( shared .file - .sync( - Completion::new_sync(|_| { - tracing::trace!("WAL file synced after reset/truncation"); - }) - .into(), - ) + .sync(Completion::new_sync(|_| { + tracing::trace!("WAL file synced after reset/truncation"); + })) .inspect_err(handle_err)?, ) .inspect_err(handle_err)?; diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 8d0fbaea8..9307bc524 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -61,7 +61,10 @@ use crate::{ vector::{vector32, vector64, vector_distance_cos, vector_distance_l2, vector_extract}, }; -use crate::{info, BufferPool, MvCursor, OpenFlags, RefValue, Row, StepResult, TransactionState}; +use crate::{ + info, turso_assert, BufferPool, MvCursor, OpenFlags, RefValue, Row, StepResult, + TransactionState, +}; use super::{ insn::{Cookie, RegisterOrLiteral}, @@ -1982,6 +1985,20 @@ pub fn op_transaction( } else { let current_state = conn.transaction_state.get(); let (new_transaction_state, updated) = match (current_state, write) { + // pending state means that we tried beginning a tx and the method returned IO. + // instead of ending the read tx, just update the state to pending. + (TransactionState::PendingUpgrade, write) => { + turso_assert!( + write, + "pending upgrade should only be set for write transactions" + ); + ( + TransactionState::Write { + schema_did_change: false, + }, + true, + ) + } (TransactionState::Write { schema_did_change }, true) => { (TransactionState::Write { schema_did_change }, false) } @@ -2003,7 +2020,6 @@ pub fn op_transaction( ), (TransactionState::None, false) => (TransactionState::Read, true), }; - if updated && matches!(current_state, TransactionState::None) { if let LimboResult::Busy = pager.begin_read_tx()? { return Ok(InsnFunctionStepResult::Busy); @@ -2014,12 +2030,24 @@ pub fn op_transaction( match pager.begin_write_tx()? { IOResult::Done(r) => { if let LimboResult::Busy = r { - pager.end_read_tx()?; + if matches!( + current_state, + TransactionState::Read | TransactionState::PendingUpgrade + ) { + pager.end_read_tx()?; + } + conn.transaction_state.replace(TransactionState::None); + conn.auto_commit.replace(true); return Ok(InsnFunctionStepResult::Busy); } } IOResult::IO => { - pager.end_read_tx()?; + // set the transaction state to pending so we don't have to + // end the read transaction. + program + .connection + .transaction_state + .replace(TransactionState::PendingUpgrade); return Ok(InsnFunctionStepResult::IO); } } @@ -6093,6 +6121,7 @@ pub fn op_set_cookie( }, TransactionState::Read => unreachable!("invalid transaction state for SetCookie: TransactionState::Read, should be write"), TransactionState::None => unreachable!("invalid transaction state for SetCookie: TransactionState::None, should be write"), + TransactionState::PendingUpgrade => unreachable!("invalid transaction state for SetCookie: TransactionState::PendingUpgrade, should be write"), } } program diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 4f15705b5..1c4addfe5 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -481,6 +481,9 @@ impl Program { Ok(StepResult::Done) } TransactionState::None => Ok(StepResult::Done), + TransactionState::PendingUpgrade => { + panic!("Unexpected transaction state: {current_state:?} during auto-commit",) + } } } else { if self.change_cnt_on { From d5b0d284e6bed60db76fae8c95255db35c97fec2 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Tue, 29 Jul 2025 17:34:14 +0300 Subject: [PATCH 42/62] bindings/rust: add tx isolation fuzz test --- Cargo.lock | 2 + bindings/rust/Cargo.toml | 2 + .../rust/tests/transaction_isolation_fuzz.rs | 572 ++++++++++++++++++ 3 files changed, 576 insertions(+) create mode 100644 bindings/rust/tests/transaction_isolation_fuzz.rs diff --git a/Cargo.lock b/Cargo.lock index 5c9feb314..46fa68323 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3790,6 +3790,8 @@ dependencies = [ name = "turso" version = "0.1.3" dependencies = [ + "rand 0.8.5", + "rand_chacha 0.3.1", "tempfile", "thiserror 2.0.12", "tokio", diff --git a/bindings/rust/Cargo.toml b/bindings/rust/Cargo.toml index 557ad6e20..a1cdf873b 100644 --- a/bindings/rust/Cargo.toml +++ b/bindings/rust/Cargo.toml @@ -21,3 +21,5 @@ thiserror = "2.0.9" [dev-dependencies] tempfile = "3.20.0" tokio = { version = "1.29.1", features = ["full"] } +rand = "0.8.5" +rand_chacha = "0.3.1" diff --git a/bindings/rust/tests/transaction_isolation_fuzz.rs b/bindings/rust/tests/transaction_isolation_fuzz.rs new file mode 100644 index 000000000..a0d37b397 --- /dev/null +++ b/bindings/rust/tests/transaction_isolation_fuzz.rs @@ -0,0 +1,572 @@ +use rand::seq::SliceRandom; +use rand::Rng; +use rand_chacha::{rand_core::SeedableRng, ChaCha8Rng}; +use std::collections::{HashMap, HashSet}; +use turso::{Builder, Value}; + +// In-memory representation of the database state +#[derive(Debug, Clone, PartialEq, Eq)] +struct DbRow { + id: i64, + text: String, +} + +impl std::fmt::Display for DbRow { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "(id={}, text={})", self.id, self.text) + } +} + +#[derive(Debug, Clone)] +struct TransactionState { + // What this transaction can see (snapshot) + visible_rows: HashMap, + // Pending changes in this transaction + pending_changes: Vec, +} + +#[derive(Debug)] +struct ShadowDb { + // Committed state (what's actually in the database) + committed_rows: HashMap, + // Transaction states + transactions: HashMap>, +} + +impl ShadowDb { + fn new() -> Self { + Self { + committed_rows: HashMap::new(), + transactions: HashMap::new(), + } + } + + fn begin_transaction(&mut self, tx_id: usize, immediate: bool) { + self.transactions.insert( + tx_id, + if immediate { + Some(TransactionState { + visible_rows: self.committed_rows.clone(), + pending_changes: Vec::new(), + }) + } else { + None + }, + ); + } + + fn take_snapshot(&mut self, tx_id: usize) { + if let Some(tx_state) = self.transactions.get_mut(&tx_id) { + assert!(tx_state.is_none()); + tx_state.replace(TransactionState { + visible_rows: self.committed_rows.clone(), + pending_changes: Vec::new(), + }); + } + } + + fn commit_transaction(&mut self, tx_id: usize) { + if let Some(tx_state) = self.transactions.remove(&tx_id) { + let tx_state = tx_state.unwrap(); + // Apply pending changes to committed state + for op in tx_state.pending_changes { + match op { + Operation::Insert { id, text } => { + self.committed_rows.insert(id, DbRow { id, text }); + } + Operation::Update { id, text } => { + self.committed_rows.insert(id, DbRow { id, text }); + } + Operation::Delete { id } => { + self.committed_rows.remove(&id); + } + other => unreachable!("Unexpected operation: {other}"), + } + } + } + } + + fn rollback_transaction(&mut self, tx_id: usize) { + self.transactions.remove(&tx_id); + } + + fn insert(&mut self, tx_id: usize, id: i64, text: String) -> Result<(), String> { + if let Some(tx_state) = self.transactions.get_mut(&tx_id) { + // Check if row exists in visible state + if tx_state.as_ref().unwrap().visible_rows.contains_key(&id) { + return Err("UNIQUE constraint failed".to_string()); + } + let row = DbRow { + id, + text: text.clone(), + }; + tx_state + .as_mut() + .unwrap() + .pending_changes + .push(Operation::Insert { id, text }); + tx_state.as_mut().unwrap().visible_rows.insert(id, row); + Ok(()) + } else { + Err("No active transaction".to_string()) + } + } + + fn update(&mut self, tx_id: usize, id: i64, text: String) -> Result<(), String> { + if let Some(tx_state) = self.transactions.get_mut(&tx_id) { + // Check if row exists in visible state + if !tx_state.as_ref().unwrap().visible_rows.contains_key(&id) { + return Err("Row not found".to_string()); + } + let row = DbRow { + id, + text: text.clone(), + }; + tx_state + .as_mut() + .unwrap() + .pending_changes + .push(Operation::Update { id, text }); + tx_state.as_mut().unwrap().visible_rows.insert(id, row); + Ok(()) + } else { + Err("No active transaction".to_string()) + } + } + + fn delete(&mut self, tx_id: usize, id: i64) -> Result<(), String> { + if let Some(tx_state) = self.transactions.get_mut(&tx_id) { + // Check if row exists in visible state + if !tx_state.as_ref().unwrap().visible_rows.contains_key(&id) { + return Err("Row not found".to_string()); + } + tx_state + .as_mut() + .unwrap() + .pending_changes + .push(Operation::Delete { id }); + tx_state.as_mut().unwrap().visible_rows.remove(&id); + Ok(()) + } else { + Err("No active transaction".to_string()) + } + } + + fn get_visible_rows(&self, tx_id: Option) -> Vec { + let Some(tx_id) = tx_id else { + // No transaction - see committed state + return self.committed_rows.values().cloned().collect(); + }; + if let Some(tx_state) = self.transactions.get(&tx_id) { + let tx_state = tx_state.as_ref().unwrap(); + tx_state.visible_rows.values().cloned().collect() + } else { + // No transaction - see committed state + self.committed_rows.values().cloned().collect() + } + } +} + +#[derive(Debug, Clone)] +enum Operation { + Begin, + Commit, + Rollback, + Insert { id: i64, text: String }, + Update { id: i64, text: String }, + Delete { id: i64 }, + Select, +} + +impl std::fmt::Display for Operation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Operation::Begin => write!(f, "BEGIN"), + Operation::Commit => write!(f, "COMMIT"), + Operation::Rollback => write!(f, "ROLLBACK"), + Operation::Insert { id, text } => { + write!(f, "INSERT INTO test_table (id, text) VALUES ({id}, {text})") + } + Operation::Update { id, text } => { + write!(f, "UPDATE test_table SET text = {text} WHERE id = {id}") + } + Operation::Delete { id } => write!(f, "DELETE FROM test_table WHERE id = {id}"), + Operation::Select => write!(f, "SELECT * FROM test_table"), + } + } +} + +fn rng_from_time_or_env() -> (ChaCha8Rng, u64) { + let seed = std::env::var("SEED").map_or( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis(), + |v| { + v.parse() + .expect("Failed to parse SEED environment variable as u64") + }, + ); + let rng = ChaCha8Rng::seed_from_u64(seed as u64); + (rng, seed as u64) +} + +#[tokio::test] +#[ignore] +/// Verify translation isolation semantics with multiple concurrent connections. +/// This test is ignored because it still fails sometimes; unsure if it fails due to a bug in the test or a bug in the implementation. +async fn test_multiple_connections_fuzz() { + let (mut rng, seed) = rng_from_time_or_env(); + println!("Multiple connections fuzz test seed: {}", seed); + + const NUM_ITERATIONS: usize = 50; + const OPERATIONS_PER_CONNECTION: usize = 30; + const NUM_CONNECTIONS: usize = 2; + + for iteration in 0..NUM_ITERATIONS { + // Create a fresh database for each iteration + let tempfile = tempfile::NamedTempFile::new().unwrap(); + let db = Builder::new_local(tempfile.path().to_str().unwrap()) + .build() + .await + .unwrap(); + + // SHARED shadow database for all connections + let mut shared_shadow_db = ShadowDb::new(); + let mut next_tx_id = 0; + + // Create connections + let mut connections = Vec::new(); + for conn_id in 0..NUM_CONNECTIONS { + let conn = db.connect().unwrap(); + + // Create table if it doesn't exist + conn.execute( + "CREATE TABLE IF NOT EXISTS test_table (id INTEGER PRIMARY KEY, text TEXT)", + (), + ) + .await + .unwrap(); + + connections.push((conn, conn_id, None::)); // (connection, conn_id, current_tx_id) + } + + // Interleave operations between all connections + for op_num in 0..OPERATIONS_PER_CONNECTION { + for (conn, conn_id, current_tx_id) in &mut connections { + // Generate operation based on current transaction state + let visible_rows = if let Some(tx_id) = *current_tx_id { + // Take snapshot during first operation after a BEGIN, not immediately at BEGIN (the semantics is BEGIN DEFERRED) + let tx_state = shared_shadow_db.transactions.get(&tx_id).unwrap(); + if tx_state.is_none() { + shared_shadow_db.take_snapshot(tx_id); + } + shared_shadow_db.get_visible_rows(Some(tx_id)) + } else { + shared_shadow_db.get_visible_rows(None) // No transaction + }; + + let operation = + generate_operation(&mut rng, current_tx_id.is_some(), &visible_rows); + + println!("Connection {conn_id}(op={op_num}): {}", operation); + + match operation { + Operation::Begin => { + shared_shadow_db.begin_transaction(next_tx_id, false); + *current_tx_id = Some(next_tx_id); + next_tx_id += 1; + + conn.execute("BEGIN", ()).await.unwrap(); + } + Operation::Commit => { + let Some(tx_id) = *current_tx_id else { + panic!("Connection {conn_id}(op={op_num}) FAILED: No transaction"); + }; + // Try real DB commit first + let result = conn.execute("COMMIT", ()).await; + + match result { + Ok(_) => { + // Success - update shadow DB + shared_shadow_db.commit_transaction(tx_id); + *current_tx_id = None; + } + Err(e) => { + println!("Connection {conn_id}(op={op_num}) FAILED: {e}"); + if let Some(tx_id) = *current_tx_id { + shared_shadow_db.rollback_transaction(tx_id); + *current_tx_id = None; + } + + // Check if it's an acceptable error + if !e.to_string().contains("Busy") + && !e.to_string().contains("database is locked") + { + panic!("Unexpected error during commit: {}", e); + } + } + } + } + Operation::Rollback => { + if let Some(tx_id) = *current_tx_id { + // Try real DB rollback first + let result = conn.execute("ROLLBACK", ()).await; + + match result { + Ok(_) => { + // Success - update shadow DB + shared_shadow_db.rollback_transaction(tx_id); + *current_tx_id = None; + } + Err(e) => { + println!("Connection {conn_id}(op={op_num}) FAILED: {e}"); + shared_shadow_db.rollback_transaction(tx_id); + *current_tx_id = None; + + // Check if it's an acceptable error + if !e.to_string().contains("Busy") + && !e.to_string().contains("database is locked") + { + panic!("Unexpected error during rollback: {e}"); + } + } + } + } + } + Operation::Insert { id, text } => { + let result = conn + .execute( + "INSERT INTO test_table (id, text) VALUES (?, ?)", + vec![Value::Integer(id), Value::Text(text.clone())], + ) + .await; + + // Check if real DB operation succeeded + match result { + Ok(_) => { + // Success - update shadow DB + if let Some(tx_id) = *current_tx_id { + // In transaction - update transaction's view + shared_shadow_db.insert(tx_id, id, text.clone()).unwrap(); + } else { + // Auto-commit - update shadow DB committed state + shared_shadow_db.begin_transaction(next_tx_id, true); + shared_shadow_db + .insert(next_tx_id, id, text.clone()) + .unwrap(); + shared_shadow_db.commit_transaction(next_tx_id); + next_tx_id += 1; + } + } + Err(e) => { + println!("Connection {conn_id}(op={op_num}) FAILED: {e}"); + if let Some(tx_id) = *current_tx_id { + shared_shadow_db.rollback_transaction(tx_id); + *current_tx_id = None; + } + // Check if it's an acceptable error + if !e.to_string().contains("database is locked") { + panic!("Unexpected error during insert: {e}"); + } + } + } + } + Operation::Update { id, text } => { + let result = conn + .execute( + "UPDATE test_table SET text = ? WHERE id = ?", + vec![Value::Text(text.clone()), Value::Integer(id)], + ) + .await; + + // Check if real DB operation succeeded + match result { + Ok(_) => { + // Success - update shadow DB + if let Some(tx_id) = *current_tx_id { + // In transaction - update transaction's view + shared_shadow_db.update(tx_id, id, text.clone()).unwrap(); + } else { + // Auto-commit - update shadow DB committed state + shared_shadow_db.begin_transaction(next_tx_id, true); + shared_shadow_db + .update(next_tx_id, id, text.clone()) + .unwrap(); + shared_shadow_db.commit_transaction(next_tx_id); + next_tx_id += 1; + } + } + Err(e) => { + println!("Connection {conn_id}(op={op_num}) FAILED: {e}"); + if let Some(tx_id) = *current_tx_id { + shared_shadow_db.rollback_transaction(tx_id); + *current_tx_id = None; + } + // Check if it's an acceptable error + if !e.to_string().contains("database is locked") { + panic!("Unexpected error during update: {e}"); + } + } + } + } + Operation::Delete { id } => { + let result = conn + .execute( + "DELETE FROM test_table WHERE id = ?", + vec![Value::Integer(id)], + ) + .await; + + // Check if real DB operation succeeded + match result { + Ok(_) => { + // Success - update shadow DB + if let Some(tx_id) = *current_tx_id { + // In transaction - update transaction's view + shared_shadow_db.delete(tx_id, id).unwrap(); + } else { + // Auto-commit - update shadow DB committed state + shared_shadow_db.begin_transaction(next_tx_id, true); + shared_shadow_db.delete(next_tx_id, id).unwrap(); + shared_shadow_db.commit_transaction(next_tx_id); + next_tx_id += 1; + } + } + Err(e) => { + println!("Connection {conn_id}(op={op_num}) FAILED: {e}"); + if let Some(tx_id) = *current_tx_id { + shared_shadow_db.rollback_transaction(tx_id); + *current_tx_id = None; + } + // Check if it's an acceptable error + if !e.to_string().contains("database is locked") { + panic!("Unexpected error during delete: {e}"); + } + } + } + } + Operation::Select => { + let query_str = "SELECT id, text FROM test_table ORDER BY id"; + let mut rows = conn.query(query_str, ()).await.unwrap(); + + let mut real_rows = Vec::new(); + while let Some(row) = rows.next().await.unwrap() { + let id = row.get_value(0).unwrap(); + let text = row.get_value(1).unwrap(); + + if let (Value::Integer(id), Value::Text(text)) = (id, text) { + real_rows.push(DbRow { id, text }); + } + } + real_rows.sort_by_key(|r| r.id); + + let mut expected_rows = visible_rows.clone(); + expected_rows.sort_by_key(|r| r.id); + + if real_rows != expected_rows { + let diff = { + let mut diff = Vec::new(); + for row in expected_rows.iter() { + if !real_rows.contains(&row) { + diff.push(row); + } + } + for row in real_rows.iter() { + if !expected_rows.contains(&row) { + diff.push(row); + } + } + diff + }; + panic!( + "Row mismatch in iteration {iteration} Connection {conn_id}(op={op_num}). Query: {query_str}.\n\nExpected: {}\n\nGot: {}\n\nDiff: {}\n\nSeed: {seed}", + expected_rows.iter().map(|r| r.to_string()).collect::>().join(", "), + real_rows.iter().map(|r| r.to_string()).collect::>().join(", "), + diff.iter().map(|r| r.to_string()).collect::>().join(", "), + ); + } + } + } + } + } + } +} + +fn generate_operation( + rng: &mut ChaCha8Rng, + in_transaction: bool, + visible_rows: &[DbRow], +) -> Operation { + match rng.gen_range(0..100) { + // 10% chance to begin transaction + 0..=9 => { + if !in_transaction { + Operation::Begin + } else { + generate_data_operation(rng, visible_rows) + } + } + // 5% chance to commit + 10..=14 => { + if in_transaction { + Operation::Commit + } else { + generate_data_operation(rng, visible_rows) + } + } + // 5% chance to rollback + 15..=19 => { + if in_transaction { + Operation::Rollback + } else { + generate_data_operation(rng, visible_rows) + } + } + // 80% chance for data operations + _ => generate_data_operation(rng, visible_rows), + } +} + +fn generate_data_operation(rng: &mut ChaCha8Rng, visible_rows: &[DbRow]) -> Operation { + match rng.gen_range(0..4) { + 0 => { + // Insert - generate a new ID that doesn't exist + let id = if visible_rows.is_empty() { + rng.gen_range(1..1000) + } else { + let max_id = visible_rows.iter().map(|r| r.id).max().unwrap(); + rng.gen_range(max_id + 1..max_id + 100) + }; + let text = format!("text_{}", rng.gen_range(1..1000)); + Operation::Insert { id, text } + } + 1 => { + // Update - only if there are visible rows + if visible_rows.is_empty() { + // No rows to update, try insert instead + let id = rng.gen_range(1..1000); + let text = format!("text_{}", rng.gen_range(1..1000)); + Operation::Insert { id, text } + } else { + let id = visible_rows.choose(rng).unwrap().id; + let text = format!("updated_{}", rng.gen_range(1..1000)); + Operation::Update { id, text } + } + } + 2 => { + // Delete - only if there are visible rows + if visible_rows.is_empty() { + // No rows to delete, try insert instead + let id = rng.gen_range(1..1000); + let text = format!("text_{}", rng.gen_range(1..1000)); + Operation::Insert { id, text } + } else { + let id = visible_rows.choose(rng).unwrap().id; + Operation::Delete { id } + } + } + 3 => Operation::Select, + _ => unreachable!(), + } +} From 1562c1df10ba9eeba62ef3723b5ad4528521ae1d Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 13:46:33 +0300 Subject: [PATCH 43/62] begin_read_tx: better assertion failure message --- core/storage/wal.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 038b51c91..972ce1dd1 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -674,7 +674,9 @@ impl Wal for WalFile { fn begin_read_tx(&mut self) -> Result<(LimboResult, bool)> { turso_assert!( self.max_frame_read_lock_index.get().eq(&NO_LOCK_HELD), - "cannot start a new read tx without ending an existing one" + "cannot start a new read tx without ending an existing one, lock_value={}, expected={}", + self.max_frame_read_lock_index.get(), + NO_LOCK_HELD ); let (shared_max, nbackfills, last_checksum, checkpoint_seq) = { let shared = self.get_shared(); From 772b71963e8f2723f0bd08fb1bb6407d42c9c2be Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 13:46:52 +0300 Subject: [PATCH 44/62] finish_append_frames_commit: properly increase readmark on commit --- core/storage/wal.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 972ce1dd1..52ba07966 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1167,6 +1167,9 @@ impl Wal for WalFile { fn finish_append_frames_commit(&mut self) -> Result<()> { let shared = self.get_shared(); shared.max_frame.store(self.max_frame, Ordering::SeqCst); + shared.read_locks[self.max_frame_read_lock_index.get()] + .value + .store(self.max_frame as u32, Ordering::SeqCst); tracing::trace!(self.max_frame, ?self.last_checksum); shared.last_checksum = self.last_checksum; Ok(()) From 9a63425b43f29a9299e7c9b778c237f86e7a6e40 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 13:47:24 +0300 Subject: [PATCH 45/62] clippy --- core/storage/wal.rs | 14 ++++++++++---- core/vdbe/execute.rs | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 52ba07966..745d67c50 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1307,7 +1307,7 @@ impl WalFile { &shared.wal_header.lock(), )?)?; self.io - .wait_for_completion(shared.file.sync(Completion::new_sync(|_| {}).into())?)?; + .wait_for_completion(shared.file.sync(Completion::new_sync(|_| {}))?)?; Ok(()) } @@ -1959,7 +1959,9 @@ pub mod test { conn.execute("create table test(id integer primary key, value text)") .unwrap(); bulk_inserts(&conn, 20, 3); - conn.pager.borrow_mut().cacheflush().unwrap(); + while let IOResult::IO = conn.pager.borrow_mut().cacheflush().unwrap() { + conn.run_once().unwrap(); + } // Snapshot header & counters before the RESTART checkpoint. let wal_shared = db.maybe_shared_wal.read().as_ref().unwrap().clone(); @@ -2049,7 +2051,9 @@ pub mod test { .execute("create table test(id integer primary key, value text)") .unwrap(); bulk_inserts(&conn1.clone(), 15, 2); - conn1.pager.borrow_mut().cacheflush().unwrap(); + while let IOResult::IO = conn1.pager.borrow_mut().cacheflush().unwrap() { + conn1.run_once().unwrap(); + } // Force a read transaction that will freeze a lower read mark let readmark = { @@ -2061,7 +2065,9 @@ pub mod test { // generate more frames that the reader will not see. bulk_inserts(&conn1.clone(), 15, 2); - conn1.pager.borrow_mut().cacheflush().unwrap(); + while let IOResult::IO = conn1.pager.borrow_mut().cacheflush().unwrap() { + conn1.run_once().unwrap(); + } // Run passive checkpoint, expect partial let (res1, max_before) = { diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 9307bc524..d88351b46 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -1989,7 +1989,7 @@ pub fn op_transaction( // instead of ending the read tx, just update the state to pending. (TransactionState::PendingUpgrade, write) => { turso_assert!( - write, + *write, "pending upgrade should only be set for write transactions" ); ( From fd5e73f038a571ea1da7892168e5732e928d2ffa Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 13:47:52 +0300 Subject: [PATCH 46/62] op_transaction: read tx must be ended in all cases if begin_write_tx fails --- core/vdbe/execute.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index d88351b46..8f211c482 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2030,12 +2030,7 @@ pub fn op_transaction( match pager.begin_write_tx()? { IOResult::Done(r) => { if let LimboResult::Busy = r { - if matches!( - current_state, - TransactionState::Read | TransactionState::PendingUpgrade - ) { - pager.end_read_tx()?; - } + pager.end_read_tx()?; conn.transaction_state.replace(TransactionState::None); conn.auto_commit.replace(true); return Ok(InsnFunctionStepResult::Busy); From 338cab3f28e48aa0684dc621c20c6f4647018704 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 14:52:15 +0300 Subject: [PATCH 47/62] End read transaction when Schema::make_from_btree fails --- core/lib.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 847244b7d..737fecb98 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -312,9 +312,13 @@ impl Database { db.with_schema_mut(|schema| { schema.schema_version = get_schema_version(&conn)?; - if let Err(LimboError::ExtensionError(e)) = - schema.make_from_btree(None, pager, &syms) - { + let result = schema + .make_from_btree(None, pager.clone(), &syms) + .or_else(|e| { + pager.end_read_tx()?; + Err(e) + }); + if let Err(LimboError::ExtensionError(e)) = result { // this means that a vtab exists and we no longer have the module loaded. we print // a warning to the user to load the module eprintln!("Warning: {e}"); From 43d13210338f5f40bef93da79175d99c5ebf635e Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 14:53:21 +0300 Subject: [PATCH 48/62] ignore completion result of self.read_frame --- core/storage/wal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 745d67c50..da7bd05e6 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1378,7 +1378,7 @@ impl WalFile { *frame ); self.ongoing_checkpoint.page.get().id = page as usize; - self.read_frame( + let _ = self.read_frame( *frame, self.ongoing_checkpoint.page.clone(), self.buffer_pool.clone(), From d4043595cd834f191099fdac3508edf1f6c429c7 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 15:02:16 +0300 Subject: [PATCH 49/62] test/fuzz/tx-isolation: clippy --- .../rust/tests/transaction_isolation_fuzz.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bindings/rust/tests/transaction_isolation_fuzz.rs b/bindings/rust/tests/transaction_isolation_fuzz.rs index a0d37b397..445210ada 100644 --- a/bindings/rust/tests/transaction_isolation_fuzz.rs +++ b/bindings/rust/tests/transaction_isolation_fuzz.rs @@ -1,7 +1,7 @@ use rand::seq::SliceRandom; use rand::Rng; use rand_chacha::{rand_core::SeedableRng, ChaCha8Rng}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use turso::{Builder, Value}; // In-memory representation of the database state @@ -217,7 +217,7 @@ fn rng_from_time_or_env() -> (ChaCha8Rng, u64) { /// This test is ignored because it still fails sometimes; unsure if it fails due to a bug in the test or a bug in the implementation. async fn test_multiple_connections_fuzz() { let (mut rng, seed) = rng_from_time_or_env(); - println!("Multiple connections fuzz test seed: {}", seed); + println!("Multiple connections fuzz test seed: {seed}"); const NUM_ITERATIONS: usize = 50; const OPERATIONS_PER_CONNECTION: usize = 30; @@ -269,7 +269,7 @@ async fn test_multiple_connections_fuzz() { let operation = generate_operation(&mut rng, current_tx_id.is_some(), &visible_rows); - println!("Connection {conn_id}(op={op_num}): {}", operation); + println!("Connection {conn_id}(op={op_num}): {operation}"); match operation { Operation::Begin => { @@ -300,10 +300,8 @@ async fn test_multiple_connections_fuzz() { } // Check if it's an acceptable error - if !e.to_string().contains("Busy") - && !e.to_string().contains("database is locked") - { - panic!("Unexpected error during commit: {}", e); + if !e.to_string().contains("database is locked") { + panic!("Unexpected error during commit: {e}"); } } } @@ -468,12 +466,12 @@ async fn test_multiple_connections_fuzz() { let diff = { let mut diff = Vec::new(); for row in expected_rows.iter() { - if !real_rows.contains(&row) { + if !real_rows.contains(row) { diff.push(row); } } for row in real_rows.iter() { - if !expected_rows.contains(&row) { + if !expected_rows.contains(row) { diff.push(row); } } From af660326d894e3f59ca6d828c356427594342939 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 15:53:01 +0300 Subject: [PATCH 50/62] finish_append_frames_commit: revert bumping readmark incorrectly --- core/storage/wal.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index da7bd05e6..80c9789f0 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1167,9 +1167,6 @@ impl Wal for WalFile { fn finish_append_frames_commit(&mut self) -> Result<()> { let shared = self.get_shared(); shared.max_frame.store(self.max_frame, Ordering::SeqCst); - shared.read_locks[self.max_frame_read_lock_index.get()] - .value - .store(self.max_frame as u32, Ordering::SeqCst); tracing::trace!(self.max_frame, ?self.last_checksum); shared.last_checksum = self.last_checksum; Ok(()) From 975b7b5434e4f1438a01c23b8e0a99b78d722b9b Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 15:53:13 +0300 Subject: [PATCH 51/62] wal: fix test incorrect expectation --- core/storage/wal.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 80c9789f0..eb55e9dc2 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -2479,14 +2479,20 @@ pub mod test { let new_size = std::fs::metadata(&walpath).unwrap().len(); assert!(new_size >= 32, "WAL file too small"); let hdr = read_wal_header(&walpath); + let expected_magic = if cfg!(target_endian = "big") { + sqlite3_ondisk::WAL_MAGIC_BE + } else { + sqlite3_ondisk::WAL_MAGIC_LE + }; assert!( - hdr.magic == sqlite3_ondisk::WAL_MAGIC_BE, - "bad WAL magic: {:#X}", - hdr.magic + hdr.magic == expected_magic, + "bad WAL magic: {:#X}, expected: {:#X}", + hdr.magic, + sqlite3_ondisk::WAL_MAGIC_BE ); assert_eq!(hdr.file_format, 3007000); assert_eq!(hdr.page_size, 4096, "invalid page size"); - assert_eq!(hdr.checkpoint_seq, 0, "invalid checkpoint_seq"); + assert_eq!(hdr.checkpoint_seq, 1, "invalid checkpoint_seq"); std::fs::remove_dir_all(path).unwrap(); } From b1aa13375dff7818b8a8e26cab27032a579a53f3 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 16:39:38 +0300 Subject: [PATCH 52/62] call pager.end_tx() everywhere instead of pager.rollback() --- core/lib.rs | 14 ++++++-------- core/storage/pager.rs | 7 ++----- core/vdbe/execute.rs | 2 +- core/vdbe/mod.rs | 24 ++++++++++++++---------- 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 737fecb98..e95184982 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -1169,7 +1169,12 @@ impl Connection { let pager = self.pager.borrow(); // remove all non-commited changes in case if WAL session left some suffix without commit frame - pager.rollback(false, self).expect("rollback must succeed"); + while let IOResult::IO = pager + .end_tx(false, false, self, false) + .expect("rollback must succeed") + { + self.run_once()?; + } let wal = pager.wal.borrow_mut(); wal.end_write_tx(); @@ -1726,13 +1731,6 @@ impl Statement { if res.is_err() { let state = self.program.connection.transaction_state.get(); if let TransactionState::Write { schema_did_change } = state { - if let Err(e) = self - .pager - .rollback(schema_did_change, &self.program.connection) - { - // Let's panic for now as we don't want to leave state in a bad state. - panic!("rollback failed: {e:?}"); - } let end_tx_res = self.pager .end_tx(true, schema_did_change, &self.program.connection, true)?; diff --git a/core/storage/pager.rs b/core/storage/pager.rs index fbdabece3..24ffb9ff5 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -818,6 +818,7 @@ impl Pager { ) -> Result> { tracing::trace!("end_tx(rollback={})", rollback); if rollback { + self.rollback(schema_did_change, connection)?; self.wal.borrow().end_write_tx(); self.wal.borrow().end_read_tx(); return Ok(IOResult::Done(PagerCommitResult::Rollback)); @@ -1797,11 +1798,7 @@ impl Pager { } #[instrument(skip_all, level = Level::DEBUG)] - pub fn rollback( - &self, - schema_did_change: bool, - connection: &Connection, - ) -> Result<(), LimboError> { + fn rollback(&self, schema_did_change: bool, connection: &Connection) -> Result<(), LimboError> { tracing::debug!(schema_did_change); self.dirty_pages.borrow_mut().clear(); let mut cache = self.page_cache.write(); diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index 8f211c482..f25946835 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2085,7 +2085,7 @@ pub fn op_auto_commit( if *auto_commit != conn.auto_commit.get() { if *rollback { // TODO(pere): add rollback I/O logic once we implement rollback journal - pager.rollback(schema_did_change, &conn)?; + return_if_io!(pager.end_tx(true, schema_did_change, &conn, false)); conn.auto_commit.replace(true); } else { conn.auto_commit.replace(*auto_commit); diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 1c4addfe5..6113c8e3c 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -398,7 +398,10 @@ impl Program { // Connection is closed for whatever reason, rollback the transaction. let state = self.connection.transaction_state.get(); if let TransactionState::Write { schema_did_change } = state { - pager.rollback(schema_did_change, &self.connection)? + match pager.end_tx(true, schema_did_change, &self.connection, false)? { + IOResult::IO => return Ok(StepResult::IO), + IOResult::Done(_) => {} + } } return Err(LimboError::InternalError("Connection closed".to_string())); } @@ -510,13 +513,10 @@ impl Program { connection.wal_checkpoint_disabled.get(), )?; match cacheflush_status { - IOResult::Done(status) => { + IOResult::Done(_) => { if self.change_cnt_on { self.connection.set_changes(self.n_change.get()); } - if matches!(status, pager::PagerCommitResult::Rollback) { - pager.rollback(schema_did_change, connection)?; - } connection.transaction_state.replace(TransactionState::None); *commit_state = CommitState::Ready; } @@ -761,11 +761,15 @@ pub fn handle_program_error( _ => { let state = connection.transaction_state.get(); if let TransactionState::Write { schema_did_change } = state { - if let Err(e) = pager.rollback(schema_did_change, connection) { - tracing::error!("rollback failed: {e}"); - } - if let Err(e) = pager.end_tx(false, schema_did_change, connection, false) { - tracing::error!("end_tx failed: {e}"); + loop { + match pager.end_tx(true, schema_did_change, connection, false) { + Ok(IOResult::IO) => connection.run_once()?, + Ok(IOResult::Done(_)) => break, + Err(e) => { + tracing::error!("end_tx failed: {e}"); + break; + } + } } } else if let Err(e) = pager.end_read_tx() { tracing::error!("end_read_tx failed: {e}"); From e7eda258028c1cacfdbfe03b45f596301911e0b5 Mon Sep 17 00:00:00 2001 From: PThorpe92 Date: Wed, 30 Jul 2025 09:44:29 -0400 Subject: [PATCH 53/62] Make sure to end read tx on error of wal insert begin API --- core/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/lib.rs b/core/lib.rs index e95184982..d4fb7e2a4 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -1151,7 +1151,9 @@ impl Connection { result::LimboResult::Busy => return Err(LimboError::Busy), result::LimboResult::Ok => {} } - match pager.io.block(|| pager.begin_write_tx())? { + match pager.io.block(|| pager.begin_write_tx()).inspect_err(|_| { + pager.end_read_tx().expect("read txn must be closed"); + })? { result::LimboResult::Busy => { pager.end_read_tx().expect("read txn must be closed"); return Err(LimboError::Busy); From 7b1f04dc5e42159f5b3372da1e51bc59eed7075d Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 17:00:38 +0300 Subject: [PATCH 54/62] pager: only ROLLBACK your own transaction, not if someone else is writing --- core/storage/pager.rs | 6 ++++-- core/vdbe/execute.rs | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 24ffb9ff5..a5801e594 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -9,7 +9,7 @@ use crate::storage::sqlite3_ondisk::{ use crate::storage::wal::{CheckpointResult, Wal}; use crate::types::{IOResult, WalInsertInfo}; use crate::util::IOExt as _; -use crate::{return_if_io, Completion}; +use crate::{return_if_io, Completion, TransactionState}; use crate::{turso_assert, Buffer, Connection, LimboError, Result}; use parking_lot::RwLock; use std::cell::{Cell, OnceCell, RefCell, UnsafeCell}; @@ -819,7 +819,9 @@ impl Pager { tracing::trace!("end_tx(rollback={})", rollback); if rollback { self.rollback(schema_did_change, connection)?; - self.wal.borrow().end_write_tx(); + if matches!(connection.transaction_state.get(), TransactionState::Write { .. }) { + self.wal.borrow().end_write_tx(); + } self.wal.borrow().end_read_tx(); return Ok(IOResult::Done(PagerCommitResult::Rollback)); } diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index f25946835..f827e26e2 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -2086,6 +2086,7 @@ pub fn op_auto_commit( if *rollback { // TODO(pere): add rollback I/O logic once we implement rollback journal return_if_io!(pager.end_tx(true, schema_did_change, &conn, false)); + conn.transaction_state.replace(TransactionState::None); conn.auto_commit.replace(true); } else { conn.auto_commit.replace(*auto_commit); From 66c4b44c555c97dad6edbb875c2ae7ff43da5adf Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 17:21:19 +0300 Subject: [PATCH 55/62] pager: call rollback() after ending txn so that read lock info is not lost when ending txn --- core/storage/pager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index a5801e594..2b386922e 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -818,11 +818,11 @@ impl Pager { ) -> Result> { tracing::trace!("end_tx(rollback={})", rollback); if rollback { - self.rollback(schema_did_change, connection)?; if matches!(connection.transaction_state.get(), TransactionState::Write { .. }) { self.wal.borrow().end_write_tx(); } self.wal.borrow().end_read_tx(); + self.rollback(schema_did_change, connection)?; return Ok(IOResult::Done(PagerCommitResult::Rollback)); } let commit_status = self.commit_dirty_pages(wal_checkpoint_disabled)?; From c00d1fcfc0051093355b87b2143a6d5761ae0037 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 17:21:29 +0300 Subject: [PATCH 56/62] fmt --- core/storage/pager.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 2b386922e..7846e492c 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -818,7 +818,10 @@ impl Pager { ) -> Result> { tracing::trace!("end_tx(rollback={})", rollback); if rollback { - if matches!(connection.transaction_state.get(), TransactionState::Write { .. }) { + if matches!( + connection.transaction_state.get(), + TransactionState::Write { .. } + ) { self.wal.borrow().end_write_tx(); } self.wal.borrow().end_read_tx(); From 2813a7a5debf5b022bbfbafa6020d6a341d2f050 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 17:25:30 +0300 Subject: [PATCH 57/62] clippy --- core/vdbe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 6113c8e3c..5d9d1fb67 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -27,7 +27,7 @@ pub mod sorter; use crate::{ error::LimboError, function::{AggFunc, FuncCtx}, - storage::{pager, sqlite3_ondisk::SmallVec}, + storage::sqlite3_ondisk::SmallVec, translate::plan::TableReferences, types::{IOResult, RawSlice, TextRef}, vdbe::execute::{OpIdxInsertState, OpInsertState, OpNewRowidState, OpSeekState}, From 73b56f3917e0732adb9c59b6cbb5bdaaa9f7a998 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 17:25:49 +0300 Subject: [PATCH 58/62] test/fuzz/tx-isolation: unignore test --- bindings/rust/tests/transaction_isolation_fuzz.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/bindings/rust/tests/transaction_isolation_fuzz.rs b/bindings/rust/tests/transaction_isolation_fuzz.rs index 445210ada..84190d7d4 100644 --- a/bindings/rust/tests/transaction_isolation_fuzz.rs +++ b/bindings/rust/tests/transaction_isolation_fuzz.rs @@ -212,7 +212,6 @@ fn rng_from_time_or_env() -> (ChaCha8Rng, u64) { } #[tokio::test] -#[ignore] /// Verify translation isolation semantics with multiple concurrent connections. /// This test is ignored because it still fails sometimes; unsure if it fails due to a bug in the test or a bug in the implementation. async fn test_multiple_connections_fuzz() { From 7bc11fe2f9d91da2811a362b9a79d1fd063b3069 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 18:16:23 +0300 Subject: [PATCH 59/62] wal_insert_end: revert unintentional changes --- core/lib.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index d4fb7e2a4..798817506 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -1171,12 +1171,7 @@ impl Connection { let pager = self.pager.borrow(); // remove all non-commited changes in case if WAL session left some suffix without commit frame - while let IOResult::IO = pager - .end_tx(false, false, self, false) - .expect("rollback must succeed") - { - self.run_once()?; - } + pager.rollback(false, self)?; let wal = pager.wal.borrow_mut(); wal.end_write_tx(); From 438cbf287291607b54554d05d3caaa2a96956b15 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 18:17:07 +0300 Subject: [PATCH 60/62] test/wal api: add comment about purpose of rollback test --- tests/integration/functions/test_wal_api.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/functions/test_wal_api.rs b/tests/integration/functions/test_wal_api.rs index 9b7b710a3..614eb795b 100644 --- a/tests/integration/functions/test_wal_api.rs +++ b/tests/integration/functions/test_wal_api.rs @@ -175,6 +175,7 @@ fn test_wal_frame_transfer_no_schema_changes_rollback() { assert_eq!(conn1.wal_frame_count().unwrap(), 14); let mut frame = [0u8; 24 + 4096]; conn2.wal_insert_begin().unwrap(); + // Intentionally leave out the final commit frame, so the big randomblob is not committed and should not be visible to transactions. for frame_id in 1..=(conn1.wal_frame_count().unwrap() as u32 - 1) { conn1.wal_get_frame(frame_id, &mut frame).unwrap(); conn2.wal_insert_frame(frame_id, &frame).unwrap(); From 7240d7903c6bc43dd08067a2c18f591d47fe35ba Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 18:22:17 +0300 Subject: [PATCH 61/62] fmt --- core/storage/pager.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 7846e492c..a31094f19 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1803,7 +1803,11 @@ impl Pager { } #[instrument(skip_all, level = Level::DEBUG)] - fn rollback(&self, schema_did_change: bool, connection: &Connection) -> Result<(), LimboError> { + pub fn rollback( + &self, + schema_did_change: bool, + connection: &Connection, + ) -> Result<(), LimboError> { tracing::debug!(schema_did_change); self.dirty_pages.borrow_mut().clear(); let mut cache = self.page_cache.write(); From ff1c1b6b8c42629513ed5c05ecdf00a637a31d28 Mon Sep 17 00:00:00 2001 From: Jussi Saurio Date: Wed, 30 Jul 2025 18:22:40 +0300 Subject: [PATCH 62/62] wal_insert_end: call pager.rollback() after tx ends so that lock index is preserved when ending tx --- core/lib.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 798817506..6176e9811 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -1170,12 +1170,13 @@ impl Connection { { let pager = self.pager.borrow(); + { + let wal = pager.wal.borrow_mut(); + wal.end_write_tx(); + wal.end_read_tx(); + } // remove all non-commited changes in case if WAL session left some suffix without commit frame pager.rollback(false, self)?; - - let wal = pager.wal.borrow_mut(); - wal.end_write_tx(); - wal.end_read_tx(); } // let's re-parse schema from scratch if schema cookie changed compared to the our in-memory view of schema