diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 4f8241351..89ce26603 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -4,6 +4,7 @@ use limbo_core::IO; use pyo3::prelude::*; use pyo3::types::PyList; use pyo3::types::PyTuple; +use std::rc::Rc; use std::sync::{Arc, Mutex}; mod errors; @@ -198,7 +199,7 @@ fn stmt_is_dml(sql: &str) -> bool { #[pyclass] #[derive(Clone)] pub struct Connection { - conn: Arc>, + conn: Arc>>, io: Arc, } @@ -238,7 +239,7 @@ pub fn connect(path: &str) -> Result { })?); let db = limbo_core::Database::open_file(io.clone(), path) .map_err(|e| PyErr::new::(format!("Failed to open database: {:?}", e)))?; - let conn: limbo_core::Connection = db.connect(); + let conn: Rc = db.connect(); Ok(Connection { conn: Arc::new(Mutex::new(conn)), io, diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs index 8c0fb8cd4..1bb912874 100644 --- a/bindings/wasm/lib.rs +++ b/bindings/wasm/lib.rs @@ -1,11 +1,12 @@ -use limbo_core::{Result, IO}; +use limbo_core::{OpenFlags, Page, Result, IO}; +use std::cell::RefCell; use std::rc::Rc; use std::sync::Arc; use wasm_bindgen::prelude::*; #[wasm_bindgen] pub struct Database { - _inner: limbo_core::Database, + _inner: Rc, } #[allow(clippy::arc_with_non_send_sync)] @@ -14,9 +15,9 @@ impl Database { #[wasm_bindgen(constructor)] pub fn new(path: &str) -> Database { let io = Arc::new(PlatformIO { vfs: VFS::new() }); - let file = io.open_file(path).unwrap(); + let file = io.open_file(path, limbo_core::OpenFlags::None).unwrap(); let page_io = Rc::new(DatabaseStorage::new(file)); - let wal = Rc::new(Wal {}); + let wal = Rc::new(RefCell::new(Wal {})); let inner = limbo_core::Database::open(io, page_io, wal).unwrap(); Database { _inner: inner } } @@ -51,7 +52,7 @@ impl limbo_core::File for File { fn pread(&self, pos: usize, c: Rc) -> Result<()> { let r = match &*c { limbo_core::Completion::Read(r) => r, - limbo_core::Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; { let mut buf = r.buf_mut(); @@ -71,6 +72,14 @@ impl limbo_core::File for File { ) -> Result<()> { todo!() } + + fn sync(&self, _c: Rc) -> Result<()> { + todo!() + } + + fn size(&self) -> Result { + todo!() + } } pub struct PlatformIO { @@ -78,7 +87,7 @@ pub struct PlatformIO { } impl limbo_core::IO for PlatformIO { - fn open_file(&self, path: &str) -> Result> { + fn open_file(&self, path: &str, _flags: OpenFlags) -> Result> { let fd = self.vfs.open(path); Ok(Rc::new(File { vfs: VFS::new(), @@ -127,11 +136,13 @@ impl DatabaseStorage { } } +struct BufferPool {} + impl limbo_core::DatabaseStorage for DatabaseStorage { fn read_page(&self, page_idx: usize, c: Rc) -> Result<()> { let r = match &(*c) { limbo_core::Completion::Read(r) => r, - limbo_core::Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; let size = r.buf().len(); assert!(page_idx > 0); @@ -151,6 +162,10 @@ impl limbo_core::DatabaseStorage for DatabaseStorage { ) -> Result<()> { todo!() } + + fn sync(&self, _c: Rc) -> Result<()> { + todo!() + } } pub struct Wal {} @@ -168,13 +183,48 @@ impl limbo_core::Wal for Wal { Ok(None) } + fn begin_write_tx(&self) -> Result<()> { + todo!() + } + + fn end_write_tx(&self) -> Result<()> { + todo!() + } + fn read_frame( &self, _frame_id: u64, _page: Rc>, + _buffer_pool: Rc, ) -> Result<()> { todo!() } + + fn should_checkpoint(&self) -> bool { + todo!() + } + + fn append_frame( + &mut self, + _page: Rc>, + _db_size: u32, + _pager: &limbo_core::Pager, + _write_counter: Rc>, + ) -> Result<()> { + todo!() + } + + fn checkpoint( + &mut self, + _pager: &limbo_core::Pager, + _write_counter: Rc>, + ) -> Result { + todo!() + } + + fn sync(&mut self) -> Result { + todo!() + } } #[wasm_bindgen(module = "/vfs.js")] diff --git a/cli/main.rs b/cli/main.rs index 6f887902f..fcb339530 100644 --- a/cli/main.rs +++ b/cli/main.rs @@ -6,6 +6,7 @@ use limbo_core::{Database, RowResult, Value}; use opcodes_dictionary::OPCODE_DESCRIPTIONS; use rustyline::{error::ReadlineError, DefaultEditor}; use std::path::PathBuf; +use std::rc::Rc; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -147,7 +148,7 @@ Note: fn handle_dot_command( io: Arc, - conn: &limbo_core::Connection, + conn: &Rc, line: &str, ) -> anyhow::Result<()> { let args: Vec<&str> = line.split_whitespace().collect(); @@ -196,7 +197,7 @@ fn handle_dot_command( fn display_schema( io: Arc, - conn: &limbo_core::Connection, + conn: &Rc, table: Option<&str>, ) -> anyhow::Result<()> { let sql = match table { @@ -251,7 +252,7 @@ fn display_schema( fn query( io: Arc, - conn: &limbo_core::Connection, + conn: &Rc, sql: &str, output_mode: &OutputMode, interrupt_count: &Arc, @@ -264,8 +265,8 @@ fn query( return Ok(()); } - match rows.next_row()? { - RowResult::Row(row) => { + match rows.next_row() { + Ok(RowResult::Row(row)) => { for (i, value) in row.values.iter().enumerate() { if i > 0 { print!("|"); @@ -282,10 +283,14 @@ fn query( } println!(); } - RowResult::IO => { + Ok(RowResult::IO) => { io.run_once()?; } - RowResult::Done => { + Ok(RowResult::Done) => { + break; + } + Err(err) => { + eprintln!("{}", err); break; } } @@ -297,8 +302,8 @@ fn query( } let mut table_rows: Vec> = vec![]; loop { - match rows.next_row()? { - RowResult::Row(row) => { + match rows.next_row() { + Ok(RowResult::Row(row)) => { table_rows.push( row.values .iter() @@ -314,10 +319,14 @@ fn query( .collect(), ); } - RowResult::IO => { + Ok(RowResult::IO) => { io.run_once()?; } - RowResult::Done => break, + Ok(RowResult::Done) => break, + Err(err) => { + eprintln!("{}", err); + break; + } } } let table = table_rows.table(); diff --git a/core/io/common.rs b/core/io/common.rs index a29c6b7ab..6627e2076 100644 --- a/core/io/common.rs +++ b/core/io/common.rs @@ -13,7 +13,7 @@ pub mod tests { // Parent process opens the file let io1 = create_io().expect("Failed to create IO"); let _file1 = io1 - .open_file(&path) + .open_file(&path, crate::io::OpenFlags::None) .expect("Failed to open file in parent process"); let current_exe = std::env::current_exe().expect("Failed to get current executable path"); @@ -38,7 +38,7 @@ pub mod tests { if std::env::var("RUST_TEST_CHILD_PROCESS").is_ok() { let path = std::env::var("RUST_TEST_FILE_PATH")?; let io = create_io()?; - match io.open_file(&path) { + match io.open_file(&path, crate::io::OpenFlags::None) { Ok(_) => std::process::exit(0), Err(_) => std::process::exit(1), } diff --git a/core/io/darwin.rs b/core/io/darwin.rs index 1016e9954..3775d1b99 100644 --- a/core/io/darwin.rs +++ b/core/io/darwin.rs @@ -2,7 +2,7 @@ use crate::error::LimboError; use crate::io::common; use crate::Result; -use super::{Completion, File, IO}; +use super::{Completion, File, OpenFlags, IO}; use libc::{c_short, fcntl, flock, F_SETLK}; use log::trace; use polling::{Event, Events, Poller}; @@ -12,6 +12,7 @@ use rustix::io::Errno; use std::cell::RefCell; use std::collections::HashMap; use std::io::{Read, Seek, Write}; +use std::os::unix::fs::MetadataExt; use std::rc::Rc; pub struct DarwinIO { @@ -31,12 +32,13 @@ impl DarwinIO { } impl IO for DarwinIO { - fn open_file(&self, path: &str) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { trace!("open_file(path = {})", path); let file = std::fs::File::options() .read(true) .custom_flags(libc::O_NONBLOCK) .write(true) + .create(matches!(flags, OpenFlags::Create)) .open(path)?; let darwin_file = Rc::new(DarwinFile { @@ -70,7 +72,7 @@ impl IO for DarwinIO { let c: &Completion = c; let r = match c { Completion::Read(r) => r, - Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; let mut buf = r.buf_mut(); file.seek(std::io::SeekFrom::Start(pos as u64))?; @@ -153,7 +155,9 @@ impl File for DarwinFile { if lock_result == -1 { let err = std::io::Error::last_os_error(); if err.kind() == std::io::ErrorKind::WouldBlock { - return Err(LimboError::LockingError("Failed locking file. File is locked by another process".to_string())); + return Err(LimboError::LockingError( + "Failed locking file. File is locked by another process".to_string(), + )); } else { return Err(LimboError::LockingError(format!( "Failed locking file, {}", @@ -184,12 +188,12 @@ impl File for DarwinFile { Ok(()) } - fn pread(&self, pos: usize, c: Rc) -> Result<()> { - let file = self.file.borrow(); + fn pread(&self, pos: usize, c: Rc) -> Result<()> { + let file = self.file.borrow(); let result = { let r = match &(*c) { Completion::Read(r) => r, - Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; let mut buf = r.buf_mut(); rustix::io::pread(file.as_fd(), buf.as_mut_slice(), pos as u64) @@ -256,6 +260,24 @@ impl File for DarwinFile { Err(e) => Err(e.into()), } } + + fn sync(&self, c: Rc) -> Result<()> { + let file = self.file.borrow(); + let result = rustix::fs::fsync(file.as_fd()); + match result { + std::result::Result::Ok(()) => { + trace!("fsync"); + c.complete(0); + Ok(()) + } + Err(e) => Err(e.into()), + } + } + + fn size(&self) -> Result { + let file = self.file.borrow(); + Ok(file.metadata().unwrap().len()) + } } impl Drop for DarwinFile { diff --git a/core/io/generic.rs b/core/io/generic.rs index f3655ad31..c703ff78e 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -1,4 +1,4 @@ -use crate::{Completion, File, Result, IO}; +use crate::{Completion, File, LimboError, OpenFlags, Result, IO}; use log::trace; use std::cell::RefCell; use std::io::{Read, Seek, Write}; @@ -13,7 +13,7 @@ impl GenericIO { } impl IO for GenericIO { - fn open_file(&self, path: &str) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { trace!("open_file(path = {})", path); let file = std::fs::File::open(path)?; Ok(Rc::new(GenericFile { @@ -57,7 +57,7 @@ impl File for GenericFile { { let r = match &(*c) { Completion::Read(r) => r, - Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; let mut buf = r.buf_mut(); let buf = buf.as_mut_slice(); @@ -78,8 +78,21 @@ impl File for GenericFile { let buf = buffer.borrow(); let buf = buf.as_slice(); file.write_all(buf)?; + c.complete(buf.len() as i32); Ok(()) } + + fn sync(&self, c: Rc) -> Result<()> { + let mut file = self.file.borrow_mut(); + file.sync_all().map_err(|err| LimboError::IOError(err))?; + c.complete(0); + Ok(()) + } + + fn size(&self) -> Result { + let file = self.file.borrow(); + Ok(file.metadata().unwrap().len()) + } } impl Drop for GenericFile { diff --git a/core/io/linux.rs b/core/io/linux.rs index 3542e2870..76d20809d 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -1,10 +1,11 @@ -use super::{common, Completion, File, IO}; +use super::{common, Completion, File, OpenFlags, IO}; use crate::{LimboError, Result}; use libc::{c_short, fcntl, flock, iovec, F_SETLK}; use log::{debug, trace}; use nix::fcntl::{FcntlArg, OFlag}; use std::cell::RefCell; use std::fmt; +use std::os::unix::fs::MetadataExt; use std::os::unix::io::AsRawFd; use std::rc::Rc; use thiserror::Error; @@ -48,7 +49,10 @@ impl LinuxIO { pub fn new() -> Result { let ring = io_uring::IoUring::new(MAX_IOVECS as u32)?; let inner = InnerLinuxIO { - ring: WrappedIOUring{ring, pending_ops: 0}, + ring: WrappedIOUring { + ring, + pending_ops: 0, + }, iovecs: [iovec { iov_base: std::ptr::null_mut(), iov_len: 0, @@ -74,7 +78,8 @@ impl InnerLinuxIO { impl WrappedIOUring { fn submit_entry(&mut self, entry: &io_uring::squeue::Entry) { unsafe { - self.ring.submission() + self.ring + .submission() .push(entry) .expect("submission queue is full"); } @@ -102,9 +107,13 @@ impl WrappedIOUring { } impl IO for LinuxIO { - fn open_file(&self, path: &str) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { trace!("open_file(path = {})", path); - let file = std::fs::File::options().read(true).write(true).open(path)?; + let file = std::fs::File::options() + .read(true) + .write(true) + .create(matches!(flags, OpenFlags::Create)) + .open(path)?; // Let's attempt to enable direct I/O. Not all filesystems support it // so ignore any errors. let fd = file.as_raw_fd(); @@ -128,7 +137,7 @@ impl IO for LinuxIO { let ring = &mut inner.ring; if ring.empty() { - return Ok(()) + return Ok(()); } ring.wait_for_completion()?; @@ -216,7 +225,7 @@ impl File for LinuxFile { fn pread(&self, pos: usize, c: Rc) -> Result<()> { let r = match &(*c) { Completion::Read(r) => r, - Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; trace!("pread(pos = {}, length = {})", pos, r.buf().len()); let fd = io_uring::types::Fd(self.file.as_raw_fd()); @@ -256,6 +265,21 @@ impl File for LinuxFile { io.ring.submit_entry(&write); Ok(()) } + + fn sync(&self, c: Rc) -> Result<()> { + let fd = io_uring::types::Fd(self.file.as_raw_fd()); + let ptr = Rc::into_raw(c.clone()); + let sync = io_uring::opcode::Fsync::new(fd) + .build() + .user_data(ptr as u64); + let mut io = self.io.borrow_mut(); + io.ring.submit_entry(&sync); + Ok(()) + } + + fn size(&self) -> Result { + Ok(self.file.metadata().unwrap().len()) + } } impl Drop for LinuxFile { diff --git a/core/io/mod.rs b/core/io/mod.rs index ea4006f92..5d87be948 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -14,10 +14,17 @@ pub trait File { fn unlock_file(&self) -> Result<()>; fn pread(&self, pos: usize, c: Rc) -> Result<()>; fn pwrite(&self, pos: usize, buffer: Rc>, c: Rc) -> Result<()>; + fn sync(&self, c: Rc) -> Result<()>; + fn size(&self) -> Result; +} + +pub enum OpenFlags { + None, + Create, } pub trait IO { - fn open_file(&self, path: &str) -> Result>; + fn open_file(&self, path: &str, flags: OpenFlags) -> Result>; fn run_once(&self) -> Result<()>; @@ -28,10 +35,12 @@ pub trait IO { pub type Complete = dyn Fn(Rc>); pub type WriteComplete = dyn Fn(i32); +pub type SyncComplete = dyn Fn(i32); pub enum Completion { Read(ReadCompletion), Write(WriteCompletion), + Sync(SyncCompletion), } pub struct ReadCompletion { @@ -43,7 +52,8 @@ impl Completion { pub fn complete(&self, result: i32) { match self { Completion::Read(r) => r.complete(), - Completion::Write(w) => w.complete(result), // fix + Completion::Write(w) => w.complete(result), + Completion::Sync(s) => s.complete(result), // fix } } } @@ -52,6 +62,10 @@ pub struct WriteCompletion { pub complete: Box, } +pub struct SyncCompletion { + pub complete: Box, +} + impl ReadCompletion { pub fn new(buf: Rc>, complete: Box) -> Self { Self { buf, complete } @@ -74,11 +88,22 @@ impl WriteCompletion { pub fn new(complete: Box) -> Self { Self { complete } } + pub fn complete(&self, bytes_written: i32) { (self.complete)(bytes_written); } } +impl SyncCompletion { + pub fn new(complete: Box) -> Self { + Self { complete } + } + + pub fn complete(&self, res: i32) { + (self.complete)(res); + } +} + pub type BufferData = Pin>; pub type BufferDropFn = Rc; diff --git a/core/io/windows.rs b/core/io/windows.rs index 2916f6719..1ef18665d 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -1,4 +1,4 @@ -use crate::{Completion, File, Result, WriteCompletion, IO}; +use crate::{Completion, File, LimboError, OpenFlags, Result, WriteCompletion, IO}; use log::trace; use std::cell::RefCell; use std::io::{Read, Seek, Write}; @@ -13,9 +13,13 @@ impl WindowsIO { } impl IO for WindowsIO { - fn open_file(&self, path: &str) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { trace!("open_file(path = {})", path); - let file = std::fs::File::open(path)?; + let file = std::fs::File::options() + .read(true) + .write(true) + .create(matches!(flags, OpenFlags::Create)) + .open(path)?; Ok(Rc::new(WindowsFile { file: RefCell::new(file), })) @@ -55,7 +59,7 @@ impl File for WindowsFile { { let r = match &(*c) { Completion::Read(r) => r, - Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; let mut buf = r.buf_mut(); let buf = buf.as_mut_slice(); @@ -76,6 +80,19 @@ impl File for WindowsFile { let buf = buffer.borrow(); let buf = buf.as_slice(); file.write_all(buf)?; + c.complete(buffer.borrow().len() as i32); Ok(()) } + + fn sync(&self, c: Rc) -> Result<()> { + let mut file = self.file.borrow_mut(); + file.sync_all().map_err(|err| LimboError::IOError(err))?; + c.complete(0); + Ok(()) + } + + fn size(&self) -> Result { + let file = self.file.borrow(); + Ok(file.metadata().unwrap().len()) + } } diff --git a/core/lib.rs b/core/lib.rs index 2b35f07a6..36f5739d0 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -19,11 +19,11 @@ use log::trace; use schema::Schema; use sqlite3_parser::ast; use sqlite3_parser::{ast::Cmd, lexer::sql::Parser}; +use std::rc::Weak; use std::sync::{Arc, OnceLock}; use std::{cell::RefCell, rc::Rc}; #[cfg(feature = "fs")] use storage::database::FileStorage; -use storage::pager::Pager; use storage::sqlite3_ondisk::DatabaseHeader; #[cfg(feature = "fs")] use storage::wal::WalFile; @@ -34,43 +34,61 @@ use translate::planner::prepare_select_plan; pub use error::LimboError; pub type Result = std::result::Result; +pub use io::OpenFlags; #[cfg(feature = "fs")] pub use io::PlatformIO; pub use io::{Buffer, Completion, File, WriteCompletion, IO}; +pub use storage::buffer_pool::BufferPool; pub use storage::database::DatabaseStorage; pub use storage::pager::Page; +pub use storage::pager::Pager; +pub use storage::wal::CheckpointStatus; pub use storage::wal::Wal; pub use types::Value; pub static DATABASE_VERSION: OnceLock = OnceLock::new(); +#[derive(Clone)] +enum TransactionState { + Write, + Read, + None, +} + pub struct Database { pager: Rc, schema: Rc, header: Rc>, + transaction_state: RefCell, } impl Database { #[cfg(feature = "fs")] - pub fn open_file(io: Arc, path: &str) -> Result { - let file = io.open_file(path)?; + pub fn open_file(io: Arc, path: &str) -> Result> { + let file = io.open_file(path, io::OpenFlags::None)?; let page_io = Rc::new(FileStorage::new(file)); let wal_path = format!("{}-wal", path); - let wal = Rc::new(WalFile::new(io.clone(), wal_path)); + let db_header = Pager::begin_open(page_io.clone())?; + io.run_once()?; + let wal = Rc::new(RefCell::new(WalFile::new( + io.clone(), + wal_path, + db_header.borrow().page_size as usize, + ))); Self::open(io, page_io, wal) } pub fn open( io: Arc, page_io: Rc, - wal: Rc, - ) -> Result { + wal: Rc>, + ) -> Result> { let db_header = Pager::begin_open(page_io.clone())?; + io.run_once()?; DATABASE_VERSION.get_or_init(|| { let version = db_header.borrow().version_number; version.to_string() }); - io.run_once()?; let pager = Rc::new(Pager::finish_open( db_header.clone(), page_io, @@ -78,11 +96,12 @@ impl Database { io.clone(), )?); let bootstrap_schema = Rc::new(Schema::new()); - let conn = Connection { + let conn = Rc::new(Connection { pager: pager.clone(), schema: bootstrap_schema.clone(), header: db_header.clone(), - }; + db: Weak::new(), + }); let mut schema = Schema::new(); let rows = conn.query("SELECT * FROM sqlite_schema")?; if let Some(mut rows) = rows { @@ -126,19 +145,21 @@ impl Database { } let schema = Rc::new(schema); let header = db_header; - Ok(Database { + Ok(Rc::new(Database { pager, schema, header, - }) + transaction_state: RefCell::new(TransactionState::None), + })) } - pub fn connect(&self) -> Connection { - Connection { + pub fn connect(self: &Rc) -> Rc { + Rc::new(Connection { pager: self.pager.clone(), schema: self.schema.clone(), header: self.header.clone(), - } + db: Rc::downgrade(self), + }) } } @@ -146,10 +167,11 @@ pub struct Connection { pager: Rc, schema: Rc, header: Rc>, + db: Weak, // backpointer to the database holding this connection } impl Connection { - pub fn prepare(&self, sql: impl Into) -> Result { + pub fn prepare(self: &Rc, sql: impl Into) -> Result { let sql = sql.into(); trace!("Preparing: {}", sql); let mut parser = Parser::new(sql.as_bytes()); @@ -162,6 +184,7 @@ impl Connection { stmt, self.header.clone(), self.pager.clone(), + Rc::downgrade(self), )?); Ok(Statement::new(program, self.pager.clone())) } @@ -173,7 +196,7 @@ impl Connection { } } - pub fn query(&self, sql: impl Into) -> Result> { + pub fn query(self: &Rc, sql: impl Into) -> Result> { let sql = sql.into(); trace!("Querying: {}", sql); let mut parser = Parser::new(sql.as_bytes()); @@ -186,6 +209,7 @@ impl Connection { stmt, self.header.clone(), self.pager.clone(), + Rc::downgrade(&self), )?); let stmt = Statement::new(program, self.pager.clone()); Ok(Some(Rows { stmt })) @@ -196,6 +220,7 @@ impl Connection { stmt, self.header.clone(), self.pager.clone(), + Rc::downgrade(self), )?; program.explain(); Ok(None) @@ -217,7 +242,7 @@ impl Connection { } } - pub fn execute(&self, sql: impl Into) -> Result<()> { + pub fn execute(self: &Rc, sql: impl Into) -> Result<()> { let sql = sql.into(); let mut parser = Parser::new(sql.as_bytes()); let cmd = parser.next()?; @@ -229,6 +254,7 @@ impl Connection { stmt, self.header.clone(), self.pager.clone(), + Rc::downgrade(self), )?; program.explain(); } @@ -239,6 +265,7 @@ impl Connection { stmt, self.header.clone(), self.pager.clone(), + Rc::downgrade(self), )?; let mut state = vdbe::ProgramState::new(program.max_registers); program.step(&mut state, self.pager.clone())?; @@ -248,8 +275,12 @@ impl Connection { Ok(()) } - pub fn cacheflush(&self) -> Result<()> { - self.pager.cacheflush()?; + pub fn cacheflush(&self) -> Result { + self.pager.cacheflush() + } + + pub fn clear_page_cache(&self) -> Result<()> { + self.pager.clear_page_cache(); Ok(()) } } diff --git a/core/storage/database.rs b/core/storage/database.rs index 16da1ed3f..75d835734 100644 --- a/core/storage/database.rs +++ b/core/storage/database.rs @@ -14,6 +14,7 @@ pub trait DatabaseStorage { buffer: Rc>, c: Rc, ) -> Result<()>; + fn sync(&self, c: Rc) -> Result<()>; } #[cfg(feature = "fs")] @@ -26,7 +27,7 @@ impl DatabaseStorage for FileStorage { fn read_page(&self, page_idx: usize, c: Rc) -> Result<()> { let r = match &(*c) { Completion::Read(r) => r, - Completion::Write(_) => unreachable!(), + _ => unreachable!(), }; let size = r.buf().len(); assert!(page_idx > 0); @@ -52,6 +53,10 @@ impl DatabaseStorage for FileStorage { self.file.pwrite(pos, buffer, c)?; Ok(()) } + + fn sync(&self, c: Rc) -> Result<()> { + self.file.sync(c) + } } #[cfg(feature = "fs")] diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 2a463514f..bbc902ea1 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -13,6 +13,8 @@ use std::rc::Rc; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; +use super::wal::CheckpointStatus; + pub struct Page { pub flags: AtomicUsize, pub contents: RwLock>, @@ -230,6 +232,13 @@ impl DumbLruPageCache { } self.detach(tail); } + + fn clear(&mut self) { + let to_remove: Vec = self.map.borrow().iter().map(|v| *v.0).collect(); + for key in to_remove { + self.delete(key); + } + } } #[allow(dead_code)] @@ -256,6 +265,23 @@ impl PageCache { } } +#[derive(Clone)] +enum FlushState { + Start, + SyncWal, + Checkpoint, + CheckpointDone, + SyncDbFile, + WaitSyncDbFile, +} + +/// This will keep track of the state of current cache flush in order to not repeat work +struct FlushInfo { + state: FlushState, + /// Number of writes taking place. When in_flight gets to 0 we can schedule a fsync. + in_flight_writes: Rc>, +} + /// The pager interface implements the persistence layer by providing access /// to pages of the database file, including caching, concurrency control, and /// transaction management. @@ -263,7 +289,7 @@ pub struct Pager { /// Source of the database pages. pub page_io: Rc, /// The write-ahead log (WAL) for the database. - wal: Rc, + wal: Rc>, /// A page cache for the database. page_cache: RefCell, /// Buffer pool for temporary data storage. @@ -272,6 +298,9 @@ pub struct Pager { pub io: Arc, dirty_pages: Rc>>, db_header: Rc>, + + flush_info: RefCell, + syncing: Rc>, } impl Pager { @@ -284,7 +313,7 @@ impl Pager { pub fn finish_open( db_header_ref: Rc>, page_io: Rc, - wal: Rc, + wal: Rc>, io: Arc, ) -> Result { let db_header = RefCell::borrow(&db_header_ref); @@ -299,19 +328,30 @@ impl Pager { io, dirty_pages: Rc::new(RefCell::new(HashSet::new())), db_header: db_header_ref.clone(), + flush_info: RefCell::new(FlushInfo { + state: FlushState::Start, + in_flight_writes: Rc::new(RefCell::new(0)), + }), + syncing: Rc::new(RefCell::new(false)), }) } pub fn begin_read_tx(&self) -> Result<()> { - self.wal.begin_read_tx()?; + self.wal.borrow().begin_read_tx()?; Ok(()) } - pub fn end_read_tx(&self) -> Result<()> { - self.wal.end_read_tx()?; + pub fn begin_write_tx(&self) -> Result<()> { + self.wal.borrow().begin_read_tx()?; Ok(()) } + pub fn end_tx(&self) -> Result { + self.cacheflush()?; + self.wal.borrow().end_read_tx()?; + Ok(CheckpointStatus::Done) + } + /// Reads a page from the database. pub fn read_page(&self, page_idx: usize) -> crate::Result>> { trace!("read_page(page_idx = {})", page_idx); @@ -321,8 +361,10 @@ impl Pager { } let page = Rc::new(RefCell::new(Page::new(page_idx))); RefCell::borrow(&page).set_locked(); - if let Some(frame_id) = self.wal.find_frame(page_idx as u64)? { - self.wal.read_frame(frame_id, page.clone())?; + if let Some(frame_id) = self.wal.borrow().find_frame(page_idx as u64)? { + self.wal + .borrow() + .read_frame(frame_id, page.clone(), self.buffer_pool.clone())?; { let page = page.borrow_mut(); page.set_uptodate(); @@ -356,19 +398,91 @@ impl Pager { dirty_pages.insert(page_id); } - pub fn cacheflush(&self) -> Result<()> { - let mut dirty_pages = RefCell::borrow_mut(&self.dirty_pages); - if dirty_pages.len() == 0 { - return Ok(()); + pub fn cacheflush(&self) -> Result { + loop { + let state = self.flush_info.borrow().state.clone(); + match state { + FlushState::Start => { + let db_size = self.db_header.borrow().database_size; + for page_id in self.dirty_pages.borrow().iter() { + let mut cache = self.page_cache.borrow_mut(); + let page = cache.get(page_id).expect("we somehow added a page to dirty list but we didn't mark it as dirty, causing cache to drop it."); + self.wal.borrow_mut().append_frame( + page.clone(), + db_size, + self, + self.flush_info.borrow().in_flight_writes.clone(), + )?; + } + self.dirty_pages.borrow_mut().clear(); + self.flush_info.borrow_mut().state = FlushState::SyncWal; + return Ok(CheckpointStatus::IO); + } + FlushState::Checkpoint => { + let in_flight = self.flush_info.borrow().in_flight_writes.clone(); + match self.wal.borrow_mut().checkpoint(self, in_flight)? { + CheckpointStatus::IO => return Ok(CheckpointStatus::IO), + CheckpointStatus::Done => { + self.flush_info.borrow_mut().state = FlushState::CheckpointDone; + } + }; + } + FlushState::CheckpointDone => { + let in_flight = *self.flush_info.borrow().in_flight_writes.borrow(); + if in_flight == 0 { + self.flush_info.borrow_mut().state = FlushState::SyncDbFile; + } else { + return Ok(CheckpointStatus::IO); + } + } + FlushState::SyncWal => { + match self.wal.borrow_mut().sync() { + Ok(CheckpointStatus::IO) => return Ok(CheckpointStatus::IO), + Ok(CheckpointStatus::Done) => {} + Err(e) => return Err(e), + } + + let should_checkpoint = self.wal.borrow().should_checkpoint(); + if should_checkpoint { + self.flush_info.borrow_mut().state = FlushState::Checkpoint; + } else { + self.flush_info.borrow_mut().state = FlushState::Start; + break; + } + } + FlushState::SyncDbFile => { + sqlite3_ondisk::begin_sync(self.page_io.clone(), self.syncing.clone())?; + self.flush_info.borrow_mut().state = FlushState::WaitSyncDbFile; + } + FlushState::WaitSyncDbFile => { + if *self.syncing.borrow() { + return Ok(CheckpointStatus::IO); + } else { + self.flush_info.borrow_mut().state = FlushState::Start; + break; + } + } + } } - for page_id in dirty_pages.iter() { - let mut cache = self.page_cache.borrow_mut(); - let page = cache.get(page_id).expect("we somehow added a page to dirty list but we didn't mark it as dirty, causing cache to drop it."); - sqlite3_ondisk::begin_write_btree_page(self, &page)?; + Ok(CheckpointStatus::Done) + } + + // WARN: used for testing purposes + pub fn clear_page_cache(&self) { + loop { + match self + .wal + .borrow_mut() + .checkpoint(self, Rc::new(RefCell::new(0))) + { + Ok(CheckpointStatus::IO) => {} + Ok(CheckpointStatus::Done) => { + break; + } + Err(err) => panic!("error while clearing cache {}", err), + } } - dirty_pages.clear(); - self.io.run_once()?; - Ok(()) + self.page_cache.borrow_mut().clear(); } /* @@ -382,14 +496,23 @@ impl Pager { header.database_size += 1; { // update database size - let first_page_ref = self.read_page(1).unwrap(); - let first_page = RefCell::borrow_mut(&first_page_ref); - first_page.set_dirty(); - self.add_dirty(1); + // read sync for now + loop { + let first_page_ref = self.read_page(1)?; + let first_page = RefCell::borrow_mut(&first_page_ref); + if first_page.is_locked() { + drop(first_page); + self.io.run_once()?; + continue; + } + first_page.set_dirty(); + self.add_dirty(1); - let contents = first_page.contents.write().unwrap(); - let contents = contents.as_ref().unwrap(); - contents.write_database_header(&header); + let contents = first_page.contents.write().unwrap(); + let contents = contents.as_ref().unwrap(); + contents.write_database_header(&header); + break; + } } let page_ref = Rc::new(RefCell::new(Page::new(0))); diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 9e0e695a3..c3f2630fa 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -42,7 +42,7 @@ //! https://www.sqlite.org/fileformat.html use crate::error::LimboError; -use crate::io::{Buffer, Completion, ReadCompletion, WriteCompletion}; +use crate::io::{Buffer, Completion, ReadCompletion, SyncCompletion, WriteCompletion}; use crate::storage::buffer_pool::BufferPool; use crate::storage::database::DatabaseStorage; use crate::storage::pager::{Page, Pager}; @@ -87,16 +87,19 @@ pub struct DatabaseHeader { pub version_number: u32, } +pub const WAL_HEADER_SIZE: usize = 32; +pub const WAL_FRAME_HEADER_SIZE: usize = 24; + #[derive(Debug, Default)] pub struct WalHeader { - magic: [u8; 4], - file_format: u32, - page_size: u32, - checkpoint_seq: u32, - salt_1: u32, - salt_2: u32, - checksum_1: u32, - checksum_2: u32, + pub magic: [u8; 4], + pub file_format: u32, + pub page_size: u32, + pub checkpoint_seq: u32, + pub salt_1: u32, + pub salt_2: u32, + pub checksum_1: u32, + pub checksum_2: u32, } #[allow(dead_code)] @@ -525,7 +528,11 @@ fn finish_read_page( Ok(()) } -pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result<()> { +pub fn begin_write_btree_page( + pager: &Pager, + page: &Rc>, + write_counter: Rc>, +) -> Result<()> { let page_source = &pager.page_io; let page_finish = page.clone(); @@ -537,11 +544,13 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result contents.buffer.clone() }; + *write_counter.borrow_mut() += 1; let write_complete = { let buf_copy = buffer.clone(); Box::new(move |bytes_written: i32| { let buf_copy = buf_copy.clone(); let buf_len = buf_copy.borrow().len(); + *write_counter.borrow_mut() -= 1; page_finish.borrow_mut().clear_dirty(); if bytes_written < buf_len as i32 { @@ -554,6 +563,18 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result Ok(()) } +pub fn begin_sync(page_io: Rc, syncing: Rc>) -> Result<()> { + assert!(!*syncing.borrow()); + *syncing.borrow_mut() = true; + let completion = Completion::Sync(SyncCompletion { + complete: Box::new(move |_| { + *syncing.borrow_mut() = false; + }), + }); + page_io.sync(Rc::new(completion))?; + Ok(()) +} + #[allow(clippy::enum_variant_names)] #[derive(Debug, Clone)] pub enum BTreeCell { @@ -935,9 +956,9 @@ pub fn write_varint_to_vec(value: u64, payload: &mut Vec) { payload.extend_from_slice(&varint); } -pub fn begin_read_wal_header(io: Rc) -> Result>> { +pub fn begin_read_wal_header(io: &Rc) -> Result>> { let drop_fn = Rc::new(|_buf| {}); - let buf = Rc::new(RefCell::new(Buffer::allocate(32, drop_fn))); + let buf = Rc::new(RefCell::new(Buffer::allocate(WAL_HEADER_SIZE, drop_fn))); let result = Rc::new(RefCell::new(WalHeader::default())); let header = result.clone(); let complete = Box::new(move |buf: Rc>| { @@ -964,26 +985,122 @@ fn finish_read_wal_header(buf: Rc>, header: Rc, offset: usize, -) -> Result>> { - let drop_fn = Rc::new(|_buf| {}); - let buf = Rc::new(RefCell::new(Buffer::allocate(32, drop_fn))); - let result = Rc::new(RefCell::new(WalFrameHeader::default())); - let frame = result.clone(); + buffer_pool: Rc, + page: Rc>, +) -> Result<()> { + let buf = buffer_pool.get(); + let drop_fn = Rc::new(move |buf| { + let buffer_pool = buffer_pool.clone(); + buffer_pool.put(buf); + }); + let buf = Rc::new(RefCell::new(Buffer::new(buf, drop_fn))); + let frame = page.clone(); let complete = Box::new(move |buf: Rc>| { let frame = frame.clone(); - finish_read_wal_frame_header(buf, frame).unwrap(); + finish_read_page(2, buf, frame).unwrap(); }); let c = Rc::new(Completion::Read(ReadCompletion::new(buf, complete))); io.pread(offset, c)?; - Ok(result) + Ok(()) } -#[allow(dead_code)] -fn finish_read_wal_frame_header( +pub fn begin_write_wal_frame( + io: &Rc, + offset: usize, + page: &Rc>, + db_size: u32, + write_counter: Rc>, +) -> Result<()> { + let page_finish = page.clone(); + let page_id = page.borrow().id; + + let header = WalFrameHeader { + page_number: page_id as u32, + db_size, + salt_1: 0, + salt_2: 0, + checksum_1: 0, + checksum_2: 0, + }; + let buffer = { + let page = page.borrow(); + let contents = page.contents.read().unwrap(); + let drop_fn = Rc::new(|_buf| {}); + let contents = contents.as_ref().unwrap(); + + let mut buffer = Buffer::allocate( + contents.buffer.borrow().len() + WAL_FRAME_HEADER_SIZE, + drop_fn, + ); + let buf = buffer.as_mut_slice(); + + buf[0..4].copy_from_slice(&header.page_number.to_ne_bytes()); + buf[4..8].copy_from_slice(&header.db_size.to_ne_bytes()); + buf[8..12].copy_from_slice(&header.salt_1.to_ne_bytes()); + buf[12..16].copy_from_slice(&header.salt_2.to_ne_bytes()); + buf[16..20].copy_from_slice(&header.checksum_1.to_ne_bytes()); + buf[20..24].copy_from_slice(&header.checksum_2.to_ne_bytes()); + buf[WAL_FRAME_HEADER_SIZE..].copy_from_slice(&contents.as_ptr()); + + Rc::new(RefCell::new(buffer)) + }; + + *write_counter.borrow_mut() += 1; + let write_complete = { + let buf_copy = buffer.clone(); + Box::new(move |bytes_written: i32| { + let buf_copy = buf_copy.clone(); + let buf_len = buf_copy.borrow().len(); + *write_counter.borrow_mut() -= 1; + + page_finish.borrow_mut().clear_dirty(); + if bytes_written < buf_len as i32 { + log::error!("wrote({bytes_written}) less than expected({buf_len})"); + } + }) + }; + let c = Rc::new(Completion::Write(WriteCompletion::new(write_complete))); + io.pwrite(offset, buffer.clone(), c)?; + Ok(()) +} + +pub fn begin_write_wal_header(io: &Rc, header: &WalHeader) -> Result<()> { + let buffer = { + let drop_fn = Rc::new(|_buf| {}); + + let mut buffer = Buffer::allocate(WAL_HEADER_SIZE, drop_fn); + let buf = buffer.as_mut_slice(); + + buf[0..4].copy_from_slice(&header.magic); + buf[4..8].copy_from_slice(&header.file_format.to_be_bytes()); + buf[8..12].copy_from_slice(&header.page_size.to_be_bytes()); + buf[12..16].copy_from_slice(&header.checkpoint_seq.to_be_bytes()); + buf[16..20].copy_from_slice(&header.salt_1.to_be_bytes()); + buf[20..24].copy_from_slice(&header.salt_2.to_be_bytes()); + buf[24..28].copy_from_slice(&header.checksum_1.to_be_bytes()); + buf[28..32].copy_from_slice(&header.checksum_2.to_be_bytes()); + + Rc::new(RefCell::new(buffer)) + }; + + let write_complete = { + Box::new(move |bytes_written: i32| { + if bytes_written < WAL_HEADER_SIZE as i32 { + log::error!( + "wal header wrote({bytes_written}) less than expected({WAL_HEADER_SIZE})" + ); + } + }) + }; + let c = Rc::new(Completion::Write(WriteCompletion::new(write_complete))); + io.pwrite(0, buffer.clone(), c)?; + Ok(()) +} + +fn finish_read_wal_frame( buf: Rc>, frame: Rc>, ) -> Result<()> { diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 10583e26b..8bc55f910 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1,23 +1,58 @@ +use std::collections::{HashMap, HashSet}; use std::{cell::RefCell, rc::Rc, sync::Arc}; -use crate::io::{File, IO}; +use crate::io::{File, SyncCompletion, IO}; +use crate::storage::sqlite3_ondisk::{ + begin_read_wal_frame, begin_write_wal_frame, WAL_FRAME_HEADER_SIZE, WAL_HEADER_SIZE, +}; +use crate::Completion; use crate::{storage::pager::Page, Result}; -use super::sqlite3_ondisk; +use super::buffer_pool::BufferPool; +use super::pager::Pager; +use super::sqlite3_ondisk::{self, begin_write_btree_page, WalHeader}; /// Write-ahead log (WAL). pub trait Wal { - /// Begin a write transaction. + /// Begin a read transaction. fn begin_read_tx(&self) -> Result<()>; - /// End a write transaction. + /// Begin a write transaction. + fn begin_write_tx(&self) -> Result<()>; + + /// End a read transaction. fn end_read_tx(&self) -> Result<()>; + /// End a write transaction. + fn end_write_tx(&self) -> Result<()>; + /// Find the latest frame containing a page. fn find_frame(&self, page_id: u64) -> Result>; /// Read a frame from the WAL. - fn read_frame(&self, frame_id: u64, page: Rc>) -> Result<()>; + fn read_frame( + &self, + frame_id: u64, + page: Rc>, + buffer_pool: Rc, + ) -> Result<()>; + + /// Write a frame to the WAL. + fn append_frame( + &mut self, + page: Rc>, + db_size: u32, + pager: &Pager, + write_counter: Rc>, + ) -> Result<()>; + + fn should_checkpoint(&self) -> bool; + fn checkpoint( + &mut self, + pager: &Pager, + write_counter: Rc>, + ) -> Result; + fn sync(&mut self) -> Result; } #[cfg(feature = "fs")] @@ -26,53 +61,231 @@ pub struct WalFile { wal_path: String, file: RefCell>>, wal_header: RefCell>>>, + min_frame: RefCell, + max_frame: RefCell, + nbackfills: RefCell, + // Maps pgno to frame id and offset in wal file + frame_cache: RefCell>>, // FIXME: for now let's use a simple hashmap instead of a shm file + checkpoint_threshold: usize, + ongoing_checkpoint: HashSet, + + syncing: Rc>, + page_size: usize, +} + +pub enum CheckpointStatus { + Done, + IO, } #[cfg(feature = "fs")] impl Wal for WalFile { - /// Begin a write transaction. + /// Begin a read transaction. fn begin_read_tx(&self) -> Result<()> { + self.min_frame.replace(*self.nbackfills.borrow() + 1); Ok(()) } - /// End a write transaction. + /// End a read transaction. fn end_read_tx(&self) -> Result<()> { Ok(()) } /// Find the latest frame containing a page. - fn find_frame(&self, _page_id: u64) -> Result> { + fn find_frame(&self, page_id: u64) -> Result> { + let frame_cache = self.frame_cache.borrow(); + let frames = frame_cache.get(&page_id); + if frames.is_none() { + return Ok(None); + } self.ensure_init()?; + let frames = frames.unwrap(); + for frame in frames.iter().rev() { + if *frame <= *self.max_frame.borrow() { + return Ok(Some(*frame)); + } + } Ok(None) } /// Read a frame from the WAL. - fn read_frame(&self, _frame_id: u64, _page: Rc>) -> Result<()> { - todo!(); + fn read_frame( + &self, + frame_id: u64, + page: Rc>, + buffer_pool: Rc, + ) -> Result<()> { + let offset = self.frame_offset(frame_id); + begin_read_wal_frame( + self.file.borrow().as_ref().unwrap(), + offset, + buffer_pool, + page, + )?; + Ok(()) + } + + /// Write a frame to the WAL. + fn append_frame( + &mut self, + page: Rc>, + db_size: u32, + _pager: &Pager, + write_counter: Rc>, + ) -> Result<()> { + self.ensure_init()?; + let page_id = page.borrow().id; + let frame_id = *self.max_frame.borrow(); + let offset = self.frame_offset(frame_id); + begin_write_wal_frame( + self.file.borrow().as_ref().unwrap(), + offset, + &page, + db_size, + write_counter, + )?; + self.max_frame.replace(frame_id + 1); + { + let mut frame_cache = self.frame_cache.borrow_mut(); + let frames = frame_cache.get_mut(&(page_id as u64)); + match frames { + Some(frames) => frames.push(frame_id), + None => { + frame_cache.insert(page_id as u64, vec![frame_id]); + } + } + } + Ok(()) + } + + /// Begin a write transaction + fn begin_write_tx(&self) -> Result<()> { + Ok(()) + } + + /// End a write transaction + fn end_write_tx(&self) -> Result<()> { + Ok(()) + } + + fn should_checkpoint(&self) -> bool { + let frame_id = *self.max_frame.borrow() as usize; + frame_id >= self.checkpoint_threshold + } + + fn checkpoint( + &mut self, + pager: &Pager, + write_counter: Rc>, + ) -> Result { + for (page_id, _frames) in self.frame_cache.borrow().iter() { + // move page from WAL to database file + // TODO(Pere): use splice syscall in linux to do zero-copy file page movements to improve perf + let page_id = *page_id as usize; + if self.ongoing_checkpoint.contains(&page_id) { + continue; + } + + let page = pager.read_page(page_id)?; + if page.borrow().is_locked() { + return Ok(CheckpointStatus::IO); + } + + begin_write_btree_page(pager, &page, write_counter.clone())?; + self.ongoing_checkpoint.insert(page_id); + } + + self.frame_cache.borrow_mut().clear(); + *self.max_frame.borrow_mut() = 0; + self.ongoing_checkpoint.clear(); + Ok(CheckpointStatus::Done) + } + + fn sync(&mut self) -> Result { + self.ensure_init()?; + let file = self.file.borrow(); + let file = file.as_ref().unwrap(); + { + let syncing = self.syncing.clone(); + let completion = Completion::Sync(SyncCompletion { + complete: Box::new(move |_| { + *syncing.borrow_mut() = false; + }), + }); + file.sync(Rc::new(completion))?; + } + + if *self.syncing.borrow() { + return Ok(CheckpointStatus::IO); + } else { + return Ok(CheckpointStatus::Done); + } } } #[cfg(feature = "fs")] impl WalFile { - pub fn new(io: Arc, wal_path: String) -> Self { + pub fn new(io: Arc, wal_path: String, page_size: usize) -> Self { Self { io, wal_path, file: RefCell::new(None), wal_header: RefCell::new(None), + frame_cache: RefCell::new(HashMap::new()), + min_frame: RefCell::new(0), + max_frame: RefCell::new(0), + nbackfills: RefCell::new(0), + checkpoint_threshold: 1000, + ongoing_checkpoint: HashSet::new(), + syncing: Rc::new(RefCell::new(false)), + page_size, } } fn ensure_init(&self) -> Result<()> { if self.file.borrow().is_none() { - if let Ok(file) = self.io.open_file(&self.wal_path) { - *self.file.borrow_mut() = Some(file.clone()); - let wal_header = sqlite3_ondisk::begin_read_wal_header(file)?; - // TODO: Return a completion instead. - self.io.run_once()?; - self.wal_header.replace(Some(wal_header)); - } + match self + .io + .open_file(&self.wal_path, crate::io::OpenFlags::Create) + { + Ok(file) => { + if file.size()? > 0 { + let wal_header = match sqlite3_ondisk::begin_read_wal_header(&file) { + Ok(header) => header, + Err(err) => panic!("Couldn't read header page: {:?}", err), + }; + // TODO: Return a completion instead. + self.io.run_once()?; + self.wal_header.replace(Some(wal_header)); + } else { + let wal_header = WalHeader { + magic: (0x377f0682_u32).to_be_bytes(), + file_format: 3007000, + page_size: self.page_size as u32, + checkpoint_seq: 0, // TODO implement sequence number + salt_1: 0, // TODO implement salt + salt_2: 0, + checksum_1: 0, + checksum_2: 0, // TODO implement checksum header + }; + sqlite3_ondisk::begin_write_wal_header(&file, &wal_header)?; + self.wal_header + .replace(Some(Rc::new(RefCell::new(wal_header)))); + } + *self.file.borrow_mut() = Some(file); + } + Err(err) => panic!("{:?} {}", err, &self.wal_path), + }; } Ok(()) } + + fn frame_offset(&self, frame_id: u64) -> usize { + let header = self.wal_header.borrow(); + let header = header.as_ref().unwrap().borrow(); + let page_size = header.page_size; + let page_offset = frame_id * (page_size as u64 + WAL_FRAME_HEADER_SIZE as u64); + let offset = WAL_HEADER_SIZE as u64 + WAL_FRAME_HEADER_SIZE as u64 + page_offset; + offset as usize + } } diff --git a/core/translate/emitter.rs b/core/translate/emitter.rs index 9c18e8cde..17c4b90fe 100644 --- a/core/translate/emitter.rs +++ b/core/translate/emitter.rs @@ -1,6 +1,7 @@ use std::cell::RefCell; use std::collections::HashMap; -use std::rc::Rc; +use std::rc::{Rc, Weak}; +use std::usize; use sqlite3_parser::ast; @@ -11,7 +12,7 @@ use crate::translate::plan::Search; use crate::types::{OwnedRecord, OwnedValue}; use crate::vdbe::builder::ProgramBuilder; use crate::vdbe::{BranchOffset, Insn, Program}; -use crate::Result; +use crate::{Connection, Result}; use super::expr::{ translate_aggregation, translate_condition_expr, translate_expr, translate_table_columns, @@ -1683,7 +1684,7 @@ fn epilogue( }); program.resolve_label(init_label, program.offset()); - program.emit_insn(Insn::Transaction); + program.emit_insn(Insn::Transaction { write: false }); program.emit_constant_insns(); program.emit_insn(Insn::Goto { @@ -1699,6 +1700,7 @@ pub fn emit_program( database_header: Rc>, mut plan: Plan, cache: ExpressionResultCache, + connection: Weak, ) -> Result { let (mut program, mut metadata, init_label, start_offset) = prologue(cache)?; loop { @@ -1717,7 +1719,7 @@ pub fn emit_program( } OpStepResult::Done => { epilogue(&mut program, &mut metadata, init_label, start_offset)?; - return Ok(program.build(database_header)); + return Ok(program.build(database_header, connection)); } } } diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 7f764ae87..ea890e994 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -1,3 +1,4 @@ +use std::rc::Weak; use std::{cell::RefCell, ops::Deref, rc::Rc}; use sqlite3_parser::ast::{ @@ -5,13 +6,13 @@ use sqlite3_parser::ast::{ }; use crate::error::SQLITE_CONSTRAINT_PRIMARYKEY; -use crate::Result; use crate::{ schema::{Schema, Table}, storage::sqlite3_ondisk::DatabaseHeader, translate::expr::translate_expr, vdbe::{builder::ProgramBuilder, Insn, Program}, }; +use crate::{Connection, Result}; #[allow(clippy::too_many_arguments)] pub fn translate_insert( @@ -23,6 +24,7 @@ pub fn translate_insert( body: &InsertBody, _returning: &Option>, database_header: Rc>, + connection: Weak, ) -> Result { assert!(with.is_none()); assert!(or_conflict.is_none()); @@ -203,11 +205,11 @@ pub fn translate_insert( description: String::new(), }); program.resolve_label(init_label, program.offset()); - program.emit_insn(Insn::Transaction); + program.emit_insn(Insn::Transaction { write: true }); program.emit_constant_insns(); program.emit_insn(Insn::Goto { target_pc: start_offset, }); program.resolve_deferred_labels(); - Ok(program.build(database_header)) + Ok(program.build(database_header, connection)) } diff --git a/core/translate/mod.rs b/core/translate/mod.rs index 86219c833..dc77a00b3 100644 --- a/core/translate/mod.rs +++ b/core/translate/mod.rs @@ -16,13 +16,13 @@ pub(crate) mod planner; pub(crate) mod select; use std::cell::RefCell; -use std::rc::Rc; +use std::rc::{Rc, Weak}; use crate::schema::Schema; use crate::storage::pager::Pager; use crate::storage::sqlite3_ondisk::{DatabaseHeader, MIN_PAGE_CACHE_SIZE}; use crate::vdbe::{builder::ProgramBuilder, Insn, Program}; -use crate::{bail_parse_error, Result}; +use crate::{bail_parse_error, Connection, Result}; use insert::translate_insert; use select::translate_select; use sqlite3_parser::ast; @@ -33,6 +33,7 @@ pub fn translate( stmt: ast::Stmt, database_header: Rc>, pager: Rc, + connection: Weak, ) -> Result { match stmt { ast::Stmt::AlterTable(_, _) => bail_parse_error!("ALTER TABLE not supported yet"), @@ -53,12 +54,14 @@ pub fn translate( ast::Stmt::DropTable { .. } => bail_parse_error!("DROP TABLE not supported yet"), ast::Stmt::DropTrigger { .. } => bail_parse_error!("DROP TRIGGER not supported yet"), ast::Stmt::DropView { .. } => bail_parse_error!("DROP VIEW not supported yet"), - ast::Stmt::Pragma(name, body) => translate_pragma(&name, body, database_header, pager), + ast::Stmt::Pragma(name, body) => { + translate_pragma(&name, body, database_header, pager, connection) + } ast::Stmt::Reindex { .. } => bail_parse_error!("REINDEX not supported yet"), ast::Stmt::Release(_) => bail_parse_error!("RELEASE not supported yet"), ast::Stmt::Rollback { .. } => bail_parse_error!("ROLLBACK not supported yet"), ast::Stmt::Savepoint(_) => bail_parse_error!("SAVEPOINT not supported yet"), - ast::Stmt::Select(select) => translate_select(schema, select, database_header), + ast::Stmt::Select(select) => translate_select(schema, select, database_header, connection), ast::Stmt::Update { .. } => bail_parse_error!("UPDATE not supported yet"), ast::Stmt::Vacuum(_, _) => bail_parse_error!("VACUUM not supported yet"), ast::Stmt::Insert { @@ -77,6 +80,7 @@ pub fn translate( &body, &returning, database_header, + connection, ), } } @@ -86,6 +90,7 @@ fn translate_pragma( body: Option, database_header: Rc>, pager: Rc, + connection: Weak, ) -> Result { let mut program = ProgramBuilder::new(); let init_label = program.allocate_label(); @@ -96,6 +101,7 @@ fn translate_pragma( init_label, ); let start_offset = program.offset(); + let mut write = false; match body { None => { let pragma_result = program.alloc_register(); @@ -124,6 +130,7 @@ fn translate_pragma( }, _ => 0, }; + write = true; update_pragma( &name.name.0, value_to_update, @@ -140,13 +147,13 @@ fn translate_pragma( description: String::new(), }); program.resolve_label(init_label, program.offset()); - program.emit_insn(Insn::Transaction); + program.emit_insn(Insn::Transaction { write }); program.emit_constant_insns(); program.emit_insn(Insn::Goto { target_pc: start_offset, }); program.resolve_deferred_labels(); - Ok(program.build(database_header)) + Ok(program.build(database_header, connection)) } fn update_pragma(name: &str, value: i64, header: Rc>, pager: Rc) { diff --git a/core/translate/select.rs b/core/translate/select.rs index 07ea7d8f2..d486f6c23 100644 --- a/core/translate/select.rs +++ b/core/translate/select.rs @@ -1,6 +1,8 @@ +use std::rc::Weak; use std::{cell::RefCell, rc::Rc}; use crate::storage::sqlite3_ondisk::DatabaseHeader; +use crate::Connection; use crate::{schema::Schema, vdbe::Program, Result}; use sqlite3_parser::ast; @@ -12,8 +14,14 @@ pub fn translate_select( schema: &Schema, select: ast::Select, database_header: Rc>, + connection: Weak, ) -> Result { let select_plan = prepare_select_plan(schema, select)?; let (optimized_plan, expr_result_cache) = optimize_plan(select_plan)?; - emit_program(database_header, optimized_plan, expr_result_cache) + emit_program( + database_header, + optimized_plan, + expr_result_cache, + connection, + ) } diff --git a/core/vdbe/builder.rs b/core/vdbe/builder.rs index 313967c5d..ee2bdb613 100644 --- a/core/vdbe/builder.rs +++ b/core/vdbe/builder.rs @@ -1,6 +1,10 @@ -use std::{cell::RefCell, collections::HashMap, rc::Rc}; +use std::{ + cell::RefCell, + collections::HashMap, + rc::{Rc, Weak}, +}; -use crate::storage::sqlite3_ondisk::DatabaseHeader; +use crate::{storage::sqlite3_ondisk::DatabaseHeader, Connection}; use super::{BranchOffset, CursorID, Insn, InsnReference, Program, Table}; @@ -354,7 +358,11 @@ impl ProgramBuilder { self.deferred_label_resolutions.clear(); } - pub fn build(self, database_header: Rc>) -> Program { + pub fn build( + self, + database_header: Rc>, + connection: Weak, + ) -> Program { assert!( self.deferred_label_resolutions.is_empty(), "deferred_label_resolutions is not empty when build() is called, did you forget to call resolve_deferred_labels()?" @@ -369,6 +377,8 @@ impl ProgramBuilder { cursor_ref: self.cursor_ref, database_header, comments: self.comments, + connection, + auto_commit: true, } } } diff --git a/core/vdbe/explain.rs b/core/vdbe/explain.rs index de918398a..b437b9d5d 100644 --- a/core/vdbe/explain.rs +++ b/core/vdbe/explain.rs @@ -395,10 +395,10 @@ pub fn insn_to_str( 0, "".to_string(), ), - Insn::Transaction => ( + Insn::Transaction { write } => ( "Transaction", 0, - 0, + *write as i32, 0, OwnedValue::Text(Rc::new("".to_string())), 0, diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index f3ee80fa4..0c292ff85 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -33,7 +33,8 @@ use crate::storage::{btree::BTreeCursor, pager::Pager}; use crate::types::{ AggContext, Cursor, CursorResult, OwnedRecord, OwnedValue, Record, SeekKey, SeekOp, }; -use crate::{Result, DATABASE_VERSION}; +use crate::DATABASE_VERSION; +use crate::{Connection, Result, TransactionState}; use datetime::{exec_date, exec_time, exec_unixepoch}; @@ -44,7 +45,7 @@ use std::borrow::BorrowMut; use std::cell::RefCell; use std::collections::{BTreeMap, HashMap}; use std::fmt::Display; -use std::rc::Rc; +use std::rc::{Rc, Weak}; pub type BranchOffset = i64; @@ -240,7 +241,9 @@ pub enum Insn { }, // Start a transaction. - Transaction, + Transaction { + write: bool, + }, // Branch to the given PC. Goto { @@ -529,6 +532,8 @@ pub struct Program { pub cursor_ref: Vec<(Option, Option)>, pub database_header: Rc>, pub comments: HashMap, + pub connection: Weak, + pub auto_commit: bool, } impl Program { @@ -1093,11 +1098,41 @@ impl Program { ))); } } - pager.end_read_tx()?; - return Ok(StepResult::Done); + if self.auto_commit { + return match pager.end_tx() { + Ok(crate::storage::wal::CheckpointStatus::IO) => Ok(StepResult::IO), + Ok(crate::storage::wal::CheckpointStatus::Done) => Ok(StepResult::Done), + Err(e) => Err(e), + }; + } else { + return Ok(StepResult::Done); + } } - Insn::Transaction => { - pager.begin_read_tx()?; + Insn::Transaction { write } => { + let connection = self.connection.upgrade().unwrap(); + if let Some(db) = connection.db.upgrade() { + // TODO(pere): are backpointers good ?? this looks ugly af + // upgrade transaction if needed + let new_transaction_state = + match (db.transaction_state.borrow().clone(), write) { + (crate::TransactionState::Write, true) => TransactionState::Write, + (crate::TransactionState::Write, false) => TransactionState::Write, + (crate::TransactionState::Read, true) => TransactionState::Write, + (crate::TransactionState::Read, false) => TransactionState::Read, + (crate::TransactionState::None, true) => TransactionState::Read, + (crate::TransactionState::None, false) => TransactionState::Read, + }; + // TODO(Pere): + // 1. lock wal + // 2. lock shared + // 3. lock write db if write + db.transaction_state.replace(new_transaction_state.clone()); + if matches!(new_transaction_state, TransactionState::Write) { + pager.begin_read_tx()?; + } else { + pager.begin_write_tx()?; + } + } state.pc += 1; } Insn::Goto { target_pc } => { diff --git a/simulator/main.rs b/simulator/main.rs index 386c9c0dc..fb80dc3b1 100644 --- a/simulator/main.rs +++ b/simulator/main.rs @@ -1,4 +1,4 @@ -use limbo_core::{Database, File, PlatformIO, Result, IO}; +use limbo_core::{Database, File, OpenFlags, PlatformIO, Result, IO}; use rand::prelude::*; use rand_chacha::ChaCha8Rng; use std::cell::RefCell; @@ -14,9 +14,12 @@ fn main() { println!("Seed: {}", seed); let mut rng = ChaCha8Rng::seed_from_u64(seed); let io = Arc::new(SimulatorIO::new(seed).unwrap()); - let db = match Database::open_file(io.clone(), "./testing/testing.db") { + let test_path = "./testing/testing.db"; + let db = match Database::open_file(io.clone(), test_path) { Ok(db) => db, - Err(_) => todo!(), + Err(e) => { + panic!("error opening database test file {}: {:?}", test_path, e); + } }; for _ in 0..100 { let conn = db.connect(); @@ -91,8 +94,8 @@ impl SimulatorIO { } impl IO for SimulatorIO { - fn open_file(&self, path: &str) -> Result> { - let inner = self.inner.open_file(path)?; + fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { + let inner = self.inner.open_file(path, flags)?; let file = Rc::new(SimulatorFile { inner, fault: RefCell::new(false), @@ -187,6 +190,14 @@ impl limbo_core::File for SimulatorFile { } self.inner.pwrite(pos, buffer, c) } + + fn sync(&self, c: Rc) -> Result<()> { + self.inner.sync(c) + } + + fn size(&self) -> Result { + Ok(self.inner.size()?) + } } impl Drop for SimulatorFile { diff --git a/sqlite3/src/lib.rs b/sqlite3/src/lib.rs index 9990a3320..67c975fb6 100644 --- a/sqlite3/src/lib.rs +++ b/sqlite3/src/lib.rs @@ -5,6 +5,7 @@ use log::trace; use std::cell::RefCell; use std::ffi; +use std::rc::Rc; use std::sync::Arc; macro_rules! stub { @@ -32,8 +33,8 @@ pub mod util; use util::sqlite3_safety_check_sick_or_ok; pub struct sqlite3 { - pub(crate) _db: limbo_core::Database, - pub(crate) conn: limbo_core::Connection, + pub(crate) _db: Rc, + pub(crate) conn: Rc, pub(crate) err_code: ffi::c_int, pub(crate) err_mask: ffi::c_int, pub(crate) malloc_failed: bool, @@ -42,7 +43,7 @@ pub struct sqlite3 { } impl sqlite3 { - pub fn new(db: limbo_core::Database, conn: limbo_core::Connection) -> Self { + pub fn new(db: Rc, conn: Rc) -> Self { Self { _db: db, conn, diff --git a/test/src/lib.rs b/test/src/lib.rs index 46ea9f704..7e8b31443 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -1,5 +1,6 @@ use limbo_core::Database; use std::path::PathBuf; +use std::rc::Rc; use std::sync::Arc; use tempfile::TempDir; @@ -16,6 +17,9 @@ impl TempDatabase { path.push("test.db"); { let connection = rusqlite::Connection::open(&path).unwrap(); + connection + .pragma_update(None, "journal_mode", "wal") + .unwrap(); connection.execute(table_sql, ()).unwrap(); } let io: Arc = Arc::new(limbo_core::PlatformIO::new().unwrap()); @@ -23,7 +27,7 @@ impl TempDatabase { Self { path, io } } - pub fn connect_limbo(&self) -> limbo_core::Connection { + pub fn connect_limbo(&self) -> Rc { let db = Database::open_file(self.io.clone(), self.path.to_str().unwrap()).unwrap(); db.connect() @@ -33,7 +37,8 @@ impl TempDatabase { #[cfg(test)] mod tests { use super::*; - use limbo_core::{RowResult, Value}; + use limbo_core::{CheckpointStatus, Connection, RowResult, Value}; + use log::debug; #[ignore] #[test] @@ -92,7 +97,7 @@ mod tests { eprintln!("{}", err); } } - conn.cacheflush()?; + do_flush(&conn, &tmp_db)?; } Ok(()) } @@ -129,7 +134,7 @@ mod tests { }; // this flush helped to review hex of test.db - conn.cacheflush()?; + do_flush(&conn, &tmp_db)?; match conn.query(list_query) { Ok(Some(ref mut rows)) => loop { @@ -160,7 +165,7 @@ mod tests { eprintln!("{}", err); } } - conn.cacheflush()?; + do_flush(&conn, &tmp_db)?; Ok(()) } @@ -234,7 +239,69 @@ mod tests { eprintln!("{}", err); } } - conn.cacheflush()?; + do_flush(&conn, &tmp_db)?; + Ok(()) + } + + #[test] + #[ignore] + fn test_wal_checkpoint() -> anyhow::Result<()> { + let _ = env_logger::try_init(); + let tmp_db = TempDatabase::new("CREATE TABLE test (x INTEGER PRIMARY KEY);"); + // threshold is 1000 by default + let iterations = 1001_usize; + let conn = tmp_db.connect_limbo(); + + for i in 0..iterations { + let insert_query = format!("INSERT INTO test VALUES ({})", i); + do_flush(&conn, &tmp_db)?; + conn.clear_page_cache().unwrap(); + match conn.query(insert_query) { + Ok(Some(ref mut rows)) => loop { + match rows.next_row()? { + RowResult::IO => { + tmp_db.io.run_once()?; + } + RowResult::Done => break, + _ => unreachable!(), + } + }, + Ok(None) => {} + Err(err) => { + eprintln!("{}", err); + } + }; + } + + do_flush(&conn, &tmp_db)?; + conn.clear_page_cache().unwrap(); + let list_query = "SELECT * FROM test LIMIT 1"; + let mut current_index = 0; + match conn.query(list_query) { + Ok(Some(ref mut rows)) => loop { + match rows.next_row()? { + RowResult::Row(row) => { + let first_value = &row.values[0]; + let id = match first_value { + Value::Integer(i) => *i as i32, + Value::Float(f) => *f as i32, + _ => unreachable!(), + }; + assert_eq!(current_index, id as usize); + current_index += 1; + } + RowResult::IO => { + tmp_db.io.run_once()?; + } + RowResult::Done => break, + } + }, + Ok(None) => {} + Err(err) => { + eprintln!("{}", err); + } + } + do_flush(&conn, &tmp_db)?; Ok(()) } @@ -254,4 +321,18 @@ mod tests { } } } + + fn do_flush(conn: &Rc, tmp_db: &TempDatabase) -> anyhow::Result<()> { + loop { + match conn.cacheflush()? { + CheckpointStatus::Done => { + break; + } + CheckpointStatus::IO => { + tmp_db.io.run_once()?; + } + } + } + Ok(()) + } }