From 6c14c50d4133b5280dad8e5e715053ec6e77a4be Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 20 Nov 2024 20:09:28 +0100 Subject: [PATCH 1/2] wal: checksums Implemeted checksums so that sqlite3 is able to read our WAL. This also helps with future work on proper recovery of WAL. Create some frames with CREATE TABLE and kill the process so that there is no checkpoint. ``` Limbo v0.0.6 Enter ".help" for usage hints. limbo> create table x(x); limbo> [1] 15910 killed cargo run xlimbo.db ``` Now sqlite3 is able to recover from this WAL created in limbo: ``` sqlite3 xlimbo.db SQLite version 3.43.2 2023-10-10 13:08:14 Enter ".help" for usage hints. sqlite> .schema CREATE TABLE x (x); ``` --- core/storage/sqlite3_ondisk.rs | 75 ++++++++++++++++++++++++++++++---- core/storage/wal.rs | 38 +++++++++++++++-- 2 files changed, 100 insertions(+), 13 deletions(-) diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index be15ce011..4763f1c77 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -48,6 +48,7 @@ use crate::storage::database::DatabaseStorage; use crate::storage::pager::{Page, Pager}; use crate::types::{OwnedRecord, OwnedValue}; use crate::{File, Result}; +use cfg_block::cfg_block; use log::trace; use std::cell::RefCell; use std::pin::Pin; @@ -90,10 +91,15 @@ pub struct DatabaseHeader { pub const WAL_HEADER_SIZE: usize = 32; pub const WAL_FRAME_HEADER_SIZE: usize = 24; +pub const WAL_MAGIC_BE: u32 = 0x377f0683; +#[allow(dead_code)] +pub const WAL_MAGIC_LE: u32 = 0x377f0682; #[derive(Debug, Default)] +#[repr(C)] // This helps with encoding because rust does not respect the order in structs, so in + // this case we want to keep the order pub struct WalHeader { - pub magic: [u8; 4], + pub magic: u32, pub file_format: u32, pub page_size: u32, pub checkpoint_seq: u32, @@ -1018,7 +1024,7 @@ fn finish_read_wal_header(buf: Rc>, header: Rc>, db_size: u32, write_counter: Rc>, -) -> Result<()> { + wal_header: &WalHeader, + checksums: (u32, u32), +) -> Result<(u32, u32)> { let page_finish = page.clone(); let page_id = page.borrow().id; trace!("begin_write_wal_frame(offset={}, page={})", offset, page_id); - let header = WalFrameHeader { + let mut header = WalFrameHeader { page_number: page_id as u32, db_size, salt_1: 0, @@ -1070,7 +1078,7 @@ pub fn begin_write_wal_frame( checksum_1: 0, checksum_2: 0, }; - let buffer = { + let (buffer, checksums) = { let page = page.borrow(); let contents = page.contents.as_ref().unwrap(); let drop_fn = Rc::new(|_buf| {}); @@ -1080,16 +1088,29 @@ pub fn begin_write_wal_frame( drop_fn, ); let buf = buffer.as_mut_slice(); - buf[0..4].copy_from_slice(&header.page_number.to_be_bytes()); buf[4..8].copy_from_slice(&header.db_size.to_be_bytes()); + + { + let contents_buf = contents.as_ptr(); + let native = wal_header.magic & 1; // LSB is set on big endian checksums + let native = cfg!(target_endian = "big") as u32 == native; // check if checksum + // type and native type is the same so that we know when to swap bytes + let checksums = checksum_wal(&buf[0..8], wal_header, checksums, native); + let checksums = checksum_wal(contents_buf, wal_header, checksums, native); + header.checksum_1 = checksums.0; + header.checksum_2 = checksums.1; + header.salt_1 = wal_header.salt_1; + header.salt_2 = wal_header.salt_2; + } + buf[8..12].copy_from_slice(&header.salt_1.to_be_bytes()); buf[12..16].copy_from_slice(&header.salt_2.to_be_bytes()); buf[16..20].copy_from_slice(&header.checksum_1.to_be_bytes()); buf[20..24].copy_from_slice(&header.checksum_2.to_be_bytes()); buf[WAL_FRAME_HEADER_SIZE..].copy_from_slice(contents.as_ptr()); - Rc::new(RefCell::new(buffer)) + (Rc::new(RefCell::new(buffer)), checksums) }; *write_counter.borrow_mut() += 1; @@ -1109,7 +1130,7 @@ pub fn begin_write_wal_frame( }; let c = Rc::new(Completion::Write(WriteCompletion::new(write_complete))); io.pwrite(offset, buffer.clone(), c)?; - Ok(()) + Ok(checksums) } pub fn begin_write_wal_header(io: &Rc, header: &WalHeader) -> Result<()> { @@ -1119,7 +1140,7 @@ pub fn begin_write_wal_header(io: &Rc, header: &WalHeader) -> Result<( let mut buffer = Buffer::allocate(512, drop_fn); let buf = buffer.as_mut_slice(); - buf[0..4].copy_from_slice(&header.magic); + buf[0..4].copy_from_slice(&header.magic.to_be_bytes()); buf[4..8].copy_from_slice(&header.file_format.to_be_bytes()); buf[8..12].copy_from_slice(&header.page_size.to_be_bytes()); buf[12..16].copy_from_slice(&header.checkpoint_seq.to_be_bytes()); @@ -1167,6 +1188,42 @@ pub fn payload_overflows( (true, space_left + 4) } +pub fn checksum_wal( + buf: &[u8], + wal_header: &WalHeader, + input: (u32, u32), + native_endian: bool, // Sqlite interprets big endian as "native" +) -> (u32, u32) { + assert!(buf.len() % 8 == 0, "buffer must be a multiple of 8"); + let mut s0: u32 = input.0; + let mut s1: u32 = input.1; + let mut i = 0; + if native_endian { + while i < buf.len() { + let v0 = u32::from_ne_bytes(buf[i..i + 4].try_into().unwrap()); + let v1 = u32::from_ne_bytes(buf[i + 4..i + 8].try_into().unwrap()); + s0 = s0.wrapping_add(v0.wrapping_add(s1)); + s1 = s1.wrapping_add(v1.wrapping_add(s0)); + i += 8; + } + } else { + while i < buf.len() { + let v0 = u32::from_ne_bytes(buf[i..i + 4].try_into().unwrap()).swap_bytes(); + let v1 = u32::from_ne_bytes(buf[i + 4..i + 8].try_into().unwrap()).swap_bytes(); + s0 = s0.wrapping_add(v0.wrapping_add(s1)); + s1 = s1.wrapping_add(v1.wrapping_add(s0)); + i += 8; + } + } + (s0, s1) +} + +impl WalHeader { + pub fn as_bytes(&self) -> &[u8] { + unsafe { std::mem::transmute::<&WalHeader, &[u8; std::mem::size_of::()]>(self) } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 713e2c838..a918f9bf6 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -10,6 +10,8 @@ use crate::storage::sqlite3_ondisk::{ use crate::Completion; use crate::{storage::pager::Page, Result}; +use self::sqlite3_ondisk::{checksum_wal, WAL_MAGIC_BE, WAL_MAGIC_LE}; + use super::buffer_pool::BufferPool; use super::pager::Pager; use super::sqlite3_ondisk::{self, begin_write_btree_page, WalHeader}; @@ -72,6 +74,9 @@ pub struct WalFile { syncing: Rc>, page_size: usize, + + last_checksum: RefCell<(u32, u32)>, // Check of last frame in WAL, this is a cumulative checksum + // over all frames in the WAL } pub enum CheckpointStatus { @@ -144,13 +149,20 @@ impl Wal for WalFile { offset, page_id ); - begin_write_wal_frame( + let header = self.wal_header.borrow(); + let header = header.as_ref().unwrap(); + let header = header.borrow(); + let checksums = *self.last_checksum.borrow(); + let checksums = begin_write_wal_frame( self.file.borrow().as_ref().unwrap(), offset, &page, db_size, write_counter, + &*header, + checksums, )?; + self.last_checksum.replace(checksums); self.max_frame.replace(frame_id + 1); { let mut frame_cache = self.frame_cache.borrow_mut(); @@ -245,6 +257,7 @@ impl WalFile { ongoing_checkpoint: HashSet::new(), syncing: Rc::new(RefCell::new(false)), page_size, + last_checksum: RefCell::new((0, 0)), } } @@ -264,16 +277,33 @@ impl WalFile { self.io.run_once()?; self.wal_header.replace(Some(wal_header)); } else { - let wal_header = WalHeader { - magic: (0x377f0682_u32).to_be_bytes(), + // magic is a single number represented as WAL_MAGIC_LE but the big endian + // counterpart is just the same number with LSB set to 1. + let magic = WAL_MAGIC_LE | cfg!(target_endian = "big") as u32; + let mut wal_header = WalHeader { + magic, file_format: 3007000, page_size: self.page_size as u32, checkpoint_seq: 0, // TODO implement sequence number salt_1: 0, // TODO implement salt salt_2: 0, checksum_1: 0, - checksum_2: 0, // TODO implement checksum header + checksum_2: 0, }; + let native = cfg!(target_endian = "big"); // if target_endian is + // already big then we don't care but if isn't, header hasn't yet been + // encoded to big endian, therefore we wan't to swap bytes to compute this + // checksum. + let checksums = *self.last_checksum.borrow_mut(); + let checksums = checksum_wal( + &wal_header.as_bytes()[..WAL_HEADER_SIZE - 2 * 4], // first 24 bytes + &wal_header, + checksums, + native, // this is false because we haven't encoded the wal header yet + ); + wal_header.checksum_1 = checksums.0; + wal_header.checksum_2 = checksums.1; + self.last_checksum.replace(checksums); sqlite3_ondisk::begin_write_wal_header(&file, &wal_header)?; self.wal_header .replace(Some(Rc::new(RefCell::new(wal_header)))); From 05a05cfc63b9e8531b731679106ed214b3a49e68 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 21 Nov 2024 11:55:36 +0100 Subject: [PATCH 2/2] endian naming changes --- core/storage/sqlite3_ondisk.rs | 15 ++++++++------- core/storage/wal.rs | 6 +++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 4763f1c77..590f94a86 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -91,9 +91,10 @@ pub struct DatabaseHeader { pub const WAL_HEADER_SIZE: usize = 32; pub const WAL_FRAME_HEADER_SIZE: usize = 24; -pub const WAL_MAGIC_BE: u32 = 0x377f0683; -#[allow(dead_code)] +// magic is a single number represented as WAL_MAGIC_LE but the big endian +// counterpart is just the same number with LSB set to 1. pub const WAL_MAGIC_LE: u32 = 0x377f0682; +pub const WAL_MAGIC_BE: u32 = 0x377f0683; #[derive(Debug, Default)] #[repr(C)] // This helps with encoding because rust does not respect the order in structs, so in @@ -1093,11 +1094,11 @@ pub fn begin_write_wal_frame( { let contents_buf = contents.as_ptr(); - let native = wal_header.magic & 1; // LSB is set on big endian checksums - let native = cfg!(target_endian = "big") as u32 == native; // check if checksum - // type and native type is the same so that we know when to swap bytes - let checksums = checksum_wal(&buf[0..8], wal_header, checksums, native); - let checksums = checksum_wal(contents_buf, wal_header, checksums, native); + let expects_be = wal_header.magic & 1; // LSB is set on big endian checksums + let use_native_endian = cfg!(target_endian = "big") as u32 == expects_be; // check if checksum + // type and native type is the same so that we know when to swap bytes + let checksums = checksum_wal(&buf[0..8], wal_header, checksums, use_native_endian); + let checksums = checksum_wal(contents_buf, wal_header, checksums, use_native_endian); header.checksum_1 = checksums.0; header.checksum_2 = checksums.1; header.salt_1 = wal_header.salt_1; diff --git a/core/storage/wal.rs b/core/storage/wal.rs index a918f9bf6..b86e06efb 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -279,7 +279,11 @@ impl WalFile { } else { // magic is a single number represented as WAL_MAGIC_LE but the big endian // counterpart is just the same number with LSB set to 1. - let magic = WAL_MAGIC_LE | cfg!(target_endian = "big") as u32; + let magic = if cfg!(target_endian = "big") { + WAL_MAGIC_BE + } else { + WAL_MAGIC_LE + }; let mut wal_header = WalHeader { magic, file_format: 3007000,