From 7e898eb8cac192d9a2af445f30a260aed12a7845 Mon Sep 17 00:00:00 2001 From: Alecco Date: Tue, 15 Apr 2025 14:40:55 +0200 Subject: [PATCH 01/22] page_cache: tests: move helper function up --- core/storage/page_cache.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index bd82e5849..3a0521aed 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -224,6 +224,14 @@ mod tests { use super::PageCacheKey; + fn insert_page(cache: &mut DumbLruPageCache, id: usize) -> PageCacheKey { + let key = PageCacheKey::new(id, None); + #[allow(clippy::arc_with_non_send_sync)] + let page = Arc::new(Page::new(id)); + cache.insert(key.clone(), page.clone()); + key + } + #[test] fn test_page_cache_evict() { let mut cache = DumbLruPageCache::new(1); @@ -327,14 +335,6 @@ mod tests { assert!(cache.get(&key2).is_none()); } - fn insert_page(cache: &mut DumbLruPageCache, id: usize) -> PageCacheKey { - let key = PageCacheKey::new(id, None); - #[allow(clippy::arc_with_non_send_sync)] - let page = Arc::new(Page::new(id)); - cache.insert(key.clone(), page.clone()); - key - } - #[test] fn test_page_cache_insert_sequential() { let mut cache = DumbLruPageCache::new(2); From 6763aa0cd506d842fac018f38bd2b2fe9bc1b169 Mon Sep 17 00:00:00 2001 From: Alecco Date: Tue, 15 Apr 2025 20:04:07 +0200 Subject: [PATCH 02/22] page_cache: tests: helper functions and more tests test_detach_via_insert fails as it repros insert not removing duplicate page entries with same cache key (id, frame) issue #1348 --- core/storage/page_cache.rs | 403 ++++++++++++++++++++++++++++++++++++- 1 file changed, 396 insertions(+), 7 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 3a0521aed..8cb650ac9 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -208,11 +208,141 @@ impl DumbLruPageCache { pub fn len(&self) -> usize { self.map.borrow().len() } + + #[cfg(test)] + fn get_entry_ptr(&self, key: &PageCacheKey) -> Option> { + self.map.borrow().get(key).copied() + } + + #[cfg(test)] + fn verify_list_integrity(&self) { + let map_len = self.map.borrow().len(); + let head_ptr = *self.head.borrow(); + let tail_ptr = *self.tail.borrow(); + + if map_len == 0 { + assert!(head_ptr.is_none(), "Head should be None when map is empty"); + assert!(tail_ptr.is_none(), "Tail should be None when map is empty"); + return; + } + + assert!( + head_ptr.is_some(), + "Head should be Some when map is not empty" + ); + assert!( + tail_ptr.is_some(), + "Tail should be Some when map is not empty" + ); + + unsafe { + assert!( + head_ptr.unwrap().as_ref().prev.is_none(), + "Head's prev pointer mismatch" + ); + } + + unsafe { + assert!( + tail_ptr.unwrap().as_ref().next.is_none(), + "Tail's next pointer mismatch" + ); + } + + // Forward traversal + let mut forward_count = 0; + let mut current = head_ptr; + let mut last_ptr: Option> = None; + while let Some(node) = current { + forward_count += 1; + unsafe { + let node_ref = node.as_ref(); + assert_eq!( + node_ref.prev, last_ptr, + "Backward pointer mismatch during forward traversal for key {:?}", + node_ref.key + ); + assert!( + self.map.borrow().contains_key(&node_ref.key), + "Node key {:?} not found in map during forward traversal", + node_ref.key + ); + assert_eq!( + self.map.borrow().get(&node_ref.key).copied(), + Some(node), + "Map pointer mismatch for key {:?}", + node_ref.key + ); + + last_ptr = Some(node); + current = node_ref.next; + } + + if forward_count > map_len + 5 { + panic!( + "Infinite loop suspected in forward integrity check. Size {}, count {}", + map_len, forward_count + ); + } + } + assert_eq!( + forward_count, map_len, + "Forward count mismatch (counted {}, map has {})", + forward_count, map_len + ); + assert_eq!( + tail_ptr, last_ptr, + "Tail pointer mismatch after forward traversal" + ); + + // Backward traversal + let mut backward_count = 0; + current = tail_ptr; + last_ptr = None; + while let Some(node) = current { + backward_count += 1; + unsafe { + let node_ref = node.as_ref(); + assert_eq!( + node_ref.next, last_ptr, + "Forward pointer mismatch during backward traversal for key {:?}", + node_ref.key + ); + assert!( + self.map.borrow().contains_key(&node_ref.key), + "Node key {:?} not found in map during backward traversal", + node_ref.key + ); + + last_ptr = Some(node); + current = node_ref.prev; + } + if backward_count > map_len + 5 { + panic!( + "Infinite loop suspected in backward integrity check. Size {}, count {}", + map_len, backward_count + ); + } + } + assert_eq!( + backward_count, map_len, + "Backward count mismatch (counted {}, map has {})", + backward_count, map_len + ); + assert_eq!( + head_ptr, last_ptr, + "Head pointer mismatch after backward traversal" + ); + } } #[cfg(test)] mod tests { - use std::{num::NonZeroUsize, sync::Arc}; + use super::*; + use crate::io::{Buffer, BufferData}; + use crate::storage::pager::{Page, PageRef}; + use crate::storage::sqlite3_ondisk::PageContent; + use std::{cell::RefCell, num::NonZeroUsize, pin::Pin, rc::Rc, sync::Arc}; use lru::LruCache; use rand_chacha::{ @@ -220,18 +350,277 @@ mod tests { ChaCha8Rng, }; - use crate::{storage::page_cache::DumbLruPageCache, Page}; + fn create_key(id: usize) -> PageCacheKey { + PageCacheKey::new(id, Some(id as u64)) + } - use super::PageCacheKey; + pub fn page_with_content(page_id: usize) -> PageRef { + let page = Arc::new(Page::new(page_id)); + { + let buffer_drop_fn = Rc::new(|_data: BufferData| {}); + let buffer = Buffer::new(Pin::new(vec![0; 4096]), buffer_drop_fn); + let page_content = PageContent { + offset: 0, + buffer: Arc::new(RefCell::new(buffer)), + overflow_cells: Vec::new(), + }; + page.get().contents = Some(page_content); + page.set_loaded(); + } + page + } fn insert_page(cache: &mut DumbLruPageCache, id: usize) -> PageCacheKey { - let key = PageCacheKey::new(id, None); - #[allow(clippy::arc_with_non_send_sync)] - let page = Arc::new(Page::new(id)); - cache.insert(key.clone(), page.clone()); + let key = create_key(id); + let page = page_with_content(id); + cache.insert(key.clone(), page); key } + fn page_has_content(page: &PageRef) -> bool { + page.is_loaded() && page.get().contents.is_some() + } + + #[test] + fn test_detach_only_element() { + let mut cache = DumbLruPageCache::new(5); + let key1 = insert_page(&mut cache, 1); + cache.verify_list_integrity(); + assert_eq!(cache.len(), 1); + assert!(cache.head.borrow().is_some()); + assert!(cache.tail.borrow().is_some()); + assert_eq!(*cache.head.borrow(), *cache.tail.borrow()); + + cache.delete(key1.clone()); + + assert_eq!( + cache.len(), + 0, + "Length should be 0 after deleting only element" + ); + assert!( + cache.map.borrow().get(&key1).is_none(), + "Map should not contain key after delete" + ); + assert!(cache.head.borrow().is_none(), "Head should be None"); + assert!(cache.tail.borrow().is_none(), "Tail should be None"); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_head() { + let mut cache = DumbLruPageCache::new(5); + let _key1 = insert_page(&mut cache, 1); // Tail + let key2 = insert_page(&mut cache, 2); // Middle + let key3 = insert_page(&mut cache, 3); // Head + cache.verify_list_integrity(); + assert_eq!(cache.len(), 3); + + let head_ptr_before = cache.head.borrow().unwrap(); + assert_eq!( + unsafe { &head_ptr_before.as_ref().key }, + &key3, + "Initial head check" + ); + + cache.delete(key3.clone()); + + assert_eq!(cache.len(), 2, "Length should be 2 after deleting head"); + assert!( + cache.map.borrow().get(&key3).is_none(), + "Map should not contain deleted head key" + ); + cache.verify_list_integrity(); + + let new_head_ptr = cache.head.borrow().unwrap(); + assert_eq!( + unsafe { &new_head_ptr.as_ref().key }, + &key2, + "New head should be key2" + ); + assert!( + unsafe { new_head_ptr.as_ref().prev.is_none() }, + "New head's prev should be None" + ); + + let tail_ptr = cache.tail.borrow().unwrap(); + assert_eq!( + unsafe { new_head_ptr.as_ref().next }, + Some(tail_ptr), + "New head's next should point to tail (key1)" + ); + } + + #[test] + fn test_detach_tail() { + let mut cache = DumbLruPageCache::new(5); + let key1 = insert_page(&mut cache, 1); // Tail + let key2 = insert_page(&mut cache, 2); // Middle + let _key3 = insert_page(&mut cache, 3); // Head + cache.verify_list_integrity(); + assert_eq!(cache.len(), 3); + + let tail_ptr_before = cache.tail.borrow().unwrap(); + assert_eq!( + unsafe { &tail_ptr_before.as_ref().key }, + &key1, + "Initial tail check" + ); + + cache.delete(key1.clone()); // Delete tail + + assert_eq!(cache.len(), 2, "Length should be 2 after deleting tail"); + assert!( + cache.map.borrow().get(&key1).is_none(), + "Map should not contain deleted tail key" + ); + cache.verify_list_integrity(); + + let new_tail_ptr = cache.tail.borrow().unwrap(); + assert_eq!( + unsafe { &new_tail_ptr.as_ref().key }, + &key2, + "New tail should be key2" + ); + assert!( + unsafe { new_tail_ptr.as_ref().next.is_none() }, + "New tail's next should be None" + ); + + let head_ptr = cache.head.borrow().unwrap(); + assert_eq!( + unsafe { head_ptr.as_ref().prev }, + None, + "Head's prev should point to new tail (key2)" + ); + assert_eq!( + unsafe { head_ptr.as_ref().next }, + Some(new_tail_ptr), + "Head's next should point to new tail (key2)" + ); + assert_eq!( + unsafe { new_tail_ptr.as_ref().next }, + None, + "Double check new tail's next is None" + ); + } + + #[test] + fn test_detach_middle() { + let mut cache = DumbLruPageCache::new(5); + let key1 = insert_page(&mut cache, 1); // Tail + let key2 = insert_page(&mut cache, 2); // Middle + let key3 = insert_page(&mut cache, 3); // Middle + let _key4 = insert_page(&mut cache, 4); // Head + cache.verify_list_integrity(); + assert_eq!(cache.len(), 4); + + let head_ptr_before = cache.head.borrow().unwrap(); + let tail_ptr_before = cache.tail.borrow().unwrap(); + + cache.delete(key2.clone()); // Detach a middle element (key2) + + assert_eq!(cache.len(), 3, "Length should be 3 after deleting middle"); + assert!( + cache.map.borrow().get(&key2).is_none(), + "Map should not contain deleted middle key2" + ); + cache.verify_list_integrity(); + + // Check neighbors + let key1_ptr = cache.get_entry_ptr(&key1).expect("Key1 should still exist"); + let key3_ptr = cache.get_entry_ptr(&key3).expect("Key3 should still exist"); + assert_eq!( + unsafe { key3_ptr.as_ref().next }, + Some(key1_ptr), + "Key3's next should point to key1" + ); + assert_eq!( + unsafe { key1_ptr.as_ref().prev }, + Some(key3_ptr), + "Key1's prev should point to key2" + ); + + assert_eq!( + cache.head.borrow().unwrap(), + head_ptr_before, + "Head should remain key4" + ); + assert_eq!( + cache.tail.borrow().unwrap(), + tail_ptr_before, + "Tail should remain key1" + ); + } + + #[test] + fn test_detach_via_delete() { + let mut cache = DumbLruPageCache::new(5); + let key1 = create_key(1); + let page1 = page_with_content(1); + cache.insert(key1.clone(), page1.clone()); + assert!( + page_has_content(&page1), + "Page content should exist before delete" + ); + cache.verify_list_integrity(); + + cache.delete(key1.clone()); + assert_eq!(cache.len(), 0); + assert!( + !page_has_content(&page1), + "Page content should be removed after delete" + ); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_via_insert() { + let mut cache = DumbLruPageCache::new(5); + let key1 = create_key(1); + let page1_v1 = page_with_content(1); + let page1_v2 = page_with_content(1); + + cache.insert(key1.clone(), page1_v1.clone()); + assert!( + page_has_content(&page1_v1), + "Page1 V1 content should exist initially" + ); + cache.verify_list_integrity(); + + cache.insert(key1.clone(), page1_v2.clone()); // Trigger delete page1_v1 + + assert_eq!( + cache.len(), + 1, + "Cache length should still be 1 after replace" + ); + assert!( + !page_has_content(&page1_v1), + "Page1 V1 content should be cleaned after being replaced in cache" + ); + assert!( + page_has_content(&page1_v2), + "Page1 V2 content should exist after insert" + ); + + let current_page = cache.get(&key1).unwrap(); + assert!( + Arc::ptr_eq(¤t_page, &page1_v2), + "Cache should now hold page1 V2" + ); + + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_nonexistent_key() { + let mut cache = DumbLruPageCache::new(5); + let key_nonexist = create_key(99); + + cache.delete(key_nonexist.clone()); // no-op + } + #[test] fn test_page_cache_evict() { let mut cache = DumbLruPageCache::new(1); From c8beddab093e0e3210e27568ad958805dbf1fc73 Mon Sep 17 00:00:00 2001 From: Alecco Date: Tue, 15 Apr 2025 12:44:34 +0200 Subject: [PATCH 03/22] page_cache: split unlink() out of detach() The unlink function removes an entry from the LRU. The detach function removes an entry in the cache and clears page contents. --- core/storage/page_cache.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 8cb650ac9..23afa13a8 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -109,7 +109,7 @@ impl DumbLruPageCache { let mut ptr = self.get_ptr(key)?; let page = unsafe { ptr.as_mut().page.clone() }; if touch { - self.detach(ptr, false); + self.unlink(ptr); self.touch(ptr); } Some(page) @@ -128,7 +128,10 @@ impl DumbLruPageCache { debug!("cleaning up page {}", page.get().id); let _ = page.get().contents.take(); } + self.unlink(entry); + } + fn unlink(&mut self, mut entry: NonNull) { let (next, prev) = unsafe { let c = entry.as_mut(); let next = c.next; @@ -138,7 +141,6 @@ impl DumbLruPageCache { (next, prev) }; - // detach match (prev, next) { (None, None) => { self.head.replace(None); From bdf427c3296aff409d2b52a6562728a5c233761d Mon Sep 17 00:00:00 2001 From: Alecco Date: Mon, 14 Apr 2025 22:06:06 +0200 Subject: [PATCH 04/22] page_cache: proper error handling for deletions Add error handling and results for insert(), delete(), _delete(), _detach(), pop_if_not_dirty(), and clear. Now these functions fail if a page is dirty, locked, or has other references. insert() makes room with pop_if_not_dirty() beforehand to handle cache full and un-evictable, else it would evict this page silently. _delete() returns Ok when key is not present in cache and it tries first to detach the cache entry and clean its page *before* removing the entry from the map. detach() checks firstt if it's possible to evict the page and if there are no other references to the page before taking its contents. test_detach_via_delete() and test_detach_via_insert() fixed by properly checking before and after dropping the page reference. test_page_cache_fuzz() fixed by reordering and moving reference to the page into insert. Other page cache tests fixed to check new function results. All page cache tests pass. Error handling and test fixes for Pager and BTree will be added in a subsequent commit. --- core/storage/page_cache.rs | 291 ++++++++++++++++++++++++++++--------- 1 file changed, 222 insertions(+), 69 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 23afa13a8..f5f105340 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -1,5 +1,6 @@ use std::{cell::RefCell, collections::HashMap, ptr::NonNull}; +use std::sync::Arc; use tracing::{debug, trace}; use super::pager::PageRef; @@ -40,6 +41,13 @@ pub struct DumbLruPageCache { unsafe impl Send for DumbLruPageCache {} unsafe impl Sync for DumbLruPageCache {} +#[derive(Debug, PartialEq)] +pub enum CacheError { + Locked, + Dirty, + ActiveRefs, +} + impl PageCacheKey { pub fn new(pgno: usize, max_frame: Option) -> Self { Self { pgno, max_frame } @@ -59,9 +67,13 @@ impl DumbLruPageCache { self.map.borrow().contains_key(key) } - pub fn insert(&mut self, key: PageCacheKey, value: PageRef) { - self._delete(key.clone(), false); + pub fn insert(&mut self, key: PageCacheKey, value: PageRef) -> Result<(), CacheError> { trace!("cache_insert(key={:?})", key); + self._delete(key.clone(), false)?; + if self.len() >= self.capacity { + // Make room before trying to insert + self.pop_if_not_dirty()?; + } let entry = Box::new(PageCacheEntry { key: key.clone(), next: None, @@ -73,24 +85,26 @@ impl DumbLruPageCache { self.touch(ptr); self.map.borrow_mut().insert(key, ptr); - if self.len() > self.capacity { - self.pop_if_not_dirty(); - } + Ok(()) } - pub fn delete(&mut self, key: PageCacheKey) { + pub fn delete(&mut self, key: PageCacheKey) -> Result<(), CacheError> { trace!("cache_delete(key={:?})", key); self._delete(key, true) } - pub fn _delete(&mut self, key: PageCacheKey, clean_page: bool) { - let ptr = self.map.borrow_mut().remove(&key); - if ptr.is_none() { - return; + // Returns Ok if key is not found + pub fn _delete(&mut self, key: PageCacheKey, clean_page: bool) -> Result<(), CacheError> { + if !self.contains_key(&key) { + return Ok(()); } - let ptr = ptr.unwrap(); - self.detach(ptr, clean_page); + + let ptr = *self.map.borrow().get(&key).unwrap(); + // Try to detach from LRU list first, can fail + self.detach(ptr, clean_page)?; + let ptr = self.map.borrow_mut().remove(&key).unwrap(); unsafe { std::ptr::drop_in_place(ptr.as_ptr()) }; + Ok(()) } fn get_ptr(&mut self, key: &PageCacheKey) -> Option> { @@ -120,15 +134,35 @@ impl DumbLruPageCache { todo!(); } - fn detach(&mut self, mut entry: NonNull, clean_page: bool) { + fn detach( + &mut self, + mut entry: NonNull, + clean_page: bool, + ) -> Result<(), CacheError> { + let entry_mut = unsafe { entry.as_mut() }; + if entry_mut.page.is_locked() { + return Err(CacheError::Locked); + } + if entry_mut.page.is_dirty() { + return Err(CacheError::Dirty); + } + if clean_page { - // evict buffer - let page = unsafe { &entry.as_mut().page }; - page.clear_loaded(); - debug!("cleaning up page {}", page.get().id); - let _ = page.get().contents.take(); + if let Some(page_mut) = Arc::get_mut(&mut entry_mut.page) { + page_mut.clear_loaded(); + debug!("cleaning up page {}", page_mut.get().id); + let _ = page_mut.get().contents.take(); + } else { + let page_id = unsafe { &entry.as_mut().page.get().id }; + debug!( + "detach page {}: can't clean, there are other references", + page_id + ); + return Err(CacheError::ActiveRefs); + } } self.unlink(entry); + Ok(()) } fn unlink(&mut self, mut entry: NonNull) { @@ -179,27 +213,33 @@ impl DumbLruPageCache { self.head.borrow_mut().replace(entry); } - fn pop_if_not_dirty(&mut self) { + fn pop_if_not_dirty(&mut self) -> Result<(), CacheError> { let tail = *self.tail.borrow(); if tail.is_none() { - return; + return Ok(()); } + let mut tail = tail.unwrap(); let tail_entry = unsafe { tail.as_mut() }; - if tail_entry.page.is_dirty() { - // TODO: drop from another clean entry? - return; - } tracing::debug!("pop_if_not_dirty(key={:?})", tail_entry.key); - self.detach(tail, true); - assert!(self.map.borrow_mut().remove(&tail_entry.key).is_some()); + let key = tail_entry.key.clone(); + + // TODO: drop from another clean entry? + self.detach(tail, true)?; + + assert!(self.map.borrow_mut().remove(&key).is_some()); + Ok(()) } - pub fn clear(&mut self) { - let to_remove: Vec = self.map.borrow().iter().map(|v| v.0.clone()).collect(); - for key in to_remove { - self.delete(key); + pub fn clear(&mut self) -> Result<(), CacheError> { + let keys_to_remove: Vec = self.map.borrow().keys().cloned().collect(); + for key in keys_to_remove { + self.delete(key)?; } + assert!(self.head.borrow().is_none()); + assert!(self.tail.borrow().is_none()); + assert!(self.map.borrow().is_empty()); + Ok(()) } pub fn print(&mut self) { @@ -342,6 +382,7 @@ impl DumbLruPageCache { mod tests { use super::*; use crate::io::{Buffer, BufferData}; + use crate::storage::page_cache::CacheError; use crate::storage::pager::{Page, PageRef}; use crate::storage::sqlite3_ondisk::PageContent; use std::{cell::RefCell, num::NonZeroUsize, pin::Pin, rc::Rc, sync::Arc}; @@ -375,7 +416,7 @@ mod tests { fn insert_page(cache: &mut DumbLruPageCache, id: usize) -> PageCacheKey { let key = create_key(id); let page = page_with_content(id); - cache.insert(key.clone(), page); + assert!(cache.insert(key.clone(), page).is_ok()); key } @@ -383,6 +424,17 @@ mod tests { page.is_loaded() && page.get().contents.is_some() } + fn insert_and_get_entry( + cache: &mut DumbLruPageCache, + id: usize, + ) -> (PageCacheKey, NonNull) { + let key = create_key(id); + let page = page_with_content(id); + assert!(cache.insert(key.clone(), page).is_ok()); + let entry = cache.get_ptr(&key).expect("Entry should exist"); + (key, entry) + } + #[test] fn test_detach_only_element() { let mut cache = DumbLruPageCache::new(5); @@ -393,7 +445,7 @@ mod tests { assert!(cache.tail.borrow().is_some()); assert_eq!(*cache.head.borrow(), *cache.tail.borrow()); - cache.delete(key1.clone()); + assert!(cache.delete(key1.clone()).is_ok()); assert_eq!( cache.len(), @@ -425,7 +477,7 @@ mod tests { "Initial head check" ); - cache.delete(key3.clone()); + assert!(cache.delete(key3.clone()).is_ok()); assert_eq!(cache.len(), 2, "Length should be 2 after deleting head"); assert!( @@ -469,7 +521,7 @@ mod tests { "Initial tail check" ); - cache.delete(key1.clone()); // Delete tail + assert!(cache.delete(key1.clone()).is_ok()); // Delete tail assert_eq!(cache.len(), 2, "Length should be 2 after deleting tail"); assert!( @@ -520,7 +572,7 @@ mod tests { let head_ptr_before = cache.head.borrow().unwrap(); let tail_ptr_before = cache.tail.borrow().unwrap(); - cache.delete(key2.clone()); // Detach a middle element (key2) + assert!(cache.delete(key2.clone()).is_ok()); // Detach a middle element (key2) assert_eq!(cache.len(), 3, "Length should be 3 after deleting middle"); assert!( @@ -560,19 +612,19 @@ mod tests { let mut cache = DumbLruPageCache::new(5); let key1 = create_key(1); let page1 = page_with_content(1); - cache.insert(key1.clone(), page1.clone()); - assert!( - page_has_content(&page1), - "Page content should exist before delete" - ); + assert!(cache.insert(key1.clone(), page1.clone()).is_ok()); + assert!(page_has_content(&page1)); cache.verify_list_integrity(); - cache.delete(key1.clone()); + let result = cache.delete(key1.clone()); + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), CacheError::ActiveRefs); + assert_eq!(cache.len(), 1); + + drop(page1); + + assert!(cache.delete(key1).is_ok()); assert_eq!(cache.len(), 0); - assert!( - !page_has_content(&page1), - "Page content should be removed after delete" - ); cache.verify_list_integrity(); } @@ -583,28 +635,21 @@ mod tests { let page1_v1 = page_with_content(1); let page1_v2 = page_with_content(1); - cache.insert(key1.clone(), page1_v1.clone()); - assert!( - page_has_content(&page1_v1), - "Page1 V1 content should exist initially" - ); + assert!(cache.insert(key1.clone(), page1_v1.clone()).is_ok()); + assert_eq!(cache.len(), 1); cache.verify_list_integrity(); - cache.insert(key1.clone(), page1_v2.clone()); // Trigger delete page1_v1 + // Fail to insert v2 as v1 is still referenced + let result = cache.insert(key1.clone(), page1_v2.clone()); + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), CacheError::ActiveRefs); + assert_eq!(cache.len(), 1); // Page v1 should remain in cache + assert!(page_has_content(&page1_v1)); - assert_eq!( - cache.len(), - 1, - "Cache length should still be 1 after replace" - ); - assert!( - !page_has_content(&page1_v1), - "Page1 V1 content should be cleaned after being replaced in cache" - ); - assert!( - page_has_content(&page1_v2), - "Page1 V2 content should exist after insert" - ); + drop(page1_v1); + assert!(cache.insert(key1.clone(), page1_v2.clone()).is_ok()); + assert_eq!(cache.len(), 1); + assert!(page_has_content(&page1_v2)); let current_page = cache.get(&key1).unwrap(); assert!( @@ -620,7 +665,7 @@ mod tests { let mut cache = DumbLruPageCache::new(5); let key_nonexist = create_key(99); - cache.delete(key_nonexist.clone()); // no-op + assert!(cache.delete(key_nonexist.clone()).is_ok()); // no-op } #[test] @@ -632,6 +677,114 @@ mod tests { assert!(cache.get(&key1).is_none()); } + #[test] + fn test_detach_locked_page() { + let mut cache = DumbLruPageCache::new(5); + let (_, mut entry) = insert_and_get_entry(&mut cache, 1); + unsafe { entry.as_mut().page.set_locked() }; + assert_eq!(cache.detach(entry, false), Err(CacheError::Locked)); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_dirty_page() { + let mut cache = DumbLruPageCache::new(5); + let (key, mut entry) = insert_and_get_entry(&mut cache, 1); + cache.get(&key).expect("Page should exist"); + unsafe { entry.as_mut().page.set_dirty() }; + assert_eq!(cache.detach(entry, false), Err(CacheError::Dirty)); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_with_active_reference_clean() { + let mut cache = DumbLruPageCache::new(5); + let (key, entry) = insert_and_get_entry(&mut cache, 1); + let page_ref = cache.get(&key); + assert_eq!(cache.detach(entry, true), Err(CacheError::ActiveRefs)); + drop(page_ref); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_with_active_reference_no_clean() { + let mut cache = DumbLruPageCache::new(5); + let (key, entry) = insert_and_get_entry(&mut cache, 1); + cache.get(&key).expect("Page should exist"); + assert!(cache.detach(entry, false).is_ok()); + assert!(cache.map.borrow_mut().remove(&key).is_some()); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_without_cleaning() { + let mut cache = DumbLruPageCache::new(5); + let (key, entry) = insert_and_get_entry(&mut cache, 1); + assert!(cache.detach(entry, false).is_ok()); + assert!(cache.map.borrow_mut().remove(&key).is_some()); + cache.verify_list_integrity(); + assert_eq!(cache.len(), 0); + } + + #[test] + fn test_detach_with_cleaning() { + let mut cache = DumbLruPageCache::new(5); + let (key, entry) = insert_and_get_entry(&mut cache, 1); + let page = cache.get(&key).expect("Page should exist"); + assert!(page_has_content(&page)); + drop(page); + assert!(cache.detach(entry, true).is_ok()); + // Internal testing: the page is still in map, so we use it to check content + let page = cache.peek(&key, false).expect("Page should exist in map"); + assert!(!page_has_content(&page)); + assert!(cache.map.borrow_mut().remove(&key).is_some()); + cache.verify_list_integrity(); + } + + #[test] + fn test_detach_only_element_preserves_integrity() { + let mut cache = DumbLruPageCache::new(5); + let (_, entry) = insert_and_get_entry(&mut cache, 1); + assert!(cache.detach(entry, false).is_ok()); + assert!( + cache.head.borrow().is_none(), + "Head should be None after detaching only element" + ); + assert!( + cache.tail.borrow().is_none(), + "Tail should be None after detaching only element" + ); + } + + #[test] + fn test_detach_with_multiple_pages() { + let mut cache = DumbLruPageCache::new(5); + let (key1, _) = insert_and_get_entry(&mut cache, 1); + let (key2, entry2) = insert_and_get_entry(&mut cache, 2); + let (key3, _) = insert_and_get_entry(&mut cache, 3); + let head_key = unsafe { cache.head.borrow().unwrap().as_ref().key.clone() }; + let tail_key = unsafe { cache.tail.borrow().unwrap().as_ref().key.clone() }; + assert_eq!(head_key, key3, "Head should be key3"); + assert_eq!(tail_key, key1, "Tail should be key1"); + assert!(cache.detach(entry2, false).is_ok()); + let head_entry = unsafe { cache.head.borrow().unwrap().as_ref() }; + let tail_entry = unsafe { cache.tail.borrow().unwrap().as_ref() }; + assert_eq!(head_entry.key, key3, "Head should still be key3"); + assert_eq!(tail_entry.key, key1, "Tail should still be key1"); + assert_eq!( + unsafe { head_entry.next.unwrap().as_ref().key.clone() }, + key1, + "Head's next should point to tail after middle element detached" + ); + assert_eq!( + unsafe { tail_entry.prev.unwrap().as_ref().key.clone() }, + key3, + "Tail's prev should point to head after middle element detached" + ); + assert!(cache.map.borrow_mut().remove(&key2).is_some()); + cache.verify_list_integrity(); + } + #[test] fn test_page_cache_fuzz() { let seed = std::time::SystemTime::now() @@ -653,9 +806,9 @@ mod tests { let key = PageCacheKey::new(id_page as usize, Some(id_frame)); #[allow(clippy::arc_with_non_send_sync)] let page = Arc::new(Page::new(id_page as usize)); + lru.push(key.clone(), page.clone()); // println!("inserting page {:?}", key); - cache.insert(key.clone(), page.clone()); - lru.push(key, page); + cache.insert(key.clone(), page); // move page so there's no reference left here assert!(cache.len() <= 10); } 1 => { @@ -712,7 +865,7 @@ mod tests { fn test_page_cache_delete() { let mut cache = DumbLruPageCache::new(2); let key1 = insert_page(&mut cache, 1); - cache.delete(key1.clone()); + assert!(cache.delete(key1.clone()).is_ok()); assert!(cache.get(&key1).is_none()); } @@ -721,7 +874,7 @@ mod tests { let mut cache = DumbLruPageCache::new(2); let key1 = insert_page(&mut cache, 1); let key2 = insert_page(&mut cache, 2); - cache.clear(); + assert!(cache.clear().is_ok()); assert!(cache.get(&key1).is_none()); assert!(cache.get(&key2).is_none()); } From 4ef3c1d04d9f023166e392247b0f52339e21e42c Mon Sep 17 00:00:00 2001 From: Alecco Date: Fri, 18 Apr 2025 21:03:48 +0200 Subject: [PATCH 05/22] page_cache: fix insert and evict logic insert() fails if key exists (there shouldn't be two) and panics if it's different pages, and also fails if it can't make room for the page. Replaced the limited pop_if_not_dirty() function with make_room_for(). It tries to evict many pages as requested spare capacity. It should come handy later by resize() and Pager. make_room_for() tries to make room or fails if it can't evict enough entries. For make_room_for() I also tried with an all-or-nothing approach, so if say a query requests a lot more than possible to make room for, it doesn't evict a bunch of pages from the cache that might be useful. But implementing this approach got very complicated since it needs to keep exclusive PageRefs and collecting this caused segfaults. Might be worth trying again in the future. But beware the rabbit hole. Updated page cache test logic for new insert rules. Updated Pager.allocate_page() to handle failure logic but needs further work. This is to show new cache insert handling. There are many places to update. Left comments on callers of pager and page cache needing to update error handling, for now. --- core/error.rs | 2 + core/storage/btree.rs | 6 ++ core/storage/page_cache.rs | 120 ++++++++++++++++++++++++------------- core/storage/pager.rs | 22 +++++-- core/vdbe/execute.rs | 2 + 5 files changed, 106 insertions(+), 46 deletions(-) diff --git a/core/error.rs b/core/error.rs index 695b52ee7..a56dfbb10 100644 --- a/core/error.rs +++ b/core/error.rs @@ -8,6 +8,8 @@ pub enum LimboError { NotADB, #[error("Internal error: {0}")] InternalError(String), + #[error("Page cache is full")] + CacheFull, #[error("Parse error: {0}")] ParseError(String), #[error(transparent)] diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 6172b6a08..1178ee1cf 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2785,6 +2785,7 @@ impl BTreeCursor { page.set_dirty(); pages_to_balance_new[i].replace(page.clone()); } else { + // FIXME: handle page cache is full let page = self.pager.do_allocate_page(page_type, 0); pages_to_balance_new[i].replace(page); // Since this page didn't exist before, we can set it to cells length as it @@ -3667,6 +3668,7 @@ impl BTreeCursor { let root = self.stack.top(); let root_contents = root.get_contents(); + // FIXME: handle page cache is full let child = self.pager.do_allocate_page(root_contents.page_type(), 0); tracing::debug!( @@ -5742,6 +5744,7 @@ fn fill_cell_payload( } // we still have bytes to add, we will need to allocate new overflow page + // FIXME: handle page cache is full let overflow_page = pager.allocate_overflow_page(); overflow_pages.push(overflow_page.clone()); { @@ -6173,6 +6176,7 @@ mod tests { Pager::finish_open(db_header, db_file, Some(wal), io, page_cache, buffer_pool).unwrap() }; let pager = Rc::new(pager); + // FIXME: handle page cache is full let page1 = pager.allocate_page().unwrap(); btree_init_page(&page1, PageType::TableLeaf, 0, 4096); (pager, page1.get().id) @@ -6792,9 +6796,11 @@ mod tests { } // Allocate two leaf pages + // FIXME: handle page cache is full let page3 = cursor.pager.allocate_page()?; btree_init_page(&page3, PageType::TableLeaf, 0, 512); + // FIXME: handle page cache is full let page4 = cursor.pager.allocate_page()?; btree_init_page(&page4, PageType::TableLeaf, 0, 512); diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index f5f105340..5aeadb8ed 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -43,9 +43,12 @@ unsafe impl Sync for DumbLruPageCache {} #[derive(Debug, PartialEq)] pub enum CacheError { + InternalError(String), Locked, Dirty, ActiveRefs, + Full, + KeyExists, } impl PageCacheKey { @@ -68,12 +71,16 @@ impl DumbLruPageCache { } pub fn insert(&mut self, key: PageCacheKey, value: PageRef) -> Result<(), CacheError> { - trace!("cache_insert(key={:?})", key); - self._delete(key.clone(), false)?; - if self.len() >= self.capacity { - // Make room before trying to insert - self.pop_if_not_dirty()?; + trace!("insert(key={:?})", key); + // Check first if page already exists in cache + if let Some(existing_page_ref) = self.get(&key) { + assert!( + Arc::ptr_eq(&value, &existing_page_ref), + "Attempted to insert different page with same key" + ); + return Err(CacheError::KeyExists); } + self.make_room_for(1)?; let entry = Box::new(PageCacheEntry { key: key.clone(), next: None, @@ -81,7 +88,7 @@ impl DumbLruPageCache { page: value, }); let ptr_raw = Box::into_raw(entry); - let ptr = unsafe { ptr_raw.as_mut().unwrap().as_non_null() }; + let ptr = unsafe { NonNull::new_unchecked(ptr_raw) }; self.touch(ptr); self.map.borrow_mut().insert(key, ptr); @@ -213,22 +220,44 @@ impl DumbLruPageCache { self.head.borrow_mut().replace(entry); } - fn pop_if_not_dirty(&mut self) -> Result<(), CacheError> { - let tail = *self.tail.borrow(); - if tail.is_none() { + pub fn make_room_for(&mut self, n: usize) -> Result<(), CacheError> { + if n > self.capacity { + return Err(CacheError::Full); + } + + let len = self.len(); + let available = self.capacity.saturating_sub(len); + if n <= available && len <= self.capacity { return Ok(()); } - let mut tail = tail.unwrap(); - let tail_entry = unsafe { tail.as_mut() }; - tracing::debug!("pop_if_not_dirty(key={:?})", tail_entry.key); - let key = tail_entry.key.clone(); + let tail = self.tail.borrow().ok_or_else(|| { + CacheError::InternalError(format!( + "Page cache of len {} expected to have a tail pointer", + self.len() + )) + })?; - // TODO: drop from another clean entry? - self.detach(tail, true)?; + // Handle len > capacity, too + let available = self.capacity.saturating_sub(len); + let x = n.saturating_sub(available); + let mut need_to_evict = x.saturating_add(len.saturating_sub(self.capacity)); - assert!(self.map.borrow_mut().remove(&key).is_some()); - Ok(()) + let mut current_opt = Some(tail); + while need_to_evict > 0 && current_opt.is_some() { + let current = current_opt.unwrap(); + let entry = unsafe { current.as_ref() }; + current_opt = entry.prev; // Pick prev before modifying entry + match self.delete(entry.key.clone()) { + Err(_) => {} + Ok(_) => need_to_evict -= 1, + } + } + + match need_to_evict > 0 { + true => Err(CacheError::Full), + false => Ok(()), + } } pub fn clear(&mut self) -> Result<(), CacheError> { @@ -629,35 +658,30 @@ mod tests { } #[test] - fn test_detach_via_insert() { + fn test_insert_same_id_different_frame() { + let mut cache = DumbLruPageCache::new(5); + let key1_1 = PageCacheKey::new(1, Some(1 as u64)); + let key1_2 = PageCacheKey::new(1, Some(2 as u64)); + let page1_1 = page_with_content(1); + let page1_2 = page_with_content(1); + + assert!(cache.insert(key1_1.clone(), page1_1.clone()).is_ok()); + assert!(cache.insert(key1_2.clone(), page1_2.clone()).is_ok()); + assert_eq!(cache.len(), 2); + cache.verify_list_integrity(); + } + + #[test] + #[should_panic(expected = "Attempted to insert different page with same key")] + fn test_insert_existing_key_fail() { let mut cache = DumbLruPageCache::new(5); let key1 = create_key(1); let page1_v1 = page_with_content(1); let page1_v2 = page_with_content(1); - assert!(cache.insert(key1.clone(), page1_v1.clone()).is_ok()); assert_eq!(cache.len(), 1); cache.verify_list_integrity(); - - // Fail to insert v2 as v1 is still referenced - let result = cache.insert(key1.clone(), page1_v2.clone()); - assert!(result.is_err()); - assert_eq!(result.unwrap_err(), CacheError::ActiveRefs); - assert_eq!(cache.len(), 1); // Page v1 should remain in cache - assert!(page_has_content(&page1_v1)); - - drop(page1_v1); - assert!(cache.insert(key1.clone(), page1_v2.clone()).is_ok()); - assert_eq!(cache.len(), 1); - assert!(page_has_content(&page1_v2)); - - let current_page = cache.get(&key1).unwrap(); - assert!( - Arc::ptr_eq(¤t_page, &page1_v2), - "Cache should now hold page1 V2" - ); - - cache.verify_list_integrity(); + let _ = cache.insert(key1.clone(), page1_v2.clone()); // Panic } #[test] @@ -806,9 +830,20 @@ mod tests { let key = PageCacheKey::new(id_page as usize, Some(id_frame)); #[allow(clippy::arc_with_non_send_sync)] let page = Arc::new(Page::new(id_page as usize)); - lru.push(key.clone(), page.clone()); + if let Some(_) = cache.get(&key) { + continue; // skip duplicate page ids + } // println!("inserting page {:?}", key); - cache.insert(key.clone(), page); // move page so there's no reference left here + match cache.insert(key.clone(), page.clone()) { + Err(CacheError::Full | CacheError::ActiveRefs) => {} // Ignore + Err(err) => { + // Any other error should fail the test + panic!("Cache insertion failed: {:?}", err); + } + Ok(_) => { + lru.push(key, page); + } + } assert!(cache.len() <= 10); } 1 => { @@ -826,7 +861,7 @@ mod tests { }; // println!("removing page {:?}", key); lru.pop(&key); - cache.delete(key); + assert!(cache.delete(key).is_ok()); } 2 => { // test contents @@ -839,6 +874,7 @@ mod tests { _ => unreachable!(), } } + assert!(lru.len() > 0); // Check it inserted some } #[test] diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 6dbd1c961..0ee2fd116 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -13,7 +13,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tracing::trace; -use super::page_cache::{DumbLruPageCache, PageCacheKey}; +use super::page_cache::{CacheError, DumbLruPageCache, PageCacheKey}; use super::wal::{CheckpointMode, CheckpointStatus}; pub struct PageInner { @@ -22,6 +22,7 @@ pub struct PageInner { pub id: usize, } +#[derive(Debug)] pub struct Page { pub inner: UnsafeCell, } @@ -213,6 +214,7 @@ impl Pager { }) } + // FIXME: handle no room in page cache pub fn btree_create(&self, flags: &CreateBTreeFlags) -> u32 { let page_type = match flags { _ if flags.is_table() => PageType::TableLeaf, @@ -226,6 +228,7 @@ impl Pager { /// Allocate a new overflow page. /// This is done when a cell overflows and new space is needed. + // FIXME: handle no room in page cache pub fn allocate_overflow_page(&self) -> PageRef { let page = self.allocate_page().unwrap(); tracing::debug!("Pager::allocate_overflow_page(id={})", page.get().id); @@ -240,6 +243,7 @@ impl Pager { /// Allocate a new page to the btree via the pager. /// This marks the page as dirty and writes the page header. + // FIXME: handle no room in page cache pub fn do_allocate_page(&self, page_type: PageType, offset: usize) -> PageRef { let page = self.allocate_page().unwrap(); crate::btree_init_page(&page, page_type, offset, self.usable_space() as u16); @@ -387,6 +391,7 @@ impl Pager { } /// Changes the size of the page cache. + // FIXME: handle no room in page cache pub fn change_page_cache_size(&self, capacity: usize) { let mut page_cache = self.page_cache.write(); page_cache.resize(capacity); @@ -640,6 +645,7 @@ impl Pager { Gets a new page that increasing the size of the page or uses a free page. Currently free list pages are not yet supported. */ + // FIXME: handle no room in page cache #[allow(clippy::readonly_write_lock)] pub fn allocate_page(&self) -> Result { let header = &self.db_header; @@ -663,21 +669,29 @@ impl Pager { } } + // FIXME: should reserve page cache entry before modifying the database let page = allocate_page(header.database_size as usize, &self.buffer_pool, 0); { // setup page and add to cache page.set_dirty(); self.add_dirty(page.get().id); - let mut cache = self.page_cache.write(); let max_frame = match &self.wal { Some(wal) => wal.borrow().get_max_frame(), None => 0, }; let page_key = PageCacheKey::new(page.get().id, Some(max_frame)); - cache.insert(page_key, page.clone()); + let mut cache = self.page_cache.write(); + match cache.insert(page_key, page.clone()) { + Err(CacheError::Full) => return Err(LimboError::CacheFull), + Err(_) => { + return Err(LimboError::InternalError( + "Unknown error inserting page to cache".into(), + )) + } + Ok(_) => return Ok(page), + } } - Ok(page) } pub fn put_loaded_page(&self, id: usize, page: PageRef) { diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index a6b375651..e1dd45db9 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -4323,6 +4323,7 @@ pub fn op_create_btree( // TODO: implement temp databases todo!("temp databases not implemented yet"); } + // FIXME: handle page cache is full let root_page = pager.btree_create(flags); state.registers[*root] = Register::Value(Value::Integer(root_page as i64)); state.pc += 1; @@ -4708,6 +4709,7 @@ pub fn op_open_ephemeral( &CreateBTreeFlags::new_index() }; + // FIXME: handle page cache is full let root_page = pager.btree_create(flag); let (_, cursor_type) = program.cursor_ref.get(cursor_id).unwrap(); From e808a28c9826b7a1a2172f3df448a8ca9c3d7f30 Mon Sep 17 00:00:00 2001 From: Alecco Date: Sun, 20 Apr 2025 18:17:51 +0200 Subject: [PATCH 06/22] WIP (squash) adapt pager and btree to page cache error handling --- core/storage/btree.rs | 1 + core/storage/pager.rs | 74 +++++++++++++++++++++++++++++++------------ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 1178ee1cf..6332fc06e 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2814,6 +2814,7 @@ impl BTreeCursor { let page = page.as_ref().unwrap(); if *new_id != page.get().id { page.get().id = *new_id; + // FIXME: why would there be another page at this id/frame? self.pager.put_loaded_page(*new_id, page.clone()); } } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 0ee2fd116..7b52100dd 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -306,7 +306,7 @@ impl Pager { } /// Reads a page from the database. - pub fn read_page(&self, page_idx: usize) -> Result { + pub fn read_page(&self, page_idx: usize) -> Result { tracing::trace!("read_page(page_idx = {})", page_idx); let mut page_cache = self.page_cache.write(); let max_frame = match &self.wal { @@ -328,9 +328,14 @@ impl Pager { { page.set_uptodate(); } - // TODO(pere) ensure page is inserted, we should probably first insert to page cache - // and if successful, read frame or page - page_cache.insert(page_key, page.clone()); + // TODO(pere) should probably first insert to page cache, and if successful, + // read frame or page + match page_cache.insert(page_key, page.clone()) { + Ok(_) => {} + Err(CacheError::Full) => return Err(LimboError::CacheFull), + Err(CacheError::KeyExists) => unreachable!("Page should not exist in cache after get() miss"), + Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache: {:?}", e))), + } return Ok(page); } } @@ -340,8 +345,12 @@ impl Pager { page.clone(), page_idx, )?; - // TODO(pere) ensure page is inserted - page_cache.insert(page_key, page.clone()); + match page_cache.insert(page_key, page.clone()) { + Ok(_) => {} + Err(CacheError::Full) => return Err(LimboError::CacheFull), + Err(CacheError::KeyExists) => unreachable!("Page should not exist in cache after get() miss"), + Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache: {:?}", e))), + } Ok(page) } @@ -363,18 +372,23 @@ impl Pager { { page.set_uptodate(); } - // TODO(pere) ensure page is inserted - if !page_cache.contains_key(&page_key) { - page_cache.insert(page_key, page.clone()); + match page_cache.insert(page_key, page.clone()) { + Err(CacheError::KeyExists) => {}, // Exists but same page, not error + Err(CacheError::Full) => return Err(LimboError::CacheFull), + Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache during load: {:?}", e))), + Ok(_) => {} } return Ok(()); } } - // TODO(pere) ensure page is inserted - if !page_cache.contains_key(&page_key) { - page_cache.insert(page_key, page.clone()); - } + match page_cache.insert(page_key, page.clone()) { + Err(CacheError::KeyExists) => {}, // Ensures same page + Err(CacheError::Full) => return Err(LimboError::CacheFull), + Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache during load: {:?}", e))), + Ok(_) => {} + }; + sqlite3_ondisk::begin_read_page( self.db_file.clone(), self.buffer_pool.clone(), @@ -394,7 +408,7 @@ impl Pager { // FIXME: handle no room in page cache pub fn change_page_cache_size(&self, capacity: usize) { let mut page_cache = self.page_cache.write(); - page_cache.resize(capacity); + page_cache.resize(capacity) } pub fn add_dirty(&self, page_id: usize) { @@ -439,8 +453,9 @@ impl Pager { } // This page is no longer valid. // For example: - // We took page with key (page_num, max_frame) -- this page is no longer valid for that max_frame so it must be invalidated. - cache.delete(page_key); + // We took page with key (page_num, max_frame) -- this page is no longer valid for that max_frame + // so it must be invalidated. There shouldn't be any active refs. + cache.delete(page_key).map_err(|e| {LimboError::InternalError(format!("Failed to delete page {:?} from cache during flush: {:?}. Might be actively referenced.", page_id, e))})?; } self.dirty_pages.borrow_mut().clear(); self.flush_info.borrow_mut().state = FlushState::WaitAppendFrames; @@ -565,7 +580,7 @@ impl Pager { } } // TODO: only clear cache of things that are really invalidated - self.page_cache.write().clear(); + self.page_cache.write().clear().expect("Failed to clear page cache"); checkpoint_result } @@ -694,16 +709,35 @@ impl Pager { } } - pub fn put_loaded_page(&self, id: usize, page: PageRef) { + pub fn put_loaded_page(&self, id: usize, page: PageRef) -> Result<(), LimboError> { let mut cache = self.page_cache.write(); - // cache insert invalidates previous page let max_frame = match &self.wal { Some(wal) => wal.borrow().get_max_frame(), None => 0, }; let page_key = PageCacheKey::new(id, Some(max_frame)); - cache.insert(page_key, page.clone()); + + // FIXME: is this correct? Why would there be another version of the page? + // Check if there's an existing page at this key and remove it + // This can happen when a page is being updated during a transaction + // or when WAL frames are being managed + if let Some(_existing_page_ref) = cache.get(&page_key) { + cache.delete(page_key.clone()).map_err(|e| { + LimboError::InternalError(format!( + "put_loaded_page failed to remove old version of page {:?}: {:?}.", + page_key, e + )) + })?; + } + + cache.insert(page_key, page.clone()).map_err(|e| { + LimboError::InternalError(format!( + "Failed to insert loaded page {} into cache: {:?}", id, e + )) + })?; + page.set_loaded(); + Ok(()) } pub fn usable_size(&self) -> usize { From e2f99a1ad26a12dbd39792de97e698d0c5873dbe Mon Sep 17 00:00:00 2001 From: Alecco Date: Sat, 19 Apr 2025 18:48:19 +0200 Subject: [PATCH 07/22] page_cache: implement resize --- core/storage/page_cache.rs | 102 +++++++++++++++++++++++++++++++++++-- core/storage/pager.rs | 7 ++- 2 files changed, 102 insertions(+), 7 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 5aeadb8ed..0d3047d9e 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -51,6 +51,12 @@ pub enum CacheError { KeyExists, } +#[derive(Debug, PartialEq)] +pub enum CacheResizeResult { + Done, + PendingEvictions, +} + impl PageCacheKey { pub fn new(pgno: usize, max_frame: Option) -> Self { Self { pgno, max_frame } @@ -136,9 +142,14 @@ impl DumbLruPageCache { Some(page) } - pub fn resize(&mut self, capacity: usize) { - let _ = capacity; - todo!(); + // To match SQLite behavior, just set capacity and try to shrink as much as possible. + // In case of failure, the caller should request further evictions (e.g. after I/O). + pub fn resize(&mut self, capacity: usize) -> CacheResizeResult { + self.capacity = capacity; + match self.make_room_for(0) { + Ok(_) => CacheResizeResult::Done, + Err(_) => CacheResizeResult::PendingEvictions, + } } fn detach( @@ -923,4 +934,89 @@ mod tests { assert_eq!(cache.peek(&key, false).unwrap().get().id, i); } } + + #[test] + fn test_resize_smaller_success() { + let mut cache = DumbLruPageCache::new(5); + for i in 1..=5 { + let _ = insert_page(&mut cache, i); + } + assert_eq!(cache.len(), 5); + let result = cache.resize(3); + assert_eq!(result, CacheResizeResult::Done); + assert_eq!(cache.len(), 3); + assert_eq!(cache.capacity, 3); + assert!(cache.insert(create_key(6), page_with_content(6)).is_ok()); + } + + #[test] + fn test_resize_larger() { + let mut cache = DumbLruPageCache::new(2); + let _ = insert_page(&mut cache, 1); + let _ = insert_page(&mut cache, 2); + assert_eq!(cache.len(), 2); + let result = cache.resize(5); + assert_eq!(result, CacheResizeResult::Done); + assert_eq!(cache.len(), 2); + assert_eq!(cache.capacity, 5); + assert!(cache.get(&create_key(1)).is_some()); + assert!(cache.get(&create_key(2)).is_some()); + for i in 3..=5 { + let _ = insert_page(&mut cache, i); + } + assert_eq!(cache.len(), 5); + assert!(cache.insert(create_key(4), page_with_content(4)).is_err()); + cache.verify_list_integrity(); + } + + #[test] + fn test_resize_with_active_references() { + let mut cache = DumbLruPageCache::new(5); + let page1 = page_with_content(1); + let page2 = page_with_content(2); + let page3 = page_with_content(3); + assert!(cache.insert(create_key(1), page1.clone()).is_ok()); + assert!(cache.insert(create_key(2), page2.clone()).is_ok()); + assert!(cache.insert(create_key(3), page3.clone()).is_ok()); + assert_eq!(cache.len(), 3); + cache.verify_list_integrity(); + assert_eq!(cache.resize(2), CacheResizeResult::PendingEvictions); + assert_eq!(cache.capacity, 2); + assert_eq!(cache.len(), 3); + drop(page2); + drop(page3); + assert_eq!(cache.resize(1), CacheResizeResult::Done); // Evicted 2 and 3 + assert_eq!(cache.len(), 1); + assert!(cache.insert(create_key(4), page_with_content(4)).is_err()); + cache.verify_list_integrity(); + } + + #[test] + fn test_resize_to_zero() { + let mut cache = DumbLruPageCache::new(5); + for i in 1..=3 { + let _ = insert_page(&mut cache, i); + } + assert_eq!(cache.len(), 3); + let result = cache.resize(0); + assert_eq!(result, CacheResizeResult::Done); // removed all 3 + assert_eq!(cache.len(), 0); + assert_eq!(cache.capacity, 0); + cache.verify_list_integrity(); + assert!(cache.insert(create_key(4), page_with_content(4)).is_err()); + } + + #[test] + fn test_resize_same_capacity() { + let mut cache = DumbLruPageCache::new(3); + for i in 1..=3 { + let _ = insert_page(&mut cache, i); + } + let result = cache.resize(3); + assert_eq!(result, CacheResizeResult::Done); // no-op + assert_eq!(cache.len(), 3); + assert_eq!(cache.capacity, 3); + cache.verify_list_integrity(); + assert!(cache.insert(create_key(4), page_with_content(4)).is_ok()); + } } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 7b52100dd..96e552b55 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -13,7 +13,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tracing::trace; -use super::page_cache::{CacheError, DumbLruPageCache, PageCacheKey}; +use super::page_cache::{CacheError, DumbLruPageCache, PageCacheKey, CacheResizeResult}; use super::wal::{CheckpointMode, CheckpointStatus}; pub struct PageInner { @@ -405,10 +405,9 @@ impl Pager { } /// Changes the size of the page cache. - // FIXME: handle no room in page cache - pub fn change_page_cache_size(&self, capacity: usize) { + pub fn change_page_cache_size(&self, capacity: usize) -> Result { let mut page_cache = self.page_cache.write(); - page_cache.resize(capacity) + Ok(page_cache.resize(capacity)) } pub fn add_dirty(&self, page_id: usize) { From 67e260ff71fe3fabb15d58106a6fcc92ab4bbbaf Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 11:07:30 +0200 Subject: [PATCH 08/22] allow delete of dirty page in cacheflush Dirty pages can be deleted in `cacheflush`. Furthermore, there could be multiple live references in the stack of a cursor so let's allow them to exist while deleting. --- core/storage/btree.rs | 2 +- core/storage/page_cache.rs | 23 ++++----------- core/storage/pager.rs | 59 ++++++++++++++++++++++++++++---------- core/translate/pragma.rs | 4 ++- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 6332fc06e..88897f4c2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2815,7 +2815,7 @@ impl BTreeCursor { if *new_id != page.get().id { page.get().id = *new_id; // FIXME: why would there be another page at this id/frame? - self.pager.put_loaded_page(*new_id, page.clone()); + self.pager.put_loaded_page(*new_id, page.clone())?; } } diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 0d3047d9e..651637c70 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -26,12 +26,6 @@ struct PageCacheEntry { next: Option>, } -impl PageCacheEntry { - fn as_non_null(&mut self) -> NonNull { - NonNull::new(&mut *self).unwrap() - } -} - pub struct DumbLruPageCache { capacity: usize, map: RefCell>>, @@ -166,18 +160,9 @@ impl DumbLruPageCache { } if clean_page { - if let Some(page_mut) = Arc::get_mut(&mut entry_mut.page) { - page_mut.clear_loaded(); - debug!("cleaning up page {}", page_mut.get().id); - let _ = page_mut.get().contents.take(); - } else { - let page_id = unsafe { &entry.as_mut().page.get().id }; - debug!( - "detach page {}: can't clean, there are other references", - page_id - ); - return Err(CacheError::ActiveRefs); - } + entry_mut.page.clear_loaded(); + debug!("cleaning up page {}", entry_mut.page.get().id); + let _ = entry_mut.page.get().contents.take(); } self.unlink(entry); Ok(()) @@ -950,6 +935,7 @@ mod tests { } #[test] + #[should_panic(expected = "Attempted to insert different page with same key")] fn test_resize_larger() { let mut cache = DumbLruPageCache::new(2); let _ = insert_page(&mut cache, 1); @@ -965,6 +951,7 @@ mod tests { let _ = insert_page(&mut cache, i); } assert_eq!(cache.len(), 5); + // FIXME: For now this will assert because we cannot insert a page with same id but different contents of page. assert!(cache.insert(create_key(4), page_with_content(4)).is_err()); cache.verify_list_integrity(); } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 96e552b55..7d1acde7d 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -13,7 +13,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tracing::trace; -use super::page_cache::{CacheError, DumbLruPageCache, PageCacheKey, CacheResizeResult}; +use super::page_cache::{CacheError, CacheResizeResult, DumbLruPageCache, PageCacheKey}; use super::wal::{CheckpointMode, CheckpointStatus}; pub struct PageInner { @@ -333,8 +333,15 @@ impl Pager { match page_cache.insert(page_key, page.clone()) { Ok(_) => {} Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(CacheError::KeyExists) => unreachable!("Page should not exist in cache after get() miss"), - Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache: {:?}", e))), + Err(CacheError::KeyExists) => { + unreachable!("Page should not exist in cache after get() miss") + } + Err(e) => { + return Err(LimboError::InternalError(format!( + "Failed to insert page into cache: {:?}", + e + ))) + } } return Ok(page); } @@ -348,8 +355,15 @@ impl Pager { match page_cache.insert(page_key, page.clone()) { Ok(_) => {} Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(CacheError::KeyExists) => unreachable!("Page should not exist in cache after get() miss"), - Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache: {:?}", e))), + Err(CacheError::KeyExists) => { + unreachable!("Page should not exist in cache after get() miss") + } + Err(e) => { + return Err(LimboError::InternalError(format!( + "Failed to insert page into cache: {:?}", + e + ))) + } } Ok(page) } @@ -373,9 +387,14 @@ impl Pager { page.set_uptodate(); } match page_cache.insert(page_key, page.clone()) { - Err(CacheError::KeyExists) => {}, // Exists but same page, not error + Err(CacheError::KeyExists) => {} // Exists but same page, not error Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache during load: {:?}", e))), + Err(e) => { + return Err(LimboError::InternalError(format!( + "Failed to insert page into cache during load: {:?}", + e + ))) + } Ok(_) => {} } return Ok(()); @@ -383,10 +402,15 @@ impl Pager { } match page_cache.insert(page_key, page.clone()) { - Err(CacheError::KeyExists) => {}, // Ensures same page - Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(e) => return Err(LimboError::InternalError(format!("Failed to insert page into cache during load: {:?}", e))), - Ok(_) => {} + Err(CacheError::KeyExists) => {} // Ensures same page + Err(CacheError::Full) => return Err(LimboError::CacheFull), + Err(e) => { + return Err(LimboError::InternalError(format!( + "Failed to insert page into cache during load: {:?}", + e + ))) + } + Ok(_) => {} }; sqlite3_ondisk::begin_read_page( @@ -440,8 +464,8 @@ impl Pager { for page_id in self.dirty_pages.borrow().iter() { let mut cache = self.page_cache.write(); let page_key = PageCacheKey::new(*page_id, Some(max_frame)); + let page = cache.get(&page_key).expect("we somehow added a page to dirty list but we didn't mark it as dirty, causing cache to drop it."); if let Some(wal) = &self.wal { - let page = cache.get(&page_key).expect("we somehow added a page to dirty list but we didn't mark it as dirty, causing cache to drop it."); let page_type = page.get().contents.as_ref().unwrap().maybe_page_type(); trace!("cacheflush(page={}, page_type={:?}", page_id, page_type); wal.borrow_mut().append_frame( @@ -450,6 +474,7 @@ impl Pager { self.flush_info.borrow().in_flight_writes.clone(), )?; } + page.clear_dirty(); // This page is no longer valid. // For example: // We took page with key (page_num, max_frame) -- this page is no longer valid for that max_frame @@ -579,7 +604,10 @@ impl Pager { } } // TODO: only clear cache of things that are really invalidated - self.page_cache.write().clear().expect("Failed to clear page cache"); + self.page_cache + .write() + .clear() + .expect("Failed to clear page cache"); checkpoint_result } @@ -731,7 +759,8 @@ impl Pager { cache.insert(page_key, page.clone()).map_err(|e| { LimboError::InternalError(format!( - "Failed to insert loaded page {} into cache: {:?}", id, e + "Failed to insert loaded page {} into cache: {:?}", + id, e )) })?; @@ -807,7 +836,7 @@ mod tests { std::thread::spawn(move || { let mut cache = cache.write(); let page_key = PageCacheKey::new(1, None); - cache.insert(page_key, Arc::new(Page::new(1))); + cache.insert(page_key, Arc::new(Page::new(1))).unwrap(); }) }; let _ = thread.join(); diff --git a/core/translate/pragma.rs b/core/translate/pragma.rs index 4e49a55d1..2de4ce996 100644 --- a/core/translate/pragma.rs +++ b/core/translate/pragma.rs @@ -305,5 +305,7 @@ fn update_cache_size(value: i64, header: Arc>, pager: R pager.write_database_header(&header_copy); // update cache size - pager.change_page_cache_size(cache_size); + pager + .change_page_cache_size(cache_size) + .expect("couldn't update page cache size"); } From 04323f95a57d034830725f9671407fe8e2407080 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 12:03:53 +0200 Subject: [PATCH 09/22] increase cache size in empty_btree --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 88897f4c2..07fb65495 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -6171,7 +6171,7 @@ mod tests { let wal_file = WalFile::new(io.clone(), page_size, wal_shared, buffer_pool.clone()); let wal = Rc::new(RefCell::new(wal_file)); - let page_cache = Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(10))); + let page_cache = Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(100))); let pager = { let db_header = Arc::new(SpinLock::new(db_header.clone())); Pager::finish_open(db_header, db_file, Some(wal), io, page_cache, buffer_pool).unwrap() From 9677997c63dba58a11440dc6a8c0f76762351960 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 12:04:30 +0200 Subject: [PATCH 10/22] fix page cache fuzz to test whether a key is in the cache, we must use peek without touching the value in order to not promote and change the order of values in lru cache --- core/storage/page_cache.rs | 90 +++++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 651637c70..566c8063a 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -268,8 +268,35 @@ impl DumbLruPageCache { } pub fn print(&mut self) { - println!("page_cache={}", self.map.borrow().len()); - println!("page_cache={:?}", self.map.borrow()) + println!("page_cache_len={}", self.map.borrow().len()); + let head_ptr = *self.head.borrow(); + let mut current = head_ptr; + let mut last_ptr: Option> = None; + while let Some(node) = current { + unsafe { + println!("page={:?}", node.as_ref().key); + let node_ref = node.as_ref(); + last_ptr = Some(node); + current = node_ref.next; + } + } + } + + #[cfg(test)] + pub fn keys(&mut self) -> Vec { + let mut this_keys = Vec::new(); + let head_ptr = *self.head.borrow(); + let mut current = head_ptr; + let mut last_ptr: Option> = None; + while let Some(node) = current { + unsafe { + this_keys.push(node.as_ref().key.clone()); + let node_ref = node.as_ref(); + last_ptr = Some(node); + current = node_ref.next; + } + } + this_keys } pub fn len(&self) -> usize { @@ -410,6 +437,7 @@ mod tests { use crate::storage::page_cache::CacheError; use crate::storage::pager::{Page, PageRef}; use crate::storage::sqlite3_ondisk::PageContent; + use std::ptr::NonNull; use std::{cell::RefCell, num::NonZeroUsize, pin::Pin, rc::Rc, sync::Arc}; use lru::LruCache; @@ -633,6 +661,7 @@ mod tests { } #[test] + #[ignore = "for now let's not track active refs"] fn test_detach_via_delete() { let mut cache = DumbLruPageCache::new(5); let key1 = create_key(1); @@ -717,6 +746,8 @@ mod tests { } #[test] + #[ignore = "for now let's not track active refs"] + fn test_detach_with_active_reference_clean() { let mut cache = DumbLruPageCache::new(5); let (key, entry) = insert_and_get_entry(&mut cache, 1); @@ -727,6 +758,7 @@ mod tests { } #[test] + #[ignore = "for now let's not track active refs"] fn test_detach_with_active_reference_no_clean() { let mut cache = DumbLruPageCache::new(5); let (key, entry) = insert_and_get_entry(&mut cache, 1); @@ -818,7 +850,12 @@ mod tests { let mut lru = LruCache::new(NonZeroUsize::new(10).unwrap()); for _ in 0..10000 { - match rng.next_u64() % 3 { + dbg!(&lru); + cache.print(); + for (key, page) in &lru { + println!("lru_page={:?}", key); + } + match rng.next_u64() % 2 { 0 => { // add let id_page = rng.next_u64() % max_pages; @@ -826,10 +863,10 @@ mod tests { let key = PageCacheKey::new(id_page as usize, Some(id_frame)); #[allow(clippy::arc_with_non_send_sync)] let page = Arc::new(Page::new(id_page as usize)); - if let Some(_) = cache.get(&key) { + if let Some(_) = cache.peek(&key, false) { continue; // skip duplicate page ids } - // println!("inserting page {:?}", key); + println!("inserting page {:?}", key); match cache.insert(key.clone(), page.clone()) { Err(CacheError::Full | CacheError::ActiveRefs) => {} // Ignore Err(err) => { @@ -846,33 +883,53 @@ mod tests { // remove let random = rng.next_u64() % 2 == 0; let key = if random || lru.is_empty() { - let id_page = rng.next_u64() % max_pages; + let id_page: u64 = rng.next_u64() % max_pages; let id_frame = rng.next_u64() % max_pages; let key = PageCacheKey::new(id_page as usize, Some(id_frame)); key } else { let i = rng.next_u64() as usize % lru.len(); - let key = lru.iter().skip(i).next().unwrap().0.clone(); + let key: PageCacheKey = lru.iter().skip(i).next().unwrap().0.clone(); key }; - // println!("removing page {:?}", key); + println!("removing page {:?}", key); lru.pop(&key); assert!(cache.delete(key).is_ok()); } - 2 => { - // test contents - for (key, page) in &lru { - // println!("getting page {:?}", key); - cache.peek(&key, false).unwrap(); - assert_eq!(page.get().id, key.pgno); - } - } _ => unreachable!(), } + compare_to_lru(&mut cache, &lru); + cache.print(); + for (key, page) in &lru { + println!("lru_page={:?}", key); + } + cache.verify_list_integrity(); + for (key, page) in &lru { + println!("getting page {:?}", key); + cache.peek(&key, false).unwrap(); + assert_eq!(page.get().id, key.pgno); + } } assert!(lru.len() > 0); // Check it inserted some } + pub fn compare_to_lru(cache: &mut DumbLruPageCache, lru: &LruCache) { + use lru::LruCache; + + let this_keys = cache.keys(); + let mut lru_keys = Vec::new(); + for (lru_key, _) in lru { + lru_keys.push(lru_key.clone()); + } + if this_keys != lru_keys { + cache.print(); + for (lru_key, _) in lru { + println!("lru_page={:?}", lru_key); + } + assert_eq!(&this_keys, &lru_keys) + } + } + #[test] fn test_page_cache_insert_and_get() { let mut cache = DumbLruPageCache::new(2); @@ -957,6 +1014,7 @@ mod tests { } #[test] + #[ignore = "for now let's not track active refs"] fn test_resize_with_active_references() { let mut cache = DumbLruPageCache::new(5); let page1 = page_with_content(1); From 35e2088b7ea9e7c299bcb29b6fa6ae98fd2ba566 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 15:00:32 +0200 Subject: [PATCH 11/22] cacheflush move dirty page to new snapshot After inserting a page into the wal, we dispose of the modified page. This is unnecessary as we can simply move new page to the newest snapshot where this page can be read. --- core/storage/pager.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 7d1acde7d..f369c9d57 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -461,6 +461,9 @@ impl Pager { Some(wal) => wal.borrow().get_max_frame(), None => 0, }; + let max_frame_after_append = self.wal.as_ref().map(|wal| { + wal.borrow().get_max_frame() + self.dirty_pages.borrow().len() as u64 + }); for page_id in self.dirty_pages.borrow().iter() { let mut cache = self.page_cache.write(); let page_key = PageCacheKey::new(*page_id, Some(max_frame)); @@ -473,6 +476,17 @@ impl Pager { db_size, self.flush_info.borrow().in_flight_writes.clone(), )?; + // Assuming writer will always end append frames at frameid == this_transaction.max_frame + dirty_pages, + // we can insert this page with a new key into that new "snapshot" that has the newest max frame. We don't + // simply clone the page and insert with new key because next cache.delete will invalidate the contents of the page, + // therefore we need to clone the contents itself and place them on the new page. Cloning contents should be fast because + // buffer is wrapped around an Arc. + let new_page = Page::new(*page_id); + new_page.get().contents = Some(page.get_contents().clone()); + new_page.set_loaded(); + let new_page: Arc = Arc::new(new_page); + let new_page_key = PageCacheKey::new(*page_id, max_frame_after_append); + cache.insert(new_page_key, new_page).map_err(|e| {LimboError::InternalError(format!("Failed to delete page {:?} from cache during flush: {:?}. Might be actively referenced.", page_id, e))})?; } page.clear_dirty(); // This page is no longer valid. From adf72f2bf8475a7eef9cff5191fdfc4bbf9d65d2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 15:02:38 +0200 Subject: [PATCH 12/22] allow updating a page id in page cache --- core/storage/btree.rs | 4 +-- core/storage/page_cache.rs | 67 +++++++++++++++++++++++++------------- core/storage/pager.rs | 30 +++++++---------- 3 files changed, 59 insertions(+), 42 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 07fb65495..5958b1539 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2814,8 +2814,8 @@ impl BTreeCursor { let page = page.as_ref().unwrap(); if *new_id != page.get().id { page.get().id = *new_id; - // FIXME: why would there be another page at this id/frame? - self.pager.put_loaded_page(*new_id, page.clone())?; + self.pager + .update_dirty_loaded_page_in_cache(*new_id, page.clone())?; } } diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 566c8063a..1626119a0 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -71,14 +71,33 @@ impl DumbLruPageCache { } pub fn insert(&mut self, key: PageCacheKey, value: PageRef) -> Result<(), CacheError> { + self._insert(key, value, false) + } + + pub fn insert_ignore_existing( + &mut self, + key: PageCacheKey, + value: PageRef, + ) -> Result<(), CacheError> { + self._insert(key, value, true) + } + + pub fn _insert( + &mut self, + key: PageCacheKey, + value: PageRef, + ignore_exists: bool, + ) -> Result<(), CacheError> { trace!("insert(key={:?})", key); // Check first if page already exists in cache - if let Some(existing_page_ref) = self.get(&key) { - assert!( - Arc::ptr_eq(&value, &existing_page_ref), - "Attempted to insert different page with same key" - ); - return Err(CacheError::KeyExists); + if !ignore_exists { + if let Some(existing_page_ref) = self.get(&key) { + assert!( + Arc::ptr_eq(&value, &existing_page_ref), + "Attempted to insert different page with same key" + ); + return Err(CacheError::KeyExists); + } } self.make_room_for(1)?; let entry = Box::new(PageCacheEntry { @@ -268,15 +287,13 @@ impl DumbLruPageCache { } pub fn print(&mut self) { - println!("page_cache_len={}", self.map.borrow().len()); + tracing::debug!("page_cache_len={}", self.map.borrow().len()); let head_ptr = *self.head.borrow(); let mut current = head_ptr; - let mut last_ptr: Option> = None; while let Some(node) = current { unsafe { - println!("page={:?}", node.as_ref().key); + tracing::debug!("page={:?}", node.as_ref().key); let node_ref = node.as_ref(); - last_ptr = Some(node); current = node_ref.next; } } @@ -287,12 +304,10 @@ impl DumbLruPageCache { let mut this_keys = Vec::new(); let head_ptr = *self.head.borrow(); let mut current = head_ptr; - let mut last_ptr: Option> = None; while let Some(node) = current { unsafe { this_keys.push(node.as_ref().key.clone()); let node_ref = node.as_ref(); - last_ptr = Some(node); current = node_ref.next; } } @@ -428,6 +443,17 @@ impl DumbLruPageCache { "Head pointer mismatch after backward traversal" ); } + + #[cfg(test)] + /// For testing purposes, in case we use cursor api directly, we might want to unmark pages as dirty because we bypass the WAL transaction layer + pub fn unset_dirty_all_pages(&mut self) { + for (_, page) in self.map.borrow_mut().iter_mut() { + unsafe { + let entry = page.as_mut(); + entry.page.clear_dirty() + }; + } + } } #[cfg(test)] @@ -850,10 +876,9 @@ mod tests { let mut lru = LruCache::new(NonZeroUsize::new(10).unwrap()); for _ in 0..10000 { - dbg!(&lru); cache.print(); - for (key, page) in &lru { - println!("lru_page={:?}", key); + for (key, _) in &lru { + tracing::debug!("lru_page={:?}", key); } match rng.next_u64() % 2 { 0 => { @@ -866,7 +891,7 @@ mod tests { if let Some(_) = cache.peek(&key, false) { continue; // skip duplicate page ids } - println!("inserting page {:?}", key); + tracing::debug!("inserting page {:?}", key); match cache.insert(key.clone(), page.clone()) { Err(CacheError::Full | CacheError::ActiveRefs) => {} // Ignore Err(err) => { @@ -892,7 +917,7 @@ mod tests { let key: PageCacheKey = lru.iter().skip(i).next().unwrap().0.clone(); key }; - println!("removing page {:?}", key); + tracing::debug!("removing page {:?}", key); lru.pop(&key); assert!(cache.delete(key).is_ok()); } @@ -900,8 +925,8 @@ mod tests { } compare_to_lru(&mut cache, &lru); cache.print(); - for (key, page) in &lru { - println!("lru_page={:?}", key); + for (key, _) in &lru { + tracing::debug!("lru_page={:?}", key); } cache.verify_list_integrity(); for (key, page) in &lru { @@ -914,8 +939,6 @@ mod tests { } pub fn compare_to_lru(cache: &mut DumbLruPageCache, lru: &LruCache) { - use lru::LruCache; - let this_keys = cache.keys(); let mut lru_keys = Vec::new(); for (lru_key, _) in lru { @@ -924,7 +947,7 @@ mod tests { if this_keys != lru_keys { cache.print(); for (lru_key, _) in lru { - println!("lru_page={:?}", lru_key); + tracing::debug!("lru_page={:?}", lru_key); } assert_eq!(&this_keys, &lru_keys) } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index f369c9d57..0e6f718da 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -750,7 +750,11 @@ impl Pager { } } - pub fn put_loaded_page(&self, id: usize, page: PageRef) -> Result<(), LimboError> { + pub fn update_dirty_loaded_page_in_cache( + &self, + id: usize, + page: PageRef, + ) -> Result<(), LimboError> { let mut cache = self.page_cache.write(); let max_frame = match &self.wal { Some(wal) => wal.borrow().get_max_frame(), @@ -758,26 +762,16 @@ impl Pager { }; let page_key = PageCacheKey::new(id, Some(max_frame)); - // FIXME: is this correct? Why would there be another version of the page? - // Check if there's an existing page at this key and remove it - // This can happen when a page is being updated during a transaction - // or when WAL frames are being managed - if let Some(_existing_page_ref) = cache.get(&page_key) { - cache.delete(page_key.clone()).map_err(|e| { + // FIXME: use specific page key for writer instead of max frame, this will make readers not conflict + assert!(page.is_dirty()); + cache + .insert_ignore_existing(page_key, page.clone()) + .map_err(|e| { LimboError::InternalError(format!( - "put_loaded_page failed to remove old version of page {:?}: {:?}.", - page_key, e + "Failed to insert loaded page {} into cache: {:?}", + id, e )) })?; - } - - cache.insert(page_key, page.clone()).map_err(|e| { - LimboError::InternalError(format!( - "Failed to insert loaded page {} into cache: {:?}", - id, e - )) - })?; - page.set_loaded(); Ok(()) } From 15d24bd818c803412e2d7b77fd887ec65fd65fc0 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 15:05:17 +0200 Subject: [PATCH 13/22] Start transactions in fuzz tests to flush pages Previously, fuzz tests increase the size of page cache indefinitely, therefore the was no problem of reaching the capacity of a page cache. By adding transactions to fuzz tests we allow pages to remove dirty flags once insert is finished. --- core/storage/btree.rs | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5958b1539..8429192bf 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -6171,7 +6171,7 @@ mod tests { let wal_file = WalFile::new(io.clone(), page_size, wal_shared, buffer_pool.clone()); let wal = Rc::new(RefCell::new(wal_file)); - let page_cache = Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(100))); + let page_cache = Arc::new(parking_lot::RwLock::new(DumbLruPageCache::new(2000))); let pager = { let db_header = Arc::new(SpinLock::new(db_header.clone())); Pager::finish_open(db_header, db_file, Some(wal), io, page_cache, buffer_pool).unwrap() @@ -6303,6 +6303,7 @@ mod tests { tracing::info!("seed: {}", seed); for insert_id in 0..inserts { let do_validate = do_validate_btree || (insert_id % VALIDATE_INTERVAL == 0); + pager.begin_write_tx().unwrap(); let size = size(&mut rng); let key = { let result; @@ -6345,6 +6346,17 @@ mod tests { pager.deref(), ) .unwrap(); + loop { + match pager.end_tx().unwrap() { + crate::CheckpointStatus::Done(_) => break, + crate::CheckpointStatus::IO => { + pager.io.run_once().unwrap(); + } + } + } + pager.begin_read_tx().unwrap(); + // FIXME: add sorted vector instead, should be okay for small amounts of keys for now :P, too lazy to fix right now + cursor.move_to_root(); let mut valid = true; if do_validate { cursor.move_to_root(); @@ -6368,7 +6380,9 @@ mod tests { println!("btree after:\n{}", btree_after); panic!("invalid btree"); } + pager.end_read_tx().unwrap(); } + pager.begin_read_tx().unwrap(); tracing::info!( "=========== btree ===========\n{}\n\n", format_btree(pager.clone(), root_page, 0) @@ -6387,6 +6401,7 @@ mod tests { key, cursor_rowid ); } + pager.end_read_tx().unwrap(); } } @@ -6408,7 +6423,10 @@ mod tests { let mut cursor = BTreeCursor::new_table(None, pager.clone(), index_root_page); let mut keys = SortedVec::new(); tracing::info!("seed: {}", seed); - for _ in 0..inserts { + for i in 0..inserts { + tracing::info!("insert {}", i); + pager.begin_read_tx().unwrap(); + pager.begin_write_tx().unwrap(); let key = { let result; loop { @@ -6442,7 +6460,16 @@ mod tests { ) .unwrap(); cursor.move_to_root(); + loop { + match pager.end_tx().unwrap() { + crate::CheckpointStatus::Done(_) => break, + crate::CheckpointStatus::IO => { + pager.io.run_once().unwrap(); + } + } + } } + pager.begin_read_tx().unwrap(); cursor.move_to_root(); for key in keys.iter() { tracing::trace!("seeking key: {:?}", key); @@ -6459,6 +6486,7 @@ mod tests { key ); } + pager.end_read_tx().unwrap(); } } @@ -6516,7 +6544,7 @@ mod tests { #[test] pub fn btree_index_insert_fuzz_run_equal_size() { - btree_index_insert_fuzz_run(2, 1024 * 32); + btree_index_insert_fuzz_run(2, 1024); } #[test] From 35f7317724a4d1b6e8d7aaeacd7535c84b1c248b Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 19 May 2025 15:40:32 +0200 Subject: [PATCH 14/22] add default page cache --- core/lib.rs | 6 +----- core/storage/page_cache.rs | 10 +++++++++- core/vdbe/execute.rs | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 4e4164a35..002c0919f 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -79,8 +79,6 @@ use vdbe::{builder::QueryMode, VTabOpaqueCursor}; pub type Result = std::result::Result; pub static DATABASE_VERSION: OnceLock = OnceLock::new(); -const DEFAULT_PAGE_CACHE_SIZE_IN_PAGES: usize = 2000; - #[derive(Clone, Copy, PartialEq, Eq)] enum TransactionState { Write, @@ -169,9 +167,7 @@ impl Database { None }; - let shared_page_cache = Arc::new(RwLock::new(DumbLruPageCache::new( - DEFAULT_PAGE_CACHE_SIZE_IN_PAGES, - ))); + let shared_page_cache = Arc::new(RwLock::new(DumbLruPageCache::default())); let schema = Arc::new(RwLock::new(Schema::new())); let db = Database { mv_store, diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 1626119a0..197fcdfd2 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -5,6 +5,8 @@ use tracing::{debug, trace}; use super::pager::PageRef; +const DEFAULT_PAGE_CACHE_SIZE_IN_PAGES: usize = 2000; + // In limbo, page cache is shared by default, meaning that multiple frames from WAL can reside in // the cache, meaning, we need a way to differentiate between pages cached in different // connections. For this we include the max_frame that a connection will read from so that if two @@ -327,7 +329,7 @@ impl DumbLruPageCache { fn verify_list_integrity(&self) { let map_len = self.map.borrow().len(); let head_ptr = *self.head.borrow(); - let tail_ptr = *self.tail.borrow(); + let tail_ptr: Option> = *self.tail.borrow(); if map_len == 0 { assert!(head_ptr.is_none(), "Head should be None when map is empty"); @@ -456,6 +458,12 @@ impl DumbLruPageCache { } } +impl Default for DumbLruPageCache { + fn default() -> Self { + DumbLruPageCache::new(DEFAULT_PAGE_CACHE_SIZE_IN_PAGES) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/vdbe/execute.rs b/core/vdbe/execute.rs index e1dd45db9..ad0f9714a 100644 --- a/core/vdbe/execute.rs +++ b/core/vdbe/execute.rs @@ -4692,7 +4692,7 @@ pub fn op_open_ephemeral( let db_header = Pager::begin_open(db_file.clone())?; let buffer_pool = Rc::new(BufferPool::new(db_header.lock().get_page_size() as usize)); - let page_cache = Arc::new(RwLock::new(DumbLruPageCache::new(10))); + let page_cache = Arc::new(RwLock::new(DumbLruPageCache::default())); let pager = Rc::new(Pager::finish_open( db_header, From 591c674e86bfb2275303b9541199aa8fc127cae7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 20 May 2025 15:44:18 +0200 Subject: [PATCH 15/22] Introduce PageRef wrapper `BTreePage`. One problem we have with PageRef, is that this Page reference can be unloaded, this means if we read the page again instead of loading the page onto the same reference, we will have split brain of references. To solve this we wrap PageRef in `BTreePage` so that if a page is seen as unloaded, we will replace BTreePage::page with the newest version of the page. --- core/lib.rs | 6 +- core/storage/btree.rs | 497 +++++++++++++++++++++++++++--------------- core/storage/pager.rs | 70 +----- 3 files changed, 333 insertions(+), 240 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 002c0919f..37a6a5020 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -56,7 +56,7 @@ use std::{ rc::Rc, sync::{Arc, OnceLock}, }; -use storage::btree::btree_init_page; +use storage::btree::{btree_init_page, BTreePageInner}; #[cfg(feature = "fs")] use storage::database::DatabaseFile; pub use storage::{ @@ -271,6 +271,9 @@ pub fn maybe_init_database_file(file: &Arc, io: &Arc) -> Resul &Rc::new(BufferPool::new(db_header.get_page_size() as usize)), DATABASE_HEADER_SIZE, ); + let page1 = Arc::new(BTreePageInner { + page: RefCell::new(page1), + }); { // Create the sqlite_schema table, for this we just need to create the btree page // for the first page of the database which is basically like any other btree page @@ -283,6 +286,7 @@ pub fn maybe_init_database_file(file: &Arc, io: &Arc) -> Resul (db_header.get_page_size() - db_header.reserved_space as u32) as u16, ); + let page1 = page1.get(); let contents = page1.get().contents.as_mut().unwrap(); contents.write_database_header(&db_header); // write the first page to disk synchronously diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 8429192bf..9b8c2879d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -25,6 +25,7 @@ use std::{ cmp::Ordering, pin::Pin, rc::Rc, + sync::Arc, }; use super::{ @@ -117,17 +118,25 @@ macro_rules! debug_validate_cells { } /// Check if the page is unlocked, if not return IO. If the page is not locked but not loaded, then try to load it. macro_rules! return_if_locked_maybe_load { - ($pager:expr, $expr:expr) => {{ - if $expr.is_locked() { + ($pager:expr, $btree_page:expr) => {{ + if $btree_page.get().is_locked() { return Ok(CursorResult::IO); } - if !$expr.is_loaded() { - $pager.load_page($expr.clone())?; + if !$btree_page.get().is_loaded() { + let page = $pager.read_page($btree_page.get().get().id)?; + $btree_page.page.replace(page); return Ok(CursorResult::IO); } }}; } +/// Wrapper around a page reference used in order to update the reference in case page was unloaded +/// and we need to update the reference. +pub struct BTreePageInner { + pub page: RefCell, +} + +pub type BTreePage = Arc; /// State machine of destroy operations /// Keep track of traversal so that it can be resumed when IO is encountered #[derive(Debug, Clone)] @@ -194,7 +203,7 @@ enum ReadPayloadOverflow { payload: Vec, next_page: u32, remaining_to_read: usize, - page: PageRef, + page: BTreePage, }, } @@ -210,7 +219,7 @@ enum PayloadOverflowWithOffset { ProcessPage { next_page: u32, remaining_to_read: u32, - page: PageRef, + page: BTreePage, current_offset: usize, buffer_offset: usize, is_write: bool, @@ -271,7 +280,7 @@ impl BTreeKey<'_> { #[derive(Clone)] struct BalanceInfo { /// Old pages being balanced. We can have maximum 3 pages being balanced at the same time. - pages_to_balance: [Option; 3], + pages_to_balance: [Option; 3], /// Bookkeeping of the rightmost pointer so the offset::BTREE_RIGHTMOST_PTR can be updated. rightmost_pointer: *mut u8, /// Divider cells of old pages. We can have maximum 2 divider cells because of 3 pages. @@ -542,11 +551,8 @@ impl BTreeCursor { } let cell_idx = cell_idx as usize; - return_if_locked!(page); - if !page.is_loaded() { - self.pager.load_page(page.clone())?; - return Ok(CursorResult::IO); - } + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); @@ -557,7 +563,7 @@ impl BTreeCursor { let rightmost_pointer = contents.rightmost_pointer(); if let Some(rightmost_pointer) = rightmost_pointer { self.stack - .push_backwards(self.pager.read_page(rightmost_pointer as usize)?); + .push_backwards(self.read_page(rightmost_pointer as usize)?); continue; } } @@ -580,7 +586,7 @@ impl BTreeCursor { _left_child_page, _rowid, }) => { - let mem_page = self.pager.read_page(_left_child_page as usize)?; + let mem_page = self.read_page(_left_child_page as usize)?; self.stack.push_backwards(mem_page); continue; } @@ -618,7 +624,7 @@ impl BTreeCursor { // left child has: key 663, key 664, key 665 // we need to move to the previous parent (with e.g. key 662) when iterating backwards. self.stack.retreat(); - let mem_page = self.pager.read_page(left_child_page as usize)?; + let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); // use cell_index = i32::MAX to tell next loop to go to the end of the current page self.stack.set_cell_index(i32::MAX); @@ -743,7 +749,7 @@ impl BTreeCursor { let res = match &mut self.state { CursorState::None => { tracing::debug!("start reading overflow page payload_size={}", payload_size); - let page = self.pager.read_page(start_next_page as usize)?; + let page = self.read_page(start_next_page as usize)?; self.state = CursorState::Read(ReadPayloadOverflow::ProcessPage { payload: payload.to_vec(), next_page: start_next_page, @@ -756,12 +762,13 @@ impl BTreeCursor { payload, next_page, remaining_to_read, - page, + page: page_btree, }) => { - if page.is_locked() { + if page_btree.get().is_locked() { return Ok(CursorResult::IO); } tracing::debug!("reading overflow page {} {}", next_page, remaining_to_read); + let page = page_btree.get(); let contents = page.get_contents(); // The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain. let next = contents.read_u32_no_offset(0); @@ -779,8 +786,12 @@ impl BTreeCursor { std::mem::swap(payload, &mut payload_swap); CursorResult::Ok(payload_swap) } else { - let new_page = self.pager.read_page(next as usize)?; - *page = new_page; + let new_page = self.pager.read_page(next as usize).map(|page| { + Arc::new(BTreePageInner { + page: RefCell::new(page), + }) + })?; + *page_btree = new_page; *next_page = next; CursorResult::IO } @@ -871,9 +882,10 @@ impl BTreeCursor { return self.continue_payload_overflow_with_offset(buffer, self.usable_space()); } - let page = self.stack.top(); - return_if_locked_maybe_load!(self.pager, page); + let page_btree = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page_btree); + let page = page_btree.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index() as usize - 1; @@ -918,7 +930,13 @@ impl BTreeCursor { local_amount = local_size as u32 - offset; } if is_write { - self.write_payload_to_page(offset, local_amount, payload, buffer, page.clone()); + self.write_payload_to_page( + offset, + local_amount, + payload, + buffer, + page_btree.clone(), + ); } else { self.read_payload_from_page(offset, local_amount, payload, buffer); } @@ -973,7 +991,7 @@ impl BTreeCursor { is_write, }) => { if *pages_left_to_skip == 0 { - let page = self.pager.read_page(*next_page as usize)?; + let page = self.read_page(*next_page as usize)?; return_if_locked_maybe_load!(self.pager, page); self.state = CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { @@ -988,8 +1006,9 @@ impl BTreeCursor { continue; } - let page = self.pager.read_page(*next_page as usize)?; + let page = self.read_page(*next_page as usize)?; return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); let contents = page.get_contents(); let next = contents.read_u32_no_offset(0); @@ -1018,17 +1037,17 @@ impl BTreeCursor { CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { next_page, remaining_to_read, - page, + page: page_btree, current_offset, buffer_offset, is_write, }) => { - if page.is_locked() { + if page_btree.get().is_locked() { self.state = CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { next_page: *next_page, remaining_to_read: *remaining_to_read, - page: page.clone(), + page: page_btree.clone(), current_offset: *current_offset, buffer_offset: *buffer_offset, is_write: *is_write, @@ -1037,6 +1056,7 @@ impl BTreeCursor { return Ok(CursorResult::IO); } + let page = page_btree.get(); let contents = page.get_contents(); let overflow_size = usable_space - 4; @@ -1054,7 +1074,7 @@ impl BTreeCursor { bytes_to_process, page_payload, buffer, - page.clone(), + page_btree.clone(), ); } else { self.read_payload_from_page( @@ -1081,7 +1101,7 @@ impl BTreeCursor { // Load next page *next_page = next; *current_offset = 0; // Reset offset for new page - *page = self.pager.read_page(next as usize)?; + *page_btree = self.read_page(next as usize)?; // Return IO to allow other operations return Ok(CursorResult::IO); @@ -1116,10 +1136,10 @@ impl BTreeCursor { num_bytes: u32, payload: &[u8], buffer: &mut Vec, - page: PageRef, + page: BTreePage, ) { - page.set_dirty(); - self.pager.add_dirty(page.get().id); + page.get().set_dirty(); + self.pager.add_dirty(page.get().get().id); // SAFETY: This is safe as long as the page is not evicted from the cache. let payload_mut = unsafe { std::slice::from_raw_parts_mut(payload.as_ptr() as *mut u8, payload.len()) }; @@ -1155,15 +1175,15 @@ impl BTreeCursor { let mem_page_rc = self.stack.top(); let cell_idx = self.stack.current_cell_index() as usize; - tracing::trace!("current id={} cell={}", mem_page_rc.get().id, cell_idx); - return_if_locked!(mem_page_rc); - if !mem_page_rc.is_loaded() { - self.pager.load_page(mem_page_rc.clone())?; - return Ok(CursorResult::IO); - } + tracing::trace!( + "current id={} cell={}", + mem_page_rc.get().get().id, + cell_idx + ); + return_if_locked_maybe_load!(self.pager, mem_page_rc); let mem_page = mem_page_rc.get(); - let contents = mem_page.contents.as_ref().unwrap(); + let contents = mem_page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); if cell_count == 0 || cell_idx == cell_count { @@ -1172,7 +1192,7 @@ impl BTreeCursor { match contents.rightmost_pointer() { Some(right_most_pointer) => { self.stack.advance(); - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue; } @@ -1216,7 +1236,7 @@ impl BTreeCursor { }) => { assert!(predicate.is_none()); self.stack.advance(); - let mem_page = self.pager.read_page(*_left_child_page as usize)?; + let mem_page = self.read_page(*_left_child_page as usize)?; self.stack.push(mem_page); continue; } @@ -1251,7 +1271,7 @@ impl BTreeCursor { payload_size, }) => { if !self.going_upwards { - let mem_page = self.pager.read_page(*left_child_page as usize)?; + let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); continue; } @@ -1387,7 +1407,7 @@ impl BTreeCursor { /// Move the cursor to the root page of the btree. fn move_to_root(&mut self) { tracing::trace!("move_to_root({})", self.root_page); - let mem_page = self.pager.read_page(self.root_page).unwrap(); + let mem_page = self.read_page(self.root_page).unwrap(); self.stack.clear(); self.stack.push(mem_page); } @@ -1398,9 +1418,10 @@ impl BTreeCursor { loop { let mem_page = self.stack.top(); - let page_idx = mem_page.get().id; - let page = self.pager.read_page(page_idx)?; - return_if_locked!(page); + let page_idx = mem_page.get().get().id; + let page = self.read_page(page_idx)?; + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { if contents.cell_count() > 0 { @@ -1412,7 +1433,7 @@ impl BTreeCursor { match contents.rightmost_pointer() { Some(right_most_pointer) => { self.stack.set_cell_index(contents.cell_count() as i32 + 1); - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue; } @@ -1429,7 +1450,8 @@ impl BTreeCursor { let iter_dir = seek_op.iteration_direction(); 'outer: loop { let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { return Ok(CursorResult::Ok(())); @@ -1457,14 +1479,14 @@ impl BTreeCursor { -1 + (iter_dir == IterationDirection::Forwards) as i32 * 2; self.stack .set_cell_index(leftmost_matching_cell as i32 + index_change); - let mem_page = self.pager.read_page(left_child_page as usize)?; + let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); continue 'outer; } self.stack.set_cell_index(cell_count as i32 + 1); match contents.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue 'outer; } @@ -1524,7 +1546,8 @@ impl BTreeCursor { let iter_dir = cmp.iteration_direction(); 'outer: loop { let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { return Ok(CursorResult::Ok(())); @@ -1540,7 +1563,7 @@ impl BTreeCursor { self.stack.set_cell_index(contents.cell_count() as i32 + 1); match contents.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue 'outer; } @@ -1580,7 +1603,7 @@ impl BTreeCursor { unreachable!("unexpected cell type: {:?}", matching_cell); }; - let mem_page = self.pager.read_page(*left_child_page as usize)?; + let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); continue 'outer; } @@ -1686,7 +1709,8 @@ impl BTreeCursor { self.move_to_root(); return_if_io!(self.tablebtree_move_to(rowid, seek_op)); let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); assert!( contents.is_leaf(), @@ -1837,8 +1861,9 @@ impl BTreeCursor { return_if_io!(self.indexbtree_move_to(key, seek_op)); let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); @@ -2070,7 +2095,8 @@ impl BTreeCursor { // get page and find cell let (cell_idx, page_type) = { - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); page.set_dirty(); self.pager.add_dirty(page.get().id); @@ -2088,8 +2114,8 @@ impl BTreeCursor { // if the cell index is less than the total cells, check: if its an existing // rowid, we are going to update / overwrite the cell - if cell_idx < page.get_contents().cell_count() { - match page.get_contents().cell_get( + if cell_idx < page.get().get_contents().cell_count() { + match page.get().get_contents().cell_get( cell_idx, payload_overflow_threshold_max(page_type, self.usable_space() as u16), payload_overflow_threshold_min(page_type, self.usable_space() as u16), @@ -2147,6 +2173,7 @@ impl BTreeCursor { // insert let overflow = { + let page = page.get(); let contents = page.get().contents.as_mut().unwrap(); tracing::debug!( "insert_into_page(overflow, cell_count={})", @@ -2220,6 +2247,7 @@ impl BTreeCursor { // is less than 2/3rds of the total usable space on the page // // https://github.com/sqlite/sqlite/blob/0aa95099f5003dc99f599ab77ac0004950b281ef/src/btree.c#L9064-L9071 + let current_page = current_page.get(); let page = current_page.get().contents.as_mut().unwrap(); let usable_space = self.usable_space(); let free_space = compute_free_space(page, usable_space as u16); @@ -2265,6 +2293,7 @@ impl BTreeCursor { WriteState::BalanceNonRoot => { let parent_page = self.stack.top(); return_if_locked_maybe_load!(self.pager, parent_page); + let parent_page = parent_page.get(); // If `move_to` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. // In any other case, `move_to` will stay in the correct index. if self.stack.current_cell_index() as usize @@ -2287,7 +2316,7 @@ impl BTreeCursor { PageType::IndexInterior | PageType::TableInterior )); // Part 1: Find the sibling pages to balance - let mut pages_to_balance: [Option; 3] = [const { None }; 3]; + let mut pages_to_balance: [Option; 3] = [const { None }; 3]; let number_of_cells_in_parent = parent_contents.cell_count() + parent_contents.overflow_cells.len(); @@ -2354,11 +2383,14 @@ impl BTreeCursor { let mut pgno: u32 = unsafe { right_pointer.cast::().read().swap_bytes() }; let current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { - let page = self.pager.read_page(pgno as usize)?; + let page = self.read_page(pgno as usize)?; #[cfg(debug_assertions)] { - return_if_locked!(page); - debug_validate_cells!(&page.get_contents(), self.usable_space() as u16); + return_if_locked!(page.get()); + debug_validate_cells!( + &page.get().get_contents(), + self.usable_space() as u16 + ); } pages_to_balance[i].replace(page); assert_eq!( @@ -2399,10 +2431,13 @@ impl BTreeCursor { let page_type_of_siblings = pages_to_balance[0] .as_ref() .unwrap() + .get() .get_contents() .page_type(); for page in pages_to_balance.iter().take(sibling_count) { - let contents = page.as_ref().unwrap().get_contents(); + return_if_locked_maybe_load!(self.pager, page.as_ref().unwrap()); + let page = page.as_ref().unwrap().get(); + let contents = page.get_contents(); debug_validate_cells!(&contents, self.usable_space() as u16); assert_eq!(contents.page_type(), page_type_of_siblings); } @@ -2431,12 +2466,14 @@ impl BTreeCursor { .pages_to_balance .iter() .take(balance_info.sibling_count) - .all(|page| !page.as_ref().unwrap().is_locked()); + .all(|page| !page.as_ref().unwrap().get().is_locked()); if !all_loaded { return Ok(CursorResult::IO); } // Now do real balancing - let parent_page = self.stack.top(); + let parent_page_btree = self.stack.top(); + let parent_page = parent_page_btree.get(); + let parent_contents = parent_page.get_contents(); let parent_is_root = !self.stack.has_parent(); @@ -2448,9 +2485,10 @@ impl BTreeCursor { /* 1. Get divider cells and max_cells */ let mut max_cells = 0; // we only need maximum 5 pages to balance 3 pages - let mut pages_to_balance_new: [Option; 5] = [const { None }; 5]; + let mut pages_to_balance_new: [Option; 5] = [const { None }; 5]; for i in (0..balance_info.sibling_count).rev() { let sibling_page = balance_info.pages_to_balance[i].as_ref().unwrap(); + let sibling_page = sibling_page.get(); let sibling_contents = sibling_page.get_contents(); sibling_page.set_dirty(); self.pager.add_dirty(sibling_page.get().id); @@ -2511,6 +2549,7 @@ impl BTreeCursor { let page_type = balance_info.pages_to_balance[0] .as_ref() .unwrap() + .get() .get_contents() .page_type(); tracing::debug!("balance_non_root(page_type={:?})", page_type); @@ -2522,7 +2561,8 @@ impl BTreeCursor { .take(balance_info.sibling_count) .enumerate() { - let old_page_contents = old_page.as_ref().unwrap().get_contents(); + let old_page = old_page.as_ref().unwrap().get(); + let old_page_contents = old_page.get_contents(); debug_validate_cells!(&old_page_contents, self.usable_space() as u16); for cell_idx in 0..old_page_contents.cell_count() { let (cell_start, cell_len) = old_page_contents.cell_get_raw_region( @@ -2612,6 +2652,7 @@ impl BTreeCursor { for i in 0..balance_info.sibling_count { cell_array.number_of_cells_per_page[i] = count_cells_in_old_pages[i]; let page = &balance_info.pages_to_balance[i].as_ref().unwrap(); + let page = page.get(); let page_contents = page.get_contents(); let free_space = compute_free_space(page_contents, self.usable_space() as u16); @@ -2782,11 +2823,11 @@ impl BTreeCursor { for i in 0..sibling_count_new { if i < balance_info.sibling_count { let page = balance_info.pages_to_balance[i].as_ref().unwrap(); - page.set_dirty(); + page.get().set_dirty(); pages_to_balance_new[i].replace(page.clone()); } else { // FIXME: handle page cache is full - let page = self.pager.do_allocate_page(page_type, 0); + let page = self.allocate_page(page_type, 0); pages_to_balance_new[i].replace(page); // Since this page didn't exist before, we can set it to cells length as it // marks them as empty since it is a prefix sum of cells. @@ -2802,7 +2843,7 @@ impl BTreeCursor { .take(sibling_count_new) .enumerate() { - page_numbers[i] = page.as_ref().unwrap().get().id; + page_numbers[i] = page.as_ref().unwrap().get().get().id; } page_numbers.sort(); for (page, new_id) in pages_to_balance_new @@ -2812,10 +2853,10 @@ impl BTreeCursor { .zip(page_numbers.iter().rev().take(sibling_count_new)) { let page = page.as_ref().unwrap(); - if *new_id != page.get().id { - page.get().id = *new_id; + if *new_id != page.get().get().id { + page.get().get().id = *new_id; self.pager - .update_dirty_loaded_page_in_cache(*new_id, page.clone())?; + .update_dirty_loaded_page_in_cache(*new_id, page.get())?; } } @@ -2828,7 +2869,7 @@ impl BTreeCursor { for page in pages_to_balance_new.iter().take(sibling_count_new) { tracing::debug!( "balance_non_root(new_sibling page_id={})", - page.as_ref().unwrap().get().id + page.as_ref().unwrap().get().get().id ); } } @@ -2846,6 +2887,7 @@ impl BTreeCursor { .as_ref() .unwrap() .get() + .get() .id as u32; let rightmost_pointer = balance_info.rightmost_pointer; let rightmost_pointer = @@ -2867,11 +2909,12 @@ impl BTreeCursor { let last_page = balance_info.pages_to_balance[balance_info.sibling_count - 1] .as_ref() .unwrap(); - let right_pointer = last_page.get_contents().rightmost_pointer().unwrap(); + let right_pointer = last_page.get().get_contents().rightmost_pointer().unwrap(); let new_last_page = pages_to_balance_new[sibling_count_new - 1] .as_ref() .unwrap(); new_last_page + .get() .get_contents() .write_u32(offset::BTREE_RIGHTMOST_PTR, right_pointer); } @@ -2892,10 +2935,12 @@ impl BTreeCursor { // Interior // Make this page's rightmost pointer point to pointer of divider cell before modification let previous_pointer_divider = read_u32(÷r_cell, 0); - page.get_contents() + page.get() + .get_contents() .write_u32(offset::BTREE_RIGHTMOST_PTR, previous_pointer_divider); // divider cell now points to this page - new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell + .extend_from_slice(&(page.get().get().id as u32).to_be_bytes()); // now copy the rest of the divider cell: // Table Interior page: // * varint rowid @@ -2912,11 +2957,13 @@ impl BTreeCursor { divider_cell = &mut cell_array.cells[divider_cell_idx - 1]; let (_, n_bytes_payload) = read_varint(divider_cell)?; let (rowid, _) = read_varint(÷r_cell[n_bytes_payload..])?; - new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell + .extend_from_slice(&(page.get().get().id as u32).to_be_bytes()); write_varint_to_vec(rowid, &mut new_divider_cell); } else { // Leaf index - new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell + .extend_from_slice(&(page.get().get().id as u32).to_be_bytes()); new_divider_cell.extend_from_slice(divider_cell); } @@ -2930,7 +2977,7 @@ impl BTreeCursor { i, left_pointer ); - assert_eq!(left_pointer, page.get().id as u32); + assert_eq!(left_pointer, page.get().get().id as u32); // FIXME: remove this lock assert!( left_pointer <= self.pager.db_header.lock().database_size, @@ -2951,7 +2998,7 @@ impl BTreeCursor { balance_info, parent_contents, i, - page, + &page.get(), ); } tracing::debug!( @@ -2965,9 +3012,9 @@ impl BTreeCursor { for page in pages_to_balance_new.iter().take(sibling_count_new) { let page = page.as_ref().unwrap(); assert!( - pages_pointed_to.contains(&(page.get().id as u32)), + pages_pointed_to.contains(&(page.get().get().id as u32)), "page {} not pointed to by divider cell or rightmost pointer", - page.get().id + page.get().get().id ); } } @@ -3025,6 +3072,7 @@ impl BTreeCursor { ) }; let page = pages_to_balance_new[page_idx].as_ref().unwrap(); + let page = page.get(); tracing::debug!("pre_edit_page(page={})", page.get().id); let page_contents = page.get_contents(); edit_page( @@ -3049,6 +3097,7 @@ impl BTreeCursor { // TODO: vacuum support let first_child_page = pages_to_balance_new[0].as_ref().unwrap(); + let first_child_page = first_child_page.get(); let first_child_contents = first_child_page.get_contents(); if parent_is_root && parent_contents.cell_count() == 0 @@ -3105,7 +3154,7 @@ impl BTreeCursor { #[cfg(debug_assertions)] self.post_balance_non_root_validation( - &parent_page, + &parent_page_btree, balance_info, parent_contents, pages_to_balance_new, @@ -3119,7 +3168,8 @@ impl BTreeCursor { // We have to free pages that are not used anymore for i in sibling_count_new..balance_info.sibling_count { let page = balance_info.pages_to_balance[i].as_ref().unwrap(); - self.pager.free_page(Some(page.clone()), page.get().id)?; + self.pager + .free_page(Some(page.get().clone()), page.get().get().id)?; } (WriteState::BalanceStart, Ok(CursorResult::Ok(()))) } @@ -3184,10 +3234,10 @@ impl BTreeCursor { #[cfg(debug_assertions)] fn post_balance_non_root_validation( &self, - parent_page: &PageRef, + parent_page: &BTreePage, balance_info: &mut BalanceInfo, parent_contents: &mut PageContent, - pages_to_balance_new: [Option>; 5], + pages_to_balance_new: [Option; 5], page_type: PageType, leaf_data: bool, mut cells_debug: Vec>, @@ -3214,9 +3264,9 @@ impl BTreeCursor { match cell { BTreeCell::TableInteriorCell(table_interior_cell) => { let left_child_page = table_interior_cell._left_child_page; - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})", - parent_page.get().id, + parent_page.get().get().id, left_child_page, ); valid = false; @@ -3224,9 +3274,9 @@ impl BTreeCursor { } BTreeCell::IndexInteriorCell(index_interior_cell) => { let left_child_page = index_interior_cell.left_child_page; - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})", - parent_page.get().id, + parent_page.get().get().id, left_child_page, ); valid = false; @@ -3242,6 +3292,7 @@ impl BTreeCursor { .enumerate() { let page = page.as_ref().unwrap(); + let page = page.get(); let contents = page.get_contents(); debug_validate_cells!(contents, self.usable_space() as u16); // Cells are distributed in order @@ -3295,7 +3346,7 @@ impl BTreeCursor { ); valid = false; } - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})", page.get().id, left_child_page, @@ -3314,7 +3365,7 @@ impl BTreeCursor { ); valid = false; } - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})", page.get().id, left_child_page, @@ -3367,10 +3418,12 @@ impl BTreeCursor { valid = false; } - if rightmost == page.get().id as u32 || rightmost == parent_page.get().id as u32 { + if rightmost == page.get().id as u32 + || rightmost == parent_page.get().get().id as u32 + { tracing::error!("balance_non_root(balance_shallower_rightmost_pointer, page_id={}, parent_page_id={}, rightmost={})", page.get().id, - parent_page.get().id, + parent_page.get().get().id, rightmost, ); valid = false; @@ -3662,27 +3715,29 @@ impl BTreeCursor { let is_page_1 = { let current_root = self.stack.top(); - current_root.get().id == 1 + current_root.get().get().id == 1 }; let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 }; - let root = self.stack.top(); + let root_btree = self.stack.top(); + let root = root_btree.get(); let root_contents = root.get_contents(); // FIXME: handle page cache is full - let child = self.pager.do_allocate_page(root_contents.page_type(), 0); + let child_btree = self.pager.do_allocate_page(root_contents.page_type(), 0); tracing::debug!( "balance_root(root={}, rightmost={}, page_type={:?})", root.get().id, - child.get().id, + child_btree.get().get().id, root.get_contents().page_type() ); self.pager.add_dirty(root.get().id); - self.pager.add_dirty(child.get().id); + self.pager.add_dirty(child_btree.get().get().id); let root_buf = root_contents.as_ptr(); + let child = child_btree.get(); let child_contents = child.get_contents(); let child_buf = child_contents.as_ptr(); let (root_pointer_start, root_pointer_len) = @@ -3724,8 +3779,8 @@ impl BTreeCursor { root_contents.overflow_cells.clear(); self.root_page = root.get().id; self.stack.clear(); - self.stack.push(root.clone()); - self.stack.push(child.clone()); + self.stack.push(root_btree.clone()); + self.stack.push(child_btree.clone()); } fn usable_space(&self) -> usize { @@ -3790,10 +3845,11 @@ impl BTreeCursor { self.move_to_root(); loop { let mem_page = self.stack.top(); - let page_id = mem_page.get().id; - let page = self.pager.read_page(page_id)?; - return_if_locked!(page); + let page_id = mem_page.get().get().id; + let page = self.read_page(page_id)?; + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { // set cursor just past the last cell to append @@ -3804,7 +3860,7 @@ impl BTreeCursor { match contents.rightmost_pointer() { Some(right_most_pointer) => { self.stack.set_cell_index(contents.cell_count() as i32 + 1); // invalid on interior - let child = self.pager.read_page(right_most_pointer as usize)?; + let child = self.read_page(right_most_pointer as usize)?; self.stack.push(child); } None => unreachable!("interior page must have rightmost pointer"), @@ -3971,10 +4027,10 @@ impl BTreeCursor { match delete_state { DeleteState::Start => { let page = self.stack.top(); - page.set_dirty(); - self.pager.add_dirty(page.get().id); + page.get().set_dirty(); + self.pager.add_dirty(page.get().get().id); if matches!( - page.get_contents().page_type(), + page.get().get_contents().page_type(), PageType::TableLeaf | PageType::TableInterior ) { let _target_rowid = match self.has_record.get() { @@ -4010,6 +4066,7 @@ impl BTreeCursor { cell_idx -= 1; } + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if cell_idx >= contents.cell_count() { return_corrupt!(format!( @@ -4054,7 +4111,8 @@ impl BTreeCursor { return_if_io!(self.clear_overflow_pages(&cell)); let page = self.stack.top(); - let contents = page.get().contents.as_ref().unwrap(); + let page = page.get(); + let contents = page.get_contents(); let delete_info = self.state.mut_delete_info().unwrap(); if !contents.is_leaf() { @@ -4088,15 +4146,16 @@ impl BTreeCursor { return_if_locked_maybe_load!(self.pager, leaf_page); assert!( matches!( - leaf_page.get_contents().page_type(), + leaf_page.get().get_contents().page_type(), PageType::TableLeaf | PageType::IndexLeaf ), "self.prev should have returned a leaf page" ); let parent_page = self.stack.parent_page().unwrap(); - assert!(parent_page.is_loaded(), "parent page"); + assert!(parent_page.get().is_loaded(), "parent page"); + let leaf_page = leaf_page.get(); let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); // The index of the cell to removed must be the last one. let leaf_cell_idx = leaf_contents.cell_count() - 1; @@ -4113,9 +4172,10 @@ impl BTreeCursor { self.usable_space(), )?; - parent_page.set_dirty(); - self.pager.add_dirty(parent_page.get().id); + parent_page.get().set_dirty(); + self.pager.add_dirty(parent_page.get().get().id); + let parent_page = parent_page.get(); let parent_contents = parent_page.get().contents.as_mut().unwrap(); // Create an interior cell from a predecessor @@ -4149,6 +4209,7 @@ impl BTreeCursor { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let free_space = compute_free_space(contents, self.usable_space() as u16); let needs_balancing = free_space as usize * 3 > self.usable_space() * 2; @@ -4288,8 +4349,9 @@ impl BTreeCursor { let _ = return_if_io!(self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ)); let page = self.stack.top(); // TODO(pere): request load - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); // find cell @@ -4346,9 +4408,10 @@ impl BTreeCursor { self.overflow_state = None; return Err(LimboError::Corrupt("Invalid overflow page number".into())); } - let page = self.pager.read_page(next_page as usize)?; - return_if_locked!(page); + let page = self.read_page(next_page as usize)?; + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let next = contents.read_u32(0); @@ -4420,8 +4483,9 @@ impl BTreeCursor { } DestroyState::ProcessPage => { let page = self.stack.top(); - assert!(page.is_loaded()); // page should be loaded at this time + assert!(page.get().is_loaded()); // page should be loaded at this time + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index(); @@ -4439,8 +4503,7 @@ impl BTreeCursor { // Non-leaf page which has processed all children but not it's potential right child (false, n) if n == contents.cell_count() as i32 => { if let Some(rightmost) = contents.rightmost_pointer() { - let rightmost_page = - self.pager.read_page(rightmost as usize)?; + let rightmost_page = self.read_page(rightmost as usize)?; self.stack.advance(); self.stack.push(rightmost_page); let destroy_info = self.state.mut_destroy_info().expect( @@ -4509,7 +4572,7 @@ impl BTreeCursor { BTreeCell::IndexInteriorCell(cell) => cell.left_child_page, _ => panic!("expected interior cell"), }; - let child_page = self.pager.read_page(child_page_id as usize)?; + let child_page = self.read_page(child_page_id as usize)?; self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( @@ -4526,9 +4589,8 @@ impl BTreeCursor { CursorResult::Ok(_) => match cell { // For an index interior cell, clear the left child page now that overflow pages have been cleared BTreeCell::IndexInteriorCell(index_int_cell) => { - let child_page = self - .pager - .read_page(index_int_cell.left_child_page as usize)?; + let child_page = + self.read_page(index_int_cell.left_child_page as usize)?; self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( @@ -4552,9 +4614,9 @@ impl BTreeCursor { } DestroyState::FreePage => { let page = self.stack.top(); - let page_id = page.get().id; + let page_id = page.get().get().id; - self.pager.free_page(Some(page), page_id)?; + self.pager.free_page(Some(page.get()), page_id)?; if self.stack.has_parent() { self.stack.pop(); @@ -4578,12 +4640,12 @@ impl BTreeCursor { pub fn overwrite_cell( &mut self, - page_ref: PageRef, + page_ref: BTreePage, cell_idx: usize, record: &ImmutableRecord, ) -> Result> { // build the new payload - let page_type = page_ref.get().contents.as_ref().unwrap().page_type(); + let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type(); let mut new_payload = Vec::with_capacity(record.len()); let CursorHasRecord::Yes { rowid } = self.has_record.get() else { panic!("cursor should be pointing to a record"); @@ -4599,6 +4661,7 @@ impl BTreeCursor { // figure out old cell offset & size let (old_offset, old_local_size) = { + let page_ref = page_ref.get(); let page = page_ref.get().contents.as_ref().unwrap(); page.cell_get_raw_region( cell_idx, @@ -4615,12 +4678,12 @@ impl BTreeCursor { } else { // doesn't fit, drop it and insert a new one drop_cell( - page_ref.get_contents(), + page_ref.get().get_contents(), cell_idx, self.usable_space() as u16, )?; insert_into_cell( - page_ref.get_contents(), + page_ref.get().get_contents(), &new_payload, cell_idx, self.usable_space() as u16, @@ -4631,11 +4694,12 @@ impl BTreeCursor { pub fn overwrite_content( &mut self, - page_ref: PageRef, + page_ref: BTreePage, dest_offset: usize, new_payload: &[u8], ) -> Result> { - return_if_locked!(page_ref); + return_if_locked!(page_ref.get()); + let page_ref = page_ref.get(); let buf = page_ref.get().contents.as_mut().unwrap().as_ptr(); buf[dest_offset..dest_offset + new_payload.len()].copy_from_slice(&new_payload); @@ -4681,7 +4745,7 @@ impl BTreeCursor { mem_page_rc = self.stack.top(); return_if_locked_maybe_load!(self.pager, mem_page_rc); mem_page = mem_page_rc.get(); - contents = mem_page.contents.as_ref().unwrap(); + contents = mem_page.get().contents.as_ref().unwrap(); /* If this is a leaf page or the tree is not an int-key tree, then ** this page contains countable entries. Increment the entry counter @@ -4710,7 +4774,7 @@ impl BTreeCursor { mem_page_rc = self.stack.top(); return_if_locked_maybe_load!(self.pager, mem_page_rc); mem_page = mem_page_rc.get(); - contents = mem_page.contents.as_ref().unwrap(); + contents = mem_page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index() as usize; @@ -4730,7 +4794,7 @@ impl BTreeCursor { // should be safe as contents is not a leaf page let right_most_pointer = contents.rightmost_pointer().unwrap(); self.stack.advance(); - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.going_upwards = false; self.stack.push(mem_page); } else { @@ -4757,7 +4821,7 @@ impl BTreeCursor { left_child_page, .. }) => { self.stack.advance(); - let mem_page = self.pager.read_page(left_child_page as usize)?; + let mem_page = self.read_page(left_child_page as usize)?; self.going_upwards = false; self.stack.push(mem_page); } @@ -4771,7 +4835,7 @@ impl BTreeCursor { pub fn save_context(&mut self) { if let CursorHasRecord::Yes { rowid } = self.has_record.get() { self.valid_state = CursorValidState::RequireSeek; - match self.stack.top().get_contents().page_type() { + match self.stack.top().get().get_contents().page_type() { PageType::TableInterior | PageType::TableLeaf => { self.context = Some(CursorContext::TableRowId(rowid.expect( "table cells should have a rowid since we don't support WITHOUT ROWID tables", @@ -4816,6 +4880,18 @@ impl BTreeCursor { pub fn collations(&self) -> &[CollationSeq] { &self.collations } + + pub fn read_page(&self, page_idx: usize) -> Result { + self.pager.read_page(page_idx).map(|page| { + Arc::new(BTreePageInner { + page: RefCell::new(page), + }) + }) + } + + pub fn allocate_page(&self, page_type: PageType, offset: usize) -> BTreePage { + self.pager.do_allocate_page(page_type, offset) + } } #[cfg(debug_assertions)] @@ -4836,7 +4912,7 @@ struct PageStack { /// Pointer to the current page being consumed current_page: Cell, /// List of pages in the stack. Root page will be in index 0 - stack: RefCell<[Option; BTCURSOR_MAX_DEPTH + 1]>, + stack: RefCell<[Option; BTCURSOR_MAX_DEPTH + 1]>, /// List of cell indices in the stack. /// cell_indices[current_page] is the current cell index being consumed. Similarly /// cell_indices[current_page-1] is the cell index of the parent of the current page @@ -4857,11 +4933,11 @@ impl PageStack { } /// Push a new page onto the stack. /// This effectively means traversing to a child page. - fn _push(&self, page: PageRef, starting_cell_idx: i32) { + fn _push(&self, page: BTreePage, starting_cell_idx: i32) { tracing::trace!( "pagestack::push(current={}, new_page_id={})", self.current_page.get(), - page.get().id + page.get().get().id ); self.increment_current(); let current = self.current_page.get(); @@ -4874,11 +4950,11 @@ impl PageStack { self.cell_indices.borrow_mut()[current as usize] = starting_cell_idx; } - fn push(&self, page: PageRef) { + fn push(&self, page: BTreePage) { self._push(page, 0); } - fn push_backwards(&self, page: PageRef) { + fn push_backwards(&self, page: BTreePage) { self._push(page, i32::MAX); } @@ -4895,7 +4971,7 @@ impl PageStack { /// Get the top page on the stack. /// This is the page that is currently being traversed. - fn top(&self) -> PageRef { + fn top(&self) -> BTreePage { let page = self.stack.borrow()[self.current()] .as_ref() .unwrap() @@ -4903,7 +4979,7 @@ impl PageStack { tracing::trace!( "pagestack::top(current={}, page_id={})", self.current(), - page.get().id + page.get().get().id ); page } @@ -4974,7 +5050,7 @@ impl PageStack { fn clear(&self) { self.current_page.set(-1); } - pub fn parent_page(&self) -> Option { + pub fn parent_page(&self) -> Option { if self.current_page.get() > 0 { Some( self.stack.borrow()[self.current() - 1] @@ -5005,6 +5081,12 @@ impl CellArray { } } +impl BTreePageInner { + pub fn get(&self) -> PageRef { + self.page.borrow().clone() + } +} + /// Try to find a free block available and allocate it if found fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> Result { // NOTE: freelist is in ascending order of keys and pc @@ -5060,11 +5142,15 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> R Ok(0) } -pub fn btree_init_page(page: &PageRef, page_type: PageType, offset: usize, usable_space: u16) { +pub fn btree_init_page(page: &BTreePage, page_type: PageType, offset: usize, usable_space: u16) { // setup btree page let contents = page.get(); - tracing::debug!("btree_init_page(id={}, offset={})", contents.id, offset); - let contents = contents.contents.as_mut().unwrap(); + tracing::debug!( + "btree_init_page(id={}, offset={})", + contents.get().id, + offset + ); + let contents = contents.get().contents.as_mut().unwrap(); contents.offset = offset; let id = page_type as u8; contents.write_u8(offset::BTREE_PAGE_TYPE, id); @@ -5877,7 +5963,6 @@ mod tests { compute_free_space, fill_cell_payload, payload_overflow_threshold_max, payload_overflow_threshold_min, }, - pager::PageRef, sqlite3_ondisk::{BTreeCell, PageContent, PageType}, }, types::Value, @@ -5887,7 +5972,7 @@ mod tests { use super::{btree_init_page, defragment_page, drop_cell, insert_into_cell}; #[allow(clippy::arc_with_non_send_sync)] - fn get_page(id: usize) -> PageRef { + fn get_page(id: usize) -> BTreePage { let page = Arc::new(Page::new(id)); let drop_fn = Rc::new(|_| {}); @@ -5899,6 +5984,9 @@ mod tests { ))), ); page.get().contents.replace(inner); + let page = Arc::new(BTreePageInner { + page: RefCell::new(page), + }); btree_init_page(&page, PageType::TableLeaf, 0, 4096); page @@ -5958,6 +6046,7 @@ mod tests { let db = get_database(); let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; let record = ImmutableRecord::from_registers(&[Register::Value(Value::Integer(1))]); @@ -5981,6 +6070,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6011,9 +6101,12 @@ mod tests { fn validate_btree(pager: Rc, page_idx: usize) -> (usize, bool) { let cursor = BTreeCursor::new_table(None, pager.clone(), page_idx); - let page = pager.read_page(page_idx).unwrap(); + let page = cursor.read_page(page_idx).unwrap(); + while page.get().is_locked() { + pager.io.run_once().unwrap(); + } let page = page.get(); - let contents = page.contents.as_ref().unwrap(); + let contents = page.get().contents.as_ref().unwrap(); let page_type = contents.page_type(); let mut previous_key = None; let mut valid = true; @@ -6034,8 +6127,12 @@ mod tests { BTreeCell::TableInteriorCell(TableInteriorCell { _left_child_page, .. }) => { - child_pages.push(pager.read_page(_left_child_page as usize).unwrap()); - if _left_child_page == page.id as u32 { + let child_page = cursor.read_page(_left_child_page as usize).unwrap(); + while child_page.get().is_locked() { + pager.io.run_once().unwrap(); + } + child_pages.push(child_page); + if _left_child_page == page.get().id as u32 { valid = false; tracing::error!( "left child page is the same as parent {}", @@ -6084,10 +6181,26 @@ mod tests { valid = false; } } - let first_page_type = child_pages.first().map(|p| p.get_contents().page_type()); + let first_page_type = child_pages.first().map(|p| { + if !p.get().is_loaded() { + let new_page = pager.read_page(p.get().get().id).unwrap(); + p.page.replace(new_page); + } + while p.get().is_locked() { + pager.io.run_once().unwrap(); + } + p.get().get_contents().page_type() + }); if let Some(child_type) = first_page_type { for page in child_pages.iter().skip(1) { - if page.get_contents().page_type() != child_type { + if !page.get().is_loaded() { + let new_page = pager.read_page(page.get().get().id).unwrap(); + page.page.replace(new_page); + } + while page.get().is_locked() { + pager.io.run_once().unwrap(); + } + if page.get().get_contents().page_type() != child_type { tracing::error!("child pages have different types"); valid = false; } @@ -6101,9 +6214,12 @@ mod tests { fn format_btree(pager: Rc, page_idx: usize, depth: usize) -> String { let cursor = BTreeCursor::new_table(None, pager.clone(), page_idx); - let page = pager.read_page(page_idx).unwrap(); + let page = cursor.read_page(page_idx).unwrap(); + while page.get().is_locked() { + pager.io.run_once().unwrap(); + } let page = page.get(); - let contents = page.contents.as_ref().unwrap(); + let contents = page.get().contents.as_ref().unwrap(); let page_type = contents.page_type(); let mut current = Vec::new(); let mut child = Vec::new(); @@ -6179,8 +6295,11 @@ mod tests { let pager = Rc::new(pager); // FIXME: handle page cache is full let page1 = pager.allocate_page().unwrap(); + let page1 = Arc::new(BTreePageInner { + page: RefCell::new(page1), + }); btree_init_page(&page1, PageType::TableLeaf, 0, 4096); - (pager, page1.get().id) + (pager, page1.get().get().id) } #[test] @@ -6496,6 +6615,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6692,13 +6812,14 @@ mod tests { .write_page(current_page as usize, buf.clone(), c)?; pager.io.run_once()?; - let page = cursor.pager.read_page(current_page as usize)?; - while page.is_locked() { + let page = cursor.read_page(current_page as usize)?; + while page.get().is_locked() { cursor.pager.io.run_once()?; } { - let contents = page.get().contents.as_mut().unwrap(); + let page = page.get(); + let contents = page.get_contents(); let next_page = if current_page < 4 { current_page + 1 @@ -6739,8 +6860,8 @@ mod tests { let trunk_page_id = db_header.lock().freelist_trunk_page; if trunk_page_id > 0 { // Verify trunk page structure - let trunk_page = cursor.pager.read_page(trunk_page_id as usize)?; - if let Some(contents) = trunk_page.get().contents.as_ref() { + let trunk_page = cursor.read_page(trunk_page_id as usize)?; + if let Some(contents) = trunk_page.get().get().contents.as_ref() { // Read number of leaf pages in trunk let n_leaf = contents.read_u32(4); assert!(n_leaf > 0, "Trunk page should have leaf entries"); @@ -6819,34 +6940,33 @@ mod tests { ); // Initialize page 2 as a root page (interior) - let root_page = cursor.pager.read_page(2)?; + let root_page = cursor.read_page(2)?; { btree_init_page(&root_page, PageType::TableInterior, 0, 512); // Use proper page size } // Allocate two leaf pages // FIXME: handle page cache is full - let page3 = cursor.pager.allocate_page()?; - btree_init_page(&page3, PageType::TableLeaf, 0, 512); + let page3 = cursor.allocate_page(PageType::TableLeaf, 0); // FIXME: handle page cache is full - let page4 = cursor.pager.allocate_page()?; - btree_init_page(&page4, PageType::TableLeaf, 0, 512); + let page4 = cursor.allocate_page(PageType::TableLeaf, 0); // Configure the root page to point to the two leaf pages { + let root_page = root_page.get(); let contents = root_page.get().contents.as_mut().unwrap(); // Set rightmost pointer to page4 - contents.write_u32(offset::BTREE_RIGHTMOST_PTR, page4.get().id as u32); + contents.write_u32(offset::BTREE_RIGHTMOST_PTR, page4.get().get().id as u32); // Create a cell with pointer to page3 let cell_content = vec![ // First 4 bytes: left child pointer (page3) - (page3.get().id >> 24) as u8, - (page3.get().id >> 16) as u8, - (page3.get().id >> 8) as u8, - page3.get().id as u8, + (page3.get().get().id >> 24) as u8, + (page3.get().get().id >> 16) as u8, + (page3.get().get().id >> 8) as u8, + page3.get().get().id as u8, // Next byte: rowid as varint (simple value 100) 100, ]; @@ -6857,6 +6977,7 @@ mod tests { // Add a simple record to each leaf page for page in [&page3, &page4] { + let page = page.get(); let contents = page.get().contents.as_mut().unwrap(); // Simple record with just a rowid and payload @@ -6898,6 +7019,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6938,6 +7060,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6984,6 +7107,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -7062,6 +7186,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -7231,6 +7356,7 @@ mod tests { let db = get_database(); let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; let usable_space = 4096; @@ -7247,6 +7373,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7272,6 +7399,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7305,6 +7433,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7340,6 +7469,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7362,6 +7492,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7400,6 +7531,7 @@ mod tests { let defragment = |page| { defragment_page(page, usable_space); }; + let page = page.get(); defragment(page.get_contents()); defragment(page.get_contents()); insert(0, page.get_contents()); @@ -7442,13 +7574,14 @@ mod tests { let record = ImmutableRecord::from_registers(&[Register::Value(Value::Integer(0))]); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.get_contents().page_type(), + page.get().get_contents().page_type(), Some(0), &mut payload, &record, 4096, conn.pager.clone(), ); + let page = page.get(); insert(0, page.get_contents()); defragment(page.get_contents()); insert(0, page.get_contents()); @@ -7515,19 +7648,19 @@ mod tests { ImmutableRecord::from_registers(&[Register::Value(Value::Blob(vec![0; 3600]))]); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.get_contents().page_type(), + page.get().get_contents().page_type(), Some(0), &mut payload, &record, 4096, conn.pager.clone(), ); - insert_into_cell(page.get_contents(), &payload, 0, 4096).unwrap(); - let free = compute_free_space(page.get_contents(), usable_space); + insert_into_cell(page.get().get_contents(), &payload, 0, 4096).unwrap(); + let free = compute_free_space(page.get().get_contents(), usable_space); let total_size = payload.len() + 2; assert_eq!( free, - usable_space - page.get_contents().header_size() as u16 - total_size as u16 + usable_space - page.get().get_contents().header_size() as u16 - total_size as u16 ); dbg!(free); } @@ -7851,7 +7984,11 @@ mod tests { let (pager, _) = empty_btree(); let page_type = PageType::TableLeaf; let page = pager.allocate_page().unwrap(); + let page = Arc::new(BTreePageInner { + page: RefCell::new(page), + }); btree_init_page(&page, page_type, 0, pager.usable_space() as u16); + let page = page.get(); let mut size = (rng.next_u64() % 100) as u16; let mut i = 0; // add a bunch of cells diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 0e6f718da..2d6fd5637 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1,5 +1,6 @@ use crate::fast_lock::SpinLock; use crate::result::LimboResult; +use crate::storage::btree::BTreePageInner; use crate::storage::buffer_pool::BufferPool; use crate::storage::database::DatabaseStorage; use crate::storage::sqlite3_ondisk::{self, DatabaseHeader, PageContent, PageType}; @@ -13,6 +14,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tracing::trace; +use super::btree::BTreePage; use super::page_cache::{CacheError, CacheResizeResult, DumbLruPageCache, PageCacheKey}; use super::wal::{CheckpointMode, CheckpointStatus}; @@ -222,7 +224,7 @@ impl Pager { _ => unreachable!("Invalid flags state"), }; let page = self.do_allocate_page(page_type, 0); - let id = page.get().id; + let id = page.get().get().id; id as u32 } @@ -244,13 +246,16 @@ impl Pager { /// Allocate a new page to the btree via the pager. /// This marks the page as dirty and writes the page header. // FIXME: handle no room in page cache - pub fn do_allocate_page(&self, page_type: PageType, offset: usize) -> PageRef { + pub fn do_allocate_page(&self, page_type: PageType, offset: usize) -> BTreePage { let page = self.allocate_page().unwrap(); + let page = Arc::new(BTreePageInner { + page: RefCell::new(page), + }); crate::btree_init_page(&page, page_type, offset, self.usable_space() as u16); tracing::debug!( "do_allocate_page(id={}, page_type={:?})", - page.get().id, - page.get_contents().page_type() + page.get().get().id, + page.get().get_contents().page_type() ); page } @@ -368,61 +373,6 @@ impl Pager { Ok(page) } - /// Loads pages if not loaded - pub fn load_page(&self, page: PageRef) -> Result<()> { - let id = page.get().id; - trace!("load_page(page_idx = {})", id); - let mut page_cache = self.page_cache.write(); - page.set_locked(); - let max_frame = match &self.wal { - Some(wal) => wal.borrow().get_max_frame(), - None => 0, - }; - let page_key = PageCacheKey::new(id, Some(max_frame)); - if let Some(wal) = &self.wal { - if let Some(frame_id) = wal.borrow().find_frame(id as u64)? { - wal.borrow() - .read_frame(frame_id, page.clone(), self.buffer_pool.clone())?; - { - page.set_uptodate(); - } - match page_cache.insert(page_key, page.clone()) { - Err(CacheError::KeyExists) => {} // Exists but same page, not error - Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(e) => { - return Err(LimboError::InternalError(format!( - "Failed to insert page into cache during load: {:?}", - e - ))) - } - Ok(_) => {} - } - return Ok(()); - } - } - - match page_cache.insert(page_key, page.clone()) { - Err(CacheError::KeyExists) => {} // Ensures same page - Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(e) => { - return Err(LimboError::InternalError(format!( - "Failed to insert page into cache during load: {:?}", - e - ))) - } - Ok(_) => {} - }; - - sqlite3_ondisk::begin_read_page( - self.db_file.clone(), - self.buffer_pool.clone(), - page.clone(), - id, - )?; - - Ok(()) - } - /// Writes the database header. pub fn write_database_header(&self, header: &DatabaseHeader) { sqlite3_ondisk::begin_write_database_header(header, self).expect("failed to write header"); @@ -464,6 +414,8 @@ impl Pager { let max_frame_after_append = self.wal.as_ref().map(|wal| { wal.borrow().get_max_frame() + self.dirty_pages.borrow().len() as u64 }); + tracing::error!("start flush"); + tracing::error!("pages={:?}", self.dirty_pages.borrow()); for page_id in self.dirty_pages.borrow().iter() { let mut cache = self.page_cache.write(); let page_key = PageCacheKey::new(*page_id, Some(max_frame)); From b76961ce35faef78833301e46d0857e357b20f9a Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 11:02:10 +0200 Subject: [PATCH 16/22] balance mark dirty from start --- core/storage/btree.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 9b8c2879d..988d652f1 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -2384,6 +2384,12 @@ impl BTreeCursor { let current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { let page = self.read_page(pgno as usize)?; + { + // mark as dirty + let sibling_page = page.get(); + sibling_page.set_dirty(); + self.pager.add_dirty(sibling_page.get().id); + } #[cfg(debug_assertions)] { return_if_locked!(page.get()); @@ -2462,13 +2468,13 @@ impl BTreeCursor { let write_info = self.state.write_info().unwrap(); let mut balance_info = write_info.balance_info.borrow_mut(); let balance_info = balance_info.as_mut().unwrap(); - let all_loaded = balance_info + for page in balance_info .pages_to_balance .iter() .take(balance_info.sibling_count) - .all(|page| !page.as_ref().unwrap().get().is_locked()); - if !all_loaded { - return Ok(CursorResult::IO); + { + let page = page.as_ref().unwrap(); + return_if_locked_maybe_load!(self.pager, page); } // Now do real balancing let parent_page_btree = self.stack.top(); @@ -2489,9 +2495,8 @@ impl BTreeCursor { for i in (0..balance_info.sibling_count).rev() { let sibling_page = balance_info.pages_to_balance[i].as_ref().unwrap(); let sibling_page = sibling_page.get(); + assert!(sibling_page.is_loaded()); let sibling_contents = sibling_page.get_contents(); - sibling_page.set_dirty(); - self.pager.add_dirty(sibling_page.get().id); max_cells += sibling_contents.cell_count(); max_cells += sibling_contents.overflow_cells.len(); @@ -6422,6 +6427,7 @@ mod tests { tracing::info!("seed: {}", seed); for insert_id in 0..inserts { let do_validate = do_validate_btree || (insert_id % VALIDATE_INTERVAL == 0); + pager.begin_read_tx().unwrap(); pager.begin_write_tx().unwrap(); let size = size(&mut rng); let key = { From c365d79cb128811b9923704c1a0e4ba7cd755231 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 11:02:36 +0200 Subject: [PATCH 17/22] minimum capacity 10 in page cache --- core/storage/page_cache.rs | 55 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 197fcdfd2..09a620248 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -60,6 +60,7 @@ impl PageCacheKey { } impl DumbLruPageCache { pub fn new(capacity: usize) -> Self { + assert!(capacity >= 10, "capacity of cache should be at least 10"); Self { capacity, map: RefCell::new(HashMap::new()), @@ -288,7 +289,7 @@ impl DumbLruPageCache { Ok(()) } - pub fn print(&mut self) { + pub fn print(&self) { tracing::debug!("page_cache_len={}", self.map.borrow().len()); let head_ptr = *self.head.borrow(); let mut current = head_ptr; @@ -524,7 +525,7 @@ mod tests { #[test] fn test_detach_only_element() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); cache.verify_list_integrity(); assert_eq!(cache.len(), 1); @@ -550,7 +551,7 @@ mod tests { #[test] fn test_detach_head() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let _key1 = insert_page(&mut cache, 1); // Tail let key2 = insert_page(&mut cache, 2); // Middle let key3 = insert_page(&mut cache, 3); // Head @@ -594,7 +595,7 @@ mod tests { #[test] fn test_detach_tail() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); // Tail let key2 = insert_page(&mut cache, 2); // Middle let _key3 = insert_page(&mut cache, 3); // Head @@ -648,7 +649,7 @@ mod tests { #[test] fn test_detach_middle() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); // Tail let key2 = insert_page(&mut cache, 2); // Middle let key3 = insert_page(&mut cache, 3); // Middle @@ -697,7 +698,7 @@ mod tests { #[test] #[ignore = "for now let's not track active refs"] fn test_detach_via_delete() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key1 = create_key(1); let page1 = page_with_content(1); assert!(cache.insert(key1.clone(), page1.clone()).is_ok()); @@ -718,7 +719,7 @@ mod tests { #[test] fn test_insert_same_id_different_frame() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key1_1 = PageCacheKey::new(1, Some(1 as u64)); let key1_2 = PageCacheKey::new(1, Some(2 as u64)); let page1_1 = page_with_content(1); @@ -733,7 +734,7 @@ mod tests { #[test] #[should_panic(expected = "Attempted to insert different page with same key")] fn test_insert_existing_key_fail() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key1 = create_key(1); let page1_v1 = page_with_content(1); let page1_v2 = page_with_content(1); @@ -745,7 +746,7 @@ mod tests { #[test] fn test_detach_nonexistent_key() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let key_nonexist = create_key(99); assert!(cache.delete(key_nonexist.clone()).is_ok()); // no-op @@ -762,7 +763,7 @@ mod tests { #[test] fn test_detach_locked_page() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (_, mut entry) = insert_and_get_entry(&mut cache, 1); unsafe { entry.as_mut().page.set_locked() }; assert_eq!(cache.detach(entry, false), Err(CacheError::Locked)); @@ -771,7 +772,7 @@ mod tests { #[test] fn test_detach_dirty_page() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (key, mut entry) = insert_and_get_entry(&mut cache, 1); cache.get(&key).expect("Page should exist"); unsafe { entry.as_mut().page.set_dirty() }; @@ -783,7 +784,7 @@ mod tests { #[ignore = "for now let's not track active refs"] fn test_detach_with_active_reference_clean() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (key, entry) = insert_and_get_entry(&mut cache, 1); let page_ref = cache.get(&key); assert_eq!(cache.detach(entry, true), Err(CacheError::ActiveRefs)); @@ -794,7 +795,7 @@ mod tests { #[test] #[ignore = "for now let's not track active refs"] fn test_detach_with_active_reference_no_clean() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (key, entry) = insert_and_get_entry(&mut cache, 1); cache.get(&key).expect("Page should exist"); assert!(cache.detach(entry, false).is_ok()); @@ -804,7 +805,7 @@ mod tests { #[test] fn test_detach_without_cleaning() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (key, entry) = insert_and_get_entry(&mut cache, 1); assert!(cache.detach(entry, false).is_ok()); assert!(cache.map.borrow_mut().remove(&key).is_some()); @@ -814,7 +815,7 @@ mod tests { #[test] fn test_detach_with_cleaning() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (key, entry) = insert_and_get_entry(&mut cache, 1); let page = cache.get(&key).expect("Page should exist"); assert!(page_has_content(&page)); @@ -829,7 +830,7 @@ mod tests { #[test] fn test_detach_only_element_preserves_integrity() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (_, entry) = insert_and_get_entry(&mut cache, 1); assert!(cache.detach(entry, false).is_ok()); assert!( @@ -844,7 +845,7 @@ mod tests { #[test] fn test_detach_with_multiple_pages() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let (key1, _) = insert_and_get_entry(&mut cache, 1); let (key2, entry2) = insert_and_get_entry(&mut cache, 2); let (key3, _) = insert_and_get_entry(&mut cache, 3); @@ -880,7 +881,7 @@ mod tests { let mut rng = ChaCha8Rng::seed_from_u64(seed); tracing::info!("super seed: {}", seed); let max_pages = 10; - let mut cache = DumbLruPageCache::new(10); + let mut cache = DumbLruPageCache::default(); let mut lru = LruCache::new(NonZeroUsize::new(10).unwrap()); for _ in 0..10000 { @@ -963,7 +964,7 @@ mod tests { #[test] fn test_page_cache_insert_and_get() { - let mut cache = DumbLruPageCache::new(2); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); let key2 = insert_page(&mut cache, 2); assert_eq!(cache.get(&key1).unwrap().get().id, 1); @@ -972,7 +973,7 @@ mod tests { #[test] fn test_page_cache_over_capacity() { - let mut cache = DumbLruPageCache::new(2); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); let key2 = insert_page(&mut cache, 2); let key3 = insert_page(&mut cache, 3); @@ -983,7 +984,7 @@ mod tests { #[test] fn test_page_cache_delete() { - let mut cache = DumbLruPageCache::new(2); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); assert!(cache.delete(key1.clone()).is_ok()); assert!(cache.get(&key1).is_none()); @@ -991,7 +992,7 @@ mod tests { #[test] fn test_page_cache_clear() { - let mut cache = DumbLruPageCache::new(2); + let mut cache = DumbLruPageCache::default(); let key1 = insert_page(&mut cache, 1); let key2 = insert_page(&mut cache, 2); assert!(cache.clear().is_ok()); @@ -1001,7 +1002,7 @@ mod tests { #[test] fn test_page_cache_insert_sequential() { - let mut cache = DumbLruPageCache::new(2); + let mut cache = DumbLruPageCache::default(); for i in 0..10000 { let key = insert_page(&mut cache, i); assert_eq!(cache.peek(&key, false).unwrap().get().id, i); @@ -1010,7 +1011,7 @@ mod tests { #[test] fn test_resize_smaller_success() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); for i in 1..=5 { let _ = insert_page(&mut cache, i); } @@ -1025,7 +1026,7 @@ mod tests { #[test] #[should_panic(expected = "Attempted to insert different page with same key")] fn test_resize_larger() { - let mut cache = DumbLruPageCache::new(2); + let mut cache = DumbLruPageCache::default(); let _ = insert_page(&mut cache, 1); let _ = insert_page(&mut cache, 2); assert_eq!(cache.len(), 2); @@ -1047,7 +1048,7 @@ mod tests { #[test] #[ignore = "for now let's not track active refs"] fn test_resize_with_active_references() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); let page1 = page_with_content(1); let page2 = page_with_content(2); let page3 = page_with_content(3); @@ -1069,7 +1070,7 @@ mod tests { #[test] fn test_resize_to_zero() { - let mut cache = DumbLruPageCache::new(5); + let mut cache = DumbLruPageCache::default(); for i in 1..=3 { let _ = insert_page(&mut cache, i); } From ddb166f0f0f6fbc31755da4b98ec88ffc9996c5f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 12:05:47 +0200 Subject: [PATCH 18/22] custom hashmap for page cache --- core/storage/page_cache.rs | 153 +++++++++++++++++++++++++++++++------ 1 file changed, 130 insertions(+), 23 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 09a620248..f68182d9d 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -30,13 +30,26 @@ struct PageCacheEntry { pub struct DumbLruPageCache { capacity: usize, - map: RefCell>>, + map: RefCell, head: RefCell>>, tail: RefCell>>, } unsafe impl Send for DumbLruPageCache {} unsafe impl Sync for DumbLruPageCache {} +struct PageHashMap { + // FIXME: do we prefer array buckets or list? Deletes will be slower here which I guess happens often. I will do this for now to test how well it does. + buckets: Vec>, + capacity: usize, + size: usize, +} + +#[derive(Clone)] +struct HashMapNode { + key: PageCacheKey, + value: NonNull, +} + #[derive(Debug, PartialEq)] pub enum CacheError { InternalError(String), @@ -60,10 +73,10 @@ impl PageCacheKey { } impl DumbLruPageCache { pub fn new(capacity: usize) -> Self { - assert!(capacity >= 10, "capacity of cache should be at least 10"); + assert!(capacity > 0, "capacity of cache should be at least 1"); Self { capacity, - map: RefCell::new(HashMap::new()), + map: RefCell::new(PageHashMap::new(capacity)), head: RefCell::new(None), tail: RefCell::new(None), } @@ -161,6 +174,8 @@ impl DumbLruPageCache { // To match SQLite behavior, just set capacity and try to shrink as much as possible. // In case of failure, the caller should request further evictions (e.g. after I/O). pub fn resize(&mut self, capacity: usize) -> CacheResizeResult { + let new_map = self.map.borrow().rehash(capacity); + self.map.replace(new_map); self.capacity = capacity; match self.make_room_for(0) { Ok(_) => CacheResizeResult::Done, @@ -279,7 +294,7 @@ impl DumbLruPageCache { } pub fn clear(&mut self) -> Result<(), CacheError> { - let keys_to_remove: Vec = self.map.borrow().keys().cloned().collect(); + let keys_to_remove: Vec = self.map.borrow().keys_cloned(); for key in keys_to_remove { self.delete(key)?; } @@ -450,9 +465,9 @@ impl DumbLruPageCache { #[cfg(test)] /// For testing purposes, in case we use cursor api directly, we might want to unmark pages as dirty because we bypass the WAL transaction layer pub fn unset_dirty_all_pages(&mut self) { - for (_, page) in self.map.borrow_mut().iter_mut() { + for node in self.map.borrow_mut().iter_mut() { unsafe { - let entry = page.as_mut(); + let entry = node.value.as_mut(); entry.page.clear_dirty() }; } @@ -465,6 +480,113 @@ impl Default for DumbLruPageCache { } } +impl PageHashMap { + pub fn new(capacity: usize) -> PageHashMap { + PageHashMap { + buckets: vec![vec![]; capacity], + capacity, + size: 0, + } + } + + /// Insert page into hashmap. If a key was already in the hashmap, then update it and return the previous value. + pub fn insert( + &mut self, + key: PageCacheKey, + value: NonNull, + ) -> Option> { + let bucket = self.hash(&key); + let bucket = &mut self.buckets[bucket]; + let mut idx = 0; + while let Some(node) = bucket.get_mut(idx) { + if node.key == key { + let prev = node.value; + node.value = value; + return Some(prev); + } + idx += 1; + } + bucket.push(HashMapNode { key, value }); + self.size += 1; + None + } + + pub fn contains_key(&self, key: &PageCacheKey) -> bool { + let bucket = self.hash(&key); + self.buckets[bucket].iter().any(|node| node.key == *key) + } + + pub fn get(&self, key: &PageCacheKey) -> Option<&NonNull> { + let bucket = self.hash(&key); + let bucket = &self.buckets[bucket]; + let mut idx = 0; + while let Some(node) = bucket.get(idx) { + if node.key == *key { + return Some(&node.value); + } + idx += 1; + } + None + } + + pub fn remove(&mut self, key: &PageCacheKey) -> Option> { + let bucket = self.hash(&key); + let bucket = &mut self.buckets[bucket]; + let mut idx = 0; + while let Some(node) = bucket.get(idx) { + if node.key == *key { + break; + } + idx += 1; + } + if idx == bucket.len() { + None + } else { + let v = bucket.remove(idx); + self.size -= 1; + Some(v.value) + } + } + + pub fn is_empty(&self) -> bool { + self.size == 0 + } + + pub fn len(&self) -> usize { + self.size + } + + pub fn keys_cloned(&self) -> Vec { + let mut all_keys = Vec::new(); + for bucket in &self.buckets { + for node in bucket { + all_keys.push(node.key.clone()); + } + } + all_keys + } + + pub fn iter(&self) -> impl Iterator { + self.buckets.iter().flat_map(|bucket| bucket.iter()) + } + + pub fn iter_mut(&mut self) -> impl Iterator { + self.buckets.iter_mut().flat_map(|bucket| bucket.iter_mut()) + } + + fn hash(&self, key: &PageCacheKey) -> usize { + key.pgno % self.capacity + } + + pub fn rehash(&self, new_capacity: usize) -> PageHashMap { + let mut new_hash_map = PageHashMap::new(new_capacity); + for node in self.iter() { + new_hash_map.insert(node.key.clone(), node.value); + } + new_hash_map + } +} + #[cfg(test)] mod tests { use super::*; @@ -881,7 +1003,7 @@ mod tests { let mut rng = ChaCha8Rng::seed_from_u64(seed); tracing::info!("super seed: {}", seed); let max_pages = 10; - let mut cache = DumbLruPageCache::default(); + let mut cache = DumbLruPageCache::new(10); let mut lru = LruCache::new(NonZeroUsize::new(10).unwrap()); for _ in 0..10000 { @@ -973,7 +1095,7 @@ mod tests { #[test] fn test_page_cache_over_capacity() { - let mut cache = DumbLruPageCache::default(); + let mut cache = DumbLruPageCache::new(2); let key1 = insert_page(&mut cache, 1); let key2 = insert_page(&mut cache, 2); let key3 = insert_page(&mut cache, 3); @@ -1068,21 +1190,6 @@ mod tests { cache.verify_list_integrity(); } - #[test] - fn test_resize_to_zero() { - let mut cache = DumbLruPageCache::default(); - for i in 1..=3 { - let _ = insert_page(&mut cache, i); - } - assert_eq!(cache.len(), 3); - let result = cache.resize(0); - assert_eq!(result, CacheResizeResult::Done); // removed all 3 - assert_eq!(cache.len(), 0); - assert_eq!(cache.capacity, 0); - cache.verify_list_integrity(); - assert!(cache.insert(create_key(4), page_with_content(4)).is_err()); - } - #[test] fn test_resize_same_capacity() { let mut cache = DumbLruPageCache::new(3); From 4704cdd24f2317e9f11b727ac06a9608588d2944 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 13:56:21 +0200 Subject: [PATCH 19/22] validate_btree pin pages --- core/storage/btree.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 988d652f1..0004bb0d7 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -6111,6 +6111,8 @@ mod tests { pager.io.run_once().unwrap(); } let page = page.get(); + // Pin page in order to not drop it in between + page.set_dirty(); let contents = page.get().contents.as_ref().unwrap(); let page_type = contents.page_type(); let mut previous_key = None; @@ -6154,6 +6156,7 @@ mod tests { }; if current_depth >= 100 { tracing::error!("depth is too big"); + page.clear_dirty(); return (100, false); } depth = Some(depth.unwrap_or(current_depth + 1)); @@ -6214,6 +6217,7 @@ mod tests { if contents.rightmost_pointer().is_none() && contents.cell_count() == 0 { valid = false; } + page.clear_dirty(); (depth.unwrap(), valid) } @@ -6224,6 +6228,8 @@ mod tests { pager.io.run_once().unwrap(); } let page = page.get(); + // Pin page in order to not drop it in between loading of different pages. If not contents will be a dangling reference. + page.set_dirty(); let contents = page.get().contents.as_ref().unwrap(); let page_type = contents.page_type(); let mut current = Vec::new(); @@ -6271,6 +6277,7 @@ mod tests { " ".repeat(depth), current.join(", ") ); + page.clear_dirty(); if child.is_empty() { current } else { @@ -6711,19 +6718,19 @@ mod tests { #[test] #[ignore] pub fn fuzz_long_btree_insert_fuzz_run_random() { - btree_insert_fuzz_run(128, 2_000, |rng| (rng.next_u32() % 4096) as usize); + btree_insert_fuzz_run(2, 10_000, |rng| (rng.next_u32() % 4096) as usize); } #[test] #[ignore] pub fn fuzz_long_btree_insert_fuzz_run_small() { - btree_insert_fuzz_run(1, 10_000, |rng| (rng.next_u32() % 128) as usize); + btree_insert_fuzz_run(2, 10_000, |rng| (rng.next_u32() % 128) as usize); } #[test] #[ignore] pub fn fuzz_long_btree_insert_fuzz_run_big() { - btree_insert_fuzz_run(64, 2_000, |rng| 3 * 1024 + (rng.next_u32() % 1024) as usize); + btree_insert_fuzz_run(2, 10_000, |rng| 3 * 1024 + (rng.next_u32() % 1024) as usize); } #[test] From a69f85be847966bf004cfaacc7a43ee083a0c194 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 13:56:40 +0200 Subject: [PATCH 20/22] cacheflush clear cache --- core/storage/page_cache.rs | 25 ++++++++++++++++++++++--- core/storage/pager.rs | 26 +++++--------------------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index f68182d9d..982118a36 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -294,10 +294,22 @@ impl DumbLruPageCache { } pub fn clear(&mut self) -> Result<(), CacheError> { - let keys_to_remove: Vec = self.map.borrow().keys_cloned(); - for key in keys_to_remove { - self.delete(key)?; + let mut current = *self.head.borrow(); + while let Some(current_entry) = current { + unsafe { + self.map.borrow_mut().remove(¤t_entry.as_ref().key); + } + let next = unsafe { current_entry.as_ref().next }; + self.detach(current_entry, true)?; + unsafe { + assert!(!current_entry.as_ref().page.is_dirty()); + } + unsafe { std::ptr::drop_in_place(current_entry.as_ptr()) }; + current = next; } + let _ = self.head.take(); + let _ = self.tail.take(); + assert!(self.head.borrow().is_none()); assert!(self.tail.borrow().is_none()); assert!(self.map.borrow().is_empty()); @@ -585,6 +597,13 @@ impl PageHashMap { } new_hash_map } + + pub fn clear(&mut self) { + for bucket in &mut self.buckets { + bucket.clear(); + } + self.size = 0; + } } #[cfg(test)] diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 2d6fd5637..e289e8281 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -411,11 +411,6 @@ impl Pager { Some(wal) => wal.borrow().get_max_frame(), None => 0, }; - let max_frame_after_append = self.wal.as_ref().map(|wal| { - wal.borrow().get_max_frame() + self.dirty_pages.borrow().len() as u64 - }); - tracing::error!("start flush"); - tracing::error!("pages={:?}", self.dirty_pages.borrow()); for page_id in self.dirty_pages.borrow().iter() { let mut cache = self.page_cache.write(); let page_key = PageCacheKey::new(*page_id, Some(max_frame)); @@ -428,24 +423,13 @@ impl Pager { db_size, self.flush_info.borrow().in_flight_writes.clone(), )?; - // Assuming writer will always end append frames at frameid == this_transaction.max_frame + dirty_pages, - // we can insert this page with a new key into that new "snapshot" that has the newest max frame. We don't - // simply clone the page and insert with new key because next cache.delete will invalidate the contents of the page, - // therefore we need to clone the contents itself and place them on the new page. Cloning contents should be fast because - // buffer is wrapped around an Arc. - let new_page = Page::new(*page_id); - new_page.get().contents = Some(page.get_contents().clone()); - new_page.set_loaded(); - let new_page: Arc = Arc::new(new_page); - let new_page_key = PageCacheKey::new(*page_id, max_frame_after_append); - cache.insert(new_page_key, new_page).map_err(|e| {LimboError::InternalError(format!("Failed to delete page {:?} from cache during flush: {:?}. Might be actively referenced.", page_id, e))})?; } page.clear_dirty(); - // This page is no longer valid. - // For example: - // We took page with key (page_num, max_frame) -- this page is no longer valid for that max_frame - // so it must be invalidated. There shouldn't be any active refs. - cache.delete(page_key).map_err(|e| {LimboError::InternalError(format!("Failed to delete page {:?} from cache during flush: {:?}. Might be actively referenced.", page_id, e))})?; + } + // This is okay assuming we use shared cache by default. + { + let mut cache = self.page_cache.write(); + cache.clear().unwrap(); } self.dirty_pages.borrow_mut().clear(); self.flush_info.borrow_mut().state = FlushState::WaitAppendFrames; From 7143e43dd4f23f5af7f9d765e61a3e74cdba7465 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 14:33:20 +0200 Subject: [PATCH 21/22] clippy --- core/storage/btree.rs | 2 ++ core/storage/page_cache.rs | 22 +++------------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 0004bb0d7..5c8a94865 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -137,6 +137,8 @@ pub struct BTreePageInner { } pub type BTreePage = Arc; +unsafe impl Send for BTreePageInner {} +unsafe impl Sync for BTreePageInner {} /// State machine of destroy operations /// Keep track of traversal so that it can be resumed when IO is encountered #[derive(Debug, Clone)] diff --git a/core/storage/page_cache.rs b/core/storage/page_cache.rs index 982118a36..3dffd27bc 100644 --- a/core/storage/page_cache.rs +++ b/core/storage/page_cache.rs @@ -1,4 +1,4 @@ -use std::{cell::RefCell, collections::HashMap, ptr::NonNull}; +use std::{cell::RefCell, ptr::NonNull}; use std::sync::Arc; use tracing::{debug, trace}; @@ -568,20 +568,11 @@ impl PageHashMap { self.size } - pub fn keys_cloned(&self) -> Vec { - let mut all_keys = Vec::new(); - for bucket in &self.buckets { - for node in bucket { - all_keys.push(node.key.clone()); - } - } - all_keys - } - pub fn iter(&self) -> impl Iterator { self.buckets.iter().flat_map(|bucket| bucket.iter()) } + #[cfg(test)] pub fn iter_mut(&mut self) -> impl Iterator { self.buckets.iter_mut().flat_map(|bucket| bucket.iter_mut()) } @@ -597,13 +588,6 @@ impl PageHashMap { } new_hash_map } - - pub fn clear(&mut self) { - for bucket in &mut self.buckets { - bucket.clear(); - } - self.size = 0; - } } #[cfg(test)] @@ -626,6 +610,7 @@ mod tests { PageCacheKey::new(id, Some(id as u64)) } + #[allow(clippy::arc_with_non_send_sync)] pub fn page_with_content(page_id: usize) -> PageRef { let page = Arc::new(Page::new(page_id)); { @@ -923,7 +908,6 @@ mod tests { #[test] #[ignore = "for now let's not track active refs"] - fn test_detach_with_active_reference_clean() { let mut cache = DumbLruPageCache::default(); let (key, entry) = insert_and_get_entry(&mut cache, 1); From b135bf449f884d06cb041c644d38b8f6d4dd7581 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 21 May 2025 15:40:42 +0200 Subject: [PATCH 22/22] reduce attempts for fuzz_long overflow --- core/storage/btree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5c8a94865..017ce21ee 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -6738,7 +6738,7 @@ mod tests { #[test] #[ignore] pub fn fuzz_long_btree_insert_fuzz_run_overflow() { - btree_insert_fuzz_run(64, 5_000, |rng| (rng.next_u32() % 32 * 1024) as usize); + btree_insert_fuzz_run(2, 5_000, |rng| (rng.next_u32() % 32 * 1024) as usize); } #[allow(clippy::arc_with_non_send_sync)]