From 591c674e86bfb2275303b9541199aa8fc127cae7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 20 May 2025 15:44:18 +0200 Subject: [PATCH] Introduce PageRef wrapper `BTreePage`. One problem we have with PageRef, is that this Page reference can be unloaded, this means if we read the page again instead of loading the page onto the same reference, we will have split brain of references. To solve this we wrap PageRef in `BTreePage` so that if a page is seen as unloaded, we will replace BTreePage::page with the newest version of the page. --- core/lib.rs | 6 +- core/storage/btree.rs | 497 +++++++++++++++++++++++++++--------------- core/storage/pager.rs | 70 +----- 3 files changed, 333 insertions(+), 240 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 002c0919f..37a6a5020 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -56,7 +56,7 @@ use std::{ rc::Rc, sync::{Arc, OnceLock}, }; -use storage::btree::btree_init_page; +use storage::btree::{btree_init_page, BTreePageInner}; #[cfg(feature = "fs")] use storage::database::DatabaseFile; pub use storage::{ @@ -271,6 +271,9 @@ pub fn maybe_init_database_file(file: &Arc, io: &Arc) -> Resul &Rc::new(BufferPool::new(db_header.get_page_size() as usize)), DATABASE_HEADER_SIZE, ); + let page1 = Arc::new(BTreePageInner { + page: RefCell::new(page1), + }); { // Create the sqlite_schema table, for this we just need to create the btree page // for the first page of the database which is basically like any other btree page @@ -283,6 +286,7 @@ pub fn maybe_init_database_file(file: &Arc, io: &Arc) -> Resul (db_header.get_page_size() - db_header.reserved_space as u32) as u16, ); + let page1 = page1.get(); let contents = page1.get().contents.as_mut().unwrap(); contents.write_database_header(&db_header); // write the first page to disk synchronously diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 8429192bf..9b8c2879d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -25,6 +25,7 @@ use std::{ cmp::Ordering, pin::Pin, rc::Rc, + sync::Arc, }; use super::{ @@ -117,17 +118,25 @@ macro_rules! debug_validate_cells { } /// Check if the page is unlocked, if not return IO. If the page is not locked but not loaded, then try to load it. macro_rules! return_if_locked_maybe_load { - ($pager:expr, $expr:expr) => {{ - if $expr.is_locked() { + ($pager:expr, $btree_page:expr) => {{ + if $btree_page.get().is_locked() { return Ok(CursorResult::IO); } - if !$expr.is_loaded() { - $pager.load_page($expr.clone())?; + if !$btree_page.get().is_loaded() { + let page = $pager.read_page($btree_page.get().get().id)?; + $btree_page.page.replace(page); return Ok(CursorResult::IO); } }}; } +/// Wrapper around a page reference used in order to update the reference in case page was unloaded +/// and we need to update the reference. +pub struct BTreePageInner { + pub page: RefCell, +} + +pub type BTreePage = Arc; /// State machine of destroy operations /// Keep track of traversal so that it can be resumed when IO is encountered #[derive(Debug, Clone)] @@ -194,7 +203,7 @@ enum ReadPayloadOverflow { payload: Vec, next_page: u32, remaining_to_read: usize, - page: PageRef, + page: BTreePage, }, } @@ -210,7 +219,7 @@ enum PayloadOverflowWithOffset { ProcessPage { next_page: u32, remaining_to_read: u32, - page: PageRef, + page: BTreePage, current_offset: usize, buffer_offset: usize, is_write: bool, @@ -271,7 +280,7 @@ impl BTreeKey<'_> { #[derive(Clone)] struct BalanceInfo { /// Old pages being balanced. We can have maximum 3 pages being balanced at the same time. - pages_to_balance: [Option; 3], + pages_to_balance: [Option; 3], /// Bookkeeping of the rightmost pointer so the offset::BTREE_RIGHTMOST_PTR can be updated. rightmost_pointer: *mut u8, /// Divider cells of old pages. We can have maximum 2 divider cells because of 3 pages. @@ -542,11 +551,8 @@ impl BTreeCursor { } let cell_idx = cell_idx as usize; - return_if_locked!(page); - if !page.is_loaded() { - self.pager.load_page(page.clone())?; - return Ok(CursorResult::IO); - } + return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); @@ -557,7 +563,7 @@ impl BTreeCursor { let rightmost_pointer = contents.rightmost_pointer(); if let Some(rightmost_pointer) = rightmost_pointer { self.stack - .push_backwards(self.pager.read_page(rightmost_pointer as usize)?); + .push_backwards(self.read_page(rightmost_pointer as usize)?); continue; } } @@ -580,7 +586,7 @@ impl BTreeCursor { _left_child_page, _rowid, }) => { - let mem_page = self.pager.read_page(_left_child_page as usize)?; + let mem_page = self.read_page(_left_child_page as usize)?; self.stack.push_backwards(mem_page); continue; } @@ -618,7 +624,7 @@ impl BTreeCursor { // left child has: key 663, key 664, key 665 // we need to move to the previous parent (with e.g. key 662) when iterating backwards. self.stack.retreat(); - let mem_page = self.pager.read_page(left_child_page as usize)?; + let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); // use cell_index = i32::MAX to tell next loop to go to the end of the current page self.stack.set_cell_index(i32::MAX); @@ -743,7 +749,7 @@ impl BTreeCursor { let res = match &mut self.state { CursorState::None => { tracing::debug!("start reading overflow page payload_size={}", payload_size); - let page = self.pager.read_page(start_next_page as usize)?; + let page = self.read_page(start_next_page as usize)?; self.state = CursorState::Read(ReadPayloadOverflow::ProcessPage { payload: payload.to_vec(), next_page: start_next_page, @@ -756,12 +762,13 @@ impl BTreeCursor { payload, next_page, remaining_to_read, - page, + page: page_btree, }) => { - if page.is_locked() { + if page_btree.get().is_locked() { return Ok(CursorResult::IO); } tracing::debug!("reading overflow page {} {}", next_page, remaining_to_read); + let page = page_btree.get(); let contents = page.get_contents(); // The first four bytes of each overflow page are a big-endian integer which is the page number of the next page in the chain, or zero for the final page in the chain. let next = contents.read_u32_no_offset(0); @@ -779,8 +786,12 @@ impl BTreeCursor { std::mem::swap(payload, &mut payload_swap); CursorResult::Ok(payload_swap) } else { - let new_page = self.pager.read_page(next as usize)?; - *page = new_page; + let new_page = self.pager.read_page(next as usize).map(|page| { + Arc::new(BTreePageInner { + page: RefCell::new(page), + }) + })?; + *page_btree = new_page; *next_page = next; CursorResult::IO } @@ -871,9 +882,10 @@ impl BTreeCursor { return self.continue_payload_overflow_with_offset(buffer, self.usable_space()); } - let page = self.stack.top(); - return_if_locked_maybe_load!(self.pager, page); + let page_btree = self.stack.top(); + return_if_locked_maybe_load!(self.pager, page_btree); + let page = page_btree.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index() as usize - 1; @@ -918,7 +930,13 @@ impl BTreeCursor { local_amount = local_size as u32 - offset; } if is_write { - self.write_payload_to_page(offset, local_amount, payload, buffer, page.clone()); + self.write_payload_to_page( + offset, + local_amount, + payload, + buffer, + page_btree.clone(), + ); } else { self.read_payload_from_page(offset, local_amount, payload, buffer); } @@ -973,7 +991,7 @@ impl BTreeCursor { is_write, }) => { if *pages_left_to_skip == 0 { - let page = self.pager.read_page(*next_page as usize)?; + let page = self.read_page(*next_page as usize)?; return_if_locked_maybe_load!(self.pager, page); self.state = CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { @@ -988,8 +1006,9 @@ impl BTreeCursor { continue; } - let page = self.pager.read_page(*next_page as usize)?; + let page = self.read_page(*next_page as usize)?; return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); let contents = page.get_contents(); let next = contents.read_u32_no_offset(0); @@ -1018,17 +1037,17 @@ impl BTreeCursor { CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { next_page, remaining_to_read, - page, + page: page_btree, current_offset, buffer_offset, is_write, }) => { - if page.is_locked() { + if page_btree.get().is_locked() { self.state = CursorState::ReadWritePayload(PayloadOverflowWithOffset::ProcessPage { next_page: *next_page, remaining_to_read: *remaining_to_read, - page: page.clone(), + page: page_btree.clone(), current_offset: *current_offset, buffer_offset: *buffer_offset, is_write: *is_write, @@ -1037,6 +1056,7 @@ impl BTreeCursor { return Ok(CursorResult::IO); } + let page = page_btree.get(); let contents = page.get_contents(); let overflow_size = usable_space - 4; @@ -1054,7 +1074,7 @@ impl BTreeCursor { bytes_to_process, page_payload, buffer, - page.clone(), + page_btree.clone(), ); } else { self.read_payload_from_page( @@ -1081,7 +1101,7 @@ impl BTreeCursor { // Load next page *next_page = next; *current_offset = 0; // Reset offset for new page - *page = self.pager.read_page(next as usize)?; + *page_btree = self.read_page(next as usize)?; // Return IO to allow other operations return Ok(CursorResult::IO); @@ -1116,10 +1136,10 @@ impl BTreeCursor { num_bytes: u32, payload: &[u8], buffer: &mut Vec, - page: PageRef, + page: BTreePage, ) { - page.set_dirty(); - self.pager.add_dirty(page.get().id); + page.get().set_dirty(); + self.pager.add_dirty(page.get().get().id); // SAFETY: This is safe as long as the page is not evicted from the cache. let payload_mut = unsafe { std::slice::from_raw_parts_mut(payload.as_ptr() as *mut u8, payload.len()) }; @@ -1155,15 +1175,15 @@ impl BTreeCursor { let mem_page_rc = self.stack.top(); let cell_idx = self.stack.current_cell_index() as usize; - tracing::trace!("current id={} cell={}", mem_page_rc.get().id, cell_idx); - return_if_locked!(mem_page_rc); - if !mem_page_rc.is_loaded() { - self.pager.load_page(mem_page_rc.clone())?; - return Ok(CursorResult::IO); - } + tracing::trace!( + "current id={} cell={}", + mem_page_rc.get().get().id, + cell_idx + ); + return_if_locked_maybe_load!(self.pager, mem_page_rc); let mem_page = mem_page_rc.get(); - let contents = mem_page.contents.as_ref().unwrap(); + let contents = mem_page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); if cell_count == 0 || cell_idx == cell_count { @@ -1172,7 +1192,7 @@ impl BTreeCursor { match contents.rightmost_pointer() { Some(right_most_pointer) => { self.stack.advance(); - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue; } @@ -1216,7 +1236,7 @@ impl BTreeCursor { }) => { assert!(predicate.is_none()); self.stack.advance(); - let mem_page = self.pager.read_page(*_left_child_page as usize)?; + let mem_page = self.read_page(*_left_child_page as usize)?; self.stack.push(mem_page); continue; } @@ -1251,7 +1271,7 @@ impl BTreeCursor { payload_size, }) => { if !self.going_upwards { - let mem_page = self.pager.read_page(*left_child_page as usize)?; + let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); continue; } @@ -1387,7 +1407,7 @@ impl BTreeCursor { /// Move the cursor to the root page of the btree. fn move_to_root(&mut self) { tracing::trace!("move_to_root({})", self.root_page); - let mem_page = self.pager.read_page(self.root_page).unwrap(); + let mem_page = self.read_page(self.root_page).unwrap(); self.stack.clear(); self.stack.push(mem_page); } @@ -1398,9 +1418,10 @@ impl BTreeCursor { loop { let mem_page = self.stack.top(); - let page_idx = mem_page.get().id; - let page = self.pager.read_page(page_idx)?; - return_if_locked!(page); + let page_idx = mem_page.get().get().id; + let page = self.read_page(page_idx)?; + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { if contents.cell_count() > 0 { @@ -1412,7 +1433,7 @@ impl BTreeCursor { match contents.rightmost_pointer() { Some(right_most_pointer) => { self.stack.set_cell_index(contents.cell_count() as i32 + 1); - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue; } @@ -1429,7 +1450,8 @@ impl BTreeCursor { let iter_dir = seek_op.iteration_direction(); 'outer: loop { let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { return Ok(CursorResult::Ok(())); @@ -1457,14 +1479,14 @@ impl BTreeCursor { -1 + (iter_dir == IterationDirection::Forwards) as i32 * 2; self.stack .set_cell_index(leftmost_matching_cell as i32 + index_change); - let mem_page = self.pager.read_page(left_child_page as usize)?; + let mem_page = self.read_page(left_child_page as usize)?; self.stack.push(mem_page); continue 'outer; } self.stack.set_cell_index(cell_count as i32 + 1); match contents.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue 'outer; } @@ -1524,7 +1546,8 @@ impl BTreeCursor { let iter_dir = cmp.iteration_direction(); 'outer: loop { let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { return Ok(CursorResult::Ok(())); @@ -1540,7 +1563,7 @@ impl BTreeCursor { self.stack.set_cell_index(contents.cell_count() as i32 + 1); match contents.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.stack.push(mem_page); continue 'outer; } @@ -1580,7 +1603,7 @@ impl BTreeCursor { unreachable!("unexpected cell type: {:?}", matching_cell); }; - let mem_page = self.pager.read_page(*left_child_page as usize)?; + let mem_page = self.read_page(*left_child_page as usize)?; self.stack.push(mem_page); continue 'outer; } @@ -1686,7 +1709,8 @@ impl BTreeCursor { self.move_to_root(); return_if_io!(self.tablebtree_move_to(rowid, seek_op)); let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); assert!( contents.is_leaf(), @@ -1837,8 +1861,9 @@ impl BTreeCursor { return_if_io!(self.indexbtree_move_to(key, seek_op)); let page = self.stack.top(); - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_count = contents.cell_count(); @@ -2070,7 +2095,8 @@ impl BTreeCursor { // get page and find cell let (cell_idx, page_type) = { - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); page.set_dirty(); self.pager.add_dirty(page.get().id); @@ -2088,8 +2114,8 @@ impl BTreeCursor { // if the cell index is less than the total cells, check: if its an existing // rowid, we are going to update / overwrite the cell - if cell_idx < page.get_contents().cell_count() { - match page.get_contents().cell_get( + if cell_idx < page.get().get_contents().cell_count() { + match page.get().get_contents().cell_get( cell_idx, payload_overflow_threshold_max(page_type, self.usable_space() as u16), payload_overflow_threshold_min(page_type, self.usable_space() as u16), @@ -2147,6 +2173,7 @@ impl BTreeCursor { // insert let overflow = { + let page = page.get(); let contents = page.get().contents.as_mut().unwrap(); tracing::debug!( "insert_into_page(overflow, cell_count={})", @@ -2220,6 +2247,7 @@ impl BTreeCursor { // is less than 2/3rds of the total usable space on the page // // https://github.com/sqlite/sqlite/blob/0aa95099f5003dc99f599ab77ac0004950b281ef/src/btree.c#L9064-L9071 + let current_page = current_page.get(); let page = current_page.get().contents.as_mut().unwrap(); let usable_space = self.usable_space(); let free_space = compute_free_space(page, usable_space as u16); @@ -2265,6 +2293,7 @@ impl BTreeCursor { WriteState::BalanceNonRoot => { let parent_page = self.stack.top(); return_if_locked_maybe_load!(self.pager, parent_page); + let parent_page = parent_page.get(); // If `move_to` moved to rightmost page, cell index will be out of bounds. Meaning cell_count+1. // In any other case, `move_to` will stay in the correct index. if self.stack.current_cell_index() as usize @@ -2287,7 +2316,7 @@ impl BTreeCursor { PageType::IndexInterior | PageType::TableInterior )); // Part 1: Find the sibling pages to balance - let mut pages_to_balance: [Option; 3] = [const { None }; 3]; + let mut pages_to_balance: [Option; 3] = [const { None }; 3]; let number_of_cells_in_parent = parent_contents.cell_count() + parent_contents.overflow_cells.len(); @@ -2354,11 +2383,14 @@ impl BTreeCursor { let mut pgno: u32 = unsafe { right_pointer.cast::().read().swap_bytes() }; let current_sibling = sibling_pointer; for i in (0..=current_sibling).rev() { - let page = self.pager.read_page(pgno as usize)?; + let page = self.read_page(pgno as usize)?; #[cfg(debug_assertions)] { - return_if_locked!(page); - debug_validate_cells!(&page.get_contents(), self.usable_space() as u16); + return_if_locked!(page.get()); + debug_validate_cells!( + &page.get().get_contents(), + self.usable_space() as u16 + ); } pages_to_balance[i].replace(page); assert_eq!( @@ -2399,10 +2431,13 @@ impl BTreeCursor { let page_type_of_siblings = pages_to_balance[0] .as_ref() .unwrap() + .get() .get_contents() .page_type(); for page in pages_to_balance.iter().take(sibling_count) { - let contents = page.as_ref().unwrap().get_contents(); + return_if_locked_maybe_load!(self.pager, page.as_ref().unwrap()); + let page = page.as_ref().unwrap().get(); + let contents = page.get_contents(); debug_validate_cells!(&contents, self.usable_space() as u16); assert_eq!(contents.page_type(), page_type_of_siblings); } @@ -2431,12 +2466,14 @@ impl BTreeCursor { .pages_to_balance .iter() .take(balance_info.sibling_count) - .all(|page| !page.as_ref().unwrap().is_locked()); + .all(|page| !page.as_ref().unwrap().get().is_locked()); if !all_loaded { return Ok(CursorResult::IO); } // Now do real balancing - let parent_page = self.stack.top(); + let parent_page_btree = self.stack.top(); + let parent_page = parent_page_btree.get(); + let parent_contents = parent_page.get_contents(); let parent_is_root = !self.stack.has_parent(); @@ -2448,9 +2485,10 @@ impl BTreeCursor { /* 1. Get divider cells and max_cells */ let mut max_cells = 0; // we only need maximum 5 pages to balance 3 pages - let mut pages_to_balance_new: [Option; 5] = [const { None }; 5]; + let mut pages_to_balance_new: [Option; 5] = [const { None }; 5]; for i in (0..balance_info.sibling_count).rev() { let sibling_page = balance_info.pages_to_balance[i].as_ref().unwrap(); + let sibling_page = sibling_page.get(); let sibling_contents = sibling_page.get_contents(); sibling_page.set_dirty(); self.pager.add_dirty(sibling_page.get().id); @@ -2511,6 +2549,7 @@ impl BTreeCursor { let page_type = balance_info.pages_to_balance[0] .as_ref() .unwrap() + .get() .get_contents() .page_type(); tracing::debug!("balance_non_root(page_type={:?})", page_type); @@ -2522,7 +2561,8 @@ impl BTreeCursor { .take(balance_info.sibling_count) .enumerate() { - let old_page_contents = old_page.as_ref().unwrap().get_contents(); + let old_page = old_page.as_ref().unwrap().get(); + let old_page_contents = old_page.get_contents(); debug_validate_cells!(&old_page_contents, self.usable_space() as u16); for cell_idx in 0..old_page_contents.cell_count() { let (cell_start, cell_len) = old_page_contents.cell_get_raw_region( @@ -2612,6 +2652,7 @@ impl BTreeCursor { for i in 0..balance_info.sibling_count { cell_array.number_of_cells_per_page[i] = count_cells_in_old_pages[i]; let page = &balance_info.pages_to_balance[i].as_ref().unwrap(); + let page = page.get(); let page_contents = page.get_contents(); let free_space = compute_free_space(page_contents, self.usable_space() as u16); @@ -2782,11 +2823,11 @@ impl BTreeCursor { for i in 0..sibling_count_new { if i < balance_info.sibling_count { let page = balance_info.pages_to_balance[i].as_ref().unwrap(); - page.set_dirty(); + page.get().set_dirty(); pages_to_balance_new[i].replace(page.clone()); } else { // FIXME: handle page cache is full - let page = self.pager.do_allocate_page(page_type, 0); + let page = self.allocate_page(page_type, 0); pages_to_balance_new[i].replace(page); // Since this page didn't exist before, we can set it to cells length as it // marks them as empty since it is a prefix sum of cells. @@ -2802,7 +2843,7 @@ impl BTreeCursor { .take(sibling_count_new) .enumerate() { - page_numbers[i] = page.as_ref().unwrap().get().id; + page_numbers[i] = page.as_ref().unwrap().get().get().id; } page_numbers.sort(); for (page, new_id) in pages_to_balance_new @@ -2812,10 +2853,10 @@ impl BTreeCursor { .zip(page_numbers.iter().rev().take(sibling_count_new)) { let page = page.as_ref().unwrap(); - if *new_id != page.get().id { - page.get().id = *new_id; + if *new_id != page.get().get().id { + page.get().get().id = *new_id; self.pager - .update_dirty_loaded_page_in_cache(*new_id, page.clone())?; + .update_dirty_loaded_page_in_cache(*new_id, page.get())?; } } @@ -2828,7 +2869,7 @@ impl BTreeCursor { for page in pages_to_balance_new.iter().take(sibling_count_new) { tracing::debug!( "balance_non_root(new_sibling page_id={})", - page.as_ref().unwrap().get().id + page.as_ref().unwrap().get().get().id ); } } @@ -2846,6 +2887,7 @@ impl BTreeCursor { .as_ref() .unwrap() .get() + .get() .id as u32; let rightmost_pointer = balance_info.rightmost_pointer; let rightmost_pointer = @@ -2867,11 +2909,12 @@ impl BTreeCursor { let last_page = balance_info.pages_to_balance[balance_info.sibling_count - 1] .as_ref() .unwrap(); - let right_pointer = last_page.get_contents().rightmost_pointer().unwrap(); + let right_pointer = last_page.get().get_contents().rightmost_pointer().unwrap(); let new_last_page = pages_to_balance_new[sibling_count_new - 1] .as_ref() .unwrap(); new_last_page + .get() .get_contents() .write_u32(offset::BTREE_RIGHTMOST_PTR, right_pointer); } @@ -2892,10 +2935,12 @@ impl BTreeCursor { // Interior // Make this page's rightmost pointer point to pointer of divider cell before modification let previous_pointer_divider = read_u32(÷r_cell, 0); - page.get_contents() + page.get() + .get_contents() .write_u32(offset::BTREE_RIGHTMOST_PTR, previous_pointer_divider); // divider cell now points to this page - new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell + .extend_from_slice(&(page.get().get().id as u32).to_be_bytes()); // now copy the rest of the divider cell: // Table Interior page: // * varint rowid @@ -2912,11 +2957,13 @@ impl BTreeCursor { divider_cell = &mut cell_array.cells[divider_cell_idx - 1]; let (_, n_bytes_payload) = read_varint(divider_cell)?; let (rowid, _) = read_varint(÷r_cell[n_bytes_payload..])?; - new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell + .extend_from_slice(&(page.get().get().id as u32).to_be_bytes()); write_varint_to_vec(rowid, &mut new_divider_cell); } else { // Leaf index - new_divider_cell.extend_from_slice(&(page.get().id as u32).to_be_bytes()); + new_divider_cell + .extend_from_slice(&(page.get().get().id as u32).to_be_bytes()); new_divider_cell.extend_from_slice(divider_cell); } @@ -2930,7 +2977,7 @@ impl BTreeCursor { i, left_pointer ); - assert_eq!(left_pointer, page.get().id as u32); + assert_eq!(left_pointer, page.get().get().id as u32); // FIXME: remove this lock assert!( left_pointer <= self.pager.db_header.lock().database_size, @@ -2951,7 +2998,7 @@ impl BTreeCursor { balance_info, parent_contents, i, - page, + &page.get(), ); } tracing::debug!( @@ -2965,9 +3012,9 @@ impl BTreeCursor { for page in pages_to_balance_new.iter().take(sibling_count_new) { let page = page.as_ref().unwrap(); assert!( - pages_pointed_to.contains(&(page.get().id as u32)), + pages_pointed_to.contains(&(page.get().get().id as u32)), "page {} not pointed to by divider cell or rightmost pointer", - page.get().id + page.get().get().id ); } } @@ -3025,6 +3072,7 @@ impl BTreeCursor { ) }; let page = pages_to_balance_new[page_idx].as_ref().unwrap(); + let page = page.get(); tracing::debug!("pre_edit_page(page={})", page.get().id); let page_contents = page.get_contents(); edit_page( @@ -3049,6 +3097,7 @@ impl BTreeCursor { // TODO: vacuum support let first_child_page = pages_to_balance_new[0].as_ref().unwrap(); + let first_child_page = first_child_page.get(); let first_child_contents = first_child_page.get_contents(); if parent_is_root && parent_contents.cell_count() == 0 @@ -3105,7 +3154,7 @@ impl BTreeCursor { #[cfg(debug_assertions)] self.post_balance_non_root_validation( - &parent_page, + &parent_page_btree, balance_info, parent_contents, pages_to_balance_new, @@ -3119,7 +3168,8 @@ impl BTreeCursor { // We have to free pages that are not used anymore for i in sibling_count_new..balance_info.sibling_count { let page = balance_info.pages_to_balance[i].as_ref().unwrap(); - self.pager.free_page(Some(page.clone()), page.get().id)?; + self.pager + .free_page(Some(page.get().clone()), page.get().get().id)?; } (WriteState::BalanceStart, Ok(CursorResult::Ok(()))) } @@ -3184,10 +3234,10 @@ impl BTreeCursor { #[cfg(debug_assertions)] fn post_balance_non_root_validation( &self, - parent_page: &PageRef, + parent_page: &BTreePage, balance_info: &mut BalanceInfo, parent_contents: &mut PageContent, - pages_to_balance_new: [Option>; 5], + pages_to_balance_new: [Option; 5], page_type: PageType, leaf_data: bool, mut cells_debug: Vec>, @@ -3214,9 +3264,9 @@ impl BTreeCursor { match cell { BTreeCell::TableInteriorCell(table_interior_cell) => { let left_child_page = table_interior_cell._left_child_page; - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})", - parent_page.get().id, + parent_page.get().get().id, left_child_page, ); valid = false; @@ -3224,9 +3274,9 @@ impl BTreeCursor { } BTreeCell::IndexInteriorCell(index_interior_cell) => { let left_child_page = index_interior_cell.left_child_page; - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(parent_divider_points_to_same_page, page_id={}, cell_left_child_page={})", - parent_page.get().id, + parent_page.get().get().id, left_child_page, ); valid = false; @@ -3242,6 +3292,7 @@ impl BTreeCursor { .enumerate() { let page = page.as_ref().unwrap(); + let page = page.get(); let contents = page.get_contents(); debug_validate_cells!(contents, self.usable_space() as u16); // Cells are distributed in order @@ -3295,7 +3346,7 @@ impl BTreeCursor { ); valid = false; } - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})", page.get().id, left_child_page, @@ -3314,7 +3365,7 @@ impl BTreeCursor { ); valid = false; } - if left_child_page == parent_page.get().id as u32 { + if left_child_page == parent_page.get().get().id as u32 { tracing::error!("balance_non_root(child_page_points_parent_of_child, page_id={}, cell_left_child_page={}, page_idx={})", page.get().id, left_child_page, @@ -3367,10 +3418,12 @@ impl BTreeCursor { valid = false; } - if rightmost == page.get().id as u32 || rightmost == parent_page.get().id as u32 { + if rightmost == page.get().id as u32 + || rightmost == parent_page.get().get().id as u32 + { tracing::error!("balance_non_root(balance_shallower_rightmost_pointer, page_id={}, parent_page_id={}, rightmost={})", page.get().id, - parent_page.get().id, + parent_page.get().get().id, rightmost, ); valid = false; @@ -3662,27 +3715,29 @@ impl BTreeCursor { let is_page_1 = { let current_root = self.stack.top(); - current_root.get().id == 1 + current_root.get().get().id == 1 }; let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 }; - let root = self.stack.top(); + let root_btree = self.stack.top(); + let root = root_btree.get(); let root_contents = root.get_contents(); // FIXME: handle page cache is full - let child = self.pager.do_allocate_page(root_contents.page_type(), 0); + let child_btree = self.pager.do_allocate_page(root_contents.page_type(), 0); tracing::debug!( "balance_root(root={}, rightmost={}, page_type={:?})", root.get().id, - child.get().id, + child_btree.get().get().id, root.get_contents().page_type() ); self.pager.add_dirty(root.get().id); - self.pager.add_dirty(child.get().id); + self.pager.add_dirty(child_btree.get().get().id); let root_buf = root_contents.as_ptr(); + let child = child_btree.get(); let child_contents = child.get_contents(); let child_buf = child_contents.as_ptr(); let (root_pointer_start, root_pointer_len) = @@ -3724,8 +3779,8 @@ impl BTreeCursor { root_contents.overflow_cells.clear(); self.root_page = root.get().id; self.stack.clear(); - self.stack.push(root.clone()); - self.stack.push(child.clone()); + self.stack.push(root_btree.clone()); + self.stack.push(child_btree.clone()); } fn usable_space(&self) -> usize { @@ -3790,10 +3845,11 @@ impl BTreeCursor { self.move_to_root(); loop { let mem_page = self.stack.top(); - let page_id = mem_page.get().id; - let page = self.pager.read_page(page_id)?; - return_if_locked!(page); + let page_id = mem_page.get().get().id; + let page = self.read_page(page_id)?; + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if contents.is_leaf() { // set cursor just past the last cell to append @@ -3804,7 +3860,7 @@ impl BTreeCursor { match contents.rightmost_pointer() { Some(right_most_pointer) => { self.stack.set_cell_index(contents.cell_count() as i32 + 1); // invalid on interior - let child = self.pager.read_page(right_most_pointer as usize)?; + let child = self.read_page(right_most_pointer as usize)?; self.stack.push(child); } None => unreachable!("interior page must have rightmost pointer"), @@ -3971,10 +4027,10 @@ impl BTreeCursor { match delete_state { DeleteState::Start => { let page = self.stack.top(); - page.set_dirty(); - self.pager.add_dirty(page.get().id); + page.get().set_dirty(); + self.pager.add_dirty(page.get().get().id); if matches!( - page.get_contents().page_type(), + page.get().get_contents().page_type(), PageType::TableLeaf | PageType::TableInterior ) { let _target_rowid = match self.has_record.get() { @@ -4010,6 +4066,7 @@ impl BTreeCursor { cell_idx -= 1; } + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); if cell_idx >= contents.cell_count() { return_corrupt!(format!( @@ -4054,7 +4111,8 @@ impl BTreeCursor { return_if_io!(self.clear_overflow_pages(&cell)); let page = self.stack.top(); - let contents = page.get().contents.as_ref().unwrap(); + let page = page.get(); + let contents = page.get_contents(); let delete_info = self.state.mut_delete_info().unwrap(); if !contents.is_leaf() { @@ -4088,15 +4146,16 @@ impl BTreeCursor { return_if_locked_maybe_load!(self.pager, leaf_page); assert!( matches!( - leaf_page.get_contents().page_type(), + leaf_page.get().get_contents().page_type(), PageType::TableLeaf | PageType::IndexLeaf ), "self.prev should have returned a leaf page" ); let parent_page = self.stack.parent_page().unwrap(); - assert!(parent_page.is_loaded(), "parent page"); + assert!(parent_page.get().is_loaded(), "parent page"); + let leaf_page = leaf_page.get(); let leaf_contents = leaf_page.get().contents.as_ref().unwrap(); // The index of the cell to removed must be the last one. let leaf_cell_idx = leaf_contents.cell_count() - 1; @@ -4113,9 +4172,10 @@ impl BTreeCursor { self.usable_space(), )?; - parent_page.set_dirty(); - self.pager.add_dirty(parent_page.get().id); + parent_page.get().set_dirty(); + self.pager.add_dirty(parent_page.get().get().id); + let parent_page = parent_page.get(); let parent_contents = parent_page.get().contents.as_mut().unwrap(); // Create an interior cell from a predecessor @@ -4149,6 +4209,7 @@ impl BTreeCursor { let page = self.stack.top(); return_if_locked_maybe_load!(self.pager, page); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let free_space = compute_free_space(contents, self.usable_space() as u16); let needs_balancing = free_space as usize * 3 > self.usable_space() * 2; @@ -4288,8 +4349,9 @@ impl BTreeCursor { let _ = return_if_io!(self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ)); let page = self.stack.top(); // TODO(pere): request load - return_if_locked!(page); + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); // find cell @@ -4346,9 +4408,10 @@ impl BTreeCursor { self.overflow_state = None; return Err(LimboError::Corrupt("Invalid overflow page number".into())); } - let page = self.pager.read_page(next_page as usize)?; - return_if_locked!(page); + let page = self.read_page(next_page as usize)?; + return_if_locked!(page.get()); + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let next = contents.read_u32(0); @@ -4420,8 +4483,9 @@ impl BTreeCursor { } DestroyState::ProcessPage => { let page = self.stack.top(); - assert!(page.is_loaded()); // page should be loaded at this time + assert!(page.get().is_loaded()); // page should be loaded at this time + let page = page.get(); let contents = page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index(); @@ -4439,8 +4503,7 @@ impl BTreeCursor { // Non-leaf page which has processed all children but not it's potential right child (false, n) if n == contents.cell_count() as i32 => { if let Some(rightmost) = contents.rightmost_pointer() { - let rightmost_page = - self.pager.read_page(rightmost as usize)?; + let rightmost_page = self.read_page(rightmost as usize)?; self.stack.advance(); self.stack.push(rightmost_page); let destroy_info = self.state.mut_destroy_info().expect( @@ -4509,7 +4572,7 @@ impl BTreeCursor { BTreeCell::IndexInteriorCell(cell) => cell.left_child_page, _ => panic!("expected interior cell"), }; - let child_page = self.pager.read_page(child_page_id as usize)?; + let child_page = self.read_page(child_page_id as usize)?; self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( @@ -4526,9 +4589,8 @@ impl BTreeCursor { CursorResult::Ok(_) => match cell { // For an index interior cell, clear the left child page now that overflow pages have been cleared BTreeCell::IndexInteriorCell(index_int_cell) => { - let child_page = self - .pager - .read_page(index_int_cell.left_child_page as usize)?; + let child_page = + self.read_page(index_int_cell.left_child_page as usize)?; self.stack.advance(); self.stack.push(child_page); let destroy_info = self.state.mut_destroy_info().expect( @@ -4552,9 +4614,9 @@ impl BTreeCursor { } DestroyState::FreePage => { let page = self.stack.top(); - let page_id = page.get().id; + let page_id = page.get().get().id; - self.pager.free_page(Some(page), page_id)?; + self.pager.free_page(Some(page.get()), page_id)?; if self.stack.has_parent() { self.stack.pop(); @@ -4578,12 +4640,12 @@ impl BTreeCursor { pub fn overwrite_cell( &mut self, - page_ref: PageRef, + page_ref: BTreePage, cell_idx: usize, record: &ImmutableRecord, ) -> Result> { // build the new payload - let page_type = page_ref.get().contents.as_ref().unwrap().page_type(); + let page_type = page_ref.get().get().contents.as_ref().unwrap().page_type(); let mut new_payload = Vec::with_capacity(record.len()); let CursorHasRecord::Yes { rowid } = self.has_record.get() else { panic!("cursor should be pointing to a record"); @@ -4599,6 +4661,7 @@ impl BTreeCursor { // figure out old cell offset & size let (old_offset, old_local_size) = { + let page_ref = page_ref.get(); let page = page_ref.get().contents.as_ref().unwrap(); page.cell_get_raw_region( cell_idx, @@ -4615,12 +4678,12 @@ impl BTreeCursor { } else { // doesn't fit, drop it and insert a new one drop_cell( - page_ref.get_contents(), + page_ref.get().get_contents(), cell_idx, self.usable_space() as u16, )?; insert_into_cell( - page_ref.get_contents(), + page_ref.get().get_contents(), &new_payload, cell_idx, self.usable_space() as u16, @@ -4631,11 +4694,12 @@ impl BTreeCursor { pub fn overwrite_content( &mut self, - page_ref: PageRef, + page_ref: BTreePage, dest_offset: usize, new_payload: &[u8], ) -> Result> { - return_if_locked!(page_ref); + return_if_locked!(page_ref.get()); + let page_ref = page_ref.get(); let buf = page_ref.get().contents.as_mut().unwrap().as_ptr(); buf[dest_offset..dest_offset + new_payload.len()].copy_from_slice(&new_payload); @@ -4681,7 +4745,7 @@ impl BTreeCursor { mem_page_rc = self.stack.top(); return_if_locked_maybe_load!(self.pager, mem_page_rc); mem_page = mem_page_rc.get(); - contents = mem_page.contents.as_ref().unwrap(); + contents = mem_page.get().contents.as_ref().unwrap(); /* If this is a leaf page or the tree is not an int-key tree, then ** this page contains countable entries. Increment the entry counter @@ -4710,7 +4774,7 @@ impl BTreeCursor { mem_page_rc = self.stack.top(); return_if_locked_maybe_load!(self.pager, mem_page_rc); mem_page = mem_page_rc.get(); - contents = mem_page.contents.as_ref().unwrap(); + contents = mem_page.get().contents.as_ref().unwrap(); let cell_idx = self.stack.current_cell_index() as usize; @@ -4730,7 +4794,7 @@ impl BTreeCursor { // should be safe as contents is not a leaf page let right_most_pointer = contents.rightmost_pointer().unwrap(); self.stack.advance(); - let mem_page = self.pager.read_page(right_most_pointer as usize)?; + let mem_page = self.read_page(right_most_pointer as usize)?; self.going_upwards = false; self.stack.push(mem_page); } else { @@ -4757,7 +4821,7 @@ impl BTreeCursor { left_child_page, .. }) => { self.stack.advance(); - let mem_page = self.pager.read_page(left_child_page as usize)?; + let mem_page = self.read_page(left_child_page as usize)?; self.going_upwards = false; self.stack.push(mem_page); } @@ -4771,7 +4835,7 @@ impl BTreeCursor { pub fn save_context(&mut self) { if let CursorHasRecord::Yes { rowid } = self.has_record.get() { self.valid_state = CursorValidState::RequireSeek; - match self.stack.top().get_contents().page_type() { + match self.stack.top().get().get_contents().page_type() { PageType::TableInterior | PageType::TableLeaf => { self.context = Some(CursorContext::TableRowId(rowid.expect( "table cells should have a rowid since we don't support WITHOUT ROWID tables", @@ -4816,6 +4880,18 @@ impl BTreeCursor { pub fn collations(&self) -> &[CollationSeq] { &self.collations } + + pub fn read_page(&self, page_idx: usize) -> Result { + self.pager.read_page(page_idx).map(|page| { + Arc::new(BTreePageInner { + page: RefCell::new(page), + }) + }) + } + + pub fn allocate_page(&self, page_type: PageType, offset: usize) -> BTreePage { + self.pager.do_allocate_page(page_type, offset) + } } #[cfg(debug_assertions)] @@ -4836,7 +4912,7 @@ struct PageStack { /// Pointer to the current page being consumed current_page: Cell, /// List of pages in the stack. Root page will be in index 0 - stack: RefCell<[Option; BTCURSOR_MAX_DEPTH + 1]>, + stack: RefCell<[Option; BTCURSOR_MAX_DEPTH + 1]>, /// List of cell indices in the stack. /// cell_indices[current_page] is the current cell index being consumed. Similarly /// cell_indices[current_page-1] is the cell index of the parent of the current page @@ -4857,11 +4933,11 @@ impl PageStack { } /// Push a new page onto the stack. /// This effectively means traversing to a child page. - fn _push(&self, page: PageRef, starting_cell_idx: i32) { + fn _push(&self, page: BTreePage, starting_cell_idx: i32) { tracing::trace!( "pagestack::push(current={}, new_page_id={})", self.current_page.get(), - page.get().id + page.get().get().id ); self.increment_current(); let current = self.current_page.get(); @@ -4874,11 +4950,11 @@ impl PageStack { self.cell_indices.borrow_mut()[current as usize] = starting_cell_idx; } - fn push(&self, page: PageRef) { + fn push(&self, page: BTreePage) { self._push(page, 0); } - fn push_backwards(&self, page: PageRef) { + fn push_backwards(&self, page: BTreePage) { self._push(page, i32::MAX); } @@ -4895,7 +4971,7 @@ impl PageStack { /// Get the top page on the stack. /// This is the page that is currently being traversed. - fn top(&self) -> PageRef { + fn top(&self) -> BTreePage { let page = self.stack.borrow()[self.current()] .as_ref() .unwrap() @@ -4903,7 +4979,7 @@ impl PageStack { tracing::trace!( "pagestack::top(current={}, page_id={})", self.current(), - page.get().id + page.get().get().id ); page } @@ -4974,7 +5050,7 @@ impl PageStack { fn clear(&self) { self.current_page.set(-1); } - pub fn parent_page(&self) -> Option { + pub fn parent_page(&self) -> Option { if self.current_page.get() > 0 { Some( self.stack.borrow()[self.current() - 1] @@ -5005,6 +5081,12 @@ impl CellArray { } } +impl BTreePageInner { + pub fn get(&self) -> PageRef { + self.page.borrow().clone() + } +} + /// Try to find a free block available and allocate it if found fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> Result { // NOTE: freelist is in ascending order of keys and pc @@ -5060,11 +5142,15 @@ fn find_free_cell(page_ref: &PageContent, usable_space: u16, amount: usize) -> R Ok(0) } -pub fn btree_init_page(page: &PageRef, page_type: PageType, offset: usize, usable_space: u16) { +pub fn btree_init_page(page: &BTreePage, page_type: PageType, offset: usize, usable_space: u16) { // setup btree page let contents = page.get(); - tracing::debug!("btree_init_page(id={}, offset={})", contents.id, offset); - let contents = contents.contents.as_mut().unwrap(); + tracing::debug!( + "btree_init_page(id={}, offset={})", + contents.get().id, + offset + ); + let contents = contents.get().contents.as_mut().unwrap(); contents.offset = offset; let id = page_type as u8; contents.write_u8(offset::BTREE_PAGE_TYPE, id); @@ -5877,7 +5963,6 @@ mod tests { compute_free_space, fill_cell_payload, payload_overflow_threshold_max, payload_overflow_threshold_min, }, - pager::PageRef, sqlite3_ondisk::{BTreeCell, PageContent, PageType}, }, types::Value, @@ -5887,7 +5972,7 @@ mod tests { use super::{btree_init_page, defragment_page, drop_cell, insert_into_cell}; #[allow(clippy::arc_with_non_send_sync)] - fn get_page(id: usize) -> PageRef { + fn get_page(id: usize) -> BTreePage { let page = Arc::new(Page::new(id)); let drop_fn = Rc::new(|_| {}); @@ -5899,6 +5984,9 @@ mod tests { ))), ); page.get().contents.replace(inner); + let page = Arc::new(BTreePageInner { + page: RefCell::new(page), + }); btree_init_page(&page, PageType::TableLeaf, 0, 4096); page @@ -5958,6 +6046,7 @@ mod tests { let db = get_database(); let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; let record = ImmutableRecord::from_registers(&[Register::Value(Value::Integer(1))]); @@ -5981,6 +6070,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6011,9 +6101,12 @@ mod tests { fn validate_btree(pager: Rc, page_idx: usize) -> (usize, bool) { let cursor = BTreeCursor::new_table(None, pager.clone(), page_idx); - let page = pager.read_page(page_idx).unwrap(); + let page = cursor.read_page(page_idx).unwrap(); + while page.get().is_locked() { + pager.io.run_once().unwrap(); + } let page = page.get(); - let contents = page.contents.as_ref().unwrap(); + let contents = page.get().contents.as_ref().unwrap(); let page_type = contents.page_type(); let mut previous_key = None; let mut valid = true; @@ -6034,8 +6127,12 @@ mod tests { BTreeCell::TableInteriorCell(TableInteriorCell { _left_child_page, .. }) => { - child_pages.push(pager.read_page(_left_child_page as usize).unwrap()); - if _left_child_page == page.id as u32 { + let child_page = cursor.read_page(_left_child_page as usize).unwrap(); + while child_page.get().is_locked() { + pager.io.run_once().unwrap(); + } + child_pages.push(child_page); + if _left_child_page == page.get().id as u32 { valid = false; tracing::error!( "left child page is the same as parent {}", @@ -6084,10 +6181,26 @@ mod tests { valid = false; } } - let first_page_type = child_pages.first().map(|p| p.get_contents().page_type()); + let first_page_type = child_pages.first().map(|p| { + if !p.get().is_loaded() { + let new_page = pager.read_page(p.get().get().id).unwrap(); + p.page.replace(new_page); + } + while p.get().is_locked() { + pager.io.run_once().unwrap(); + } + p.get().get_contents().page_type() + }); if let Some(child_type) = first_page_type { for page in child_pages.iter().skip(1) { - if page.get_contents().page_type() != child_type { + if !page.get().is_loaded() { + let new_page = pager.read_page(page.get().get().id).unwrap(); + page.page.replace(new_page); + } + while page.get().is_locked() { + pager.io.run_once().unwrap(); + } + if page.get().get_contents().page_type() != child_type { tracing::error!("child pages have different types"); valid = false; } @@ -6101,9 +6214,12 @@ mod tests { fn format_btree(pager: Rc, page_idx: usize, depth: usize) -> String { let cursor = BTreeCursor::new_table(None, pager.clone(), page_idx); - let page = pager.read_page(page_idx).unwrap(); + let page = cursor.read_page(page_idx).unwrap(); + while page.get().is_locked() { + pager.io.run_once().unwrap(); + } let page = page.get(); - let contents = page.contents.as_ref().unwrap(); + let contents = page.get().contents.as_ref().unwrap(); let page_type = contents.page_type(); let mut current = Vec::new(); let mut child = Vec::new(); @@ -6179,8 +6295,11 @@ mod tests { let pager = Rc::new(pager); // FIXME: handle page cache is full let page1 = pager.allocate_page().unwrap(); + let page1 = Arc::new(BTreePageInner { + page: RefCell::new(page1), + }); btree_init_page(&page1, PageType::TableLeaf, 0, 4096); - (pager, page1.get().id) + (pager, page1.get().get().id) } #[test] @@ -6496,6 +6615,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6692,13 +6812,14 @@ mod tests { .write_page(current_page as usize, buf.clone(), c)?; pager.io.run_once()?; - let page = cursor.pager.read_page(current_page as usize)?; - while page.is_locked() { + let page = cursor.read_page(current_page as usize)?; + while page.get().is_locked() { cursor.pager.io.run_once()?; } { - let contents = page.get().contents.as_mut().unwrap(); + let page = page.get(); + let contents = page.get_contents(); let next_page = if current_page < 4 { current_page + 1 @@ -6739,8 +6860,8 @@ mod tests { let trunk_page_id = db_header.lock().freelist_trunk_page; if trunk_page_id > 0 { // Verify trunk page structure - let trunk_page = cursor.pager.read_page(trunk_page_id as usize)?; - if let Some(contents) = trunk_page.get().contents.as_ref() { + let trunk_page = cursor.read_page(trunk_page_id as usize)?; + if let Some(contents) = trunk_page.get().get().contents.as_ref() { // Read number of leaf pages in trunk let n_leaf = contents.read_u32(4); assert!(n_leaf > 0, "Trunk page should have leaf entries"); @@ -6819,34 +6940,33 @@ mod tests { ); // Initialize page 2 as a root page (interior) - let root_page = cursor.pager.read_page(2)?; + let root_page = cursor.read_page(2)?; { btree_init_page(&root_page, PageType::TableInterior, 0, 512); // Use proper page size } // Allocate two leaf pages // FIXME: handle page cache is full - let page3 = cursor.pager.allocate_page()?; - btree_init_page(&page3, PageType::TableLeaf, 0, 512); + let page3 = cursor.allocate_page(PageType::TableLeaf, 0); // FIXME: handle page cache is full - let page4 = cursor.pager.allocate_page()?; - btree_init_page(&page4, PageType::TableLeaf, 0, 512); + let page4 = cursor.allocate_page(PageType::TableLeaf, 0); // Configure the root page to point to the two leaf pages { + let root_page = root_page.get(); let contents = root_page.get().contents.as_mut().unwrap(); // Set rightmost pointer to page4 - contents.write_u32(offset::BTREE_RIGHTMOST_PTR, page4.get().id as u32); + contents.write_u32(offset::BTREE_RIGHTMOST_PTR, page4.get().get().id as u32); // Create a cell with pointer to page3 let cell_content = vec![ // First 4 bytes: left child pointer (page3) - (page3.get().id >> 24) as u8, - (page3.get().id >> 16) as u8, - (page3.get().id >> 8) as u8, - page3.get().id as u8, + (page3.get().get().id >> 24) as u8, + (page3.get().get().id >> 16) as u8, + (page3.get().get().id >> 8) as u8, + page3.get().get().id as u8, // Next byte: rowid as varint (simple value 100) 100, ]; @@ -6857,6 +6977,7 @@ mod tests { // Add a simple record to each leaf page for page in [&page3, &page4] { + let page = page.get(); let contents = page.get().contents.as_mut().unwrap(); // Simple record with just a rowid and payload @@ -6898,6 +7019,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6938,6 +7060,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -6984,6 +7107,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -7062,6 +7186,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; @@ -7231,6 +7356,7 @@ mod tests { let db = get_database(); let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let header_size = 8; let usable_space = 4096; @@ -7247,6 +7373,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7272,6 +7399,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7305,6 +7433,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7340,6 +7469,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7362,6 +7492,7 @@ mod tests { let conn = db.connect().unwrap(); let page = get_page(2); + let page = page.get(); let page = page.get_contents(); let usable_space = 4096; @@ -7400,6 +7531,7 @@ mod tests { let defragment = |page| { defragment_page(page, usable_space); }; + let page = page.get(); defragment(page.get_contents()); defragment(page.get_contents()); insert(0, page.get_contents()); @@ -7442,13 +7574,14 @@ mod tests { let record = ImmutableRecord::from_registers(&[Register::Value(Value::Integer(0))]); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.get_contents().page_type(), + page.get().get_contents().page_type(), Some(0), &mut payload, &record, 4096, conn.pager.clone(), ); + let page = page.get(); insert(0, page.get_contents()); defragment(page.get_contents()); insert(0, page.get_contents()); @@ -7515,19 +7648,19 @@ mod tests { ImmutableRecord::from_registers(&[Register::Value(Value::Blob(vec![0; 3600]))]); let mut payload: Vec = Vec::new(); fill_cell_payload( - page.get_contents().page_type(), + page.get().get_contents().page_type(), Some(0), &mut payload, &record, 4096, conn.pager.clone(), ); - insert_into_cell(page.get_contents(), &payload, 0, 4096).unwrap(); - let free = compute_free_space(page.get_contents(), usable_space); + insert_into_cell(page.get().get_contents(), &payload, 0, 4096).unwrap(); + let free = compute_free_space(page.get().get_contents(), usable_space); let total_size = payload.len() + 2; assert_eq!( free, - usable_space - page.get_contents().header_size() as u16 - total_size as u16 + usable_space - page.get().get_contents().header_size() as u16 - total_size as u16 ); dbg!(free); } @@ -7851,7 +7984,11 @@ mod tests { let (pager, _) = empty_btree(); let page_type = PageType::TableLeaf; let page = pager.allocate_page().unwrap(); + let page = Arc::new(BTreePageInner { + page: RefCell::new(page), + }); btree_init_page(&page, page_type, 0, pager.usable_space() as u16); + let page = page.get(); let mut size = (rng.next_u64() % 100) as u16; let mut i = 0; // add a bunch of cells diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 0e6f718da..2d6fd5637 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -1,5 +1,6 @@ use crate::fast_lock::SpinLock; use crate::result::LimboResult; +use crate::storage::btree::BTreePageInner; use crate::storage::buffer_pool::BufferPool; use crate::storage::database::DatabaseStorage; use crate::storage::sqlite3_ondisk::{self, DatabaseHeader, PageContent, PageType}; @@ -13,6 +14,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tracing::trace; +use super::btree::BTreePage; use super::page_cache::{CacheError, CacheResizeResult, DumbLruPageCache, PageCacheKey}; use super::wal::{CheckpointMode, CheckpointStatus}; @@ -222,7 +224,7 @@ impl Pager { _ => unreachable!("Invalid flags state"), }; let page = self.do_allocate_page(page_type, 0); - let id = page.get().id; + let id = page.get().get().id; id as u32 } @@ -244,13 +246,16 @@ impl Pager { /// Allocate a new page to the btree via the pager. /// This marks the page as dirty and writes the page header. // FIXME: handle no room in page cache - pub fn do_allocate_page(&self, page_type: PageType, offset: usize) -> PageRef { + pub fn do_allocate_page(&self, page_type: PageType, offset: usize) -> BTreePage { let page = self.allocate_page().unwrap(); + let page = Arc::new(BTreePageInner { + page: RefCell::new(page), + }); crate::btree_init_page(&page, page_type, offset, self.usable_space() as u16); tracing::debug!( "do_allocate_page(id={}, page_type={:?})", - page.get().id, - page.get_contents().page_type() + page.get().get().id, + page.get().get_contents().page_type() ); page } @@ -368,61 +373,6 @@ impl Pager { Ok(page) } - /// Loads pages if not loaded - pub fn load_page(&self, page: PageRef) -> Result<()> { - let id = page.get().id; - trace!("load_page(page_idx = {})", id); - let mut page_cache = self.page_cache.write(); - page.set_locked(); - let max_frame = match &self.wal { - Some(wal) => wal.borrow().get_max_frame(), - None => 0, - }; - let page_key = PageCacheKey::new(id, Some(max_frame)); - if let Some(wal) = &self.wal { - if let Some(frame_id) = wal.borrow().find_frame(id as u64)? { - wal.borrow() - .read_frame(frame_id, page.clone(), self.buffer_pool.clone())?; - { - page.set_uptodate(); - } - match page_cache.insert(page_key, page.clone()) { - Err(CacheError::KeyExists) => {} // Exists but same page, not error - Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(e) => { - return Err(LimboError::InternalError(format!( - "Failed to insert page into cache during load: {:?}", - e - ))) - } - Ok(_) => {} - } - return Ok(()); - } - } - - match page_cache.insert(page_key, page.clone()) { - Err(CacheError::KeyExists) => {} // Ensures same page - Err(CacheError::Full) => return Err(LimboError::CacheFull), - Err(e) => { - return Err(LimboError::InternalError(format!( - "Failed to insert page into cache during load: {:?}", - e - ))) - } - Ok(_) => {} - }; - - sqlite3_ondisk::begin_read_page( - self.db_file.clone(), - self.buffer_pool.clone(), - page.clone(), - id, - )?; - - Ok(()) - } - /// Writes the database header. pub fn write_database_header(&self, header: &DatabaseHeader) { sqlite3_ondisk::begin_write_database_header(header, self).expect("failed to write header"); @@ -464,6 +414,8 @@ impl Pager { let max_frame_after_append = self.wal.as_ref().map(|wal| { wal.borrow().get_max_frame() + self.dirty_pages.borrow().len() as u64 }); + tracing::error!("start flush"); + tracing::error!("pages={:?}", self.dirty_pages.borrow()); for page_id in self.dirty_pages.borrow().iter() { let mut cache = self.page_cache.write(); let page_key = PageCacheKey::new(*page_id, Some(max_frame));