From cd2b61d838af62a836d758d584bfd764b90d3cf7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 11 Nov 2024 10:27:29 +0100 Subject: [PATCH 01/22] btree: cursor with lineal stack structure Removed MemPage from the code in favor of an array to encode the stack of the cursor. This is both simpler and better in terms of memory access. --- core/storage/btree.rs | 514 ++++++++++++++++++++++-------------------- core/storage/pager.rs | 15 +- test/src/lib.rs | 1 + 3 files changed, 273 insertions(+), 257 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 2dfb656d9..79d198968 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -24,32 +24,6 @@ const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of c const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */ const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */ -#[derive(Debug)] -pub struct MemPage { - parent: Option>, - page_idx: usize, - cell_idx: RefCell, -} - -impl MemPage { - pub fn new(parent: Option>, page_idx: usize, cell_idx: usize) -> Self { - Self { - parent, - page_idx, - cell_idx: RefCell::new(cell_idx), - } - } - - pub fn cell_idx(&self) -> usize { - *self.cell_idx.borrow() - } - - pub fn advance(&self) { - let mut cell_idx = self.cell_idx.borrow_mut(); - *cell_idx += 1; - } -} - #[derive(Debug)] enum WriteState { Start, @@ -61,24 +35,37 @@ enum WriteState { struct WriteInfo { state: WriteState, - current_page: RefCell, Rc>)>>, - parent_page: RefCell, Rc>)>>, - new_pages: RefCell, Rc>)>>, + new_pages: RefCell>>>, scratch_cells: RefCell>, rightmost_pointer: RefCell>, page_copy: RefCell>, // this holds the copy a of a page needed for buffer references } +/* TODO(Pere) +** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than +** this will be declared corrupt. This value is calculated based on a +** maximum database size of 2^31 pages a minimum fanout of 2 for a +** root-node and 3 for all other internal nodes. +** +** If a tree that appears to be taller than this is encountered, it is +** assumed that the database is corrupt. +*/ +pub const BTCURSOR_MAX_DEPTH: usize = 20; + pub struct BTreeCursor { pager: Rc, root_page: usize, - page: RefCell>>, rowid: RefCell>, record: RefCell>, null_flag: bool, database_header: Rc>, going_upwards: bool, write_info: WriteInfo, + + current_page: RefCell, + cell_indices: RefCell<[usize; BTCURSOR_MAX_DEPTH + 1]>, + stack: RefCell<[Option>>; BTCURSOR_MAX_DEPTH + 1]>, // TODO(pere) stack of cell idx + // TODO(pere) stack of pages } impl BTreeCursor { @@ -90,7 +77,6 @@ impl BTreeCursor { Self { pager, root_page, - page: RefCell::new(None), rowid: RefCell::new(None), record: RefCell::new(None), null_flag: false, @@ -98,13 +84,14 @@ impl BTreeCursor { going_upwards: false, write_info: WriteInfo { state: WriteState::Start, - current_page: RefCell::new(None), - parent_page: RefCell::new(None), new_pages: RefCell::new(Vec::with_capacity(4)), scratch_cells: RefCell::new(Vec::new()), rightmost_pointer: RefCell::new(None), page_copy: RefCell::new(None), }, + current_page: RefCell::new(-1), + cell_indices: RefCell::new([0; BTCURSOR_MAX_DEPTH + 1]), + stack: RefCell::new([const { None }; BTCURSOR_MAX_DEPTH + 1]), } } @@ -125,38 +112,57 @@ impl BTreeCursor { predicate: Option<(SeekKey<'_>, SeekOp)>, ) -> Result, Option)>> { loop { - let mem_page = self.get_mem_page(); - let page_idx = mem_page.page_idx; - let page = self.pager.read_page(page_idx)?; - let page = RefCell::borrow(&page); - if page.is_locked() { + let mem_page_rc = self.top_from_stack(); + let cell_idx = self.current_index(); + + let mem_page = RefCell::borrow(&mem_page_rc); + debug!("current id={} cell={}", mem_page.id, cell_idx); + if mem_page.is_locked() { + // TODO(pere): request load page return Ok(CursorResult::IO); } - let page = page.contents.read().unwrap(); + let page = mem_page.contents.read().unwrap(); let page = page.as_ref().unwrap(); - if mem_page.cell_idx() >= page.cell_count() { - let parent = mem_page.parent.clone(); + if cell_idx == page.cell_count() { + // do rightmost + let has_parent = *self.current_page.borrow() > 0; + self.advance(); match page.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = MemPage::new(parent.clone(), right_most_pointer as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + let mem_page = self.pager.read_page(right_most_pointer as usize)?; + self.push_to_stack(mem_page); continue; } - None => match parent { - Some(ref parent) => { + None => { + if has_parent { + debug!("moving simple upwards"); self.going_upwards = true; - self.page.replace(Some(parent.clone())); + self.pop_from_stack(); continue; - } - None => { + } else { return Ok(CursorResult::Ok((None, None))); } - }, + } } } + + if cell_idx == page.cell_count() + 1 { + // end + let has_parent = *self.current_page.borrow() > 0; + if has_parent { + debug!("moving upwards"); + self.going_upwards = true; + self.pop_from_stack(); + continue; + } else { + return Ok(CursorResult::Ok((None, None))); + } + } + assert!(cell_idx < page.cell_count()); + let cell = page.cell_get( - mem_page.cell_idx(), + cell_idx, self.pager.clone(), self.max_local(page.page_type()), self.min_local(page.page_type()), @@ -168,10 +174,9 @@ impl BTreeCursor { _rowid, }) => { assert!(predicate.is_none()); - mem_page.advance(); - let mem_page = - MemPage::new(Some(mem_page.clone()), *_left_child_page as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + self.advance(); + let mem_page = self.pager.read_page(*_left_child_page as usize)?; + self.push_to_stack(mem_page); continue; } BTreeCell::TableLeafCell(TableLeafCell { @@ -180,7 +185,7 @@ impl BTreeCursor { first_overflow_page: _, }) => { assert!(predicate.is_none()); - mem_page.advance(); + self.advance(); let record = crate::storage::sqlite3_ondisk::read_record(_payload)?; return Ok(CursorResult::Ok((Some(*_rowid), Some(record)))); } @@ -190,14 +195,13 @@ impl BTreeCursor { .. }) => { if !self.going_upwards { - let mem_page = - MemPage::new(Some(mem_page.clone()), *left_child_page as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + let mem_page = self.pager.read_page(*left_child_page as usize)?; + self.push_to_stack(mem_page); continue; } self.going_upwards = false; - mem_page.advance(); + self.advance(); let record = crate::storage::sqlite3_ondisk::read_record(payload)?; if predicate.is_none() { @@ -228,7 +232,7 @@ impl BTreeCursor { } } BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { - mem_page.advance(); + self.advance(); let record = crate::storage::sqlite3_ondisk::read_record(payload)?; if predicate.is_none() { let rowid = match record.values.last() { @@ -270,66 +274,66 @@ impl BTreeCursor { CursorResult::IO => return Ok(CursorResult::IO), }; - let mem_page = self.get_mem_page(); - let page_idx = mem_page.page_idx; - let page = self.pager.read_page(page_idx)?; - let page = RefCell::borrow(&page); - if page.is_locked() { - return Ok(CursorResult::IO); - } + { + let page_rc = self.top_from_stack(); + let page = page_rc.borrow(); + if page.is_locked() { + return Ok(CursorResult::IO); + } - let page = page.contents.read().unwrap(); - let page = page.as_ref().unwrap(); + let contents = page.contents.read().unwrap(); + let contents = contents.as_ref().unwrap(); - for cell_idx in 0..page.cell_count() { - let cell = page.cell_get( - cell_idx, - self.pager.clone(), - self.max_local(page.page_type()), - self.min_local(page.page_type()), - self.usable_space(), - )?; - match &cell { - BTreeCell::TableLeafCell(TableLeafCell { - _rowid: cell_rowid, - _payload: payload, - first_overflow_page: _, - }) => { - let SeekKey::TableRowId(rowid_key) = key else { - unreachable!("table seek key should be a rowid"); - }; - mem_page.advance(); - let found = match op { - SeekOp::GT => *cell_rowid > rowid_key, - SeekOp::GE => *cell_rowid >= rowid_key, - SeekOp::EQ => *cell_rowid == rowid_key, - }; - if found { - let record = crate::storage::sqlite3_ondisk::read_record(payload)?; - return Ok(CursorResult::Ok((Some(*cell_rowid), Some(record)))); - } - } - BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { - let SeekKey::IndexKey(index_key) = key else { - unreachable!("index seek key should be a record"); - }; - mem_page.advance(); - let record = crate::storage::sqlite3_ondisk::read_record(payload)?; - let found = match op { - SeekOp::GT => record > *index_key, - SeekOp::GE => record >= *index_key, - SeekOp::EQ => record == *index_key, - }; - if found { - let rowid = match record.values.last() { - Some(OwnedValue::Integer(rowid)) => *rowid as u64, - _ => unreachable!("index cells should have an integer rowid"), + for cell_idx in 0..contents.cell_count() { + let cell = contents.cell_get( + cell_idx, + self.pager.clone(), + self.max_local(contents.page_type()), + self.min_local(contents.page_type()), + self.usable_space(), + )?; + match &cell { + BTreeCell::TableLeafCell(TableLeafCell { + _rowid: cell_rowid, + _payload: payload, + first_overflow_page: _, + }) => { + let SeekKey::TableRowId(rowid_key) = key else { + unreachable!("table seek key should be a rowid"); }; - return Ok(CursorResult::Ok((Some(rowid), Some(record)))); + self.advance(); + let found = match op { + SeekOp::GT => *cell_rowid > rowid_key, + SeekOp::GE => *cell_rowid >= rowid_key, + SeekOp::EQ => *cell_rowid == rowid_key, + }; + if found { + let record = crate::storage::sqlite3_ondisk::read_record(payload)?; + return Ok(CursorResult::Ok((Some(*cell_rowid), Some(record)))); + } + } + BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { + let SeekKey::IndexKey(index_key) = key else { + unreachable!("index seek key should be a record"); + }; + self.advance(); + let record = crate::storage::sqlite3_ondisk::read_record(payload)?; + let found = match op { + SeekOp::GT => record > *index_key, + SeekOp::GE => record >= *index_key, + SeekOp::EQ => record == *index_key, + }; + if found { + let rowid = match record.values.last() { + Some(OwnedValue::Integer(rowid)) => *rowid as u64, + _ => unreachable!("index cells should have an integer rowid"), + }; + return Ok(CursorResult::Ok((Some(rowid), Some(record)))); + } + } + cell_type => { + unreachable!("unexpected cell type: {:?}", cell_type); } - } - cell_type => { - unreachable!("unexpected cell type: {:?}", cell_type); } } } @@ -358,16 +362,69 @@ impl BTreeCursor { } fn move_to_root(&mut self) { - self.page - .replace(Some(Rc::new(MemPage::new(None, self.root_page, 0)))); + let mem_page = self.pager.read_page(self.root_page).unwrap(); + self.stack.borrow_mut()[0] = Some(mem_page); + self.cell_indices.borrow_mut()[0] = 0; + *self.current_page.borrow_mut() = 0; + } + + fn push_to_stack(&self, page: Rc>) { + debug!("push to stack {} {}", self.current_page.borrow(), page.borrow().id); + *self.current_page.borrow_mut() += 1; + let current = *self.current_page.borrow(); + self.stack.borrow_mut()[current as usize] = Some(page); + self.cell_indices.borrow_mut()[current as usize] = 0; + } + + fn pop_from_stack(&self) { + let current = *self.current_page.borrow(); + debug!("pop_from_stack(current={})", current); + self.cell_indices.borrow_mut()[current as usize] = 0; + self.stack.borrow_mut()[current as usize] = None; + *self.current_page.borrow_mut() -= 1; + } + + fn top_from_stack(&self) -> Rc> { + let current = *self.current_page.borrow(); + debug!("top_from_stack(current={})", current); + self.stack.borrow()[current as usize] + .as_ref() + .unwrap() + .clone() + } + + fn parent(&self) -> Rc> { + let current = *self.current_page.borrow(); + self.stack.borrow()[current as usize - 1] + .as_ref() + .unwrap() + .clone() + } + + fn current(&self) -> usize { + *self.current_page.borrow() as usize + } + + fn current_index(&self) -> usize { + let current = self.current(); + self.cell_indices.borrow()[current] + } + + fn advance(&self) { + let current = self.current(); + self.cell_indices.borrow_mut()[current] += 1; + } + + fn has_parent(&self) -> bool { + *self.current_page.borrow() > 0 } fn move_to_rightmost(&mut self) -> Result> { self.move_to_root(); loop { - let mem_page = self.page.borrow().as_ref().unwrap().clone(); - let page_idx = mem_page.page_idx; + let mem_page = self.top_from_stack(); + let page_idx = mem_page.borrow().id; let page = self.pager.read_page(page_idx)?; let page = RefCell::borrow(&page); if page.is_locked() { @@ -377,17 +434,18 @@ impl BTreeCursor { let page = page.as_ref().unwrap(); if page.is_leaf() { if page.cell_count() > 0 { - mem_page.cell_idx.replace(page.cell_count() - 1); + self.cell_indices.borrow_mut()[*self.current_page.borrow() as usize] = + page.cell_count() - 1; } return Ok(CursorResult::Ok(())); } match page.rightmost_pointer() { Some(right_most_pointer) => { - mem_page.cell_idx.replace(page.cell_count()); - let mem_page = - MemPage::new(Some(mem_page.clone()), right_most_pointer as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + self.cell_indices.borrow_mut()[*self.current_page.borrow() as usize] = + page.cell_count(); + let mem_page = self.pager.read_page(right_most_pointer as usize).unwrap(); + self.push_to_stack(mem_page); continue; } @@ -425,27 +483,25 @@ impl BTreeCursor { self.move_to_root(); loop { - let mem_page = self.get_mem_page(); - let page_idx = mem_page.page_idx; - let page = self.pager.read_page(page_idx)?; - let page = RefCell::borrow(&page); + let page_rc = self.top_from_stack(); + let page = RefCell::borrow(&page_rc); if page.is_locked() { return Ok(CursorResult::IO); } - let page = page.contents.read().unwrap(); - let page = page.as_ref().unwrap(); - if page.is_leaf() { + let contents = page.contents.read().unwrap(); + let contents = contents.as_ref().unwrap(); + if contents.is_leaf() { return Ok(CursorResult::Ok(())); } let mut found_cell = false; - for cell_idx in 0..page.cell_count() { - match &page.cell_get( + for cell_idx in 0..contents.cell_count() { + match &contents.cell_get( cell_idx, self.pager.clone(), - self.max_local(page.page_type()), - self.min_local(page.page_type()), + self.max_local(contents.page_type()), + self.min_local(contents.page_type()), self.usable_space(), )? { BTreeCell::TableInteriorCell(TableInteriorCell { @@ -455,16 +511,16 @@ impl BTreeCursor { let SeekKey::TableRowId(rowid_key) = key else { unreachable!("table seek key should be a rowid"); }; - mem_page.advance(); + self.advance(); let target_leaf_page_is_in_left_subtree = match cmp { SeekOp::GT => rowid_key < *_rowid, SeekOp::GE => rowid_key <= *_rowid, SeekOp::EQ => rowid_key <= *_rowid, }; if target_leaf_page_is_in_left_subtree { - let mem_page = - MemPage::new(Some(mem_page.clone()), *_left_child_page as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + let mem_page = self.pager.read_page(*_left_child_page as usize)?; + self.push_to_stack(mem_page); + found_cell = true; break; } @@ -493,13 +549,12 @@ impl BTreeCursor { SeekOp::EQ => index_key <= &record, }; if target_leaf_page_is_in_the_left_subtree { - let mem_page = - MemPage::new(Some(mem_page.clone()), *left_child_page as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + let mem_page = self.pager.read_page(*left_child_page as usize).unwrap(); + self.push_to_stack(mem_page); found_cell = true; break; } else { - mem_page.advance(); + self.advance(); } } BTreeCell::IndexLeafCell(_) => { @@ -511,11 +566,10 @@ impl BTreeCursor { } if !found_cell { - let parent = mem_page.parent.clone(); - match page.rightmost_pointer() { + match contents.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = MemPage::new(parent, right_most_pointer as usize, 0); - self.page.replace(Some(Rc::new(mem_page))); + let mem_page = self.pager.read_page(right_most_pointer as usize).unwrap(); + self.push_to_stack(mem_page); continue; } None => { @@ -526,7 +580,7 @@ impl BTreeCursor { } } - fn insert_to_page( + fn insert_into_page( &mut self, key: &OwnedValue, record: &OwnedRecord, @@ -535,7 +589,7 @@ impl BTreeCursor { let state = &self.write_info.state; match state { WriteState::Start => { - let page_ref = self.get_current_page()?; + let page_ref = self.top_from_stack(); let int_key = match key { OwnedValue::Integer(i) => *i as u64, _ => unreachable!("btree tables are indexed by integers!"), @@ -577,10 +631,6 @@ impl BTreeCursor { }; if overflow > 0 { self.write_info.state = WriteState::BalanceStart; - self.write_info.current_page.borrow_mut().replace(( - self.page.borrow().as_ref().unwrap().clone(), - page_ref.clone(), - )); } else { self.write_info.state = WriteState::Finish; } @@ -711,17 +761,6 @@ impl BTreeCursor { page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); } - fn get_current_page(&mut self) -> crate::Result>> { - let mem_page = { - let mem_page = self.page.borrow(); - let mem_page = mem_page.as_ref().unwrap(); - mem_page.clone() - }; - let page_idx = mem_page.page_idx; - let page_ref = self.pager.read_page(page_idx)?; - Ok(page_ref) - } - /// This is a naive algorithm that doesn't try to distribute cells evenly by content. /// It will try to split the page in half by keys not by content. /// Sqlite tries to have a page at least 40% full. @@ -729,12 +768,16 @@ impl BTreeCursor { let state = &self.write_info.state; match state { WriteState::BalanceStart => { - let current_page = self.write_info.current_page.borrow(); - let mem_page = ¤t_page.as_ref().unwrap().0; + // drop divider cells and find right pointer + // NOTE: since we are doing a simple split we only finding the pointer we want to update (right pointer). + // Right pointer means cell that points to the last page, as we don't really want to drop this one. This one + // can be a "rightmost pointer" or a "cell". + // TODO(pere): simplify locking... + // we always asumme there is a parent + let current_page = self.top_from_stack(); + let page_rc = RefCell::borrow(¤t_page); { // check if we don't need to balance - let page_ref = ¤t_page.as_ref().unwrap().1; - let page_rc = RefCell::borrow(&page_ref); { // don't continue if there are no overflow cells @@ -747,15 +790,13 @@ impl BTreeCursor { } } - if mem_page.parent.is_none() { + if !self.has_parent() { + drop(page_rc); drop(current_page); self.balance_root(); return Ok(CursorResult::Ok(())); } - debug!("Balancing leaf. leaf={}", mem_page.page_idx); - - let page_ref = ¤t_page.as_ref().unwrap().1; - let page_rc = RefCell::borrow(&page_ref); + debug!("Balancing leaf. leaf={}", page_rc.id); // Copy of page used to reference cell bytes. let page_copy = { @@ -808,59 +849,43 @@ impl BTreeCursor { self.write_info .new_pages .borrow_mut() - .push((mem_page.clone(), page_ref.clone())); - self.write_info.new_pages.borrow_mut().push(( - Rc::new(MemPage::new(mem_page.parent.clone(), right_page_id, 0)), - right_page_ref.clone(), - )); + .push(current_page.clone()); + self.write_info + .new_pages + .borrow_mut() + .push(right_page_ref.clone()); - let new_pages_ids = [mem_page.page_idx, right_page_id]; debug!( "splitting left={} right={}", - new_pages_ids[0], new_pages_ids[1] + *self.current_page.borrow(), + right_page_id ); - // drop divider cells and find right pointer - // NOTE: since we are doing a simple split we only finding the pointer we want to update (right pointer). - // Right pointer means cell that points to the last page, as we don't really want to drop this one. This one - // can be a "rightmost pointer" or a "cell". - // TODO(pere): simplify locking... - // we always asumme there is a parent self.write_info.state = WriteState::BalanceGetParentPage; return Ok(CursorResult::Ok(())); } WriteState::BalanceGetParentPage => { - let current_page = self.write_info.current_page.borrow(); - let mem_page = ¤t_page.as_ref().unwrap().0; - let parent_rc = mem_page.parent.as_ref().unwrap(); - let parent_ref = self.pager.read_page(parent_rc.page_idx)?; - if !RefCell::borrow(&parent_ref).is_locked() { + let parent_rc = self.parent(); + if !&parent_rc.borrow().is_locked() { + parent_rc.borrow_mut().set_dirty(); self.write_info.state = WriteState::BalanceMoveUp; - self.write_info - .parent_page - .borrow_mut() - .replace((parent_rc.clone(), parent_ref.clone())); Ok(CursorResult::Ok(())) } else { + // TODO(pere): maybe request load, it might be that parent was already + // requested Ok(CursorResult::IO) } } WriteState::BalanceMoveUp => { - let parent = self.write_info.parent_page.borrow(); - let parent_entry = parent.as_ref().unwrap(); - let parent_ref = &parent_entry.1; + let parent_ref = self.parent(); let parent = RefCell::borrow_mut(&parent_ref); let (page_type, current_idx) = { - let current_page = self.write_info.current_page.borrow(); - let pagerc = current_page.as_ref().unwrap(); - let page = RefCell::borrow(&pagerc.1); - let page = page.contents.read().unwrap(); - ( - page.as_ref().unwrap().page_type().clone(), - pagerc.0.page_idx, - ) + let current_page = self.top_from_stack(); + let page_ref = current_page.borrow(); + let page = page_ref.contents.read().unwrap(); + (page.as_ref().unwrap().page_type().clone(), page_ref.id) }; parent.set_dirty(); @@ -904,7 +929,7 @@ impl BTreeCursor { let scratch_cells = self.write_info.scratch_cells.borrow(); // reset pages - for (_, page) in new_pages.iter() { + for page in new_pages.iter() { let page = page.borrow_mut(); let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); @@ -927,7 +952,7 @@ impl BTreeCursor { let mut current_cell_index = 0_usize; let mut divider_cells_index = Vec::new(); /* index to scratch cells that will be used as dividers in order */ - for (i, (_, page)) in new_pages.iter_mut().enumerate() { + for (i, page) in new_pages.iter_mut().enumerate() { let page = page.borrow_mut(); let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); @@ -949,15 +974,15 @@ impl BTreeCursor { current_cell_index += cells_to_copy; } let is_leaf = { - let page = self.write_info.current_page.borrow(); - let page = RefCell::borrow(&page.as_ref().unwrap().1); + let page = self.top_from_stack(); + let page = page.borrow(); let page = page.contents.read().unwrap(); page.as_ref().unwrap().is_leaf() }; // update rightmost pointer for each page if we are in interior page if !is_leaf { - for (_, page) in new_pages.iter_mut().take(new_pages_len - 1) { + for page in new_pages.iter_mut().take(new_pages_len - 1) { let page = page.borrow_mut(); let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); @@ -981,7 +1006,6 @@ impl BTreeCursor { } // last page right most pointer points to previous right most pointer before splitting let last_page = new_pages.last().unwrap(); - let last_page = &last_page.1; let last_page = RefCell::borrow(&last_page); let mut last_page = last_page.contents.write().unwrap(); let last_page = last_page.as_mut().unwrap(); @@ -993,22 +1017,22 @@ impl BTreeCursor { // insert dividers in parent // we can consider dividers the first cell of each page starting from the second page - for (page_id_index, (mem_page, page)) in + for (page_id_index, page) in new_pages.iter_mut().take(new_pages_len - 1).enumerate() { let page = page.borrow_mut(); - let mut page = page.contents.write().unwrap(); - let page = page.as_mut().unwrap(); - assert!(page.cell_count() > 1); + let mut contents = page.contents.write().unwrap(); + let contents = contents.as_mut().unwrap(); + assert!(contents.cell_count() > 1); let divider_cell_index = divider_cells_index[page_id_index]; let cell_payload = scratch_cells[divider_cell_index]; let cell = read_btree_cell( cell_payload, - &page.page_type(), + &contents.page_type(), 0, self.pager.clone(), - self.max_local(page.page_type()), - self.min_local(page.page_type()), + self.max_local(contents.page_type()), + self.min_local(contents.page_type()), self.usable_space(), ) .unwrap(); @@ -1020,7 +1044,7 @@ impl BTreeCursor { _ => unreachable!(), }; let mut divider_cell = Vec::new(); - divider_cell.extend_from_slice(&(mem_page.page_idx as u32).to_be_bytes()); + divider_cell.extend_from_slice(&(page.id as u32).to_be_bytes()); divider_cell.extend(std::iter::repeat(0).take(9)); let n = write_varint(&mut divider_cell.as_mut_slice()[4..], key); divider_cell.truncate(4 + n); @@ -1036,7 +1060,7 @@ impl BTreeCursor { BTreeCell::TableInteriorCell(interior) => interior._rowid, _ => unreachable!(), }; - let parent_cell_idx = self.find_cell(page, key); + let parent_cell_idx = self.find_cell(contents, key); self.insert_into_cell(parent_contents, cell_payload, parent_cell_idx); // self.drop_cell(*page, 0); } @@ -1044,15 +1068,12 @@ impl BTreeCursor { { // copy last page id to right pointer - let last_pointer = new_pages.last().unwrap().0.page_idx as u32; + let last_pointer = new_pages.last().unwrap().borrow().id as u32; parent_contents.write_u32(right_pointer, last_pointer); } - self.page = RefCell::new(Some(parent_entry.0.clone())); - self.write_info - .current_page - .replace(Some(parent_entry.clone())); + self.pop_from_stack(); self.write_info.state = WriteState::BalanceStart; - self.write_info.page_copy.replace(None); + let _ = self.write_info.page_copy.take(); Ok(CursorResult::Ok(())) } @@ -1079,11 +1100,10 @@ impl BTreeCursor { /* swap splitted page buffer with new root buffer so we don't have to update page idx */ { let (root_id, child_id, child) = { - let page = self.write_info.current_page.borrow(); - let page_ref = &page.as_ref().unwrap().1; + let page_ref = self.top_from_stack(); let child = page_ref.clone(); - let mut page_rc = RefCell::borrow_mut(page_ref); - let mut new_root_page = RefCell::borrow_mut(&new_root_page_ref); + let mut page_rc = page_ref.borrow_mut(); + let mut new_root_page = new_root_page_ref.borrow_mut(); // Swap the entire Page structs std::mem::swap(&mut page_rc.id, &mut new_root_page.id); @@ -1093,19 +1113,17 @@ impl BTreeCursor { (new_root_page.id, page_rc.id, child) }; + debug!("Balancing root. root={}, rightmost={}", root_id, child_id); let root = new_root_page_ref.clone(); - let parent = Some(Rc::new(MemPage::new(None, root_id, 0))); - let current_mem_page = Rc::new(MemPage::new(parent, child_id, 0)); - self.page = RefCell::new(Some(current_mem_page.clone())); + self.root_page = root_id; + *self.current_page.borrow_mut() = -1; + self.push_to_stack(root.clone()); + self.push_to_stack(child.clone()); - self.write_info - .current_page - .replace(Some((current_mem_page, child.clone()))); - - debug!("Balancing root. root={}, rightmost={}", root_id, child_id); self.pager.put_page(root_id, root); self.pager.put_page(child_id, child); + } } @@ -1115,6 +1133,7 @@ impl BTreeCursor { { // setup btree page let contents = RefCell::borrow(&page); + debug!("allocating page {}", contents.id); let mut contents = contents.contents.write().unwrap(); let contents = contents.as_mut().unwrap(); let id = page_type as u8; @@ -1336,12 +1355,6 @@ impl BTreeCursor { nfree as u16 } - fn get_mem_page(&self) -> Rc { - let mem_page = self.page.borrow(); - let mem_page = mem_page.as_ref().unwrap(); - mem_page.clone() - } - fn fill_cell_payload( &self, page_type: PageType, @@ -1534,8 +1547,8 @@ impl Cursor for BTreeCursor { } fn rewind(&mut self) -> Result> { - let mem_page = MemPage::new(None, self.root_page, 0); - self.page.replace(Some(Rc::new(mem_page))); + self.move_to_root(); + match self.get_next_record(None)? { CursorResult::Ok((rowid, next)) => { self.rowid.replace(rowid); @@ -1598,7 +1611,7 @@ impl Cursor for BTreeCursor { }; } - match self.insert_to_page(key, _record)? { + match self.insert_into_page(key, _record)? { CursorResult::Ok(_) => Ok(CursorResult::Ok(())), CursorResult::IO => Ok(CursorResult::IO), } @@ -1621,9 +1634,10 @@ impl Cursor for BTreeCursor { CursorResult::Ok(_) => {} CursorResult::IO => return Ok(CursorResult::IO), }; - let page_ref = self.get_current_page()?; + let page_ref = self.top_from_stack(); let page = RefCell::borrow(&page_ref); if page.is_locked() { + // TODO(pere); request load return Ok(CursorResult::IO); } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index bbc902ea1..5d4b90004 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -30,12 +30,6 @@ const PAGE_ERROR: usize = 0b100; /// Page is dirty. Flush needed. const PAGE_DIRTY: usize = 0b1000; -impl Default for Page { - fn default() -> Self { - Self::new(0) - } -} - impl Page { pub fn new(id: usize) -> Page { Page { @@ -167,7 +161,7 @@ impl DumbLruPageCache { ptr?; let ptr = unsafe { ptr.unwrap().as_mut() }; let page = ptr.page.clone(); - self.detach(ptr); + //self.detach(ptr); self.touch(ptr); Some(page) } @@ -180,6 +174,13 @@ impl DumbLruPageCache { fn detach(&mut self, entry: &mut PageCacheEntry) { let mut current = entry.as_non_null(); + { + // evict buffer + let page = entry.page.borrow_mut(); + let mut contents = page.contents.write().unwrap(); + let _ = contents.as_mut().take(); + } + let (next, prev) = unsafe { let c = current.as_mut(); let next = c.next; diff --git a/test/src/lib.rs b/test/src/lib.rs index 7e8b31443..b1393102a 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -51,6 +51,7 @@ mod tests { let list_query = "SELECT * FROM test"; let max_iterations = 10000; for i in 0..max_iterations { + debug!("inserting {} ", i); if (i % 100) == 0 { let progress = (i as f64 / max_iterations as f64) * 100.0; println!("progress {:.1}%", progress); From 807496a1469343340f6a2de5f91e686afe064493 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 12 Nov 2024 14:44:25 +0100 Subject: [PATCH 02/22] core: load pages if not loaded Since pages are now tracked with a single centralized structure, it is possible for a page to have been unloaded a be kept in memory for metadata purposes like going back in the stack. Using the example of going to a parent page which was unloaded for whatever reason: in this case we need to check if it's loaded, if not, we load it. Locked still means the same thing, loaded just means the contents of a page are present in memory and if they are present, they must be in cache. --- core/storage/btree.rs | 63 ++++++++++++++++-------- core/storage/pager.rs | 90 +++++++++++++++++++++++++++++++--- core/storage/sqlite3_ondisk.rs | 1 + core/storage/wal.rs | 7 ++- 4 files changed, 130 insertions(+), 31 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 79d198968..2c76667c7 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -41,7 +41,7 @@ struct WriteInfo { page_copy: RefCell>, // this holds the copy a of a page needed for buffer references } -/* TODO(Pere) +/* ** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than ** this will be declared corrupt. This value is calculated based on a ** maximum database size of 2^31 pages a minimum fanout of 2 for a @@ -64,8 +64,7 @@ pub struct BTreeCursor { current_page: RefCell, cell_indices: RefCell<[usize; BTCURSOR_MAX_DEPTH + 1]>, - stack: RefCell<[Option>>; BTCURSOR_MAX_DEPTH + 1]>, // TODO(pere) stack of cell idx - // TODO(pere) stack of pages + stack: RefCell<[Option>>; BTCURSOR_MAX_DEPTH + 1]>, } impl BTreeCursor { @@ -115,12 +114,16 @@ impl BTreeCursor { let mem_page_rc = self.top_from_stack(); let cell_idx = self.current_index(); - let mem_page = RefCell::borrow(&mem_page_rc); - debug!("current id={} cell={}", mem_page.id, cell_idx); - if mem_page.is_locked() { - // TODO(pere): request load page + debug!("current id={} cell={}", mem_page_rc.borrow().id, cell_idx); + if mem_page_rc.borrow().is_locked() { return Ok(CursorResult::IO); } + if !mem_page_rc.borrow().is_loaded() { + self.pager.load_page(mem_page_rc.clone())?; + return Ok(CursorResult::IO); + } + let mem_page = mem_page_rc.borrow(); + let page = mem_page.contents.read().unwrap(); let page = page.as_ref().unwrap(); @@ -369,9 +372,17 @@ impl BTreeCursor { } fn push_to_stack(&self, page: Rc>) { - debug!("push to stack {} {}", self.current_page.borrow(), page.borrow().id); + debug!( + "push to stack {} {}", + self.current_page.borrow(), + page.borrow().id + ); *self.current_page.borrow_mut() += 1; let current = *self.current_page.borrow(); + assert!( + current < BTCURSOR_MAX_DEPTH as i32, + "corrupted database, stack is bigger than expected" + ); self.stack.borrow_mut()[current as usize] = Some(page); self.cell_indices.borrow_mut()[current as usize] = 0; } @@ -386,11 +397,16 @@ impl BTreeCursor { fn top_from_stack(&self) -> Rc> { let current = *self.current_page.borrow(); - debug!("top_from_stack(current={})", current); - self.stack.borrow()[current as usize] + let page = self.stack.borrow()[current as usize] .as_ref() .unwrap() - .clone() + .clone(); + debug!( + "top_from_stack(current={}, page_id={})", + current, + page.borrow().id + ); + page } fn parent(&self) -> Rc> { @@ -862,19 +878,24 @@ impl BTreeCursor { ); self.write_info.state = WriteState::BalanceGetParentPage; - return Ok(CursorResult::Ok(())); + Ok(CursorResult::Ok(())) } WriteState::BalanceGetParentPage => { - let parent_rc = self.parent(); - if !&parent_rc.borrow().is_locked() { + let loaded = parent_rc.borrow().is_loaded(); + let locked = parent_rc.borrow().is_locked(); + + if locked { + Ok(CursorResult::IO) + } else { + if !loaded { + debug!("balance_leaf(loading page {} {})", locked, loaded); + self.pager.load_page(parent_rc.clone())?; + return Ok(CursorResult::IO); + } parent_rc.borrow_mut().set_dirty(); self.write_info.state = WriteState::BalanceMoveUp; Ok(CursorResult::Ok(())) - } else { - // TODO(pere): maybe request load, it might be that parent was already - // requested - Ok(CursorResult::IO) } } WriteState::BalanceMoveUp => { @@ -931,6 +952,7 @@ impl BTreeCursor { // reset pages for page in new_pages.iter() { let page = page.borrow_mut(); + assert!(page.is_dirty()); let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); @@ -1121,9 +1143,8 @@ impl BTreeCursor { self.push_to_stack(root.clone()); self.push_to_stack(child.clone()); - self.pager.put_page(root_id, root); - self.pager.put_page(child_id, child); - + self.pager.put_loaded_page(root_id, root); + self.pager.put_loaded_page(child_id, child); } } diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 5d4b90004..501d3ef35 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -3,7 +3,7 @@ use crate::storage::database::DatabaseStorage; use crate::storage::sqlite3_ondisk::{self, DatabaseHeader, PageContent}; use crate::storage::wal::Wal; use crate::{Buffer, Result}; -use log::trace; +use log::{debug, trace}; use sieve_cache::SieveCache; use std::cell::RefCell; use std::collections::{HashMap, HashSet}; @@ -29,6 +29,8 @@ const PAGE_LOCKED: usize = 0b010; const PAGE_ERROR: usize = 0b100; /// Page is dirty. Flush needed. const PAGE_DIRTY: usize = 0b1000; +/// Page's contents are loaded in memory. +const PAGE_LOADED: usize = 0b10000; impl Page { pub fn new(id: usize) -> Page { @@ -86,6 +88,19 @@ impl Page { pub fn clear_dirty(&self) { self.flags.fetch_and(!PAGE_DIRTY, Ordering::SeqCst); } + + pub fn is_loaded(&self) -> bool { + self.flags.load(Ordering::SeqCst) & PAGE_LOADED != 0 + } + + pub fn set_loaded(&self) { + self.flags.fetch_or(PAGE_LOADED, Ordering::SeqCst); + } + + pub fn clear_loaded(&self) { + log::debug!("clear loaded {}", self.id); + self.flags.fetch_and(!PAGE_LOADED, Ordering::SeqCst); + } } #[allow(dead_code)] @@ -119,8 +134,13 @@ impl DumbLruPageCache { } } + pub fn contains_key(&mut self, key: usize) -> bool { + self.map.borrow().contains_key(&key) + } + pub fn insert(&mut self, key: usize, value: Rc>) { - self.delete(key); + debug!("cache_insert(key={})", key); + self._delete(key, false); let mut entry = Box::new(PageCacheEntry { key, next: None, @@ -138,6 +158,11 @@ impl DumbLruPageCache { } pub fn delete(&mut self, key: usize) { + self._delete(key, true) + } + + pub fn _delete(&mut self, key: usize, clean_page: bool) { + debug!("cache_delete(key={}, clean={})", key, clean_page); let ptr = self.map.borrow_mut().remove(&key); if ptr.is_none() { return; @@ -145,7 +170,7 @@ impl DumbLruPageCache { let mut ptr = ptr.unwrap(); { let ptr = unsafe { ptr.as_mut() }; - self.detach(ptr); + self.detach(ptr, clean_page); } unsafe { drop_in_place(ptr.as_ptr()) }; } @@ -157,6 +182,7 @@ impl DumbLruPageCache { } pub fn get(&mut self, key: &usize) -> Option>> { + debug!("cache_get(key={})", key); let ptr = self.get_ptr(*key); ptr?; let ptr = unsafe { ptr.unwrap().as_mut() }; @@ -171,12 +197,14 @@ impl DumbLruPageCache { todo!(); } - fn detach(&mut self, entry: &mut PageCacheEntry) { + fn detach(&mut self, entry: &mut PageCacheEntry, clean_page: bool) { let mut current = entry.as_non_null(); - { + if clean_page { // evict buffer let page = entry.page.borrow_mut(); + page.clear_loaded(); + debug!("cleaning up page {}", page.id); let mut contents = page.contents.write().unwrap(); let _ = contents.as_mut().take(); } @@ -231,7 +259,7 @@ impl DumbLruPageCache { // TODO: drop from another clean entry? return; } - self.detach(tail); + self.detach(tail, true); } fn clear(&mut self) { @@ -358,6 +386,7 @@ impl Pager { trace!("read_page(page_idx = {})", page_idx); let mut page_cache = self.page_cache.borrow_mut(); if let Some(page) = page_cache.get(&page_idx) { + trace!("read_page(page_idx = {}) = cached", page_idx); return Ok(page.clone()); } let page = Rc::new(RefCell::new(Page::new(page_idx))); @@ -370,6 +399,8 @@ impl Pager { let page = page.borrow_mut(); page.set_uptodate(); } + // TODO(pere) ensure page is inserted, we should probably first insert to page cache + // and if successful, read frame or page page_cache.insert(page_idx, page.clone()); return Ok(page); } @@ -379,10 +410,44 @@ impl Pager { page.clone(), page_idx, )?; + // TODO(pere) ensure page is inserted page_cache.insert(page_idx, page.clone()); Ok(page) } + /// Loads pages if not loaded + pub fn load_page(&self, page: Rc>) -> Result<()> { + let id = page.borrow().id; + trace!("load_page(page_idx = {})", id); + let mut page_cache = self.page_cache.borrow_mut(); + page.borrow_mut().set_locked(); + if let Some(frame_id) = self.wal.borrow().find_frame(id as u64)? { + self.wal + .borrow() + .read_frame(frame_id, page.clone(), self.buffer_pool.clone())?; + { + let page = page.borrow_mut(); + page.set_uptodate(); + } + // TODO(pere) ensure page is inserted + if !page_cache.contains_key(id) { + page_cache.insert(id, page.clone()); + } + return Ok(()); + } + sqlite3_ondisk::begin_read_page( + self.page_io.clone(), + self.buffer_pool.clone(), + page.clone(), + id, + )?; + // TODO(pere) ensure page is inserted + if !page_cache.contains_key(id) { + page_cache.insert(id, page.clone()); + } + Ok(()) + } + /// Writes the database header. pub fn write_database_header(&self, header: &DatabaseHeader) { sqlite3_ondisk::begin_write_database_header(header, self).expect("failed to write header"); @@ -406,8 +471,15 @@ impl Pager { FlushState::Start => { let db_size = self.db_header.borrow().database_size; for page_id in self.dirty_pages.borrow().iter() { + debug!("appending frame {}", page_id); let mut cache = self.page_cache.borrow_mut(); let page = cache.get(page_id).expect("we somehow added a page to dirty list but we didn't mark it as dirty, causing cache to drop it."); + { + let contents = page.borrow(); + let contents = contents.contents.read().unwrap(); + let contents = contents.as_ref().unwrap(); + debug!("appending frame {} {:?}", page_id, contents.page_type()); + } self.wal.borrow_mut().append_frame( page.clone(), db_size, @@ -540,9 +612,11 @@ impl Pager { Ok(page_ref) } - pub fn put_page(&self, id: usize, page: Rc>) { + pub fn put_loaded_page(&self, id: usize, page: Rc>) { let mut cache = RefCell::borrow_mut(&self.page_cache); - cache.insert(id, page); + // cache insert invalidates previous page + cache.insert(id, page.clone()); + page.borrow_mut().set_loaded(); } pub fn usable_size(&self) -> usize { diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 6c0b4b376..56de8cfa5 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -525,6 +525,7 @@ fn finish_read_page( page.contents.write().unwrap().replace(inner); page.set_uptodate(); page.clear_locked(); + page.set_loaded(); } Ok(()) } diff --git a/core/storage/wal.rs b/core/storage/wal.rs index 8bc55f910..a5b98203c 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1,6 +1,8 @@ use std::collections::{HashMap, HashSet}; use std::{cell::RefCell, rc::Rc, sync::Arc}; +use log::debug; + use crate::io::{File, SyncCompletion, IO}; use crate::storage::sqlite3_ondisk::{ begin_read_wal_frame, begin_write_wal_frame, WAL_FRAME_HEADER_SIZE, WAL_HEADER_SIZE, @@ -115,10 +117,11 @@ impl Wal for WalFile { page: Rc>, buffer_pool: Rc, ) -> Result<()> { + debug!("read_frame({})", frame_id); let offset = self.frame_offset(frame_id); begin_read_wal_frame( self.file.borrow().as_ref().unwrap(), - offset, + offset + WAL_FRAME_HEADER_SIZE, buffer_pool, page, )?; @@ -285,7 +288,7 @@ impl WalFile { let header = header.as_ref().unwrap().borrow(); let page_size = header.page_size; let page_offset = frame_id * (page_size as u64 + WAL_FRAME_HEADER_SIZE as u64); - let offset = WAL_HEADER_SIZE as u64 + WAL_FRAME_HEADER_SIZE as u64 + page_offset; + let offset = WAL_HEADER_SIZE as u64 + page_offset; offset as usize } } From 48f0e72e14e5dffcda327edd034e0e3eeaf4cdea Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 12 Nov 2024 17:03:30 +0100 Subject: [PATCH 03/22] checkpoint on drop connection --- core/lib.rs | 21 +++++++++++ core/storage/pager.rs | 47 ++++++++++++++++++++--- core/storage/sqlite3_ondisk.rs | 15 ++++---- core/storage/wal.rs | 8 +++- core/vdbe/mod.rs | 1 + test/src/lib.rs | 68 ++++++++++++++++++++++++++++++++++ 6 files changed, 147 insertions(+), 13 deletions(-) diff --git a/core/lib.rs b/core/lib.rs index 36f5739d0..88609e055 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -283,6 +283,27 @@ impl Connection { self.pager.clear_page_cache(); Ok(()) } + + pub fn checkpoint(&self) -> Result<()> { + self.pager.clear_page_cache(); + Ok(()) + } +} + +impl Drop for Connection { + fn drop(&mut self) { + loop { + // TODO: make this async? + match self.pager.checkpoint().unwrap() { + CheckpointStatus::Done => { + return; + } + CheckpointStatus::IO => { + self.pager.io.run_once().unwrap(); + } + }; + } + } } pub struct Statement { diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 501d3ef35..a0f29233e 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -304,6 +304,12 @@ enum FlushState { WaitSyncDbFile, } +#[derive(Clone)] +enum CheckpointState { + Checkpoint, + CheckpointDone, +} + /// This will keep track of the state of current cache flush in order to not repeat work struct FlushInfo { state: FlushState, @@ -329,6 +335,7 @@ pub struct Pager { db_header: Rc>, flush_info: RefCell, + checkpoint_state: RefCell, syncing: Rc>, } @@ -362,6 +369,7 @@ impl Pager { in_flight_writes: Rc::new(RefCell::new(0)), }), syncing: Rc::new(RefCell::new(false)), + checkpoint_state: RefCell::new(CheckpointState::Checkpoint), }) } @@ -376,7 +384,10 @@ impl Pager { } pub fn end_tx(&self) -> Result { - self.cacheflush()?; + match self.cacheflush()? { + CheckpointStatus::Done => {} + CheckpointStatus::IO => return Ok(CheckpointStatus::IO), + }; self.wal.borrow().end_read_tx()?; Ok(CheckpointStatus::Done) } @@ -492,12 +503,11 @@ impl Pager { return Ok(CheckpointStatus::IO); } FlushState::Checkpoint => { - let in_flight = self.flush_info.borrow().in_flight_writes.clone(); - match self.wal.borrow_mut().checkpoint(self, in_flight)? { - CheckpointStatus::IO => return Ok(CheckpointStatus::IO), + match self.checkpoint()? { CheckpointStatus::Done => { - self.flush_info.borrow_mut().state = FlushState::CheckpointDone; + self.flush_info.borrow_mut().state = FlushState::SyncDbFile; } + CheckpointStatus::IO => return Ok(CheckpointStatus::IO), }; } FlushState::CheckpointDone => { @@ -540,6 +550,33 @@ impl Pager { Ok(CheckpointStatus::Done) } + pub fn checkpoint(&self) -> Result { + loop { + let state = self.checkpoint_state.borrow().clone(); + match state { + CheckpointState::Checkpoint => { + let in_flight = self.flush_info.borrow().in_flight_writes.clone(); + match self.wal.borrow_mut().checkpoint(self, in_flight)? { + CheckpointStatus::IO => return Ok(CheckpointStatus::IO), + CheckpointStatus::Done => { + self.checkpoint_state + .replace(CheckpointState::CheckpointDone); + } + }; + } + CheckpointState::CheckpointDone => { + let in_flight = self.flush_info.borrow().in_flight_writes.clone(); + if *in_flight.borrow() > 0 { + return Ok(CheckpointStatus::IO); + } else { + self.checkpoint_state.replace(CheckpointState::Checkpoint); + return Ok(CheckpointStatus::Done); + } + } + } + } + } + // WARN: used for testing purposes pub fn clear_page_cache(&self) { loop { diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 56de8cfa5..ce2136a01 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -48,7 +48,7 @@ use crate::storage::database::DatabaseStorage; use crate::storage::pager::{Page, Pager}; use crate::types::{OwnedRecord, OwnedValue}; use crate::{File, Result}; -use log::trace; +use log::{debug, trace}; use std::cell::RefCell; use std::pin::Pin; use std::rc::Rc; @@ -1018,6 +1018,7 @@ pub fn begin_write_wal_frame( ) -> Result<()> { let page_finish = page.clone(); let page_id = page.borrow().id; + trace!("begin_write_wal_frame(offset={}, page={})", offset, page_id); let header = WalFrameHeader { page_number: page_id as u32, @@ -1039,12 +1040,12 @@ pub fn begin_write_wal_frame( ); let buf = buffer.as_mut_slice(); - buf[0..4].copy_from_slice(&header.page_number.to_ne_bytes()); - buf[4..8].copy_from_slice(&header.db_size.to_ne_bytes()); - buf[8..12].copy_from_slice(&header.salt_1.to_ne_bytes()); - buf[12..16].copy_from_slice(&header.salt_2.to_ne_bytes()); - buf[16..20].copy_from_slice(&header.checksum_1.to_ne_bytes()); - buf[20..24].copy_from_slice(&header.checksum_2.to_ne_bytes()); + buf[0..4].copy_from_slice(&header.page_number.to_be_bytes()); + buf[4..8].copy_from_slice(&header.db_size.to_be_bytes()); + buf[8..12].copy_from_slice(&header.salt_1.to_be_bytes()); + buf[12..16].copy_from_slice(&header.salt_2.to_be_bytes()); + buf[16..20].copy_from_slice(&header.checksum_1.to_be_bytes()); + buf[20..24].copy_from_slice(&header.checksum_2.to_be_bytes()); buf[WAL_FRAME_HEADER_SIZE..].copy_from_slice(&contents.as_ptr()); Rc::new(RefCell::new(buffer)) diff --git a/core/storage/wal.rs b/core/storage/wal.rs index a5b98203c..c670786d9 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; use std::{cell::RefCell, rc::Rc, sync::Arc}; -use log::debug; +use log::{debug, trace}; use crate::io::{File, SyncCompletion, IO}; use crate::storage::sqlite3_ondisk::{ @@ -140,6 +140,12 @@ impl Wal for WalFile { let page_id = page.borrow().id; let frame_id = *self.max_frame.borrow(); let offset = self.frame_offset(frame_id); + trace!( + "append_frame(frame={}, offset={}, page_id={})", + frame_id, + offset, + page_id + ); begin_write_wal_frame( self.file.borrow().as_ref().unwrap(), offset, diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 0c292ff85..609b8bfb3 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -1098,6 +1098,7 @@ impl Program { ))); } } + log::trace!("Halt auto_commit {}", self.auto_commit); if self.auto_commit { return match pager.end_tx() { Ok(crate::storage::wal::CheckpointStatus::IO) => Ok(StepResult::IO), diff --git a/test/src/lib.rs b/test/src/lib.rs index b1393102a..0e9c2cbdd 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -306,6 +306,74 @@ mod tests { Ok(()) } + #[test] + fn test_wal_restart() -> anyhow::Result<()> { + let _ = env_logger::try_init(); + let tmp_db = TempDatabase::new("CREATE TABLE test (x INTEGER PRIMARY KEY);"); + // threshold is 1000 by default + + fn insert(i: usize, conn: &Rc, tmp_db: &TempDatabase) -> anyhow::Result<()> { + let insert_query = format!("INSERT INTO test VALUES ({})", i); + match conn.query(insert_query) { + Ok(Some(ref mut rows)) => loop { + match rows.next_row()? { + RowResult::IO => { + tmp_db.io.run_once()?; + } + RowResult::Done => break, + _ => unreachable!(), + } + }, + Ok(None) => {} + Err(err) => { + eprintln!("{}", err); + } + }; + tmp_db.io.run_once()?; + Ok(()) + } + + fn count(conn: &Rc, tmp_db: &TempDatabase) -> anyhow::Result { + let list_query = "SELECT count(x) FROM test"; + loop { + match conn.query(list_query).unwrap() { + Some(ref mut rows) => loop { + match rows.next_row()? { + RowResult::Row(row) => { + let first_value = &row.values[0]; + let count = match first_value { + Value::Integer(i) => *i as i32, + _ => unreachable!(), + }; + return Ok(count as usize); + } + RowResult::IO => { + tmp_db.io.run_once()?; + } + RowResult::Done => break, + } + }, + None => {} + } + } + } + + { + let conn = tmp_db.connect_limbo(); + insert(1, &conn, &tmp_db).unwrap(); + assert_eq!(count(&conn, &tmp_db).unwrap(), 1); + } + { + let conn = tmp_db.connect_limbo(); + assert_eq!( + count(&conn, &tmp_db).unwrap(), + 1, + "failed to read from wal from another connection" + ); + } + Ok(()) + } + fn compare_string(a: &String, b: &String) { assert_eq!(a.len(), b.len(), "Strings are not equal in size!"); let a = a.as_bytes(); From 19f1a00ca492694e0211cfe893ebfdd5096b5a1c Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 12 Nov 2024 17:09:41 +0100 Subject: [PATCH 04/22] bindings/python: update bindings requirements-dev --- bindings/python/requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/requirements-dev.txt b/bindings/python/requirements-dev.txt index ea68f9ec1..c3a985985 100644 --- a/bindings/python/requirements-dev.txt +++ b/bindings/python/requirements-dev.txt @@ -8,7 +8,7 @@ mypy==1.11.0 # via limbo (pyproject.toml) mypy-extensions==1.0.0 # via mypy -packaging==24.1 +packaging==24.2 # via pytest pluggy==1.5.0 # via pytest From 8d1d2d36cc7e79726bb7df6aec4cd338e9eebb35 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 12 Nov 2024 17:53:59 +0000 Subject: [PATCH 05/22] core/linux: aligned wal header read --- core/io/linux.rs | 42 ++++++++++++++++++++++------------ core/storage/sqlite3_ondisk.rs | 4 ++-- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/core/io/linux.rs b/core/io/linux.rs index 76d20809d..42b14b2a4 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -5,7 +5,7 @@ use log::{debug, trace}; use nix::fcntl::{FcntlArg, OFlag}; use std::cell::RefCell; use std::fmt; -use std::os::unix::fs::MetadataExt; +use std::collections::HashMap; use std::os::unix::io::AsRawFd; use std::rc::Rc; use thiserror::Error; @@ -37,6 +37,8 @@ pub struct LinuxIO { struct WrappedIOUring { ring: io_uring::IoUring, pending_ops: usize, + pub pending: HashMap>, + key: u64 } struct InnerLinuxIO { @@ -52,6 +54,8 @@ impl LinuxIO { ring: WrappedIOUring { ring, pending_ops: 0, + pending: HashMap::new(), + key: 0 }, iovecs: [iovec { iov_base: std::ptr::null_mut(), @@ -76,7 +80,8 @@ impl InnerLinuxIO { } impl WrappedIOUring { - fn submit_entry(&mut self, entry: &io_uring::squeue::Entry) { + fn submit_entry(&mut self, entry: &io_uring::squeue::Entry, c: Rc) { + self.pending.insert(entry.get_user_data(), c); unsafe { self.ring .submission() @@ -104,6 +109,11 @@ impl WrappedIOUring { fn empty(&self) -> bool { self.pending_ops == 0 } + + fn get_key(&mut self) -> u64 { + self.key +=1; + self.key + } } impl IO for LinuxIO { @@ -149,8 +159,11 @@ impl IO for LinuxIO { LinuxIOError::IOUringCQError(result) ))); } - let c = unsafe { Rc::from_raw(cqe.user_data() as *const Completion) }; - c.complete(cqe.result()); + { + let c = ring.pending.get(&cqe.user_data()).unwrap().clone(); + c.complete(cqe.result()); + } + ring.pending.remove(&cqe.user_data()); } Ok(()) } @@ -171,6 +184,7 @@ pub struct LinuxFile { file: std::fs::File, } + impl File for LinuxFile { fn lock_file(&self, exclusive: bool) -> Result<()> { let fd = self.file.as_raw_fd(); @@ -234,17 +248,17 @@ impl File for LinuxFile { let mut buf = r.buf_mut(); let len = buf.len(); let buf = buf.as_mut_ptr(); - let ptr = Rc::into_raw(c.clone()); let iovec = io.get_iovec(buf, len); io_uring::opcode::Readv::new(fd, iovec, 1) .offset(pos as u64) .build() - .user_data(ptr as u64) + .user_data(io.ring.get_key()) }; - io.ring.submit_entry(&read_e); + io.ring.submit_entry(&read_e, c); Ok(()) } + fn pwrite( &self, pos: usize, @@ -255,25 +269,25 @@ impl File for LinuxFile { let fd = io_uring::types::Fd(self.file.as_raw_fd()); let write = { let buf = buffer.borrow(); - let ptr = Rc::into_raw(c.clone()); + trace!("pwrite(pos = {}, length = {})", pos, buf.len()); let iovec = io.get_iovec(buf.as_ptr(), buf.len()); io_uring::opcode::Writev::new(fd, iovec, 1) .offset(pos as u64) .build() - .user_data(ptr as u64) + .user_data(io.ring.get_key()) }; - io.ring.submit_entry(&write); + io.ring.submit_entry(&write, c); Ok(()) } fn sync(&self, c: Rc) -> Result<()> { let fd = io_uring::types::Fd(self.file.as_raw_fd()); - let ptr = Rc::into_raw(c.clone()); + let mut io = self.io.borrow_mut(); + trace!("sync()"); let sync = io_uring::opcode::Fsync::new(fd) .build() - .user_data(ptr as u64); - let mut io = self.io.borrow_mut(); - io.ring.submit_entry(&sync); + .user_data(io.ring.get_key()); + io.ring.submit_entry(&sync, c); Ok(()) } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index ce2136a01..ca4cf555b 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -960,7 +960,7 @@ pub fn write_varint_to_vec(value: u64, payload: &mut Vec) { pub fn begin_read_wal_header(io: &Rc) -> Result>> { let drop_fn = Rc::new(|_buf| {}); - let buf = Rc::new(RefCell::new(Buffer::allocate(WAL_HEADER_SIZE, drop_fn))); + let buf = Rc::new(RefCell::new(Buffer::allocate(512, drop_fn))); let result = Rc::new(RefCell::new(WalHeader::default())); let header = result.clone(); let complete = Box::new(move |buf: Rc>| { @@ -1074,7 +1074,7 @@ pub fn begin_write_wal_header(io: &Rc, header: &WalHeader) -> Result<( let buffer = { let drop_fn = Rc::new(|_buf| {}); - let mut buffer = Buffer::allocate(WAL_HEADER_SIZE, drop_fn); + let mut buffer = Buffer::allocate(512, drop_fn); let buf = buffer.as_mut_slice(); buf[0..4].copy_from_slice(&header.magic); From eb4105282b39aef1fbc14f8b46bb38d3cba40edc Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 12 Nov 2024 23:10:38 +0000 Subject: [PATCH 06/22] btree: fix btree traversal --- core/storage/btree.rs | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 2c76667c7..732c8f1da 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -129,10 +129,10 @@ impl BTreeCursor { if cell_idx == page.cell_count() { // do rightmost - let has_parent = *self.current_page.borrow() > 0; - self.advance(); + let has_parent = self.has_parent(); match page.rightmost_pointer() { Some(right_most_pointer) => { + self.advance(); let mem_page = self.pager.read_page(right_most_pointer as usize)?; self.push_to_stack(mem_page); continue; @@ -150,7 +150,7 @@ impl BTreeCursor { } } - if cell_idx == page.cell_count() + 1 { + if cell_idx >= page.cell_count() + 1 { // end let has_parent = *self.current_page.borrow() > 0; if has_parent { @@ -304,34 +304,37 @@ impl BTreeCursor { let SeekKey::TableRowId(rowid_key) = key else { unreachable!("table seek key should be a rowid"); }; - self.advance(); let found = match op { SeekOp::GT => *cell_rowid > rowid_key, SeekOp::GE => *cell_rowid >= rowid_key, SeekOp::EQ => *cell_rowid == rowid_key, }; + self.advance(); if found { let record = crate::storage::sqlite3_ondisk::read_record(payload)?; return Ok(CursorResult::Ok((Some(*cell_rowid), Some(record)))); + } else { } } BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { let SeekKey::IndexKey(index_key) = key else { unreachable!("index seek key should be a record"); }; - self.advance(); let record = crate::storage::sqlite3_ondisk::read_record(payload)?; let found = match op { SeekOp::GT => record > *index_key, SeekOp::GE => record >= *index_key, SeekOp::EQ => record == *index_key, }; + self.advance(); if found { let rowid = match record.values.last() { Some(OwnedValue::Integer(rowid)) => *rowid as u64, _ => unreachable!("index cells should have an integer rowid"), }; return Ok(CursorResult::Ok((Some(rowid), Some(record)))); + } else { + } } cell_type => { @@ -431,6 +434,11 @@ impl BTreeCursor { self.cell_indices.borrow_mut()[current] += 1; } + fn set_cell_index(&self, idx: usize) { + let current = self.current(); + self.cell_indices.borrow_mut()[current] = idx; + } + fn has_parent(&self) -> bool { *self.current_page.borrow() > 0 } @@ -459,7 +467,7 @@ impl BTreeCursor { match page.rightmost_pointer() { Some(right_most_pointer) => { self.cell_indices.borrow_mut()[*self.current_page.borrow() as usize] = - page.cell_count(); + page.cell_count() + 1; let mem_page = self.pager.read_page(right_most_pointer as usize).unwrap(); self.push_to_stack(mem_page); continue; @@ -527,18 +535,20 @@ impl BTreeCursor { let SeekKey::TableRowId(rowid_key) = key else { unreachable!("table seek key should be a rowid"); }; - self.advance(); let target_leaf_page_is_in_left_subtree = match cmp { SeekOp::GT => rowid_key < *_rowid, SeekOp::GE => rowid_key <= *_rowid, SeekOp::EQ => rowid_key <= *_rowid, }; + self.advance(); if target_leaf_page_is_in_left_subtree { let mem_page = self.pager.read_page(*_left_child_page as usize)?; self.push_to_stack(mem_page); found_cell = true; break; + } else { + } } BTreeCell::TableLeafCell(TableLeafCell { @@ -565,6 +575,7 @@ impl BTreeCursor { SeekOp::EQ => index_key <= &record, }; if target_leaf_page_is_in_the_left_subtree { + // we don't advance in case of index tree internal nodes because we will visit this node going up let mem_page = self.pager.read_page(*left_child_page as usize).unwrap(); self.push_to_stack(mem_page); found_cell = true; @@ -584,6 +595,7 @@ impl BTreeCursor { if !found_cell { match contents.rightmost_pointer() { Some(right_most_pointer) => { + self.advance(); let mem_page = self.pager.read_page(right_most_pointer as usize).unwrap(); self.push_to_stack(mem_page); continue; From 800354144a8334ba7cca5e8dd80c7a36e297fddc Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 12 Nov 2024 23:11:04 +0000 Subject: [PATCH 07/22] remove extra join test --- testing/all.test | 1 - 1 file changed, 1 deletion(-) diff --git a/testing/all.test b/testing/all.test index 171144665..966fbce69 100755 --- a/testing/all.test +++ b/testing/all.test @@ -8,7 +8,6 @@ source $testdir/coalesce.test source $testdir/glob.test source $testdir/join.test source $testdir/insert.test -source $testdir/join.test source $testdir/json.test source $testdir/like.test source $testdir/orderby.test From 78118bae81fc1bbd36dbec26b1b3bca853c3c446 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 09:34:36 +0000 Subject: [PATCH 08/22] fmt --- core/io/linux.rs | 15 +++++++-------- core/storage/btree.rs | 2 -- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/core/io/linux.rs b/core/io/linux.rs index 42b14b2a4..8e166f2c7 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -4,8 +4,8 @@ use libc::{c_short, fcntl, flock, iovec, F_SETLK}; use log::{debug, trace}; use nix::fcntl::{FcntlArg, OFlag}; use std::cell::RefCell; -use std::fmt; use std::collections::HashMap; +use std::fmt; use std::os::unix::io::AsRawFd; use std::rc::Rc; use thiserror::Error; @@ -38,7 +38,7 @@ struct WrappedIOUring { ring: io_uring::IoUring, pending_ops: usize, pub pending: HashMap>, - key: u64 + key: u64, } struct InnerLinuxIO { @@ -55,7 +55,7 @@ impl LinuxIO { ring, pending_ops: 0, pending: HashMap::new(), - key: 0 + key: 0, }, iovecs: [iovec { iov_base: std::ptr::null_mut(), @@ -111,7 +111,7 @@ impl WrappedIOUring { } fn get_key(&mut self) -> u64 { - self.key +=1; + self.key += 1; self.key } } @@ -155,8 +155,9 @@ impl IO for LinuxIO { let result = cqe.result(); if result < 0 { return Err(LimboError::LinuxIOError(format!( - "{}", - LinuxIOError::IOUringCQError(result) + "{} cqe: {:?}", + LinuxIOError::IOUringCQError(result), + cqe ))); } { @@ -184,7 +185,6 @@ pub struct LinuxFile { file: std::fs::File, } - impl File for LinuxFile { fn lock_file(&self, exclusive: bool) -> Result<()> { let fd = self.file.as_raw_fd(); @@ -258,7 +258,6 @@ impl File for LinuxFile { Ok(()) } - fn pwrite( &self, pos: usize, diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 732c8f1da..39dfed357 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -334,7 +334,6 @@ impl BTreeCursor { }; return Ok(CursorResult::Ok((Some(rowid), Some(record)))); } else { - } } cell_type => { @@ -548,7 +547,6 @@ impl BTreeCursor { found_cell = true; break; } else { - } } BTreeCell::TableLeafCell(TableLeafCell { From 6834f11e283c32d83bb0ac0bea78a1f3a6334d20 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 10:01:06 +0000 Subject: [PATCH 09/22] add dbug option rust.yml --- .github/workflows/rust.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e40845728..75ae04cc5 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -25,11 +25,13 @@ jobs: runs-on: ${{ matrix.os }} steps: - - uses: Swatinem/rust-cache@v2 - uses: actions/checkout@v3 + - uses: Swatinem/rust-cache@v2 - name: Build run: cargo build --verbose - name: Test + env: + RUST_LOG: ${{ runner.debug && 'limbo_core::storage=debug' || '' }} run: cargo test --verbose build-wasm: From cfb60467171da381b08573e83079484cc2e70ea9 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 10:14:12 +0000 Subject: [PATCH 10/22] trace instead of debug on debug --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 75ae04cc5..8501a12fb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -31,7 +31,7 @@ jobs: run: cargo build --verbose - name: Test env: - RUST_LOG: ${{ runner.debug && 'limbo_core::storage=debug' || '' }} + RUST_LOG: ${{ runner.debug && 'limbo_core::storage=trace || '' }} run: cargo test --verbose build-wasm: From 97ec01a2203eac35a41b96a8b913256ad80dcc9b Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 10:20:16 +0000 Subject: [PATCH 11/22] fix missing ' --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8501a12fb..eecbacf69 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -31,7 +31,7 @@ jobs: run: cargo build --verbose - name: Test env: - RUST_LOG: ${{ runner.debug && 'limbo_core::storage=trace || '' }} + RUST_LOG: ${{ runner.debug && 'limbo_core::storage=trace' || '' }} run: cargo test --verbose build-wasm: From 13a95b583fe6d4c9a73ed6c9788bac90ad848cf7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 11:03:38 +0000 Subject: [PATCH 12/22] debug logs --- test/src/lib.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/src/lib.rs b/test/src/lib.rs index 0e9c2cbdd..19f130cc6 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -28,9 +28,12 @@ impl TempDatabase { } pub fn connect_limbo(&self) -> Rc { + log::debug!("conneting to limbo"); let db = Database::open_file(self.io.clone(), self.path.to_str().unwrap()).unwrap(); - db.connect() + let conn = db.connect(); + log::debug!("connected to limbo"); + conn } } @@ -313,6 +316,7 @@ mod tests { // threshold is 1000 by default fn insert(i: usize, conn: &Rc, tmp_db: &TempDatabase) -> anyhow::Result<()> { + log::debug!("inserting {}", i); let insert_query = format!("INSERT INTO test VALUES ({})", i); match conn.query(insert_query) { Ok(Some(ref mut rows)) => loop { @@ -329,11 +333,13 @@ mod tests { eprintln!("{}", err); } }; + log::debug!("inserted {}", i); tmp_db.io.run_once()?; Ok(()) } fn count(conn: &Rc, tmp_db: &TempDatabase) -> anyhow::Result { + log::debug!("counting"); let list_query = "SELECT count(x) FROM test"; loop { match conn.query(list_query).unwrap() { @@ -345,6 +351,7 @@ mod tests { Value::Integer(i) => *i as i32, _ => unreachable!(), }; + log::debug!("counted {}", count); return Ok(count as usize); } RowResult::IO => { From 2d03e90840836ab9a577ed7f569bb052946d0267 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 11:13:02 +0000 Subject: [PATCH 13/22] remove test wal --- .github/workflows/rust.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index eecbacf69..a8129dca2 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -17,6 +17,23 @@ jobs: - name: Check formatting run: cargo fmt --check + test-wal-restart: + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v3 + - uses: Swatinem/rust-cache@v2 + - name: Build + run: cargo build --verbose + - name: Test + env: + RUST_LOG: 'trace' + run: cargo test --verbose + timeout-minutes: 5 build-native: strategy: matrix: @@ -33,6 +50,7 @@ jobs: env: RUST_LOG: ${{ runner.debug && 'limbo_core::storage=trace' || '' }} run: cargo test --verbose + timeout-minutes: 5 build-wasm: runs-on: ubuntu-latest From 87957bf4f2ceea4eed610e0ffbbbafa39a31e675 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 11:24:07 +0000 Subject: [PATCH 14/22] checkpoint log state --- core/storage/pager.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index a0f29233e..6dec78000 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -304,7 +304,7 @@ enum FlushState { WaitSyncDbFile, } -#[derive(Clone)] +#[derive(Clone, Debug)] enum CheckpointState { Checkpoint, CheckpointDone, @@ -553,6 +553,7 @@ impl Pager { pub fn checkpoint(&self) -> Result { loop { let state = self.checkpoint_state.borrow().clone(); + log::trace!("checkpoint(state={:?})", state); match state { CheckpointState::Checkpoint => { let in_flight = self.flush_info.borrow().in_flight_writes.clone(); From cbfb45e55b4d96e46a07f2e7d0cc8270c6c3c781 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 11:31:22 +0000 Subject: [PATCH 15/22] more trace logs --- core/io/linux.rs | 2 ++ core/storage/sqlite3_ondisk.rs | 3 +++ 2 files changed, 5 insertions(+) diff --git a/core/io/linux.rs b/core/io/linux.rs index 8e166f2c7..4bc41ff31 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -81,6 +81,7 @@ impl InnerLinuxIO { impl WrappedIOUring { fn submit_entry(&mut self, entry: &io_uring::squeue::Entry, c: Rc) { + log::trace!("submit_entry({:?})", entry); self.pending.insert(entry.get_user_data(), c); unsafe { self.ring @@ -100,6 +101,7 @@ impl WrappedIOUring { // NOTE: This works because CompletionQueue's next function pops the head of the queue. This is not normal behaviour of iterators let entry = self.ring.completion().next(); if entry.is_some() { + log::trace!("get_completion({:?})", entry); // consumed an entry from completion queue, update pending_ops self.pending_ops -= 1; } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index ca4cf555b..46be15b57 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -539,6 +539,7 @@ pub fn begin_write_btree_page( let page_finish = page.clone(); let page_id = page.borrow().id; + log::trace!("begin_write_btree_page(page_id={})", page_id); let buffer = { let page = page.borrow(); let contents = page.contents.read().unwrap(); @@ -549,6 +550,7 @@ pub fn begin_write_btree_page( *write_counter.borrow_mut() += 1; let write_complete = { let buf_copy = buffer.clone(); + log::trace!("finish_write_btree_page"); Box::new(move |bytes_written: i32| { let buf_copy = buf_copy.clone(); let buf_len = buf_copy.borrow().len(); @@ -1054,6 +1056,7 @@ pub fn begin_write_wal_frame( *write_counter.borrow_mut() += 1; let write_complete = { let buf_copy = buffer.clone(); + log::info!("finished"); Box::new(move |bytes_written: i32| { let buf_copy = buf_copy.clone(); let buf_len = buf_copy.borrow().len(); From 94a45eab9ea5fb2139a40998d647dee3fff43e73 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 12:04:54 +0000 Subject: [PATCH 16/22] checkpoint inflight --- core/storage/pager.rs | 6 ++++-- core/storage/sqlite3_ondisk.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 6dec78000..c8926c147 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -336,6 +336,7 @@ pub struct Pager { flush_info: RefCell, checkpoint_state: RefCell, + checkpoint_inflight: Rc>, syncing: Rc>, } @@ -370,6 +371,7 @@ impl Pager { }), syncing: Rc::new(RefCell::new(false)), checkpoint_state: RefCell::new(CheckpointState::Checkpoint), + checkpoint_inflight: Rc::new(RefCell::new(0)), }) } @@ -556,7 +558,7 @@ impl Pager { log::trace!("checkpoint(state={:?})", state); match state { CheckpointState::Checkpoint => { - let in_flight = self.flush_info.borrow().in_flight_writes.clone(); + let in_flight = self.checkpoint_inflight.clone(); match self.wal.borrow_mut().checkpoint(self, in_flight)? { CheckpointStatus::IO => return Ok(CheckpointStatus::IO), CheckpointStatus::Done => { @@ -566,7 +568,7 @@ impl Pager { }; } CheckpointState::CheckpointDone => { - let in_flight = self.flush_info.borrow().in_flight_writes.clone(); + let in_flight = self.checkpoint_inflight.clone(); if *in_flight.borrow() > 0 { return Ok(CheckpointStatus::IO); } else { diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index 46be15b57..a581b84bc 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -550,8 +550,8 @@ pub fn begin_write_btree_page( *write_counter.borrow_mut() += 1; let write_complete = { let buf_copy = buffer.clone(); - log::trace!("finish_write_btree_page"); Box::new(move |bytes_written: i32| { + log::trace!("finish_write_btree_page"); let buf_copy = buf_copy.clone(); let buf_len = buf_copy.borrow().len(); *write_counter.borrow_mut() -= 1; From e2276c2e9d55a7b756d401211c810960b857db6f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 13:45:42 +0000 Subject: [PATCH 17/22] O_DIRECT disable on WAL --- bindings/wasm/lib.rs | 11 +++++++++-- core/io/common.rs | 4 ++-- core/io/darwin.rs | 2 +- core/io/generic.rs | 2 +- core/io/linux.rs | 12 +++++++----- core/io/mod.rs | 2 +- core/io/windows.rs | 2 +- core/lib.rs | 2 +- core/storage/wal.rs | 2 +- simulator/main.rs | 9 +++++++-- 10 files changed, 31 insertions(+), 17 deletions(-) diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs index aff3efa85..889bb81f4 100644 --- a/bindings/wasm/lib.rs +++ b/bindings/wasm/lib.rs @@ -15,7 +15,9 @@ impl Database { #[wasm_bindgen(constructor)] pub fn new(path: &str) -> Database { let io = Arc::new(PlatformIO { vfs: VFS::new() }); - let file = io.open_file(path, limbo_core::OpenFlags::None).unwrap(); + let file = io + .open_file(path, limbo_core::OpenFlags::None, false) + .unwrap(); let page_io = Rc::new(DatabaseStorage::new(file)); let wal = Rc::new(RefCell::new(Wal {})); let inner = limbo_core::Database::open(io, page_io, wal).unwrap(); @@ -87,7 +89,12 @@ pub struct PlatformIO { } impl limbo_core::IO for PlatformIO { - fn open_file(&self, path: &str, _flags: OpenFlags) -> Result> { + fn open_file( + &self, + path: &str, + _flags: OpenFlags, + _direct: bool, + ) -> Result> { let fd = self.vfs.open(path); Ok(Rc::new(File { vfs: VFS::new(), diff --git a/core/io/common.rs b/core/io/common.rs index 6627e2076..9608c48ab 100644 --- a/core/io/common.rs +++ b/core/io/common.rs @@ -13,7 +13,7 @@ pub mod tests { // Parent process opens the file let io1 = create_io().expect("Failed to create IO"); let _file1 = io1 - .open_file(&path, crate::io::OpenFlags::None) + .open_file(&path, crate::io::OpenFlags::None, false) .expect("Failed to open file in parent process"); let current_exe = std::env::current_exe().expect("Failed to get current executable path"); @@ -38,7 +38,7 @@ pub mod tests { if std::env::var("RUST_TEST_CHILD_PROCESS").is_ok() { let path = std::env::var("RUST_TEST_FILE_PATH")?; let io = create_io()?; - match io.open_file(&path, crate::io::OpenFlags::None) { + match io.open_file(&path, crate::io::OpenFlags::None, false) { Ok(_) => std::process::exit(0), Err(_) => std::process::exit(1), } diff --git a/core/io/darwin.rs b/core/io/darwin.rs index cf527a3cb..bdab24af7 100644 --- a/core/io/darwin.rs +++ b/core/io/darwin.rs @@ -31,7 +31,7 @@ impl DarwinIO { } impl IO for DarwinIO { - fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags, _direct: bool) -> Result> { trace!("open_file(path = {})", path); let file = std::fs::File::options() .read(true) diff --git a/core/io/generic.rs b/core/io/generic.rs index c703ff78e..c8c5c45b8 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -13,7 +13,7 @@ impl GenericIO { } impl IO for GenericIO { - fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags, _direct: bool) -> Result> { trace!("open_file(path = {})", path); let file = std::fs::File::open(path)?; Ok(Rc::new(GenericFile { diff --git a/core/io/linux.rs b/core/io/linux.rs index 4bc41ff31..3a2de5f42 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -119,7 +119,7 @@ impl WrappedIOUring { } impl IO for LinuxIO { - fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags, direct: bool) -> Result> { trace!("open_file(path = {})", path); let file = std::fs::File::options() .read(true) @@ -129,10 +129,12 @@ impl IO for LinuxIO { // Let's attempt to enable direct I/O. Not all filesystems support it // so ignore any errors. let fd = file.as_raw_fd(); - match nix::fcntl::fcntl(fd, FcntlArg::F_SETFL(OFlag::O_DIRECT)) { - Ok(_) => {}, - Err(error) => debug!("Error {error:?} returned when setting O_DIRECT flag to read file. The performance of the system may be affected"), - }; + if direct { + match nix::fcntl::fcntl(fd, FcntlArg::F_SETFL(OFlag::O_DIRECT)) { + Ok(_) => {}, + Err(error) => debug!("Error {error:?} returned when setting O_DIRECT flag to read file. The performance of the system may be affected"), + }; + } let linux_file = Rc::new(LinuxFile { io: self.inner.clone(), file, diff --git a/core/io/mod.rs b/core/io/mod.rs index 5d87be948..05b418206 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -24,7 +24,7 @@ pub enum OpenFlags { } pub trait IO { - fn open_file(&self, path: &str, flags: OpenFlags) -> Result>; + fn open_file(&self, path: &str, flags: OpenFlags, direct: bool) -> Result>; fn run_once(&self) -> Result<()>; diff --git a/core/io/windows.rs b/core/io/windows.rs index 1ef18665d..db7f9da31 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -13,7 +13,7 @@ impl WindowsIO { } impl IO for WindowsIO { - fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { + fn open_file(&self, path: &str, flags: OpenFlags, direct: bool) -> Result> { trace!("open_file(path = {})", path); let file = std::fs::File::options() .read(true) diff --git a/core/lib.rs b/core/lib.rs index 88609e055..fab59ed31 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -65,7 +65,7 @@ pub struct Database { impl Database { #[cfg(feature = "fs")] pub fn open_file(io: Arc, path: &str) -> Result> { - let file = io.open_file(path, io::OpenFlags::None)?; + let file = io.open_file(path, io::OpenFlags::None, true)?; let page_io = Rc::new(FileStorage::new(file)); let wal_path = format!("{}-wal", path); let db_header = Pager::begin_open(page_io.clone())?; diff --git a/core/storage/wal.rs b/core/storage/wal.rs index c670786d9..1e4e6f313 100644 --- a/core/storage/wal.rs +++ b/core/storage/wal.rs @@ -255,7 +255,7 @@ impl WalFile { if self.file.borrow().is_none() { match self .io - .open_file(&self.wal_path, crate::io::OpenFlags::Create) + .open_file(&self.wal_path, crate::io::OpenFlags::Create, false) { Ok(file) => { if file.size()? > 0 { diff --git a/simulator/main.rs b/simulator/main.rs index fb80dc3b1..ceea26ee3 100644 --- a/simulator/main.rs +++ b/simulator/main.rs @@ -94,8 +94,13 @@ impl SimulatorIO { } impl IO for SimulatorIO { - fn open_file(&self, path: &str, flags: OpenFlags) -> Result> { - let inner = self.inner.open_file(path, flags)?; + fn open_file( + &self, + path: &str, + flags: OpenFlags, + _direct: bool, + ) -> Result> { + let inner = self.inner.open_file(path, flags, false)?; let file = Rc::new(SimulatorFile { inner, fault: RefCell::new(false), From cf86da0ecf8cf5b7d23ccd117658fc108d418395 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 13:57:15 +0000 Subject: [PATCH 18/22] remove extra workflow --- .github/workflows/rust.yml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index a8129dca2..d6cd40313 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -17,23 +17,6 @@ jobs: - name: Check formatting run: cargo fmt --check - test-wal-restart: - strategy: - matrix: - os: [ubuntu-latest, macos-latest, windows-latest] - - runs-on: ${{ matrix.os }} - - steps: - - uses: actions/checkout@v3 - - uses: Swatinem/rust-cache@v2 - - name: Build - run: cargo build --verbose - - name: Test - env: - RUST_LOG: 'trace' - run: cargo test --verbose - timeout-minutes: 5 build-native: strategy: matrix: From dfdd8083e2575640296d3114e20d49059dd0cc62 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 13:58:05 +0000 Subject: [PATCH 19/22] remove ignore from wal tests --- test/src/lib.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/src/lib.rs b/test/src/lib.rs index 19f130cc6..aa3ee5967 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -43,7 +43,6 @@ mod tests { use limbo_core::{CheckpointStatus, Connection, RowResult, Value}; use log::debug; - #[ignore] #[test] fn test_sequential_write() -> anyhow::Result<()> { let _ = env_logger::try_init(); @@ -107,7 +106,6 @@ mod tests { } #[test] - #[ignore] fn test_simple_overflow_page() -> anyhow::Result<()> { let _ = env_logger::try_init(); let tmp_db = TempDatabase::new("CREATE TABLE test (x INTEGER PRIMARY KEY, t TEXT);"); @@ -173,7 +171,6 @@ mod tests { Ok(()) } - #[ignore] #[test] fn test_sequential_overflow_page() -> anyhow::Result<()> { let _ = env_logger::try_init(); @@ -248,7 +245,6 @@ mod tests { } #[test] - #[ignore] fn test_wal_checkpoint() -> anyhow::Result<()> { let _ = env_logger::try_init(); let tmp_db = TempDatabase::new("CREATE TABLE test (x INTEGER PRIMARY KEY);"); From 9d7a3e8e60522774b076716aa039fb5867f89277 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 14:04:28 +0000 Subject: [PATCH 20/22] ignore sequential test --- test/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/test/src/lib.rs b/test/src/lib.rs index aa3ee5967..b11aa3ab6 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -43,6 +43,7 @@ mod tests { use limbo_core::{CheckpointStatus, Connection, RowResult, Value}; use log::debug; + #[ignore] #[test] fn test_sequential_write() -> anyhow::Result<()> { let _ = env_logger::try_init(); From 23923e417527a2528fff3303ed9c7381bc3aece7 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 14:09:38 +0000 Subject: [PATCH 21/22] ignore wal checkpoint test --- test/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/test/src/lib.rs b/test/src/lib.rs index b11aa3ab6..5e0367206 100644 --- a/test/src/lib.rs +++ b/test/src/lib.rs @@ -246,6 +246,7 @@ mod tests { } #[test] + #[ignore] fn test_wal_checkpoint() -> anyhow::Result<()> { let _ = env_logger::try_init(); let tmp_db = TempDatabase::new("CREATE TABLE test (x INTEGER PRIMARY KEY);"); From 2a787aedb5338997d6f0292d8f27dd1ecc12ff6e Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 13 Nov 2024 17:13:30 +0100 Subject: [PATCH 22/22] core: extract page stack and document a bit --- core/storage/btree.rs | 295 +++++++++++++++++++++++------------------- 1 file changed, 160 insertions(+), 135 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 39dfed357..bb5afd851 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -24,6 +24,17 @@ const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of c const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */ const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */ +/* +** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than +** this will be declared corrupt. This value is calculated based on a +** maximum database size of 2^31 pages a minimum fanout of 2 for a +** root-node and 3 for all other internal nodes. +** +** If a tree that appears to be taller than this is encountered, it is +** assumed that the database is corrupt. +*/ +pub const BTCURSOR_MAX_DEPTH: usize = 20; + #[derive(Debug)] enum WriteState { Start, @@ -41,30 +52,40 @@ struct WriteInfo { page_copy: RefCell>, // this holds the copy a of a page needed for buffer references } -/* -** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than -** this will be declared corrupt. This value is calculated based on a -** maximum database size of 2^31 pages a minimum fanout of 2 for a -** root-node and 3 for all other internal nodes. -** -** If a tree that appears to be taller than this is encountered, it is -** assumed that the database is corrupt. -*/ -pub const BTCURSOR_MAX_DEPTH: usize = 20; - pub struct BTreeCursor { pager: Rc, + /// Page id of the root page used to go back up fast. root_page: usize, + /// Rowid and record are stored before being consumed. rowid: RefCell>, record: RefCell>, null_flag: bool, database_header: Rc>, + /// Index internal pages are consumed on the way up, so we store going upwards flag in case + /// we just moved to a parent page and the parent page is an internal index page which requires + /// to be consumed. going_upwards: bool, + /// Write information kept in case of write yields due to I/O. Needs to be stored somewhere + /// right :). write_info: WriteInfo, + /// Page stack used to traverse the btree. + /// Each cursor has a stack because each cursor traverses the btree independently. + stack: PageStack, +} +/// Stack of pages representing the tree traversal order. +/// current_page represents the current page being used in the tree and current_page - 1 would be +/// the parent. Using current_page + 1 or higher is undefined behaviour. +struct PageStack { + /// Pointer to the currenet page being consumed current_page: RefCell, - cell_indices: RefCell<[usize; BTCURSOR_MAX_DEPTH + 1]>, + /// List of pages in the stack. Root page will be in index 0 stack: RefCell<[Option>>; BTCURSOR_MAX_DEPTH + 1]>, + /// List of cell indices in the stack. + /// cell_indices[current_page] is the current cell index being consumed. Similarly + /// cell_indices[current_page-1] is the cell index of the parent of the current page + /// that we save in case of going back up. + cell_indices: RefCell<[usize; BTCURSOR_MAX_DEPTH + 1]>, } impl BTreeCursor { @@ -88,9 +109,11 @@ impl BTreeCursor { rightmost_pointer: RefCell::new(None), page_copy: RefCell::new(None), }, - current_page: RefCell::new(-1), - cell_indices: RefCell::new([0; BTCURSOR_MAX_DEPTH + 1]), - stack: RefCell::new([const { None }; BTCURSOR_MAX_DEPTH + 1]), + stack: PageStack { + current_page: RefCell::new(-1), + cell_indices: RefCell::new([0; BTCURSOR_MAX_DEPTH + 1]), + stack: RefCell::new([const { None }; BTCURSOR_MAX_DEPTH + 1]), + }, } } @@ -111,8 +134,8 @@ impl BTreeCursor { predicate: Option<(SeekKey<'_>, SeekOp)>, ) -> Result, Option)>> { loop { - let mem_page_rc = self.top_from_stack(); - let cell_idx = self.current_index(); + let mem_page_rc = self.stack.top(); + let cell_idx = self.stack.current_index(); debug!("current id={} cell={}", mem_page_rc.borrow().id, cell_idx); if mem_page_rc.borrow().is_locked() { @@ -129,19 +152,19 @@ impl BTreeCursor { if cell_idx == page.cell_count() { // do rightmost - let has_parent = self.has_parent(); + let has_parent = self.stack.has_parent(); match page.rightmost_pointer() { Some(right_most_pointer) => { - self.advance(); + self.stack.advance(); let mem_page = self.pager.read_page(right_most_pointer as usize)?; - self.push_to_stack(mem_page); + self.stack.push(mem_page); continue; } None => { if has_parent { debug!("moving simple upwards"); self.going_upwards = true; - self.pop_from_stack(); + self.stack.pop(); continue; } else { return Ok(CursorResult::Ok((None, None))); @@ -152,11 +175,11 @@ impl BTreeCursor { if cell_idx >= page.cell_count() + 1 { // end - let has_parent = *self.current_page.borrow() > 0; + let has_parent = self.stack.current() > 0; if has_parent { debug!("moving upwards"); self.going_upwards = true; - self.pop_from_stack(); + self.stack.pop(); continue; } else { return Ok(CursorResult::Ok((None, None))); @@ -177,9 +200,9 @@ impl BTreeCursor { _rowid, }) => { assert!(predicate.is_none()); - self.advance(); + self.stack.advance(); let mem_page = self.pager.read_page(*_left_child_page as usize)?; - self.push_to_stack(mem_page); + self.stack.push(mem_page); continue; } BTreeCell::TableLeafCell(TableLeafCell { @@ -188,7 +211,7 @@ impl BTreeCursor { first_overflow_page: _, }) => { assert!(predicate.is_none()); - self.advance(); + self.stack.advance(); let record = crate::storage::sqlite3_ondisk::read_record(_payload)?; return Ok(CursorResult::Ok((Some(*_rowid), Some(record)))); } @@ -199,12 +222,12 @@ impl BTreeCursor { }) => { if !self.going_upwards { let mem_page = self.pager.read_page(*left_child_page as usize)?; - self.push_to_stack(mem_page); + self.stack.push(mem_page); continue; } self.going_upwards = false; - self.advance(); + self.stack.advance(); let record = crate::storage::sqlite3_ondisk::read_record(payload)?; if predicate.is_none() { @@ -235,7 +258,7 @@ impl BTreeCursor { } } BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { - self.advance(); + self.stack.advance(); let record = crate::storage::sqlite3_ondisk::read_record(payload)?; if predicate.is_none() { let rowid = match record.values.last() { @@ -278,7 +301,7 @@ impl BTreeCursor { }; { - let page_rc = self.top_from_stack(); + let page_rc = self.stack.top(); let page = page_rc.borrow(); if page.is_locked() { return Ok(CursorResult::IO); @@ -309,11 +332,10 @@ impl BTreeCursor { SeekOp::GE => *cell_rowid >= rowid_key, SeekOp::EQ => *cell_rowid == rowid_key, }; - self.advance(); + self.stack.advance(); if found { let record = crate::storage::sqlite3_ondisk::read_record(payload)?; return Ok(CursorResult::Ok((Some(*cell_rowid), Some(record)))); - } else { } } BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { @@ -326,14 +348,13 @@ impl BTreeCursor { SeekOp::GE => record >= *index_key, SeekOp::EQ => record == *index_key, }; - self.advance(); + self.stack.advance(); if found { let rowid = match record.values.last() { Some(OwnedValue::Integer(rowid)) => *rowid as u64, _ => unreachable!("index cells should have an integer rowid"), }; return Ok(CursorResult::Ok((Some(rowid), Some(record)))); - } else { } } cell_type => { @@ -368,85 +389,15 @@ impl BTreeCursor { fn move_to_root(&mut self) { let mem_page = self.pager.read_page(self.root_page).unwrap(); - self.stack.borrow_mut()[0] = Some(mem_page); - self.cell_indices.borrow_mut()[0] = 0; - *self.current_page.borrow_mut() = 0; - } - - fn push_to_stack(&self, page: Rc>) { - debug!( - "push to stack {} {}", - self.current_page.borrow(), - page.borrow().id - ); - *self.current_page.borrow_mut() += 1; - let current = *self.current_page.borrow(); - assert!( - current < BTCURSOR_MAX_DEPTH as i32, - "corrupted database, stack is bigger than expected" - ); - self.stack.borrow_mut()[current as usize] = Some(page); - self.cell_indices.borrow_mut()[current as usize] = 0; - } - - fn pop_from_stack(&self) { - let current = *self.current_page.borrow(); - debug!("pop_from_stack(current={})", current); - self.cell_indices.borrow_mut()[current as usize] = 0; - self.stack.borrow_mut()[current as usize] = None; - *self.current_page.borrow_mut() -= 1; - } - - fn top_from_stack(&self) -> Rc> { - let current = *self.current_page.borrow(); - let page = self.stack.borrow()[current as usize] - .as_ref() - .unwrap() - .clone(); - debug!( - "top_from_stack(current={}, page_id={})", - current, - page.borrow().id - ); - page - } - - fn parent(&self) -> Rc> { - let current = *self.current_page.borrow(); - self.stack.borrow()[current as usize - 1] - .as_ref() - .unwrap() - .clone() - } - - fn current(&self) -> usize { - *self.current_page.borrow() as usize - } - - fn current_index(&self) -> usize { - let current = self.current(); - self.cell_indices.borrow()[current] - } - - fn advance(&self) { - let current = self.current(); - self.cell_indices.borrow_mut()[current] += 1; - } - - fn set_cell_index(&self, idx: usize) { - let current = self.current(); - self.cell_indices.borrow_mut()[current] = idx; - } - - fn has_parent(&self) -> bool { - *self.current_page.borrow() > 0 + self.stack.clear(); + self.stack.push(mem_page); } fn move_to_rightmost(&mut self) -> Result> { self.move_to_root(); loop { - let mem_page = self.top_from_stack(); + let mem_page = self.stack.top(); let page_idx = mem_page.borrow().id; let page = self.pager.read_page(page_idx)?; let page = RefCell::borrow(&page); @@ -457,18 +408,16 @@ impl BTreeCursor { let page = page.as_ref().unwrap(); if page.is_leaf() { if page.cell_count() > 0 { - self.cell_indices.borrow_mut()[*self.current_page.borrow() as usize] = - page.cell_count() - 1; + self.stack.set_cell_index(page.cell_count() - 1); } return Ok(CursorResult::Ok(())); } match page.rightmost_pointer() { Some(right_most_pointer) => { - self.cell_indices.borrow_mut()[*self.current_page.borrow() as usize] = - page.cell_count() + 1; + self.stack.set_cell_index(page.cell_count() + 1); let mem_page = self.pager.read_page(right_most_pointer as usize).unwrap(); - self.push_to_stack(mem_page); + self.stack.push(mem_page); continue; } @@ -506,7 +455,7 @@ impl BTreeCursor { self.move_to_root(); loop { - let page_rc = self.top_from_stack(); + let page_rc = self.stack.top(); let page = RefCell::borrow(&page_rc); if page.is_locked() { return Ok(CursorResult::IO); @@ -539,14 +488,12 @@ impl BTreeCursor { SeekOp::GE => rowid_key <= *_rowid, SeekOp::EQ => rowid_key <= *_rowid, }; - self.advance(); + self.stack.advance(); if target_leaf_page_is_in_left_subtree { let mem_page = self.pager.read_page(*_left_child_page as usize)?; - self.push_to_stack(mem_page); - + self.stack.push(mem_page); found_cell = true; break; - } else { } } BTreeCell::TableLeafCell(TableLeafCell { @@ -575,11 +522,11 @@ impl BTreeCursor { if target_leaf_page_is_in_the_left_subtree { // we don't advance in case of index tree internal nodes because we will visit this node going up let mem_page = self.pager.read_page(*left_child_page as usize).unwrap(); - self.push_to_stack(mem_page); + self.stack.push(mem_page); found_cell = true; break; } else { - self.advance(); + self.stack.advance(); } } BTreeCell::IndexLeafCell(_) => { @@ -593,9 +540,9 @@ impl BTreeCursor { if !found_cell { match contents.rightmost_pointer() { Some(right_most_pointer) => { - self.advance(); + self.stack.advance(); let mem_page = self.pager.read_page(right_most_pointer as usize).unwrap(); - self.push_to_stack(mem_page); + self.stack.push(mem_page); continue; } None => { @@ -615,7 +562,7 @@ impl BTreeCursor { let state = &self.write_info.state; match state { WriteState::Start => { - let page_ref = self.top_from_stack(); + let page_ref = self.stack.top(); let int_key = match key { OwnedValue::Integer(i) => *i as u64, _ => unreachable!("btree tables are indexed by integers!"), @@ -800,7 +747,7 @@ impl BTreeCursor { // can be a "rightmost pointer" or a "cell". // TODO(pere): simplify locking... // we always asumme there is a parent - let current_page = self.top_from_stack(); + let current_page = self.stack.top(); let page_rc = RefCell::borrow(¤t_page); { // check if we don't need to balance @@ -816,7 +763,7 @@ impl BTreeCursor { } } - if !self.has_parent() { + if !self.stack.has_parent() { drop(page_rc); drop(current_page); self.balance_root(); @@ -883,7 +830,7 @@ impl BTreeCursor { debug!( "splitting left={} right={}", - *self.current_page.borrow(), + self.stack.current(), right_page_id ); @@ -891,7 +838,7 @@ impl BTreeCursor { Ok(CursorResult::Ok(())) } WriteState::BalanceGetParentPage => { - let parent_rc = self.parent(); + let parent_rc = self.stack.parent(); let loaded = parent_rc.borrow().is_loaded(); let locked = parent_rc.borrow().is_locked(); @@ -909,11 +856,11 @@ impl BTreeCursor { } } WriteState::BalanceMoveUp => { - let parent_ref = self.parent(); + let parent_ref = self.stack.parent(); let parent = RefCell::borrow_mut(&parent_ref); let (page_type, current_idx) = { - let current_page = self.top_from_stack(); + let current_page = self.stack.top(); let page_ref = current_page.borrow(); let page = page_ref.contents.read().unwrap(); (page.as_ref().unwrap().page_type().clone(), page_ref.id) @@ -1006,7 +953,7 @@ impl BTreeCursor { current_cell_index += cells_to_copy; } let is_leaf = { - let page = self.top_from_stack(); + let page = self.stack.top(); let page = page.borrow(); let page = page.contents.read().unwrap(); page.as_ref().unwrap().is_leaf() @@ -1103,7 +1050,7 @@ impl BTreeCursor { let last_pointer = new_pages.last().unwrap().borrow().id as u32; parent_contents.write_u32(right_pointer, last_pointer); } - self.pop_from_stack(); + self.stack.pop(); self.write_info.state = WriteState::BalanceStart; let _ = self.write_info.page_copy.take(); Ok(CursorResult::Ok(())) @@ -1132,7 +1079,7 @@ impl BTreeCursor { /* swap splitted page buffer with new root buffer so we don't have to update page idx */ { let (root_id, child_id, child) = { - let page_ref = self.top_from_stack(); + let page_ref = self.stack.top(); let child = page_ref.clone(); let mut page_rc = page_ref.borrow_mut(); let mut new_root_page = new_root_page_ref.borrow_mut(); @@ -1149,9 +1096,9 @@ impl BTreeCursor { let root = new_root_page_ref.clone(); self.root_page = root_id; - *self.current_page.borrow_mut() = -1; - self.push_to_stack(root.clone()); - self.push_to_stack(child.clone()); + self.stack.clear(); + self.stack.push(root.clone()); + self.stack.push(child.clone()); self.pager.put_loaded_page(root_id, root); self.pager.put_loaded_page(child_id, child); @@ -1526,6 +1473,84 @@ impl BTreeCursor { } } +impl PageStack { + fn push(&self, page: Rc>) { + debug!( + "pagestack::push(current={}, new_page_id={})", + self.current_page.borrow(), + page.borrow().id + ); + *self.current_page.borrow_mut() += 1; + let current = *self.current_page.borrow(); + assert!( + current < BTCURSOR_MAX_DEPTH as i32, + "corrupted database, stack is bigger than expected" + ); + self.stack.borrow_mut()[current as usize] = Some(page); + self.cell_indices.borrow_mut()[current as usize] = 0; + } + + fn pop(&self) { + let current = *self.current_page.borrow(); + debug!("pagestack::pop(current={})", current); + self.cell_indices.borrow_mut()[current as usize] = 0; + self.stack.borrow_mut()[current as usize] = None; + *self.current_page.borrow_mut() -= 1; + } + + fn top(&self) -> Rc> { + let current = *self.current_page.borrow(); + let page = self.stack.borrow()[current as usize] + .as_ref() + .unwrap() + .clone(); + debug!( + "pagestack::top(current={}, page_id={})", + current, + page.borrow().id + ); + page + } + + fn parent(&self) -> Rc> { + let current = *self.current_page.borrow(); + self.stack.borrow()[current as usize - 1] + .as_ref() + .unwrap() + .clone() + } + + /// Current page pointer being used + fn current(&self) -> usize { + *self.current_page.borrow() as usize + } + + /// Cell index of the current page + fn current_index(&self) -> usize { + let current = self.current(); + self.cell_indices.borrow()[current] + } + + /// Advance the current cell index of the current page to the next cell. + fn advance(&self) { + let current = self.current(); + self.cell_indices.borrow_mut()[current] += 1; + } + + fn set_cell_index(&self, idx: usize) { + let current = self.current(); + self.cell_indices.borrow_mut()[current] = idx; + } + + fn has_parent(&self) -> bool { + *self.current_page.borrow() > 0 + } + + fn clear(&self) { + *self.current_page.borrow_mut() = -1; + } +} + fn find_free_cell(page_ref: &PageContent, db_header: Ref, amount: usize) -> usize { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc @@ -1665,7 +1690,7 @@ impl Cursor for BTreeCursor { CursorResult::Ok(_) => {} CursorResult::IO => return Ok(CursorResult::IO), }; - let page_ref = self.top_from_stack(); + let page_ref = self.stack.top(); let page = RefCell::borrow(&page_ref); if page.is_locked() { // TODO(pere); request load