mirror of
https://github.com/aljazceru/turso.git
synced 2026-01-26 03:14:23 +01:00
Merge 'core/btree: improve documentation' from Jussi Saurio
This PR should have no functional changes, just variable renaming and comments Using `///` comment format for better IDE support Reviewed-by: Pere Diaz Bou <penberg@iki.fi> Closes #539
This commit is contained in:
@@ -20,22 +20,37 @@ use super::sqlite3_ondisk::{
|
||||
/*
|
||||
These are offsets of fields in the header of a b-tree page.
|
||||
*/
|
||||
const BTREE_HEADER_OFFSET_TYPE: usize = 0; /* type of btree page -> u8 */
|
||||
const BTREE_HEADER_OFFSET_FREEBLOCK: usize = 1; /* pointer to first freeblock -> u16 */
|
||||
const BTREE_HEADER_OFFSET_CELL_COUNT: usize = 3; /* number of cells in the page -> u16 */
|
||||
const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of cell allocated content from top -> u16 */
|
||||
const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */
|
||||
const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */
|
||||
|
||||
/*
|
||||
** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
|
||||
** this will be declared corrupt. This value is calculated based on a
|
||||
** maximum database size of 2^31 pages a minimum fanout of 2 for a
|
||||
** root-node and 3 for all other internal nodes.
|
||||
**
|
||||
** If a tree that appears to be taller than this is encountered, it is
|
||||
** assumed that the database is corrupt.
|
||||
*/
|
||||
/// type of btree page -> u8
|
||||
const PAGE_HEADER_OFFSET_PAGE_TYPE: usize = 0;
|
||||
/// pointer to first freeblock -> u16
|
||||
/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page.
|
||||
/// A freeblock is a structure used to identify unallocated space within a b-tree page.
|
||||
/// Freeblocks are organized as a chain.
|
||||
///
|
||||
/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead
|
||||
/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions.
|
||||
const PAGE_HEADER_OFFSET_FIRST_FREEBLOCK: usize = 1;
|
||||
/// number of cells in the page -> u16
|
||||
const PAGE_HEADER_OFFSET_CELL_COUNT: usize = 3;
|
||||
/// pointer to first byte of cell allocated content from top -> u16
|
||||
/// SQLite strives to place cells as far toward the end of the b-tree page as it can,
|
||||
/// in order to leave space for future growth of the cell pointer array.
|
||||
/// = the cell content area pointer moves leftward as cells are added to the page
|
||||
const PAGE_HEADER_OFFSET_CELL_CONTENT_AREA: usize = 5;
|
||||
/// number of fragmented bytes -> u8
|
||||
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
|
||||
const PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT: usize = 7;
|
||||
/// if internalnode, pointer right most pointer (saved separately from cells) -> u32
|
||||
const PAGE_HEADER_OFFSET_RIGHTMOST_PTR: usize = 8;
|
||||
|
||||
/// Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
|
||||
/// this will be declared corrupt. This value is calculated based on a
|
||||
/// maximum database size of 2^31 pages a minimum fanout of 2 for a
|
||||
/// root-node and 3 for all other internal nodes.
|
||||
///
|
||||
/// If a tree that appears to be taller than this is encountered, it is
|
||||
/// assumed that the database is corrupt.
|
||||
pub const BTCURSOR_MAX_DEPTH: usize = 20;
|
||||
|
||||
/// Evaluate a Result<CursorResult<T>>, if IO return IO.
|
||||
@@ -57,6 +72,8 @@ macro_rules! return_if_locked {
|
||||
}};
|
||||
}
|
||||
|
||||
/// State machine of a write operation.
|
||||
/// May involve balancing due to overflow.
|
||||
#[derive(Debug)]
|
||||
enum WriteState {
|
||||
Start,
|
||||
@@ -67,11 +84,16 @@ enum WriteState {
|
||||
}
|
||||
|
||||
struct WriteInfo {
|
||||
/// State of the write operation state machine.
|
||||
state: WriteState,
|
||||
/// Pages allocated during the write operation due to balancing.
|
||||
new_pages: RefCell<Vec<PageRef>>,
|
||||
/// Scratch space used during balancing.
|
||||
scratch_cells: RefCell<Vec<&'static [u8]>>,
|
||||
/// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated.
|
||||
rightmost_pointer: RefCell<Option<u32>>,
|
||||
page_copy: RefCell<Option<PageContent>>, // this holds the copy a of a page needed for buffer references
|
||||
/// Copy of the current page needed for buffer references.
|
||||
page_copy: RefCell<Option<PageContent>>,
|
||||
}
|
||||
|
||||
pub struct BTreeCursor {
|
||||
@@ -142,6 +164,8 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the table is empty.
|
||||
/// This is done by checking if the root page has no cells.
|
||||
fn is_empty_table(&mut self) -> Result<CursorResult<bool>> {
|
||||
let page = self.pager.read_page(self.root_page)?;
|
||||
return_if_locked!(page);
|
||||
@@ -150,16 +174,18 @@ impl BTreeCursor {
|
||||
Ok(CursorResult::Ok(cell_count == 0))
|
||||
}
|
||||
|
||||
/// Move the cursor to the previous record and return it.
|
||||
/// Used in backwards iteration.
|
||||
fn get_prev_record(&mut self) -> Result<CursorResult<(Option<u64>, Option<OwnedRecord>)>> {
|
||||
loop {
|
||||
let page = self.stack.top();
|
||||
let cell_idx = self.stack.current_index();
|
||||
let cell_idx = self.stack.current_cell_index();
|
||||
|
||||
// moved to current page begin
|
||||
// moved to beginning of current page
|
||||
// todo: find a better way to flag moved to end or begin of page
|
||||
if self.stack.curr_idx_out_of_begin() {
|
||||
if self.stack.current_cell_index_less_than_min() {
|
||||
loop {
|
||||
if self.stack.current_index() > 0 {
|
||||
if self.stack.current_cell_index() > 0 {
|
||||
self.stack.retreat();
|
||||
break;
|
||||
}
|
||||
@@ -198,8 +224,8 @@ impl BTreeCursor {
|
||||
let cell = contents.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)?;
|
||||
|
||||
@@ -228,13 +254,15 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Move the cursor to the next record and return it.
|
||||
/// Used in forwards iteration, which is the default.
|
||||
fn get_next_record(
|
||||
&mut self,
|
||||
predicate: Option<(SeekKey<'_>, SeekOp)>,
|
||||
) -> Result<CursorResult<(Option<u64>, Option<OwnedRecord>)>> {
|
||||
loop {
|
||||
let mem_page_rc = self.stack.top();
|
||||
let cell_idx = self.stack.current_index() as usize;
|
||||
let cell_idx = self.stack.current_cell_index() as usize;
|
||||
|
||||
debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx);
|
||||
return_if_locked!(mem_page_rc);
|
||||
@@ -286,8 +314,8 @@ impl BTreeCursor {
|
||||
let cell = contents.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)?;
|
||||
match &cell {
|
||||
@@ -386,6 +414,9 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Move the cursor to the record that matches the seek key and seek operation.
|
||||
/// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10)
|
||||
/// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10).
|
||||
fn seek(
|
||||
&mut self,
|
||||
key: SeekKey<'_>,
|
||||
@@ -403,8 +434,8 @@ impl BTreeCursor {
|
||||
let cell = contents.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)?;
|
||||
match &cell {
|
||||
@@ -476,12 +507,14 @@ impl BTreeCursor {
|
||||
Ok(CursorResult::Ok((None, None)))
|
||||
}
|
||||
|
||||
/// Move the cursor to the root page of the btree.
|
||||
fn move_to_root(&mut self) {
|
||||
let mem_page = self.pager.read_page(self.root_page).unwrap();
|
||||
self.stack.clear();
|
||||
self.stack.push(mem_page);
|
||||
}
|
||||
|
||||
/// Move the cursor to the rightmost record in the btree.
|
||||
fn move_to_rightmost(&mut self) -> Result<CursorResult<()>> {
|
||||
self.move_to_root();
|
||||
|
||||
@@ -553,8 +586,8 @@ impl BTreeCursor {
|
||||
match &contents.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)? {
|
||||
BTreeCell::TableInteriorCell(TableInteriorCell {
|
||||
@@ -634,6 +667,8 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a record into the btree.
|
||||
/// If the insert operation overflows the page, it will be split and the btree will be balanced.
|
||||
fn insert_into_page(
|
||||
&mut self,
|
||||
key: &OwnedValue,
|
||||
@@ -700,7 +735,11 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/* insert to position and shift other pointers */
|
||||
/// Insert a record into a cell.
|
||||
/// If the cell overflows, an overflow cell is created.
|
||||
/// insert_into_cell() is called from insert_into_page(),
|
||||
/// and the overflow cell count is used to determine if the page overflows,
|
||||
/// i.e. whether we need to balance the btree after the insert.
|
||||
fn insert_into_cell(&self, page: &mut PageContent, payload: &[u8], cell_idx: usize) {
|
||||
let free = self.compute_free_space(page, RefCell::borrow(&self.database_header));
|
||||
let enough_space = payload.len() + 2 <= free as usize;
|
||||
@@ -734,41 +773,54 @@ impl BTreeCursor {
|
||||
page.write_u16(pointer_area_pc_by_idx - page.offset, pc);
|
||||
|
||||
// update first byte of content area
|
||||
page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, pc);
|
||||
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, pc);
|
||||
|
||||
// update cell count
|
||||
let new_n_cells = (page.cell_count() + 1) as u16;
|
||||
page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, new_n_cells);
|
||||
page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells);
|
||||
}
|
||||
|
||||
/// Free the range of bytes that a cell occupies.
|
||||
/// This function also updates the freeblock list in the page.
|
||||
/// Freeblocks are used to keep track of free space in the page,
|
||||
/// and are organized as a linked list.
|
||||
fn free_cell_range(&self, page: &mut PageContent, offset: u16, len: u16) {
|
||||
// if the freeblock list is empty, we set this block as the first freeblock in the page header.
|
||||
if page.first_freeblock() == 0 {
|
||||
// insert into empty list
|
||||
page.write_u16(offset as usize, 0);
|
||||
page.write_u16(offset as usize + 2, len);
|
||||
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, offset);
|
||||
page.write_u16(offset as usize, 0); // next freeblock = null
|
||||
page.write_u16(offset as usize + 2, len); // size of this freeblock
|
||||
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block
|
||||
return;
|
||||
}
|
||||
let first_block = page.first_freeblock();
|
||||
|
||||
// if the freeblock list is not empty, and the offset is less than the first freeblock,
|
||||
// we insert this block at the head of the list
|
||||
if offset < first_block {
|
||||
// insert into head of list
|
||||
page.write_u16(offset as usize, first_block);
|
||||
page.write_u16(offset as usize + 2, len);
|
||||
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, offset);
|
||||
page.write_u16(offset as usize, first_block); // next freeblock = previous first freeblock
|
||||
page.write_u16(offset as usize + 2, len); // size of this freeblock
|
||||
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block
|
||||
return;
|
||||
}
|
||||
|
||||
// if we clear space that is at the start of the cell content area,
|
||||
// we need to update the cell content area pointer forward to account for the removed space
|
||||
// FIXME: is offset ever < cell_content_area? cell content area grows leftwards and the pointer
|
||||
// is to the start of the last allocated cell. should we assert!(offset >= page.cell_content_area())
|
||||
// and change this to if offset == page.cell_content_area()?
|
||||
if offset <= page.cell_content_area() {
|
||||
// extend boundary of content area
|
||||
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, page.first_freeblock());
|
||||
page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, offset + len);
|
||||
// FIXME: remove the line directly below this, it does not change anything.
|
||||
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, page.first_freeblock());
|
||||
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len);
|
||||
return;
|
||||
}
|
||||
|
||||
// if the freeblock list is not empty, and the offset is greater than the first freeblock,
|
||||
// then we need to do some more calculation to figure out where to insert the freeblock
|
||||
// in the freeblock linked list.
|
||||
let maxpc = {
|
||||
let db_header = self.database_header.borrow();
|
||||
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
|
||||
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
|
||||
usable_space as u16
|
||||
};
|
||||
|
||||
@@ -799,17 +851,23 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop a cell from a page.
|
||||
/// This is done by freeing the range of bytes that the cell occupies.
|
||||
fn drop_cell(&self, page: &mut PageContent, cell_idx: usize) {
|
||||
let (cell_start, cell_len) = page.cell_get_raw_region(
|
||||
cell_idx,
|
||||
self.max_local(page.page_type()),
|
||||
self.min_local(page.page_type()),
|
||||
self.payload_overflow_threshold_max(page.page_type()),
|
||||
self.payload_overflow_threshold_min(page.page_type()),
|
||||
self.usable_space(),
|
||||
);
|
||||
self.free_cell_range(page, cell_start as u16, cell_len as u16);
|
||||
page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1);
|
||||
page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1);
|
||||
}
|
||||
|
||||
/// Balance a leaf page.
|
||||
/// Balancing is done when a page overflows.
|
||||
/// see e.g. https://en.wikipedia.org/wiki/B-tree
|
||||
///
|
||||
/// This is a naive algorithm that doesn't try to distribute cells evenly by content.
|
||||
/// It will try to split the page in half by keys not by content.
|
||||
/// Sqlite tries to have a page at least 40% full.
|
||||
@@ -852,8 +910,8 @@ impl BTreeCursor {
|
||||
for cell_idx in 0..page_copy.cell_count() {
|
||||
let (start, len) = page_copy.cell_get_raw_region(
|
||||
cell_idx,
|
||||
self.max_local(page_copy.page_type()),
|
||||
self.min_local(page_copy.page_type()),
|
||||
self.payload_overflow_threshold_max(page_copy.page_type()),
|
||||
self.payload_overflow_threshold_min(page_copy.page_type()),
|
||||
self.usable_space(),
|
||||
);
|
||||
let buf = page_copy.as_ptr();
|
||||
@@ -930,14 +988,14 @@ impl BTreeCursor {
|
||||
assert_eq!(parent_contents.overflow_cells.len(), 0);
|
||||
|
||||
// Right page pointer is u32 in right most pointer, and in cell is u32 too, so we can use a *u32 to hold where we want to change this value
|
||||
let mut right_pointer = BTREE_HEADER_OFFSET_RIGHTMOST;
|
||||
let mut right_pointer = PAGE_HEADER_OFFSET_RIGHTMOST_PTR;
|
||||
for cell_idx in 0..parent_contents.cell_count() {
|
||||
let cell = parent_contents
|
||||
.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(page_type.clone()),
|
||||
self.min_local(page_type.clone()),
|
||||
self.payload_overflow_threshold_max(page_type.clone()),
|
||||
self.payload_overflow_threshold_min(page_type.clone()),
|
||||
self.usable_space(),
|
||||
)
|
||||
.unwrap();
|
||||
@@ -950,8 +1008,8 @@ impl BTreeCursor {
|
||||
if found {
|
||||
let (start, _len) = parent_contents.cell_get_raw_region(
|
||||
cell_idx,
|
||||
self.max_local(page_type.clone()),
|
||||
self.min_local(page_type.clone()),
|
||||
self.payload_overflow_threshold_max(page_type.clone()),
|
||||
self.payload_overflow_threshold_min(page_type.clone()),
|
||||
self.usable_space(),
|
||||
);
|
||||
right_pointer = start;
|
||||
@@ -967,17 +1025,20 @@ impl BTreeCursor {
|
||||
assert!(page.is_dirty());
|
||||
let contents = page.get().contents.as_mut().unwrap();
|
||||
|
||||
contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0);
|
||||
contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
|
||||
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
|
||||
let db_header = RefCell::borrow(&self.database_header);
|
||||
let cell_content_area_start =
|
||||
db_header.page_size - db_header.unused_space as u16;
|
||||
contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start);
|
||||
db_header.page_size - db_header.reserved_space as u16;
|
||||
contents.write_u16(
|
||||
PAGE_HEADER_OFFSET_CELL_CONTENT_AREA,
|
||||
cell_content_area_start,
|
||||
);
|
||||
|
||||
contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0);
|
||||
contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
|
||||
if !contents.is_leaf() {
|
||||
contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0);
|
||||
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1035,8 +1096,8 @@ impl BTreeCursor {
|
||||
.cell_get(
|
||||
contents.cell_count() - 1,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)
|
||||
.unwrap();
|
||||
@@ -1045,13 +1106,13 @@ impl BTreeCursor {
|
||||
_ => unreachable!(),
|
||||
};
|
||||
self.drop_cell(contents, contents.cell_count() - 1);
|
||||
contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, last_cell_pointer);
|
||||
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, last_cell_pointer);
|
||||
}
|
||||
// last page right most pointer points to previous right most pointer before splitting
|
||||
let last_page = new_pages.last().unwrap();
|
||||
let last_page_contents = last_page.get().contents.as_mut().unwrap();
|
||||
last_page_contents.write_u32(
|
||||
BTREE_HEADER_OFFSET_RIGHTMOST,
|
||||
PAGE_HEADER_OFFSET_RIGHTMOST_PTR,
|
||||
self.write_info.rightmost_pointer.borrow().unwrap(),
|
||||
);
|
||||
}
|
||||
@@ -1069,8 +1130,8 @@ impl BTreeCursor {
|
||||
&contents.page_type(),
|
||||
0,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)
|
||||
.unwrap();
|
||||
@@ -1119,6 +1180,9 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Balance the root page.
|
||||
/// This is done when the root page overflows, and we need to create a new root page.
|
||||
/// See e.g. https://en.wikipedia.org/wiki/B-tree
|
||||
fn balance_root(&mut self) {
|
||||
/* todo: balance deeper, create child and copy contents of root there. Then split root */
|
||||
/* if we are in root page then we just need to create a new root and push key there */
|
||||
@@ -1145,8 +1209,8 @@ impl BTreeCursor {
|
||||
}
|
||||
// point new root right child to previous root
|
||||
new_root_page_contents
|
||||
.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, new_root_page_id as u32);
|
||||
new_root_page_contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32);
|
||||
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
}
|
||||
|
||||
/* swap splitted page buffer with new root buffer so we don't have to update page idx */
|
||||
@@ -1195,12 +1259,16 @@ impl BTreeCursor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocate a new page to the btree via the pager.
|
||||
/// This marks the page as dirty and writes the page header.
|
||||
fn allocate_page(&self, page_type: PageType, offset: usize) -> PageRef {
|
||||
let page = self.pager.allocate_page().unwrap();
|
||||
btree_init_page(&page, page_type, &self.database_header.borrow(), offset);
|
||||
page
|
||||
}
|
||||
|
||||
/// Allocate a new overflow page.
|
||||
/// This is done when a cell overflows and new space is needed.
|
||||
fn allocate_overflow_page(&self) -> PageRef {
|
||||
let page = self.pager.allocate_page().unwrap();
|
||||
|
||||
@@ -1212,9 +1280,7 @@ impl BTreeCursor {
|
||||
page
|
||||
}
|
||||
|
||||
/*
|
||||
Allocate space for a cell on a page.
|
||||
*/
|
||||
/// Allocate space for a cell on a page.
|
||||
fn allocate_cell_space(&self, page_ref: &PageContent, amount: u16) -> u16 {
|
||||
let amount = amount as usize;
|
||||
|
||||
@@ -1236,24 +1302,25 @@ impl BTreeCursor {
|
||||
if gap + 2 + amount > top {
|
||||
// defragment
|
||||
self.defragment_page(page_ref, RefCell::borrow(&self.database_header));
|
||||
top = page_ref.read_u16(BTREE_HEADER_OFFSET_CELL_CONTENT) as usize;
|
||||
top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize;
|
||||
}
|
||||
|
||||
let db_header = RefCell::borrow(&self.database_header);
|
||||
top -= amount;
|
||||
|
||||
page_ref.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, top as u16);
|
||||
page_ref.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, top as u16);
|
||||
|
||||
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
|
||||
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
|
||||
assert!(top + amount <= usable_space);
|
||||
top as u16
|
||||
}
|
||||
|
||||
/// Defragment a page. This means packing all the cells to the end of the page.
|
||||
fn defragment_page(&self, page: &PageContent, db_header: Ref<DatabaseHeader>) {
|
||||
log::debug!("defragment_page");
|
||||
let cloned_page = page.clone();
|
||||
// TODO(pere): usable space should include offset probably
|
||||
let usable_space = (db_header.page_size - db_header.unused_space as u16) as u64;
|
||||
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as u64;
|
||||
let mut cbrk = usable_space;
|
||||
|
||||
// TODO: implement fast algorithm
|
||||
@@ -1330,24 +1397,33 @@ impl BTreeCursor {
|
||||
let write_buf = page.as_ptr();
|
||||
|
||||
// set new first byte of cell content
|
||||
page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cbrk as u16);
|
||||
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16);
|
||||
// set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start
|
||||
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0);
|
||||
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
|
||||
// set unused space to 0
|
||||
let first_cell = cloned_page.cell_content_area() as u64;
|
||||
assert!(first_cell <= cbrk);
|
||||
write_buf[first_cell as usize..cbrk as usize].fill(0);
|
||||
}
|
||||
|
||||
// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte
|
||||
// and end of cell pointer area.
|
||||
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte
|
||||
/// and end of cell pointer area.
|
||||
#[allow(unused_assignments)]
|
||||
fn compute_free_space(&self, page: &PageContent, db_header: Ref<DatabaseHeader>) -> u16 {
|
||||
// TODO(pere): maybe free space is not calculated correctly with offset
|
||||
let buf = page.as_ptr();
|
||||
|
||||
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
|
||||
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
|
||||
let mut first_byte_in_cell_content = page.cell_content_area();
|
||||
// A zero value for the cell content area pointer is interpreted as 65536.
|
||||
// See https://www.sqlite.org/fileformat.html
|
||||
// The max page size for a sqlite database is 64kiB i.e. 65536 bytes.
|
||||
// 65536 is u16::MAX + 1, and since cell content grows from right to left, this means
|
||||
// the cell content area pointer is at the end of the page,
|
||||
// i.e.
|
||||
// 1. the page size is 64kiB
|
||||
// 2. there are no cells on the page
|
||||
// 3. there is no reserved space at the end of the page
|
||||
if first_byte_in_cell_content == 0 {
|
||||
first_byte_in_cell_content = u16::MAX;
|
||||
}
|
||||
@@ -1360,12 +1436,16 @@ impl BTreeCursor {
|
||||
let child_pointer_size = if page.is_leaf() { 0 } else { 4 };
|
||||
let first_cell = (page.offset + 8 + child_pointer_size + (2 * ncell)) as u16;
|
||||
|
||||
// The amount of free space is the sum of:
|
||||
// 1. 0..first_byte_in_cell_content (everything to the left of the cell content area pointer is unused free space)
|
||||
// 2. fragmented_free_bytes.
|
||||
let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize;
|
||||
|
||||
let mut pc = free_block_pointer as usize;
|
||||
if pc > 0 {
|
||||
if pc < first_byte_in_cell_content as usize {
|
||||
// corrupt
|
||||
// Freeblocks exist in the cell content area e.g. after deletions
|
||||
// They should never exist in the unused area of the page.
|
||||
todo!("corrupted page");
|
||||
}
|
||||
|
||||
@@ -1399,6 +1479,8 @@ impl BTreeCursor {
|
||||
nfree as u16
|
||||
}
|
||||
|
||||
/// Fill in the cell payload with the record.
|
||||
/// If the record is too large to fit in the cell, it will spill onto overflow pages.
|
||||
fn fill_cell_payload(
|
||||
&self,
|
||||
page_type: PageType,
|
||||
@@ -1423,13 +1505,13 @@ impl BTreeCursor {
|
||||
write_varint_to_vec(record_buf.len() as u64, cell_payload);
|
||||
}
|
||||
|
||||
let max_local = self.max_local(page_type.clone());
|
||||
let payload_overflow_threshold_max = self.payload_overflow_threshold_max(page_type.clone());
|
||||
log::debug!(
|
||||
"fill_cell_payload(record_size={}, max_local={})",
|
||||
"fill_cell_payload(record_size={}, payload_overflow_threshold_max={})",
|
||||
record_buf.len(),
|
||||
max_local
|
||||
payload_overflow_threshold_max
|
||||
);
|
||||
if record_buf.len() <= max_local {
|
||||
if record_buf.len() <= payload_overflow_threshold_max {
|
||||
// enough allowed space to fit inside a btree page
|
||||
cell_payload.extend_from_slice(record_buf.as_slice());
|
||||
cell_payload.resize(cell_payload.len() + 4, 0);
|
||||
@@ -1437,11 +1519,13 @@ impl BTreeCursor {
|
||||
}
|
||||
log::debug!("fill_cell_payload(overflow)");
|
||||
|
||||
let min_local = self.min_local(page_type);
|
||||
let mut space_left = min_local + (record_buf.len() - min_local) % (self.usable_space() - 4);
|
||||
let payload_overflow_threshold_min = self.payload_overflow_threshold_min(page_type);
|
||||
// see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371
|
||||
let mut space_left = payload_overflow_threshold_min
|
||||
+ (record_buf.len() - payload_overflow_threshold_min) % (self.usable_space() - 4);
|
||||
|
||||
if space_left > max_local {
|
||||
space_left = min_local;
|
||||
if space_left > payload_overflow_threshold_max {
|
||||
space_left = payload_overflow_threshold_min;
|
||||
}
|
||||
|
||||
// cell_size must be equal to first value of space_left as this will be the bytes copied to non-overflow page.
|
||||
@@ -1487,31 +1571,54 @@ impl BTreeCursor {
|
||||
assert_eq!(cell_size, cell_payload.len());
|
||||
}
|
||||
|
||||
fn max_local(&self, page_type: PageType) -> usize {
|
||||
let usable_space = self.usable_space();
|
||||
/// Returns the maximum payload size (X) that can be stored directly on a b-tree page without spilling to overflow pages.
|
||||
///
|
||||
/// For table leaf pages: X = usable_size - 35
|
||||
/// For index pages: X = ((usable_size - 12) * 64/255) - 23
|
||||
///
|
||||
/// The usable size is the total page size less the reserved space at the end of each page.
|
||||
/// These thresholds are designed to:
|
||||
/// - Give a minimum fanout of 4 for index b-trees
|
||||
/// - Ensure enough payload is on the b-tree page that the record header can usually be accessed
|
||||
/// without consulting an overflow page
|
||||
fn payload_overflow_threshold_max(&self, page_type: PageType) -> usize {
|
||||
let usable_size = self.usable_space();
|
||||
match page_type {
|
||||
PageType::IndexInterior | PageType::TableInterior => {
|
||||
(usable_space - 12) * 64 / 255 - 23
|
||||
PageType::IndexInterior | PageType::IndexLeaf => {
|
||||
((usable_size - 12) * 64 / 255) - 23 // Index page formula
|
||||
}
|
||||
PageType::TableInterior | PageType::TableLeaf => {
|
||||
usable_size - 35 // Table leaf page formula
|
||||
}
|
||||
PageType::IndexLeaf | PageType::TableLeaf => usable_space - 35,
|
||||
}
|
||||
}
|
||||
|
||||
fn min_local(&self, page_type: PageType) -> usize {
|
||||
let usable_space = self.usable_space();
|
||||
match page_type {
|
||||
PageType::IndexInterior | PageType::TableInterior => {
|
||||
(usable_space - 12) * 32 / 255 - 23
|
||||
}
|
||||
PageType::IndexLeaf | PageType::TableLeaf => (usable_space - 12) * 32 / 255 - 23,
|
||||
}
|
||||
/// Returns the minimum payload size (M) that must be stored on the b-tree page before spilling to overflow pages is allowed.
|
||||
///
|
||||
/// For all page types: M = ((usable_size - 12) * 32/255) - 23
|
||||
///
|
||||
/// When payload size P exceeds max_local():
|
||||
/// - If K = M + ((P-M) % (usable_size-4)) <= max_local(): store K bytes on page
|
||||
/// - Otherwise: store M bytes on page
|
||||
///
|
||||
/// The remaining bytes are stored on overflow pages in both cases.
|
||||
fn payload_overflow_threshold_min(&self, _page_type: PageType) -> usize {
|
||||
let usable_size = self.usable_space();
|
||||
// Same formula for all page types
|
||||
((usable_size - 12) * 32 / 255) - 23
|
||||
}
|
||||
|
||||
/// The "usable size" of a database page is the page size specified by the 2-byte integer at offset 16
|
||||
/// in the header, minus the "reserved" space size recorded in the 1-byte integer at offset 20 in the header.
|
||||
/// The usable size of a page might be an odd number. However, the usable size is not allowed to be less than 480.
|
||||
/// In other words, if the page size is 512, then the reserved space size cannot exceed 32.
|
||||
fn usable_space(&self) -> usize {
|
||||
let db_header = RefCell::borrow(&self.database_header);
|
||||
(db_header.page_size - db_header.unused_space as u16) as usize
|
||||
(db_header.page_size - db_header.reserved_space as u16) as usize
|
||||
}
|
||||
|
||||
/// Find the index of the cell in the page that contains the given rowid.
|
||||
/// BTree tables only.
|
||||
fn find_cell(&self, page: &PageContent, int_key: u64) -> usize {
|
||||
let mut cell_idx = 0;
|
||||
let cell_count = page.cell_count();
|
||||
@@ -1520,8 +1627,8 @@ impl BTreeCursor {
|
||||
.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(page.page_type()),
|
||||
self.min_local(page.page_type()),
|
||||
self.payload_overflow_threshold_max(page.page_type()),
|
||||
self.payload_overflow_threshold_min(page.page_type()),
|
||||
self.usable_space(),
|
||||
)
|
||||
.unwrap()
|
||||
@@ -1545,6 +1652,8 @@ impl BTreeCursor {
|
||||
}
|
||||
|
||||
impl PageStack {
|
||||
/// Push a new page onto the stack.
|
||||
/// This effectively means traversing to a child page.
|
||||
fn push(&self, page: PageRef) {
|
||||
debug!(
|
||||
"pagestack::push(current={}, new_page_id={})",
|
||||
@@ -1561,6 +1670,8 @@ impl PageStack {
|
||||
self.cell_indices.borrow_mut()[current as usize] = 0;
|
||||
}
|
||||
|
||||
/// Pop a page off the stack.
|
||||
/// This effectively means traversing back up to a parent page.
|
||||
fn pop(&self) {
|
||||
let current = *self.current_page.borrow();
|
||||
debug!("pagestack::pop(current={})", current);
|
||||
@@ -1569,6 +1680,8 @@ impl PageStack {
|
||||
*self.current_page.borrow_mut() -= 1;
|
||||
}
|
||||
|
||||
/// Get the top page on the stack.
|
||||
/// This is the page that is currently being traversed.
|
||||
fn top(&self) -> PageRef {
|
||||
let current = *self.current_page.borrow();
|
||||
let page = self.stack.borrow()[current as usize]
|
||||
@@ -1583,6 +1696,7 @@ impl PageStack {
|
||||
page
|
||||
}
|
||||
|
||||
/// Get the parent page of the current page.
|
||||
fn parent(&self) -> PageRef {
|
||||
let current = *self.current_page.borrow();
|
||||
self.stack.borrow()[current as usize - 1]
|
||||
@@ -1597,13 +1711,15 @@ impl PageStack {
|
||||
}
|
||||
|
||||
/// Cell index of the current page
|
||||
fn current_index(&self) -> i32 {
|
||||
fn current_cell_index(&self) -> i32 {
|
||||
let current = self.current();
|
||||
self.cell_indices.borrow()[current]
|
||||
}
|
||||
|
||||
fn curr_idx_out_of_begin(&self) -> bool {
|
||||
let cell_idx = self.current_index();
|
||||
/// Check if the current cell index is less than 0.
|
||||
/// This means we have been iterating backwards and have reached the start of the page.
|
||||
fn current_cell_index_less_than_min(&self) -> bool {
|
||||
let cell_idx = self.current_cell_index();
|
||||
cell_idx < 0
|
||||
}
|
||||
|
||||
@@ -1639,7 +1755,7 @@ fn find_free_cell(page_ref: &PageContent, db_header: Ref<DatabaseHeader>, amount
|
||||
|
||||
let buf = page_ref.as_ptr();
|
||||
|
||||
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
|
||||
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
|
||||
let maxpc = usable_space - amount;
|
||||
let mut found = false;
|
||||
while pc <= maxpc {
|
||||
@@ -1785,8 +1901,8 @@ impl Cursor for BTreeCursor {
|
||||
let equals = match &contents.cell_get(
|
||||
cell_idx,
|
||||
self.pager.clone(),
|
||||
self.max_local(contents.page_type()),
|
||||
self.min_local(contents.page_type()),
|
||||
self.payload_overflow_threshold_max(contents.page_type()),
|
||||
self.payload_overflow_threshold_min(contents.page_type()),
|
||||
self.usable_space(),
|
||||
)? {
|
||||
BTreeCell::TableLeafCell(l) => l._rowid == int_key,
|
||||
@@ -1823,15 +1939,18 @@ pub fn btree_init_page(
|
||||
let contents = contents.contents.as_mut().unwrap();
|
||||
contents.offset = offset;
|
||||
let id = page_type as u8;
|
||||
contents.write_u8(BTREE_HEADER_OFFSET_TYPE, id);
|
||||
contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0);
|
||||
contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, id);
|
||||
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
|
||||
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
|
||||
|
||||
let cell_content_area_start = db_header.page_size - db_header.unused_space as u16;
|
||||
contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start);
|
||||
let cell_content_area_start = db_header.page_size - db_header.reserved_space as u16;
|
||||
contents.write_u16(
|
||||
PAGE_HEADER_OFFSET_CELL_CONTENT_AREA,
|
||||
cell_content_area_start,
|
||||
);
|
||||
|
||||
contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0);
|
||||
contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0);
|
||||
contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
|
||||
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0);
|
||||
}
|
||||
|
||||
fn to_static_buf(buf: &[u8]) -> &'static [u8] {
|
||||
|
||||
@@ -482,7 +482,7 @@ impl Pager {
|
||||
|
||||
pub fn usable_size(&self) -> usize {
|
||||
let db_header = self.db_header.borrow();
|
||||
(db_header.page_size - db_header.unused_space as u16) as usize
|
||||
(db_header.page_size - db_header.reserved_space as u16) as usize
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -64,30 +64,84 @@ const DEFAULT_CACHE_SIZE: i32 = -2000;
|
||||
// Minimum number of pages that cache can hold.
|
||||
pub const MIN_PAGE_CACHE_SIZE: usize = 10;
|
||||
|
||||
/// The database header.
|
||||
/// The first 100 bytes of the database file comprise the database file header.
|
||||
/// The database file header is divided into fields as shown by the table below.
|
||||
/// All multibyte fields in the database file header are stored with the most significant byte first (big-endian).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DatabaseHeader {
|
||||
/// The header string: "SQLite format 3\0"
|
||||
magic: [u8; 16],
|
||||
|
||||
/// The database page size in bytes. Must be a power of two between 512 and 32768 inclusive,
|
||||
/// or the value 1 representing a page size of 65536.
|
||||
pub page_size: u16,
|
||||
|
||||
/// File format write version. 1 for legacy; 2 for WAL.
|
||||
write_version: u8,
|
||||
|
||||
/// File format read version. 1 for legacy; 2 for WAL.
|
||||
read_version: u8,
|
||||
pub unused_space: u8,
|
||||
|
||||
/// Bytes of unused "reserved" space at the end of each page. Usually 0.
|
||||
/// SQLite has the ability to set aside a small number of extra bytes at the end of every page for use by extensions.
|
||||
/// These extra bytes are used, for example, by the SQLite Encryption Extension to store a nonce and/or
|
||||
/// cryptographic checksum associated with each page.
|
||||
pub reserved_space: u8,
|
||||
|
||||
/// Maximum embedded payload fraction. Must be 64.
|
||||
max_embed_frac: u8,
|
||||
|
||||
/// Minimum embedded payload fraction. Must be 32.
|
||||
min_embed_frac: u8,
|
||||
|
||||
/// Leaf payload fraction. Must be 32.
|
||||
min_leaf_frac: u8,
|
||||
|
||||
/// File change counter, incremented when database is modified.
|
||||
change_counter: u32,
|
||||
|
||||
/// Size of the database file in pages. The "in-header database size".
|
||||
pub database_size: u32,
|
||||
|
||||
/// Page number of the first freelist trunk page.
|
||||
freelist_trunk_page: u32,
|
||||
|
||||
/// Total number of freelist pages.
|
||||
freelist_pages: u32,
|
||||
|
||||
/// The schema cookie. Incremented when the database schema changes.
|
||||
schema_cookie: u32,
|
||||
|
||||
/// The schema format number. Supported formats are 1, 2, 3, and 4.
|
||||
schema_format: u32,
|
||||
pub default_cache_size: i32,
|
||||
vacuum: u32,
|
||||
|
||||
/// Default page cache size.
|
||||
pub default_page_cache_size: i32,
|
||||
|
||||
/// The page number of the largest root b-tree page when in auto-vacuum or
|
||||
/// incremental-vacuum modes, or zero otherwise.
|
||||
vacuum_mode_largest_root_page: u32,
|
||||
|
||||
/// The database text encoding. 1=UTF-8, 2=UTF-16le, 3=UTF-16be.
|
||||
text_encoding: u32,
|
||||
|
||||
/// The "user version" as read and set by the user_version pragma.
|
||||
user_version: u32,
|
||||
incremental_vacuum: u32,
|
||||
|
||||
/// True (non-zero) for incremental-vacuum mode. False (zero) otherwise.
|
||||
incremental_vacuum_enabled: u32,
|
||||
|
||||
/// The "Application ID" set by PRAGMA application_id.
|
||||
application_id: u32,
|
||||
reserved: [u8; 20],
|
||||
|
||||
/// Reserved for expansion. Must be zero.
|
||||
reserved_for_expansion: [u8; 20],
|
||||
|
||||
/// The version-valid-for number.
|
||||
version_valid_for: u32,
|
||||
|
||||
/// SQLITE_VERSION_NUMBER
|
||||
pub version_number: u32,
|
||||
}
|
||||
|
||||
@@ -98,28 +152,62 @@ pub const WAL_FRAME_HEADER_SIZE: usize = 24;
|
||||
pub const WAL_MAGIC_LE: u32 = 0x377f0682;
|
||||
pub const WAL_MAGIC_BE: u32 = 0x377f0683;
|
||||
|
||||
/// The Write-Ahead Log (WAL) header.
|
||||
/// The first 32 bytes of a WAL file comprise the WAL header.
|
||||
/// The WAL header is divided into the following fields stored in big-endian order.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
#[repr(C)] // This helps with encoding because rust does not respect the order in structs, so in
|
||||
// this case we want to keep the order
|
||||
pub struct WalHeader {
|
||||
/// Magic number. 0x377f0682 or 0x377f0683
|
||||
/// If the LSB is 0, checksums are native byte order, else checksums are serialized
|
||||
pub magic: u32,
|
||||
|
||||
/// WAL format version. Currently 3007000
|
||||
pub file_format: u32,
|
||||
|
||||
/// Database page size in bytes. Power of two between 512 and 32768 inclusive
|
||||
pub page_size: u32,
|
||||
|
||||
/// Checkpoint sequence number. Increases with each checkpoint
|
||||
pub checkpoint_seq: u32,
|
||||
|
||||
/// Random value used for the first salt in checksum calculations
|
||||
pub salt_1: u32,
|
||||
|
||||
/// Random value used for the second salt in checksum calculations
|
||||
pub salt_2: u32,
|
||||
|
||||
/// First checksum value in the wal-header
|
||||
pub checksum_1: u32,
|
||||
|
||||
/// Second checksum value in the wal-header
|
||||
pub checksum_2: u32,
|
||||
}
|
||||
|
||||
/// Immediately following the wal-header are zero or more frames.
|
||||
/// Each frame consists of a 24-byte frame-header followed by <page-size> bytes of page data.
|
||||
/// The frame-header is six big-endian 32-bit unsigned integer values, as follows:
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
pub struct WalFrameHeader {
|
||||
/// Page number
|
||||
page_number: u32,
|
||||
|
||||
/// For commit records, the size of the database file in pages after the commit.
|
||||
/// For all other records, zero.
|
||||
db_size: u32,
|
||||
|
||||
/// Salt-1 copied from the WAL header
|
||||
salt_1: u32,
|
||||
|
||||
/// Salt-2 copied from the WAL header
|
||||
salt_2: u32,
|
||||
|
||||
/// Checksum-1: Cumulative checksum up through and including this page
|
||||
checksum_1: u32,
|
||||
|
||||
/// Checksum-2: Second half of the cumulative checksum
|
||||
checksum_2: u32,
|
||||
}
|
||||
|
||||
@@ -130,7 +218,7 @@ impl Default for DatabaseHeader {
|
||||
page_size: 4096,
|
||||
write_version: 2,
|
||||
read_version: 2,
|
||||
unused_space: 0,
|
||||
reserved_space: 0,
|
||||
max_embed_frac: 64,
|
||||
min_embed_frac: 32,
|
||||
min_leaf_frac: 32,
|
||||
@@ -140,13 +228,13 @@ impl Default for DatabaseHeader {
|
||||
freelist_pages: 0,
|
||||
schema_cookie: 0,
|
||||
schema_format: 4, // latest format, new sqlite3 databases use this format
|
||||
default_cache_size: 500, // pages
|
||||
vacuum: 0,
|
||||
default_page_cache_size: 500, // pages
|
||||
vacuum_mode_largest_root_page: 0,
|
||||
text_encoding: 1, // utf-8
|
||||
user_version: 1,
|
||||
incremental_vacuum: 0,
|
||||
incremental_vacuum_enabled: 0,
|
||||
application_id: 0,
|
||||
reserved: [0; 20],
|
||||
reserved_for_expansion: [0; 20],
|
||||
version_valid_for: 3047000,
|
||||
version_number: 3047000,
|
||||
}
|
||||
@@ -180,7 +268,7 @@ fn finish_read_database_header(
|
||||
header.page_size = u16::from_be_bytes([buf[16], buf[17]]);
|
||||
header.write_version = buf[18];
|
||||
header.read_version = buf[19];
|
||||
header.unused_space = buf[20];
|
||||
header.reserved_space = buf[20];
|
||||
header.max_embed_frac = buf[21];
|
||||
header.min_embed_frac = buf[22];
|
||||
header.min_leaf_frac = buf[23];
|
||||
@@ -190,16 +278,16 @@ fn finish_read_database_header(
|
||||
header.freelist_pages = u32::from_be_bytes([buf[36], buf[37], buf[38], buf[39]]);
|
||||
header.schema_cookie = u32::from_be_bytes([buf[40], buf[41], buf[42], buf[43]]);
|
||||
header.schema_format = u32::from_be_bytes([buf[44], buf[45], buf[46], buf[47]]);
|
||||
header.default_cache_size = i32::from_be_bytes([buf[48], buf[49], buf[50], buf[51]]);
|
||||
if header.default_cache_size == 0 {
|
||||
header.default_cache_size = DEFAULT_CACHE_SIZE;
|
||||
header.default_page_cache_size = i32::from_be_bytes([buf[48], buf[49], buf[50], buf[51]]);
|
||||
if header.default_page_cache_size == 0 {
|
||||
header.default_page_cache_size = DEFAULT_CACHE_SIZE;
|
||||
}
|
||||
header.vacuum = u32::from_be_bytes([buf[52], buf[53], buf[54], buf[55]]);
|
||||
header.vacuum_mode_largest_root_page = u32::from_be_bytes([buf[52], buf[53], buf[54], buf[55]]);
|
||||
header.text_encoding = u32::from_be_bytes([buf[56], buf[57], buf[58], buf[59]]);
|
||||
header.user_version = u32::from_be_bytes([buf[60], buf[61], buf[62], buf[63]]);
|
||||
header.incremental_vacuum = u32::from_be_bytes([buf[64], buf[65], buf[66], buf[67]]);
|
||||
header.incremental_vacuum_enabled = u32::from_be_bytes([buf[64], buf[65], buf[66], buf[67]]);
|
||||
header.application_id = u32::from_be_bytes([buf[68], buf[69], buf[70], buf[71]]);
|
||||
header.reserved.copy_from_slice(&buf[72..92]);
|
||||
header.reserved_for_expansion.copy_from_slice(&buf[72..92]);
|
||||
header.version_valid_for = u32::from_be_bytes([buf[92], buf[93], buf[94], buf[95]]);
|
||||
header.version_number = u32::from_be_bytes([buf[96], buf[97], buf[98], buf[99]]);
|
||||
Ok(())
|
||||
@@ -258,7 +346,7 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
|
||||
buf[16..18].copy_from_slice(&header.page_size.to_be_bytes());
|
||||
buf[18] = header.write_version;
|
||||
buf[19] = header.read_version;
|
||||
buf[20] = header.unused_space;
|
||||
buf[20] = header.reserved_space;
|
||||
buf[21] = header.max_embed_frac;
|
||||
buf[22] = header.min_embed_frac;
|
||||
buf[23] = header.min_leaf_frac;
|
||||
@@ -268,15 +356,15 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
|
||||
buf[36..40].copy_from_slice(&header.freelist_pages.to_be_bytes());
|
||||
buf[40..44].copy_from_slice(&header.schema_cookie.to_be_bytes());
|
||||
buf[44..48].copy_from_slice(&header.schema_format.to_be_bytes());
|
||||
buf[48..52].copy_from_slice(&header.default_cache_size.to_be_bytes());
|
||||
buf[48..52].copy_from_slice(&header.default_page_cache_size.to_be_bytes());
|
||||
|
||||
buf[52..56].copy_from_slice(&header.vacuum.to_be_bytes());
|
||||
buf[52..56].copy_from_slice(&header.vacuum_mode_largest_root_page.to_be_bytes());
|
||||
buf[56..60].copy_from_slice(&header.text_encoding.to_be_bytes());
|
||||
buf[60..64].copy_from_slice(&header.user_version.to_be_bytes());
|
||||
buf[64..68].copy_from_slice(&header.incremental_vacuum.to_be_bytes());
|
||||
buf[64..68].copy_from_slice(&header.incremental_vacuum_enabled.to_be_bytes());
|
||||
|
||||
buf[68..72].copy_from_slice(&header.application_id.to_be_bytes());
|
||||
buf[72..92].copy_from_slice(&header.reserved);
|
||||
buf[72..92].copy_from_slice(&header.reserved_for_expansion);
|
||||
buf[92..96].copy_from_slice(&header.version_valid_for.to_be_bytes());
|
||||
buf[96..100].copy_from_slice(&header.version_number.to_be_bytes());
|
||||
}
|
||||
@@ -387,6 +475,12 @@ impl PageContent {
|
||||
buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes());
|
||||
}
|
||||
|
||||
/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page.
|
||||
/// A freeblock is a structure used to identify unallocated space within a b-tree page.
|
||||
/// Freeblocks are organized as a chain.
|
||||
///
|
||||
/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead
|
||||
/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions.
|
||||
pub fn first_freeblock(&self) -> u16 {
|
||||
self.read_u16(1)
|
||||
}
|
||||
@@ -395,10 +489,16 @@ impl PageContent {
|
||||
self.read_u16(3) as usize
|
||||
}
|
||||
|
||||
/// The start of the cell content area.
|
||||
/// SQLite strives to place cells as far toward the end of the b-tree page as it can,
|
||||
/// in order to leave space for future growth of the cell pointer array.
|
||||
/// = the cell content area pointer moves leftward as cells are added to the page
|
||||
pub fn cell_content_area(&self) -> u16 {
|
||||
self.read_u16(5)
|
||||
}
|
||||
|
||||
/// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header.
|
||||
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
|
||||
pub fn num_frag_free_bytes(&self) -> u8 {
|
||||
self.read_u8(7)
|
||||
}
|
||||
@@ -416,22 +516,24 @@ impl PageContent {
|
||||
&self,
|
||||
idx: usize,
|
||||
pager: Rc<Pager>,
|
||||
max_local: usize,
|
||||
min_local: usize,
|
||||
payload_overflow_threshold_max: usize,
|
||||
payload_overflow_threshold_min: usize,
|
||||
usable_size: usize,
|
||||
) -> Result<BTreeCell> {
|
||||
log::debug!("cell_get(idx={})", idx);
|
||||
let buf = self.as_ptr();
|
||||
|
||||
let ncells = self.cell_count();
|
||||
let cell_start = match self.page_type() {
|
||||
// the page header is 12 bytes for interior pages, 8 bytes for leaf pages
|
||||
// this is because the 4 last bytes in the interior page's header are used for the rightmost pointer.
|
||||
let cell_pointer_array_start = match self.page_type() {
|
||||
PageType::IndexInterior => 12,
|
||||
PageType::TableInterior => 12,
|
||||
PageType::IndexLeaf => 8,
|
||||
PageType::TableLeaf => 8,
|
||||
};
|
||||
assert!(idx < ncells, "cell_get: idx out of bounds");
|
||||
let cell_pointer = cell_start + (idx * 2);
|
||||
let cell_pointer = cell_pointer_array_start + (idx * 2);
|
||||
let cell_pointer = self.read_u16(cell_pointer) as usize;
|
||||
|
||||
read_btree_cell(
|
||||
@@ -439,13 +541,17 @@ impl PageContent {
|
||||
&self.page_type(),
|
||||
cell_pointer,
|
||||
pager,
|
||||
max_local,
|
||||
min_local,
|
||||
payload_overflow_threshold_max,
|
||||
payload_overflow_threshold_min,
|
||||
usable_size,
|
||||
)
|
||||
}
|
||||
|
||||
/// When using this fu
|
||||
/// The cell pointer array of a b-tree page immediately follows the b-tree page header.
|
||||
/// Let K be the number of cells on the btree.
|
||||
/// The cell pointer array consists of K 2-byte integer offsets to the cell contents.
|
||||
/// The cell pointers are arranged in key order with:
|
||||
/// - left-most cell (the cell with the smallest key) first and
|
||||
/// - the right-most cell (the cell with the largest key) last.
|
||||
pub fn cell_get_raw_pointer_region(&self) -> (usize, usize) {
|
||||
let cell_start = match self.page_type() {
|
||||
PageType::IndexInterior => 12,
|
||||
@@ -460,27 +566,31 @@ impl PageContent {
|
||||
pub fn cell_get_raw_region(
|
||||
&self,
|
||||
idx: usize,
|
||||
max_local: usize,
|
||||
min_local: usize,
|
||||
payload_overflow_threshold_max: usize,
|
||||
payload_overflow_threshold_min: usize,
|
||||
usable_size: usize,
|
||||
) -> (usize, usize) {
|
||||
let buf = self.as_ptr();
|
||||
let ncells = self.cell_count();
|
||||
let cell_start = match self.page_type() {
|
||||
let cell_pointer_array_start = match self.page_type() {
|
||||
PageType::IndexInterior => 12,
|
||||
PageType::TableInterior => 12,
|
||||
PageType::IndexLeaf => 8,
|
||||
PageType::TableLeaf => 8,
|
||||
};
|
||||
assert!(idx < ncells, "cell_get: idx out of bounds");
|
||||
let cell_pointer = cell_start + (idx * 2);
|
||||
let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each
|
||||
let cell_pointer = self.read_u16(cell_pointer) as usize;
|
||||
let start = cell_pointer;
|
||||
let len = match self.page_type() {
|
||||
PageType::IndexInterior => {
|
||||
let (len_payload, n_payload) = read_varint(&buf[cell_pointer + 4..]).unwrap();
|
||||
let (overflows, to_read) =
|
||||
payload_overflows(len_payload as usize, max_local, min_local, usable_size);
|
||||
let (overflows, to_read) = payload_overflows(
|
||||
len_payload as usize,
|
||||
payload_overflow_threshold_max,
|
||||
payload_overflow_threshold_min,
|
||||
usable_size,
|
||||
);
|
||||
if overflows {
|
||||
4 + to_read + n_payload + 4
|
||||
} else {
|
||||
@@ -493,8 +603,12 @@ impl PageContent {
|
||||
}
|
||||
PageType::IndexLeaf => {
|
||||
let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap();
|
||||
let (overflows, to_read) =
|
||||
payload_overflows(len_payload as usize, max_local, min_local, usable_size);
|
||||
let (overflows, to_read) = payload_overflows(
|
||||
len_payload as usize,
|
||||
payload_overflow_threshold_max,
|
||||
payload_overflow_threshold_min,
|
||||
usable_size,
|
||||
);
|
||||
if overflows {
|
||||
to_read + n_payload + 4
|
||||
} else {
|
||||
@@ -504,8 +618,12 @@ impl PageContent {
|
||||
PageType::TableLeaf => {
|
||||
let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap();
|
||||
let (_, n_rowid) = read_varint(&buf[cell_pointer + n_payload..]).unwrap();
|
||||
let (overflows, to_read) =
|
||||
payload_overflows(len_payload as usize, max_local, min_local, usable_size);
|
||||
let (overflows, to_read) = payload_overflows(
|
||||
len_payload as usize,
|
||||
payload_overflow_threshold_max,
|
||||
payload_overflow_threshold_min,
|
||||
usable_size,
|
||||
);
|
||||
if overflows {
|
||||
to_read + n_payload + n_rowid
|
||||
} else {
|
||||
@@ -1170,28 +1288,46 @@ pub fn begin_write_wal_header(io: &Rc<dyn File>, header: &WalHeader) -> Result<(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
Checks if payload will overflow a cell based on max local and
|
||||
it will return the min size that will be stored in that case,
|
||||
including overflow pointer
|
||||
*/
|
||||
/// Checks if payload will overflow a cell based on the maximum allowed size.
|
||||
/// It will return the min size that will be stored in that case,
|
||||
/// including overflow pointer
|
||||
/// see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371
|
||||
pub fn payload_overflows(
|
||||
payload_size: usize,
|
||||
max_local: usize,
|
||||
min_local: usize,
|
||||
payload_overflow_threshold_max: usize,
|
||||
payload_overflow_threshold_min: usize,
|
||||
usable_size: usize,
|
||||
) -> (bool, usize) {
|
||||
if payload_size <= max_local {
|
||||
if payload_size <= payload_overflow_threshold_max {
|
||||
return (false, 0);
|
||||
}
|
||||
|
||||
let mut space_left = min_local + (payload_size - min_local) % (usable_size - 4);
|
||||
if space_left > max_local {
|
||||
space_left = min_local;
|
||||
let mut space_left = payload_overflow_threshold_min
|
||||
+ (payload_size - payload_overflow_threshold_min) % (usable_size - 4);
|
||||
if space_left > payload_overflow_threshold_max {
|
||||
space_left = payload_overflow_threshold_min;
|
||||
}
|
||||
(true, space_left + 4)
|
||||
}
|
||||
|
||||
/// The checksum is computed by interpreting the input as an even number of unsigned 32-bit integers: x(0) through x(N).
|
||||
/// The 32-bit integers are big-endian if the magic number in the first 4 bytes of the WAL header is 0x377f0683
|
||||
/// and the integers are little-endian if the magic number is 0x377f0682.
|
||||
/// The checksum values are always stored in the frame header in a big-endian format regardless of which byte order is used to compute the checksum.
|
||||
|
||||
/// The checksum algorithm only works for content which is a multiple of 8 bytes in length.
|
||||
/// In other words, if the inputs are x(0) through x(N) then N must be odd.
|
||||
/// The checksum algorithm is as follows:
|
||||
///
|
||||
/// s0 = s1 = 0
|
||||
/// for i from 0 to n-1 step 2:
|
||||
/// s0 += x(i) + s1;
|
||||
/// s1 += x(i+1) + s0;
|
||||
/// endfor
|
||||
///
|
||||
/// The outputs s0 and s1 are both weighted checksums using Fibonacci weights in reverse order.
|
||||
/// (The largest Fibonacci weight occurs on the first element of the sequence being summed.)
|
||||
/// The s1 value spans all 32-bit integer terms of the sequence whereas s0 omits the final term.
|
||||
pub fn checksum_wal(
|
||||
buf: &[u8],
|
||||
_wal_header: &WalHeader,
|
||||
|
||||
@@ -386,7 +386,7 @@ fn query_pragma(
|
||||
match pragma {
|
||||
PragmaName::CacheSize => {
|
||||
program.emit_insn(Insn::Integer {
|
||||
value: database_header.borrow().default_cache_size.into(),
|
||||
value: database_header.borrow().default_page_cache_size.into(),
|
||||
dest: register,
|
||||
});
|
||||
}
|
||||
@@ -424,7 +424,7 @@ fn update_cache_size(value: i64, header: Rc<RefCell<DatabaseHeader>>, pager: Rc<
|
||||
}
|
||||
|
||||
// update in-memory header
|
||||
header.borrow_mut().default_cache_size = cache_size_unformatted
|
||||
header.borrow_mut().default_page_cache_size = cache_size_unformatted
|
||||
.try_into()
|
||||
.unwrap_or_else(|_| panic!("invalid value, too big for a i32 {}", value));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user