Merge 'core/btree: improve documentation' from Jussi Saurio

This PR should have no functional changes, just variable renaming and
comments
Using `///` comment format for better IDE support

Reviewed-by: Pere Diaz Bou <penberg@iki.fi>

Closes #539
This commit is contained in:
jussisaurio
2024-12-24 09:44:15 +02:00
4 changed files with 427 additions and 172 deletions

View File

@@ -20,22 +20,37 @@ use super::sqlite3_ondisk::{
/*
These are offsets of fields in the header of a b-tree page.
*/
const BTREE_HEADER_OFFSET_TYPE: usize = 0; /* type of btree page -> u8 */
const BTREE_HEADER_OFFSET_FREEBLOCK: usize = 1; /* pointer to first freeblock -> u16 */
const BTREE_HEADER_OFFSET_CELL_COUNT: usize = 3; /* number of cells in the page -> u16 */
const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of cell allocated content from top -> u16 */
const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */
const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */
/*
** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
** this will be declared corrupt. This value is calculated based on a
** maximum database size of 2^31 pages a minimum fanout of 2 for a
** root-node and 3 for all other internal nodes.
**
** If a tree that appears to be taller than this is encountered, it is
** assumed that the database is corrupt.
*/
/// type of btree page -> u8
const PAGE_HEADER_OFFSET_PAGE_TYPE: usize = 0;
/// pointer to first freeblock -> u16
/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page.
/// A freeblock is a structure used to identify unallocated space within a b-tree page.
/// Freeblocks are organized as a chain.
///
/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead
/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions.
const PAGE_HEADER_OFFSET_FIRST_FREEBLOCK: usize = 1;
/// number of cells in the page -> u16
const PAGE_HEADER_OFFSET_CELL_COUNT: usize = 3;
/// pointer to first byte of cell allocated content from top -> u16
/// SQLite strives to place cells as far toward the end of the b-tree page as it can,
/// in order to leave space for future growth of the cell pointer array.
/// = the cell content area pointer moves leftward as cells are added to the page
const PAGE_HEADER_OFFSET_CELL_CONTENT_AREA: usize = 5;
/// number of fragmented bytes -> u8
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
const PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT: usize = 7;
/// if internalnode, pointer right most pointer (saved separately from cells) -> u32
const PAGE_HEADER_OFFSET_RIGHTMOST_PTR: usize = 8;
/// Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
/// this will be declared corrupt. This value is calculated based on a
/// maximum database size of 2^31 pages a minimum fanout of 2 for a
/// root-node and 3 for all other internal nodes.
///
/// If a tree that appears to be taller than this is encountered, it is
/// assumed that the database is corrupt.
pub const BTCURSOR_MAX_DEPTH: usize = 20;
/// Evaluate a Result<CursorResult<T>>, if IO return IO.
@@ -57,6 +72,8 @@ macro_rules! return_if_locked {
}};
}
/// State machine of a write operation.
/// May involve balancing due to overflow.
#[derive(Debug)]
enum WriteState {
Start,
@@ -67,11 +84,16 @@ enum WriteState {
}
struct WriteInfo {
/// State of the write operation state machine.
state: WriteState,
/// Pages allocated during the write operation due to balancing.
new_pages: RefCell<Vec<PageRef>>,
/// Scratch space used during balancing.
scratch_cells: RefCell<Vec<&'static [u8]>>,
/// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated.
rightmost_pointer: RefCell<Option<u32>>,
page_copy: RefCell<Option<PageContent>>, // this holds the copy a of a page needed for buffer references
/// Copy of the current page needed for buffer references.
page_copy: RefCell<Option<PageContent>>,
}
pub struct BTreeCursor {
@@ -142,6 +164,8 @@ impl BTreeCursor {
}
}
/// Check if the table is empty.
/// This is done by checking if the root page has no cells.
fn is_empty_table(&mut self) -> Result<CursorResult<bool>> {
let page = self.pager.read_page(self.root_page)?;
return_if_locked!(page);
@@ -150,16 +174,18 @@ impl BTreeCursor {
Ok(CursorResult::Ok(cell_count == 0))
}
/// Move the cursor to the previous record and return it.
/// Used in backwards iteration.
fn get_prev_record(&mut self) -> Result<CursorResult<(Option<u64>, Option<OwnedRecord>)>> {
loop {
let page = self.stack.top();
let cell_idx = self.stack.current_index();
let cell_idx = self.stack.current_cell_index();
// moved to current page begin
// moved to beginning of current page
// todo: find a better way to flag moved to end or begin of page
if self.stack.curr_idx_out_of_begin() {
if self.stack.current_cell_index_less_than_min() {
loop {
if self.stack.current_index() > 0 {
if self.stack.current_cell_index() > 0 {
self.stack.retreat();
break;
}
@@ -198,8 +224,8 @@ impl BTreeCursor {
let cell = contents.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)?;
@@ -228,13 +254,15 @@ impl BTreeCursor {
}
}
/// Move the cursor to the next record and return it.
/// Used in forwards iteration, which is the default.
fn get_next_record(
&mut self,
predicate: Option<(SeekKey<'_>, SeekOp)>,
) -> Result<CursorResult<(Option<u64>, Option<OwnedRecord>)>> {
loop {
let mem_page_rc = self.stack.top();
let cell_idx = self.stack.current_index() as usize;
let cell_idx = self.stack.current_cell_index() as usize;
debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx);
return_if_locked!(mem_page_rc);
@@ -286,8 +314,8 @@ impl BTreeCursor {
let cell = contents.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)?;
match &cell {
@@ -386,6 +414,9 @@ impl BTreeCursor {
}
}
/// Move the cursor to the record that matches the seek key and seek operation.
/// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10)
/// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10).
fn seek(
&mut self,
key: SeekKey<'_>,
@@ -403,8 +434,8 @@ impl BTreeCursor {
let cell = contents.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)?;
match &cell {
@@ -476,12 +507,14 @@ impl BTreeCursor {
Ok(CursorResult::Ok((None, None)))
}
/// Move the cursor to the root page of the btree.
fn move_to_root(&mut self) {
let mem_page = self.pager.read_page(self.root_page).unwrap();
self.stack.clear();
self.stack.push(mem_page);
}
/// Move the cursor to the rightmost record in the btree.
fn move_to_rightmost(&mut self) -> Result<CursorResult<()>> {
self.move_to_root();
@@ -553,8 +586,8 @@ impl BTreeCursor {
match &contents.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)? {
BTreeCell::TableInteriorCell(TableInteriorCell {
@@ -634,6 +667,8 @@ impl BTreeCursor {
}
}
/// Insert a record into the btree.
/// If the insert operation overflows the page, it will be split and the btree will be balanced.
fn insert_into_page(
&mut self,
key: &OwnedValue,
@@ -700,7 +735,11 @@ impl BTreeCursor {
}
}
/* insert to position and shift other pointers */
/// Insert a record into a cell.
/// If the cell overflows, an overflow cell is created.
/// insert_into_cell() is called from insert_into_page(),
/// and the overflow cell count is used to determine if the page overflows,
/// i.e. whether we need to balance the btree after the insert.
fn insert_into_cell(&self, page: &mut PageContent, payload: &[u8], cell_idx: usize) {
let free = self.compute_free_space(page, RefCell::borrow(&self.database_header));
let enough_space = payload.len() + 2 <= free as usize;
@@ -734,41 +773,54 @@ impl BTreeCursor {
page.write_u16(pointer_area_pc_by_idx - page.offset, pc);
// update first byte of content area
page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, pc);
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, pc);
// update cell count
let new_n_cells = (page.cell_count() + 1) as u16;
page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, new_n_cells);
page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells);
}
/// Free the range of bytes that a cell occupies.
/// This function also updates the freeblock list in the page.
/// Freeblocks are used to keep track of free space in the page,
/// and are organized as a linked list.
fn free_cell_range(&self, page: &mut PageContent, offset: u16, len: u16) {
// if the freeblock list is empty, we set this block as the first freeblock in the page header.
if page.first_freeblock() == 0 {
// insert into empty list
page.write_u16(offset as usize, 0);
page.write_u16(offset as usize + 2, len);
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, offset);
page.write_u16(offset as usize, 0); // next freeblock = null
page.write_u16(offset as usize + 2, len); // size of this freeblock
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block
return;
}
let first_block = page.first_freeblock();
// if the freeblock list is not empty, and the offset is less than the first freeblock,
// we insert this block at the head of the list
if offset < first_block {
// insert into head of list
page.write_u16(offset as usize, first_block);
page.write_u16(offset as usize + 2, len);
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, offset);
page.write_u16(offset as usize, first_block); // next freeblock = previous first freeblock
page.write_u16(offset as usize + 2, len); // size of this freeblock
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block
return;
}
// if we clear space that is at the start of the cell content area,
// we need to update the cell content area pointer forward to account for the removed space
// FIXME: is offset ever < cell_content_area? cell content area grows leftwards and the pointer
// is to the start of the last allocated cell. should we assert!(offset >= page.cell_content_area())
// and change this to if offset == page.cell_content_area()?
if offset <= page.cell_content_area() {
// extend boundary of content area
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, page.first_freeblock());
page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, offset + len);
// FIXME: remove the line directly below this, it does not change anything.
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, page.first_freeblock());
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len);
return;
}
// if the freeblock list is not empty, and the offset is greater than the first freeblock,
// then we need to do some more calculation to figure out where to insert the freeblock
// in the freeblock linked list.
let maxpc = {
let db_header = self.database_header.borrow();
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
usable_space as u16
};
@@ -799,17 +851,23 @@ impl BTreeCursor {
}
}
/// Drop a cell from a page.
/// This is done by freeing the range of bytes that the cell occupies.
fn drop_cell(&self, page: &mut PageContent, cell_idx: usize) {
let (cell_start, cell_len) = page.cell_get_raw_region(
cell_idx,
self.max_local(page.page_type()),
self.min_local(page.page_type()),
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.usable_space(),
);
self.free_cell_range(page, cell_start as u16, cell_len as u16);
page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1);
page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1);
}
/// Balance a leaf page.
/// Balancing is done when a page overflows.
/// see e.g. https://en.wikipedia.org/wiki/B-tree
///
/// This is a naive algorithm that doesn't try to distribute cells evenly by content.
/// It will try to split the page in half by keys not by content.
/// Sqlite tries to have a page at least 40% full.
@@ -852,8 +910,8 @@ impl BTreeCursor {
for cell_idx in 0..page_copy.cell_count() {
let (start, len) = page_copy.cell_get_raw_region(
cell_idx,
self.max_local(page_copy.page_type()),
self.min_local(page_copy.page_type()),
self.payload_overflow_threshold_max(page_copy.page_type()),
self.payload_overflow_threshold_min(page_copy.page_type()),
self.usable_space(),
);
let buf = page_copy.as_ptr();
@@ -930,14 +988,14 @@ impl BTreeCursor {
assert_eq!(parent_contents.overflow_cells.len(), 0);
// Right page pointer is u32 in right most pointer, and in cell is u32 too, so we can use a *u32 to hold where we want to change this value
let mut right_pointer = BTREE_HEADER_OFFSET_RIGHTMOST;
let mut right_pointer = PAGE_HEADER_OFFSET_RIGHTMOST_PTR;
for cell_idx in 0..parent_contents.cell_count() {
let cell = parent_contents
.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(page_type.clone()),
self.min_local(page_type.clone()),
self.payload_overflow_threshold_max(page_type.clone()),
self.payload_overflow_threshold_min(page_type.clone()),
self.usable_space(),
)
.unwrap();
@@ -950,8 +1008,8 @@ impl BTreeCursor {
if found {
let (start, _len) = parent_contents.cell_get_raw_region(
cell_idx,
self.max_local(page_type.clone()),
self.min_local(page_type.clone()),
self.payload_overflow_threshold_max(page_type.clone()),
self.payload_overflow_threshold_min(page_type.clone()),
self.usable_space(),
);
right_pointer = start;
@@ -967,17 +1025,20 @@ impl BTreeCursor {
assert!(page.is_dirty());
let contents = page.get().contents.as_mut().unwrap();
contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0);
contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
let db_header = RefCell::borrow(&self.database_header);
let cell_content_area_start =
db_header.page_size - db_header.unused_space as u16;
contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start);
db_header.page_size - db_header.reserved_space as u16;
contents.write_u16(
PAGE_HEADER_OFFSET_CELL_CONTENT_AREA,
cell_content_area_start,
);
contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0);
contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
if !contents.is_leaf() {
contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0);
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0);
}
}
@@ -1035,8 +1096,8 @@ impl BTreeCursor {
.cell_get(
contents.cell_count() - 1,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)
.unwrap();
@@ -1045,13 +1106,13 @@ impl BTreeCursor {
_ => unreachable!(),
};
self.drop_cell(contents, contents.cell_count() - 1);
contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, last_cell_pointer);
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, last_cell_pointer);
}
// last page right most pointer points to previous right most pointer before splitting
let last_page = new_pages.last().unwrap();
let last_page_contents = last_page.get().contents.as_mut().unwrap();
last_page_contents.write_u32(
BTREE_HEADER_OFFSET_RIGHTMOST,
PAGE_HEADER_OFFSET_RIGHTMOST_PTR,
self.write_info.rightmost_pointer.borrow().unwrap(),
);
}
@@ -1069,8 +1130,8 @@ impl BTreeCursor {
&contents.page_type(),
0,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)
.unwrap();
@@ -1119,6 +1180,9 @@ impl BTreeCursor {
}
}
/// Balance the root page.
/// This is done when the root page overflows, and we need to create a new root page.
/// See e.g. https://en.wikipedia.org/wiki/B-tree
fn balance_root(&mut self) {
/* todo: balance deeper, create child and copy contents of root there. Then split root */
/* if we are in root page then we just need to create a new root and push key there */
@@ -1145,8 +1209,8 @@ impl BTreeCursor {
}
// point new root right child to previous root
new_root_page_contents
.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, new_root_page_id as u32);
new_root_page_contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0);
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32);
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
}
/* swap splitted page buffer with new root buffer so we don't have to update page idx */
@@ -1195,12 +1259,16 @@ impl BTreeCursor {
}
}
/// Allocate a new page to the btree via the pager.
/// This marks the page as dirty and writes the page header.
fn allocate_page(&self, page_type: PageType, offset: usize) -> PageRef {
let page = self.pager.allocate_page().unwrap();
btree_init_page(&page, page_type, &self.database_header.borrow(), offset);
page
}
/// Allocate a new overflow page.
/// This is done when a cell overflows and new space is needed.
fn allocate_overflow_page(&self) -> PageRef {
let page = self.pager.allocate_page().unwrap();
@@ -1212,9 +1280,7 @@ impl BTreeCursor {
page
}
/*
Allocate space for a cell on a page.
*/
/// Allocate space for a cell on a page.
fn allocate_cell_space(&self, page_ref: &PageContent, amount: u16) -> u16 {
let amount = amount as usize;
@@ -1236,24 +1302,25 @@ impl BTreeCursor {
if gap + 2 + amount > top {
// defragment
self.defragment_page(page_ref, RefCell::borrow(&self.database_header));
top = page_ref.read_u16(BTREE_HEADER_OFFSET_CELL_CONTENT) as usize;
top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize;
}
let db_header = RefCell::borrow(&self.database_header);
top -= amount;
page_ref.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, top as u16);
page_ref.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, top as u16);
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
assert!(top + amount <= usable_space);
top as u16
}
/// Defragment a page. This means packing all the cells to the end of the page.
fn defragment_page(&self, page: &PageContent, db_header: Ref<DatabaseHeader>) {
log::debug!("defragment_page");
let cloned_page = page.clone();
// TODO(pere): usable space should include offset probably
let usable_space = (db_header.page_size - db_header.unused_space as u16) as u64;
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as u64;
let mut cbrk = usable_space;
// TODO: implement fast algorithm
@@ -1330,24 +1397,33 @@ impl BTreeCursor {
let write_buf = page.as_ptr();
// set new first byte of cell content
page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cbrk as u16);
page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16);
// set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start
page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0);
page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
// set unused space to 0
let first_cell = cloned_page.cell_content_area() as u64;
assert!(first_cell <= cbrk);
write_buf[first_cell as usize..cbrk as usize].fill(0);
}
// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte
// and end of cell pointer area.
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte
/// and end of cell pointer area.
#[allow(unused_assignments)]
fn compute_free_space(&self, page: &PageContent, db_header: Ref<DatabaseHeader>) -> u16 {
// TODO(pere): maybe free space is not calculated correctly with offset
let buf = page.as_ptr();
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
let mut first_byte_in_cell_content = page.cell_content_area();
// A zero value for the cell content area pointer is interpreted as 65536.
// See https://www.sqlite.org/fileformat.html
// The max page size for a sqlite database is 64kiB i.e. 65536 bytes.
// 65536 is u16::MAX + 1, and since cell content grows from right to left, this means
// the cell content area pointer is at the end of the page,
// i.e.
// 1. the page size is 64kiB
// 2. there are no cells on the page
// 3. there is no reserved space at the end of the page
if first_byte_in_cell_content == 0 {
first_byte_in_cell_content = u16::MAX;
}
@@ -1360,12 +1436,16 @@ impl BTreeCursor {
let child_pointer_size = if page.is_leaf() { 0 } else { 4 };
let first_cell = (page.offset + 8 + child_pointer_size + (2 * ncell)) as u16;
// The amount of free space is the sum of:
// 1. 0..first_byte_in_cell_content (everything to the left of the cell content area pointer is unused free space)
// 2. fragmented_free_bytes.
let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize;
let mut pc = free_block_pointer as usize;
if pc > 0 {
if pc < first_byte_in_cell_content as usize {
// corrupt
// Freeblocks exist in the cell content area e.g. after deletions
// They should never exist in the unused area of the page.
todo!("corrupted page");
}
@@ -1399,6 +1479,8 @@ impl BTreeCursor {
nfree as u16
}
/// Fill in the cell payload with the record.
/// If the record is too large to fit in the cell, it will spill onto overflow pages.
fn fill_cell_payload(
&self,
page_type: PageType,
@@ -1423,13 +1505,13 @@ impl BTreeCursor {
write_varint_to_vec(record_buf.len() as u64, cell_payload);
}
let max_local = self.max_local(page_type.clone());
let payload_overflow_threshold_max = self.payload_overflow_threshold_max(page_type.clone());
log::debug!(
"fill_cell_payload(record_size={}, max_local={})",
"fill_cell_payload(record_size={}, payload_overflow_threshold_max={})",
record_buf.len(),
max_local
payload_overflow_threshold_max
);
if record_buf.len() <= max_local {
if record_buf.len() <= payload_overflow_threshold_max {
// enough allowed space to fit inside a btree page
cell_payload.extend_from_slice(record_buf.as_slice());
cell_payload.resize(cell_payload.len() + 4, 0);
@@ -1437,11 +1519,13 @@ impl BTreeCursor {
}
log::debug!("fill_cell_payload(overflow)");
let min_local = self.min_local(page_type);
let mut space_left = min_local + (record_buf.len() - min_local) % (self.usable_space() - 4);
let payload_overflow_threshold_min = self.payload_overflow_threshold_min(page_type);
// see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371
let mut space_left = payload_overflow_threshold_min
+ (record_buf.len() - payload_overflow_threshold_min) % (self.usable_space() - 4);
if space_left > max_local {
space_left = min_local;
if space_left > payload_overflow_threshold_max {
space_left = payload_overflow_threshold_min;
}
// cell_size must be equal to first value of space_left as this will be the bytes copied to non-overflow page.
@@ -1487,31 +1571,54 @@ impl BTreeCursor {
assert_eq!(cell_size, cell_payload.len());
}
fn max_local(&self, page_type: PageType) -> usize {
let usable_space = self.usable_space();
/// Returns the maximum payload size (X) that can be stored directly on a b-tree page without spilling to overflow pages.
///
/// For table leaf pages: X = usable_size - 35
/// For index pages: X = ((usable_size - 12) * 64/255) - 23
///
/// The usable size is the total page size less the reserved space at the end of each page.
/// These thresholds are designed to:
/// - Give a minimum fanout of 4 for index b-trees
/// - Ensure enough payload is on the b-tree page that the record header can usually be accessed
/// without consulting an overflow page
fn payload_overflow_threshold_max(&self, page_type: PageType) -> usize {
let usable_size = self.usable_space();
match page_type {
PageType::IndexInterior | PageType::TableInterior => {
(usable_space - 12) * 64 / 255 - 23
PageType::IndexInterior | PageType::IndexLeaf => {
((usable_size - 12) * 64 / 255) - 23 // Index page formula
}
PageType::TableInterior | PageType::TableLeaf => {
usable_size - 35 // Table leaf page formula
}
PageType::IndexLeaf | PageType::TableLeaf => usable_space - 35,
}
}
fn min_local(&self, page_type: PageType) -> usize {
let usable_space = self.usable_space();
match page_type {
PageType::IndexInterior | PageType::TableInterior => {
(usable_space - 12) * 32 / 255 - 23
}
PageType::IndexLeaf | PageType::TableLeaf => (usable_space - 12) * 32 / 255 - 23,
}
/// Returns the minimum payload size (M) that must be stored on the b-tree page before spilling to overflow pages is allowed.
///
/// For all page types: M = ((usable_size - 12) * 32/255) - 23
///
/// When payload size P exceeds max_local():
/// - If K = M + ((P-M) % (usable_size-4)) <= max_local(): store K bytes on page
/// - Otherwise: store M bytes on page
///
/// The remaining bytes are stored on overflow pages in both cases.
fn payload_overflow_threshold_min(&self, _page_type: PageType) -> usize {
let usable_size = self.usable_space();
// Same formula for all page types
((usable_size - 12) * 32 / 255) - 23
}
/// The "usable size" of a database page is the page size specified by the 2-byte integer at offset 16
/// in the header, minus the "reserved" space size recorded in the 1-byte integer at offset 20 in the header.
/// The usable size of a page might be an odd number. However, the usable size is not allowed to be less than 480.
/// In other words, if the page size is 512, then the reserved space size cannot exceed 32.
fn usable_space(&self) -> usize {
let db_header = RefCell::borrow(&self.database_header);
(db_header.page_size - db_header.unused_space as u16) as usize
(db_header.page_size - db_header.reserved_space as u16) as usize
}
/// Find the index of the cell in the page that contains the given rowid.
/// BTree tables only.
fn find_cell(&self, page: &PageContent, int_key: u64) -> usize {
let mut cell_idx = 0;
let cell_count = page.cell_count();
@@ -1520,8 +1627,8 @@ impl BTreeCursor {
.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(page.page_type()),
self.min_local(page.page_type()),
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.usable_space(),
)
.unwrap()
@@ -1545,6 +1652,8 @@ impl BTreeCursor {
}
impl PageStack {
/// Push a new page onto the stack.
/// This effectively means traversing to a child page.
fn push(&self, page: PageRef) {
debug!(
"pagestack::push(current={}, new_page_id={})",
@@ -1561,6 +1670,8 @@ impl PageStack {
self.cell_indices.borrow_mut()[current as usize] = 0;
}
/// Pop a page off the stack.
/// This effectively means traversing back up to a parent page.
fn pop(&self) {
let current = *self.current_page.borrow();
debug!("pagestack::pop(current={})", current);
@@ -1569,6 +1680,8 @@ impl PageStack {
*self.current_page.borrow_mut() -= 1;
}
/// Get the top page on the stack.
/// This is the page that is currently being traversed.
fn top(&self) -> PageRef {
let current = *self.current_page.borrow();
let page = self.stack.borrow()[current as usize]
@@ -1583,6 +1696,7 @@ impl PageStack {
page
}
/// Get the parent page of the current page.
fn parent(&self) -> PageRef {
let current = *self.current_page.borrow();
self.stack.borrow()[current as usize - 1]
@@ -1597,13 +1711,15 @@ impl PageStack {
}
/// Cell index of the current page
fn current_index(&self) -> i32 {
fn current_cell_index(&self) -> i32 {
let current = self.current();
self.cell_indices.borrow()[current]
}
fn curr_idx_out_of_begin(&self) -> bool {
let cell_idx = self.current_index();
/// Check if the current cell index is less than 0.
/// This means we have been iterating backwards and have reached the start of the page.
fn current_cell_index_less_than_min(&self) -> bool {
let cell_idx = self.current_cell_index();
cell_idx < 0
}
@@ -1639,7 +1755,7 @@ fn find_free_cell(page_ref: &PageContent, db_header: Ref<DatabaseHeader>, amount
let buf = page_ref.as_ptr();
let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize;
let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize;
let maxpc = usable_space - amount;
let mut found = false;
while pc <= maxpc {
@@ -1785,8 +1901,8 @@ impl Cursor for BTreeCursor {
let equals = match &contents.cell_get(
cell_idx,
self.pager.clone(),
self.max_local(contents.page_type()),
self.min_local(contents.page_type()),
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
)? {
BTreeCell::TableLeafCell(l) => l._rowid == int_key,
@@ -1823,15 +1939,18 @@ pub fn btree_init_page(
let contents = contents.contents.as_mut().unwrap();
contents.offset = offset;
let id = page_type as u8;
contents.write_u8(BTREE_HEADER_OFFSET_TYPE, id);
contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0);
contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, id);
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
let cell_content_area_start = db_header.page_size - db_header.unused_space as u16;
contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start);
let cell_content_area_start = db_header.page_size - db_header.reserved_space as u16;
contents.write_u16(
PAGE_HEADER_OFFSET_CELL_CONTENT_AREA,
cell_content_area_start,
);
contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0);
contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0);
contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0);
}
fn to_static_buf(buf: &[u8]) -> &'static [u8] {

View File

@@ -482,7 +482,7 @@ impl Pager {
pub fn usable_size(&self) -> usize {
let db_header = self.db_header.borrow();
(db_header.page_size - db_header.unused_space as u16) as usize
(db_header.page_size - db_header.reserved_space as u16) as usize
}
}

View File

@@ -64,30 +64,84 @@ const DEFAULT_CACHE_SIZE: i32 = -2000;
// Minimum number of pages that cache can hold.
pub const MIN_PAGE_CACHE_SIZE: usize = 10;
/// The database header.
/// The first 100 bytes of the database file comprise the database file header.
/// The database file header is divided into fields as shown by the table below.
/// All multibyte fields in the database file header are stored with the most significant byte first (big-endian).
#[derive(Debug, Clone)]
pub struct DatabaseHeader {
/// The header string: "SQLite format 3\0"
magic: [u8; 16],
/// The database page size in bytes. Must be a power of two between 512 and 32768 inclusive,
/// or the value 1 representing a page size of 65536.
pub page_size: u16,
/// File format write version. 1 for legacy; 2 for WAL.
write_version: u8,
/// File format read version. 1 for legacy; 2 for WAL.
read_version: u8,
pub unused_space: u8,
/// Bytes of unused "reserved" space at the end of each page. Usually 0.
/// SQLite has the ability to set aside a small number of extra bytes at the end of every page for use by extensions.
/// These extra bytes are used, for example, by the SQLite Encryption Extension to store a nonce and/or
/// cryptographic checksum associated with each page.
pub reserved_space: u8,
/// Maximum embedded payload fraction. Must be 64.
max_embed_frac: u8,
/// Minimum embedded payload fraction. Must be 32.
min_embed_frac: u8,
/// Leaf payload fraction. Must be 32.
min_leaf_frac: u8,
/// File change counter, incremented when database is modified.
change_counter: u32,
/// Size of the database file in pages. The "in-header database size".
pub database_size: u32,
/// Page number of the first freelist trunk page.
freelist_trunk_page: u32,
/// Total number of freelist pages.
freelist_pages: u32,
/// The schema cookie. Incremented when the database schema changes.
schema_cookie: u32,
/// The schema format number. Supported formats are 1, 2, 3, and 4.
schema_format: u32,
pub default_cache_size: i32,
vacuum: u32,
/// Default page cache size.
pub default_page_cache_size: i32,
/// The page number of the largest root b-tree page when in auto-vacuum or
/// incremental-vacuum modes, or zero otherwise.
vacuum_mode_largest_root_page: u32,
/// The database text encoding. 1=UTF-8, 2=UTF-16le, 3=UTF-16be.
text_encoding: u32,
/// The "user version" as read and set by the user_version pragma.
user_version: u32,
incremental_vacuum: u32,
/// True (non-zero) for incremental-vacuum mode. False (zero) otherwise.
incremental_vacuum_enabled: u32,
/// The "Application ID" set by PRAGMA application_id.
application_id: u32,
reserved: [u8; 20],
/// Reserved for expansion. Must be zero.
reserved_for_expansion: [u8; 20],
/// The version-valid-for number.
version_valid_for: u32,
/// SQLITE_VERSION_NUMBER
pub version_number: u32,
}
@@ -98,28 +152,62 @@ pub const WAL_FRAME_HEADER_SIZE: usize = 24;
pub const WAL_MAGIC_LE: u32 = 0x377f0682;
pub const WAL_MAGIC_BE: u32 = 0x377f0683;
/// The Write-Ahead Log (WAL) header.
/// The first 32 bytes of a WAL file comprise the WAL header.
/// The WAL header is divided into the following fields stored in big-endian order.
#[derive(Debug, Default, Clone)]
#[repr(C)] // This helps with encoding because rust does not respect the order in structs, so in
// this case we want to keep the order
pub struct WalHeader {
/// Magic number. 0x377f0682 or 0x377f0683
/// If the LSB is 0, checksums are native byte order, else checksums are serialized
pub magic: u32,
/// WAL format version. Currently 3007000
pub file_format: u32,
/// Database page size in bytes. Power of two between 512 and 32768 inclusive
pub page_size: u32,
/// Checkpoint sequence number. Increases with each checkpoint
pub checkpoint_seq: u32,
/// Random value used for the first salt in checksum calculations
pub salt_1: u32,
/// Random value used for the second salt in checksum calculations
pub salt_2: u32,
/// First checksum value in the wal-header
pub checksum_1: u32,
/// Second checksum value in the wal-header
pub checksum_2: u32,
}
/// Immediately following the wal-header are zero or more frames.
/// Each frame consists of a 24-byte frame-header followed by <page-size> bytes of page data.
/// The frame-header is six big-endian 32-bit unsigned integer values, as follows:
#[allow(dead_code)]
#[derive(Debug, Default)]
pub struct WalFrameHeader {
/// Page number
page_number: u32,
/// For commit records, the size of the database file in pages after the commit.
/// For all other records, zero.
db_size: u32,
/// Salt-1 copied from the WAL header
salt_1: u32,
/// Salt-2 copied from the WAL header
salt_2: u32,
/// Checksum-1: Cumulative checksum up through and including this page
checksum_1: u32,
/// Checksum-2: Second half of the cumulative checksum
checksum_2: u32,
}
@@ -130,7 +218,7 @@ impl Default for DatabaseHeader {
page_size: 4096,
write_version: 2,
read_version: 2,
unused_space: 0,
reserved_space: 0,
max_embed_frac: 64,
min_embed_frac: 32,
min_leaf_frac: 32,
@@ -140,13 +228,13 @@ impl Default for DatabaseHeader {
freelist_pages: 0,
schema_cookie: 0,
schema_format: 4, // latest format, new sqlite3 databases use this format
default_cache_size: 500, // pages
vacuum: 0,
default_page_cache_size: 500, // pages
vacuum_mode_largest_root_page: 0,
text_encoding: 1, // utf-8
user_version: 1,
incremental_vacuum: 0,
incremental_vacuum_enabled: 0,
application_id: 0,
reserved: [0; 20],
reserved_for_expansion: [0; 20],
version_valid_for: 3047000,
version_number: 3047000,
}
@@ -180,7 +268,7 @@ fn finish_read_database_header(
header.page_size = u16::from_be_bytes([buf[16], buf[17]]);
header.write_version = buf[18];
header.read_version = buf[19];
header.unused_space = buf[20];
header.reserved_space = buf[20];
header.max_embed_frac = buf[21];
header.min_embed_frac = buf[22];
header.min_leaf_frac = buf[23];
@@ -190,16 +278,16 @@ fn finish_read_database_header(
header.freelist_pages = u32::from_be_bytes([buf[36], buf[37], buf[38], buf[39]]);
header.schema_cookie = u32::from_be_bytes([buf[40], buf[41], buf[42], buf[43]]);
header.schema_format = u32::from_be_bytes([buf[44], buf[45], buf[46], buf[47]]);
header.default_cache_size = i32::from_be_bytes([buf[48], buf[49], buf[50], buf[51]]);
if header.default_cache_size == 0 {
header.default_cache_size = DEFAULT_CACHE_SIZE;
header.default_page_cache_size = i32::from_be_bytes([buf[48], buf[49], buf[50], buf[51]]);
if header.default_page_cache_size == 0 {
header.default_page_cache_size = DEFAULT_CACHE_SIZE;
}
header.vacuum = u32::from_be_bytes([buf[52], buf[53], buf[54], buf[55]]);
header.vacuum_mode_largest_root_page = u32::from_be_bytes([buf[52], buf[53], buf[54], buf[55]]);
header.text_encoding = u32::from_be_bytes([buf[56], buf[57], buf[58], buf[59]]);
header.user_version = u32::from_be_bytes([buf[60], buf[61], buf[62], buf[63]]);
header.incremental_vacuum = u32::from_be_bytes([buf[64], buf[65], buf[66], buf[67]]);
header.incremental_vacuum_enabled = u32::from_be_bytes([buf[64], buf[65], buf[66], buf[67]]);
header.application_id = u32::from_be_bytes([buf[68], buf[69], buf[70], buf[71]]);
header.reserved.copy_from_slice(&buf[72..92]);
header.reserved_for_expansion.copy_from_slice(&buf[72..92]);
header.version_valid_for = u32::from_be_bytes([buf[92], buf[93], buf[94], buf[95]]);
header.version_number = u32::from_be_bytes([buf[96], buf[97], buf[98], buf[99]]);
Ok(())
@@ -258,7 +346,7 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
buf[16..18].copy_from_slice(&header.page_size.to_be_bytes());
buf[18] = header.write_version;
buf[19] = header.read_version;
buf[20] = header.unused_space;
buf[20] = header.reserved_space;
buf[21] = header.max_embed_frac;
buf[22] = header.min_embed_frac;
buf[23] = header.min_leaf_frac;
@@ -268,15 +356,15 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
buf[36..40].copy_from_slice(&header.freelist_pages.to_be_bytes());
buf[40..44].copy_from_slice(&header.schema_cookie.to_be_bytes());
buf[44..48].copy_from_slice(&header.schema_format.to_be_bytes());
buf[48..52].copy_from_slice(&header.default_cache_size.to_be_bytes());
buf[48..52].copy_from_slice(&header.default_page_cache_size.to_be_bytes());
buf[52..56].copy_from_slice(&header.vacuum.to_be_bytes());
buf[52..56].copy_from_slice(&header.vacuum_mode_largest_root_page.to_be_bytes());
buf[56..60].copy_from_slice(&header.text_encoding.to_be_bytes());
buf[60..64].copy_from_slice(&header.user_version.to_be_bytes());
buf[64..68].copy_from_slice(&header.incremental_vacuum.to_be_bytes());
buf[64..68].copy_from_slice(&header.incremental_vacuum_enabled.to_be_bytes());
buf[68..72].copy_from_slice(&header.application_id.to_be_bytes());
buf[72..92].copy_from_slice(&header.reserved);
buf[72..92].copy_from_slice(&header.reserved_for_expansion);
buf[92..96].copy_from_slice(&header.version_valid_for.to_be_bytes());
buf[96..100].copy_from_slice(&header.version_number.to_be_bytes());
}
@@ -387,6 +475,12 @@ impl PageContent {
buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes());
}
/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page.
/// A freeblock is a structure used to identify unallocated space within a b-tree page.
/// Freeblocks are organized as a chain.
///
/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead
/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions.
pub fn first_freeblock(&self) -> u16 {
self.read_u16(1)
}
@@ -395,10 +489,16 @@ impl PageContent {
self.read_u16(3) as usize
}
/// The start of the cell content area.
/// SQLite strives to place cells as far toward the end of the b-tree page as it can,
/// in order to leave space for future growth of the cell pointer array.
/// = the cell content area pointer moves leftward as cells are added to the page
pub fn cell_content_area(&self) -> u16 {
self.read_u16(5)
}
/// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header.
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
pub fn num_frag_free_bytes(&self) -> u8 {
self.read_u8(7)
}
@@ -416,22 +516,24 @@ impl PageContent {
&self,
idx: usize,
pager: Rc<Pager>,
max_local: usize,
min_local: usize,
payload_overflow_threshold_max: usize,
payload_overflow_threshold_min: usize,
usable_size: usize,
) -> Result<BTreeCell> {
log::debug!("cell_get(idx={})", idx);
let buf = self.as_ptr();
let ncells = self.cell_count();
let cell_start = match self.page_type() {
// the page header is 12 bytes for interior pages, 8 bytes for leaf pages
// this is because the 4 last bytes in the interior page's header are used for the rightmost pointer.
let cell_pointer_array_start = match self.page_type() {
PageType::IndexInterior => 12,
PageType::TableInterior => 12,
PageType::IndexLeaf => 8,
PageType::TableLeaf => 8,
};
assert!(idx < ncells, "cell_get: idx out of bounds");
let cell_pointer = cell_start + (idx * 2);
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer = self.read_u16(cell_pointer) as usize;
read_btree_cell(
@@ -439,13 +541,17 @@ impl PageContent {
&self.page_type(),
cell_pointer,
pager,
max_local,
min_local,
payload_overflow_threshold_max,
payload_overflow_threshold_min,
usable_size,
)
}
/// When using this fu
/// The cell pointer array of a b-tree page immediately follows the b-tree page header.
/// Let K be the number of cells on the btree.
/// The cell pointer array consists of K 2-byte integer offsets to the cell contents.
/// The cell pointers are arranged in key order with:
/// - left-most cell (the cell with the smallest key) first and
/// - the right-most cell (the cell with the largest key) last.
pub fn cell_get_raw_pointer_region(&self) -> (usize, usize) {
let cell_start = match self.page_type() {
PageType::IndexInterior => 12,
@@ -460,27 +566,31 @@ impl PageContent {
pub fn cell_get_raw_region(
&self,
idx: usize,
max_local: usize,
min_local: usize,
payload_overflow_threshold_max: usize,
payload_overflow_threshold_min: usize,
usable_size: usize,
) -> (usize, usize) {
let buf = self.as_ptr();
let ncells = self.cell_count();
let cell_start = match self.page_type() {
let cell_pointer_array_start = match self.page_type() {
PageType::IndexInterior => 12,
PageType::TableInterior => 12,
PageType::IndexLeaf => 8,
PageType::TableLeaf => 8,
};
assert!(idx < ncells, "cell_get: idx out of bounds");
let cell_pointer = cell_start + (idx * 2);
let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each
let cell_pointer = self.read_u16(cell_pointer) as usize;
let start = cell_pointer;
let len = match self.page_type() {
PageType::IndexInterior => {
let (len_payload, n_payload) = read_varint(&buf[cell_pointer + 4..]).unwrap();
let (overflows, to_read) =
payload_overflows(len_payload as usize, max_local, min_local, usable_size);
let (overflows, to_read) = payload_overflows(
len_payload as usize,
payload_overflow_threshold_max,
payload_overflow_threshold_min,
usable_size,
);
if overflows {
4 + to_read + n_payload + 4
} else {
@@ -493,8 +603,12 @@ impl PageContent {
}
PageType::IndexLeaf => {
let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap();
let (overflows, to_read) =
payload_overflows(len_payload as usize, max_local, min_local, usable_size);
let (overflows, to_read) = payload_overflows(
len_payload as usize,
payload_overflow_threshold_max,
payload_overflow_threshold_min,
usable_size,
);
if overflows {
to_read + n_payload + 4
} else {
@@ -504,8 +618,12 @@ impl PageContent {
PageType::TableLeaf => {
let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap();
let (_, n_rowid) = read_varint(&buf[cell_pointer + n_payload..]).unwrap();
let (overflows, to_read) =
payload_overflows(len_payload as usize, max_local, min_local, usable_size);
let (overflows, to_read) = payload_overflows(
len_payload as usize,
payload_overflow_threshold_max,
payload_overflow_threshold_min,
usable_size,
);
if overflows {
to_read + n_payload + n_rowid
} else {
@@ -1170,28 +1288,46 @@ pub fn begin_write_wal_header(io: &Rc<dyn File>, header: &WalHeader) -> Result<(
Ok(())
}
/*
Checks if payload will overflow a cell based on max local and
it will return the min size that will be stored in that case,
including overflow pointer
*/
/// Checks if payload will overflow a cell based on the maximum allowed size.
/// It will return the min size that will be stored in that case,
/// including overflow pointer
/// see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371
pub fn payload_overflows(
payload_size: usize,
max_local: usize,
min_local: usize,
payload_overflow_threshold_max: usize,
payload_overflow_threshold_min: usize,
usable_size: usize,
) -> (bool, usize) {
if payload_size <= max_local {
if payload_size <= payload_overflow_threshold_max {
return (false, 0);
}
let mut space_left = min_local + (payload_size - min_local) % (usable_size - 4);
if space_left > max_local {
space_left = min_local;
let mut space_left = payload_overflow_threshold_min
+ (payload_size - payload_overflow_threshold_min) % (usable_size - 4);
if space_left > payload_overflow_threshold_max {
space_left = payload_overflow_threshold_min;
}
(true, space_left + 4)
}
/// The checksum is computed by interpreting the input as an even number of unsigned 32-bit integers: x(0) through x(N).
/// The 32-bit integers are big-endian if the magic number in the first 4 bytes of the WAL header is 0x377f0683
/// and the integers are little-endian if the magic number is 0x377f0682.
/// The checksum values are always stored in the frame header in a big-endian format regardless of which byte order is used to compute the checksum.
/// The checksum algorithm only works for content which is a multiple of 8 bytes in length.
/// In other words, if the inputs are x(0) through x(N) then N must be odd.
/// The checksum algorithm is as follows:
///
/// s0 = s1 = 0
/// for i from 0 to n-1 step 2:
/// s0 += x(i) + s1;
/// s1 += x(i+1) + s0;
/// endfor
///
/// The outputs s0 and s1 are both weighted checksums using Fibonacci weights in reverse order.
/// (The largest Fibonacci weight occurs on the first element of the sequence being summed.)
/// The s1 value spans all 32-bit integer terms of the sequence whereas s0 omits the final term.
pub fn checksum_wal(
buf: &[u8],
_wal_header: &WalHeader,

View File

@@ -386,7 +386,7 @@ fn query_pragma(
match pragma {
PragmaName::CacheSize => {
program.emit_insn(Insn::Integer {
value: database_header.borrow().default_cache_size.into(),
value: database_header.borrow().default_page_cache_size.into(),
dest: register,
});
}
@@ -424,7 +424,7 @@ fn update_cache_size(value: i64, header: Rc<RefCell<DatabaseHeader>>, pager: Rc<
}
// update in-memory header
header.borrow_mut().default_cache_size = cache_size_unformatted
header.borrow_mut().default_page_cache_size = cache_size_unformatted
.try_into()
.unwrap_or_else(|_| panic!("invalid value, too big for a i32 {}", value));