diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 19ba4c7fa..bdd27932b 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -20,22 +20,37 @@ use super::sqlite3_ondisk::{ /* These are offsets of fields in the header of a b-tree page. */ -const BTREE_HEADER_OFFSET_TYPE: usize = 0; /* type of btree page -> u8 */ -const BTREE_HEADER_OFFSET_FREEBLOCK: usize = 1; /* pointer to first freeblock -> u16 */ -const BTREE_HEADER_OFFSET_CELL_COUNT: usize = 3; /* number of cells in the page -> u16 */ -const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of cell allocated content from top -> u16 */ -const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */ -const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */ -/* -** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than -** this will be declared corrupt. This value is calculated based on a -** maximum database size of 2^31 pages a minimum fanout of 2 for a -** root-node and 3 for all other internal nodes. -** -** If a tree that appears to be taller than this is encountered, it is -** assumed that the database is corrupt. -*/ +/// type of btree page -> u8 +const PAGE_HEADER_OFFSET_PAGE_TYPE: usize = 0; +/// pointer to first freeblock -> u16 +/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page. +/// A freeblock is a structure used to identify unallocated space within a b-tree page. +/// Freeblocks are organized as a chain. +/// +/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead +/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions. +const PAGE_HEADER_OFFSET_FIRST_FREEBLOCK: usize = 1; +/// number of cells in the page -> u16 +const PAGE_HEADER_OFFSET_CELL_COUNT: usize = 3; +/// pointer to first byte of cell allocated content from top -> u16 +/// SQLite strives to place cells as far toward the end of the b-tree page as it can, +/// in order to leave space for future growth of the cell pointer array. +/// = the cell content area pointer moves leftward as cells are added to the page +const PAGE_HEADER_OFFSET_CELL_CONTENT_AREA: usize = 5; +/// number of fragmented bytes -> u8 +/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area. +const PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT: usize = 7; +/// if internalnode, pointer right most pointer (saved separately from cells) -> u32 +const PAGE_HEADER_OFFSET_RIGHTMOST_PTR: usize = 8; + +/// Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than +/// this will be declared corrupt. This value is calculated based on a +/// maximum database size of 2^31 pages a minimum fanout of 2 for a +/// root-node and 3 for all other internal nodes. +/// +/// If a tree that appears to be taller than this is encountered, it is +/// assumed that the database is corrupt. pub const BTCURSOR_MAX_DEPTH: usize = 20; /// Evaluate a Result>, if IO return IO. @@ -57,6 +72,8 @@ macro_rules! return_if_locked { }}; } +/// State machine of a write operation. +/// May involve balancing due to overflow. #[derive(Debug)] enum WriteState { Start, @@ -67,11 +84,16 @@ enum WriteState { } struct WriteInfo { + /// State of the write operation state machine. state: WriteState, + /// Pages allocated during the write operation due to balancing. new_pages: RefCell>, + /// Scratch space used during balancing. scratch_cells: RefCell>, + /// Bookkeeping of the rightmost pointer so the PAGE_HEADER_OFFSET_RIGHTMOST_PTR can be updated. rightmost_pointer: RefCell>, - page_copy: RefCell>, // this holds the copy a of a page needed for buffer references + /// Copy of the current page needed for buffer references. + page_copy: RefCell>, } pub struct BTreeCursor { @@ -142,6 +164,8 @@ impl BTreeCursor { } } + /// Check if the table is empty. + /// This is done by checking if the root page has no cells. fn is_empty_table(&mut self) -> Result> { let page = self.pager.read_page(self.root_page)?; return_if_locked!(page); @@ -150,16 +174,18 @@ impl BTreeCursor { Ok(CursorResult::Ok(cell_count == 0)) } + /// Move the cursor to the previous record and return it. + /// Used in backwards iteration. fn get_prev_record(&mut self) -> Result, Option)>> { loop { let page = self.stack.top(); - let cell_idx = self.stack.current_index(); + let cell_idx = self.stack.current_cell_index(); - // moved to current page begin + // moved to beginning of current page // todo: find a better way to flag moved to end or begin of page - if self.stack.curr_idx_out_of_begin() { + if self.stack.current_cell_index_less_than_min() { loop { - if self.stack.current_index() > 0 { + if self.stack.current_cell_index() > 0 { self.stack.retreat(); break; } @@ -198,8 +224,8 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), )?; @@ -228,13 +254,15 @@ impl BTreeCursor { } } + /// Move the cursor to the next record and return it. + /// Used in forwards iteration, which is the default. fn get_next_record( &mut self, predicate: Option<(SeekKey<'_>, SeekOp)>, ) -> Result, Option)>> { loop { let mem_page_rc = self.stack.top(); - let cell_idx = self.stack.current_index() as usize; + let cell_idx = self.stack.current_cell_index() as usize; debug!("current id={} cell={}", mem_page_rc.get().id, cell_idx); return_if_locked!(mem_page_rc); @@ -286,8 +314,8 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), )?; match &cell { @@ -386,6 +414,9 @@ impl BTreeCursor { } } + /// Move the cursor to the record that matches the seek key and seek operation. + /// This may be used to seek to a specific record in a point query (e.g. SELECT * FROM table WHERE col = 10) + /// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10). fn seek( &mut self, key: SeekKey<'_>, @@ -403,8 +434,8 @@ impl BTreeCursor { let cell = contents.cell_get( cell_idx, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), )?; match &cell { @@ -476,12 +507,14 @@ impl BTreeCursor { Ok(CursorResult::Ok((None, None))) } + /// Move the cursor to the root page of the btree. fn move_to_root(&mut self) { let mem_page = self.pager.read_page(self.root_page).unwrap(); self.stack.clear(); self.stack.push(mem_page); } + /// Move the cursor to the rightmost record in the btree. fn move_to_rightmost(&mut self) -> Result> { self.move_to_root(); @@ -553,8 +586,8 @@ impl BTreeCursor { match &contents.cell_get( cell_idx, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), )? { BTreeCell::TableInteriorCell(TableInteriorCell { @@ -634,6 +667,8 @@ impl BTreeCursor { } } + /// Insert a record into the btree. + /// If the insert operation overflows the page, it will be split and the btree will be balanced. fn insert_into_page( &mut self, key: &OwnedValue, @@ -700,7 +735,11 @@ impl BTreeCursor { } } - /* insert to position and shift other pointers */ + /// Insert a record into a cell. + /// If the cell overflows, an overflow cell is created. + /// insert_into_cell() is called from insert_into_page(), + /// and the overflow cell count is used to determine if the page overflows, + /// i.e. whether we need to balance the btree after the insert. fn insert_into_cell(&self, page: &mut PageContent, payload: &[u8], cell_idx: usize) { let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); let enough_space = payload.len() + 2 <= free as usize; @@ -734,41 +773,54 @@ impl BTreeCursor { page.write_u16(pointer_area_pc_by_idx - page.offset, pc); // update first byte of content area - page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, pc); + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, pc); // update cell count let new_n_cells = (page.cell_count() + 1) as u16; - page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, new_n_cells); + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, new_n_cells); } + /// Free the range of bytes that a cell occupies. + /// This function also updates the freeblock list in the page. + /// Freeblocks are used to keep track of free space in the page, + /// and are organized as a linked list. fn free_cell_range(&self, page: &mut PageContent, offset: u16, len: u16) { + // if the freeblock list is empty, we set this block as the first freeblock in the page header. if page.first_freeblock() == 0 { - // insert into empty list - page.write_u16(offset as usize, 0); - page.write_u16(offset as usize + 2, len); - page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, offset); + page.write_u16(offset as usize, 0); // next freeblock = null + page.write_u16(offset as usize + 2, len); // size of this freeblock + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block return; } let first_block = page.first_freeblock(); + // if the freeblock list is not empty, and the offset is less than the first freeblock, + // we insert this block at the head of the list if offset < first_block { - // insert into head of list - page.write_u16(offset as usize, first_block); - page.write_u16(offset as usize + 2, len); - page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, offset); + page.write_u16(offset as usize, first_block); // next freeblock = previous first freeblock + page.write_u16(offset as usize + 2, len); // size of this freeblock + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, offset); // first freeblock in page = this block return; } + // if we clear space that is at the start of the cell content area, + // we need to update the cell content area pointer forward to account for the removed space + // FIXME: is offset ever < cell_content_area? cell content area grows leftwards and the pointer + // is to the start of the last allocated cell. should we assert!(offset >= page.cell_content_area()) + // and change this to if offset == page.cell_content_area()? if offset <= page.cell_content_area() { - // extend boundary of content area - page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, page.first_freeblock()); - page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, offset + len); + // FIXME: remove the line directly below this, it does not change anything. + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, page.first_freeblock()); + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, offset + len); return; } + // if the freeblock list is not empty, and the offset is greater than the first freeblock, + // then we need to do some more calculation to figure out where to insert the freeblock + // in the freeblock linked list. let maxpc = { let db_header = self.database_header.borrow(); - let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; + let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; usable_space as u16 }; @@ -799,17 +851,23 @@ impl BTreeCursor { } } + /// Drop a cell from a page. + /// This is done by freeing the range of bytes that the cell occupies. fn drop_cell(&self, page: &mut PageContent, cell_idx: usize) { let (cell_start, cell_len) = page.cell_get_raw_region( cell_idx, - self.max_local(page.page_type()), - self.min_local(page.page_type()), + self.payload_overflow_threshold_max(page.page_type()), + self.payload_overflow_threshold_min(page.page_type()), self.usable_space(), ); self.free_cell_range(page, cell_start as u16, cell_len as u16); - page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); + page.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, page.cell_count() as u16 - 1); } + /// Balance a leaf page. + /// Balancing is done when a page overflows. + /// see e.g. https://en.wikipedia.org/wiki/B-tree + /// /// This is a naive algorithm that doesn't try to distribute cells evenly by content. /// It will try to split the page in half by keys not by content. /// Sqlite tries to have a page at least 40% full. @@ -852,8 +910,8 @@ impl BTreeCursor { for cell_idx in 0..page_copy.cell_count() { let (start, len) = page_copy.cell_get_raw_region( cell_idx, - self.max_local(page_copy.page_type()), - self.min_local(page_copy.page_type()), + self.payload_overflow_threshold_max(page_copy.page_type()), + self.payload_overflow_threshold_min(page_copy.page_type()), self.usable_space(), ); let buf = page_copy.as_ptr(); @@ -930,14 +988,14 @@ impl BTreeCursor { assert_eq!(parent_contents.overflow_cells.len(), 0); // Right page pointer is u32 in right most pointer, and in cell is u32 too, so we can use a *u32 to hold where we want to change this value - let mut right_pointer = BTREE_HEADER_OFFSET_RIGHTMOST; + let mut right_pointer = PAGE_HEADER_OFFSET_RIGHTMOST_PTR; for cell_idx in 0..parent_contents.cell_count() { let cell = parent_contents .cell_get( cell_idx, self.pager.clone(), - self.max_local(page_type.clone()), - self.min_local(page_type.clone()), + self.payload_overflow_threshold_max(page_type.clone()), + self.payload_overflow_threshold_min(page_type.clone()), self.usable_space(), ) .unwrap(); @@ -950,8 +1008,8 @@ impl BTreeCursor { if found { let (start, _len) = parent_contents.cell_get_raw_region( cell_idx, - self.max_local(page_type.clone()), - self.min_local(page_type.clone()), + self.payload_overflow_threshold_max(page_type.clone()), + self.payload_overflow_threshold_min(page_type.clone()), self.usable_space(), ); right_pointer = start; @@ -967,17 +1025,20 @@ impl BTreeCursor { assert!(page.is_dirty()); let contents = page.get().contents.as_mut().unwrap(); - contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0); - contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0); + contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); + contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); let db_header = RefCell::borrow(&self.database_header); let cell_content_area_start = - db_header.page_size - db_header.unused_space as u16; - contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start); + db_header.page_size - db_header.reserved_space as u16; + contents.write_u16( + PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, + cell_content_area_start, + ); - contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0); + contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); if !contents.is_leaf() { - contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0); + contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0); } } @@ -1035,8 +1096,8 @@ impl BTreeCursor { .cell_get( contents.cell_count() - 1, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), ) .unwrap(); @@ -1045,13 +1106,13 @@ impl BTreeCursor { _ => unreachable!(), }; self.drop_cell(contents, contents.cell_count() - 1); - contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, last_cell_pointer); + contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, last_cell_pointer); } // last page right most pointer points to previous right most pointer before splitting let last_page = new_pages.last().unwrap(); let last_page_contents = last_page.get().contents.as_mut().unwrap(); last_page_contents.write_u32( - BTREE_HEADER_OFFSET_RIGHTMOST, + PAGE_HEADER_OFFSET_RIGHTMOST_PTR, self.write_info.rightmost_pointer.borrow().unwrap(), ); } @@ -1069,8 +1130,8 @@ impl BTreeCursor { &contents.page_type(), 0, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), ) .unwrap(); @@ -1119,6 +1180,9 @@ impl BTreeCursor { } } + /// Balance the root page. + /// This is done when the root page overflows, and we need to create a new root page. + /// See e.g. https://en.wikipedia.org/wiki/B-tree fn balance_root(&mut self) { /* todo: balance deeper, create child and copy contents of root there. Then split root */ /* if we are in root page then we just need to create a new root and push key there */ @@ -1145,8 +1209,8 @@ impl BTreeCursor { } // point new root right child to previous root new_root_page_contents - .write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, new_root_page_id as u32); - new_root_page_contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0); + .write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32); + new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); } /* swap splitted page buffer with new root buffer so we don't have to update page idx */ @@ -1195,12 +1259,16 @@ impl BTreeCursor { } } + /// Allocate a new page to the btree via the pager. + /// This marks the page as dirty and writes the page header. fn allocate_page(&self, page_type: PageType, offset: usize) -> PageRef { let page = self.pager.allocate_page().unwrap(); btree_init_page(&page, page_type, &self.database_header.borrow(), offset); page } + /// Allocate a new overflow page. + /// This is done when a cell overflows and new space is needed. fn allocate_overflow_page(&self) -> PageRef { let page = self.pager.allocate_page().unwrap(); @@ -1212,9 +1280,7 @@ impl BTreeCursor { page } - /* - Allocate space for a cell on a page. - */ + /// Allocate space for a cell on a page. fn allocate_cell_space(&self, page_ref: &PageContent, amount: u16) -> u16 { let amount = amount as usize; @@ -1236,24 +1302,25 @@ impl BTreeCursor { if gap + 2 + amount > top { // defragment self.defragment_page(page_ref, RefCell::borrow(&self.database_header)); - top = page_ref.read_u16(BTREE_HEADER_OFFSET_CELL_CONTENT) as usize; + top = page_ref.read_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA) as usize; } let db_header = RefCell::borrow(&self.database_header); top -= amount; - page_ref.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, top as u16); + page_ref.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, top as u16); - let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; + let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; assert!(top + amount <= usable_space); top as u16 } + /// Defragment a page. This means packing all the cells to the end of the page. fn defragment_page(&self, page: &PageContent, db_header: Ref) { log::debug!("defragment_page"); let cloned_page = page.clone(); // TODO(pere): usable space should include offset probably - let usable_space = (db_header.page_size - db_header.unused_space as u16) as u64; + let usable_space = (db_header.page_size - db_header.reserved_space as u16) as u64; let mut cbrk = usable_space; // TODO: implement fast algorithm @@ -1330,24 +1397,33 @@ impl BTreeCursor { let write_buf = page.as_ptr(); // set new first byte of cell content - page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cbrk as u16); + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, cbrk as u16); // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start - page.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0); + page.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); // set unused space to 0 let first_cell = cloned_page.cell_content_area() as u64; assert!(first_cell <= cbrk); write_buf[first_cell as usize..cbrk as usize].fill(0); } - // Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte - // and end of cell pointer area. + /// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte + /// and end of cell pointer area. #[allow(unused_assignments)] fn compute_free_space(&self, page: &PageContent, db_header: Ref) -> u16 { // TODO(pere): maybe free space is not calculated correctly with offset let buf = page.as_ptr(); - let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; + let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; let mut first_byte_in_cell_content = page.cell_content_area(); + // A zero value for the cell content area pointer is interpreted as 65536. + // See https://www.sqlite.org/fileformat.html + // The max page size for a sqlite database is 64kiB i.e. 65536 bytes. + // 65536 is u16::MAX + 1, and since cell content grows from right to left, this means + // the cell content area pointer is at the end of the page, + // i.e. + // 1. the page size is 64kiB + // 2. there are no cells on the page + // 3. there is no reserved space at the end of the page if first_byte_in_cell_content == 0 { first_byte_in_cell_content = u16::MAX; } @@ -1360,12 +1436,16 @@ impl BTreeCursor { let child_pointer_size = if page.is_leaf() { 0 } else { 4 }; let first_cell = (page.offset + 8 + child_pointer_size + (2 * ncell)) as u16; + // The amount of free space is the sum of: + // 1. 0..first_byte_in_cell_content (everything to the left of the cell content area pointer is unused free space) + // 2. fragmented_free_bytes. let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize; let mut pc = free_block_pointer as usize; if pc > 0 { if pc < first_byte_in_cell_content as usize { - // corrupt + // Freeblocks exist in the cell content area e.g. after deletions + // They should never exist in the unused area of the page. todo!("corrupted page"); } @@ -1399,6 +1479,8 @@ impl BTreeCursor { nfree as u16 } + /// Fill in the cell payload with the record. + /// If the record is too large to fit in the cell, it will spill onto overflow pages. fn fill_cell_payload( &self, page_type: PageType, @@ -1423,13 +1505,13 @@ impl BTreeCursor { write_varint_to_vec(record_buf.len() as u64, cell_payload); } - let max_local = self.max_local(page_type.clone()); + let payload_overflow_threshold_max = self.payload_overflow_threshold_max(page_type.clone()); log::debug!( - "fill_cell_payload(record_size={}, max_local={})", + "fill_cell_payload(record_size={}, payload_overflow_threshold_max={})", record_buf.len(), - max_local + payload_overflow_threshold_max ); - if record_buf.len() <= max_local { + if record_buf.len() <= payload_overflow_threshold_max { // enough allowed space to fit inside a btree page cell_payload.extend_from_slice(record_buf.as_slice()); cell_payload.resize(cell_payload.len() + 4, 0); @@ -1437,11 +1519,13 @@ impl BTreeCursor { } log::debug!("fill_cell_payload(overflow)"); - let min_local = self.min_local(page_type); - let mut space_left = min_local + (record_buf.len() - min_local) % (self.usable_space() - 4); + let payload_overflow_threshold_min = self.payload_overflow_threshold_min(page_type); + // see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371 + let mut space_left = payload_overflow_threshold_min + + (record_buf.len() - payload_overflow_threshold_min) % (self.usable_space() - 4); - if space_left > max_local { - space_left = min_local; + if space_left > payload_overflow_threshold_max { + space_left = payload_overflow_threshold_min; } // cell_size must be equal to first value of space_left as this will be the bytes copied to non-overflow page. @@ -1487,31 +1571,54 @@ impl BTreeCursor { assert_eq!(cell_size, cell_payload.len()); } - fn max_local(&self, page_type: PageType) -> usize { - let usable_space = self.usable_space(); + /// Returns the maximum payload size (X) that can be stored directly on a b-tree page without spilling to overflow pages. + /// + /// For table leaf pages: X = usable_size - 35 + /// For index pages: X = ((usable_size - 12) * 64/255) - 23 + /// + /// The usable size is the total page size less the reserved space at the end of each page. + /// These thresholds are designed to: + /// - Give a minimum fanout of 4 for index b-trees + /// - Ensure enough payload is on the b-tree page that the record header can usually be accessed + /// without consulting an overflow page + fn payload_overflow_threshold_max(&self, page_type: PageType) -> usize { + let usable_size = self.usable_space(); match page_type { - PageType::IndexInterior | PageType::TableInterior => { - (usable_space - 12) * 64 / 255 - 23 + PageType::IndexInterior | PageType::IndexLeaf => { + ((usable_size - 12) * 64 / 255) - 23 // Index page formula + } + PageType::TableInterior | PageType::TableLeaf => { + usable_size - 35 // Table leaf page formula } - PageType::IndexLeaf | PageType::TableLeaf => usable_space - 35, } } - fn min_local(&self, page_type: PageType) -> usize { - let usable_space = self.usable_space(); - match page_type { - PageType::IndexInterior | PageType::TableInterior => { - (usable_space - 12) * 32 / 255 - 23 - } - PageType::IndexLeaf | PageType::TableLeaf => (usable_space - 12) * 32 / 255 - 23, - } + /// Returns the minimum payload size (M) that must be stored on the b-tree page before spilling to overflow pages is allowed. + /// + /// For all page types: M = ((usable_size - 12) * 32/255) - 23 + /// + /// When payload size P exceeds max_local(): + /// - If K = M + ((P-M) % (usable_size-4)) <= max_local(): store K bytes on page + /// - Otherwise: store M bytes on page + /// + /// The remaining bytes are stored on overflow pages in both cases. + fn payload_overflow_threshold_min(&self, _page_type: PageType) -> usize { + let usable_size = self.usable_space(); + // Same formula for all page types + ((usable_size - 12) * 32 / 255) - 23 } + /// The "usable size" of a database page is the page size specified by the 2-byte integer at offset 16 + /// in the header, minus the "reserved" space size recorded in the 1-byte integer at offset 20 in the header. + /// The usable size of a page might be an odd number. However, the usable size is not allowed to be less than 480. + /// In other words, if the page size is 512, then the reserved space size cannot exceed 32. fn usable_space(&self) -> usize { let db_header = RefCell::borrow(&self.database_header); - (db_header.page_size - db_header.unused_space as u16) as usize + (db_header.page_size - db_header.reserved_space as u16) as usize } + /// Find the index of the cell in the page that contains the given rowid. + /// BTree tables only. fn find_cell(&self, page: &PageContent, int_key: u64) -> usize { let mut cell_idx = 0; let cell_count = page.cell_count(); @@ -1520,8 +1627,8 @@ impl BTreeCursor { .cell_get( cell_idx, self.pager.clone(), - self.max_local(page.page_type()), - self.min_local(page.page_type()), + self.payload_overflow_threshold_max(page.page_type()), + self.payload_overflow_threshold_min(page.page_type()), self.usable_space(), ) .unwrap() @@ -1545,6 +1652,8 @@ impl BTreeCursor { } impl PageStack { + /// Push a new page onto the stack. + /// This effectively means traversing to a child page. fn push(&self, page: PageRef) { debug!( "pagestack::push(current={}, new_page_id={})", @@ -1561,6 +1670,8 @@ impl PageStack { self.cell_indices.borrow_mut()[current as usize] = 0; } + /// Pop a page off the stack. + /// This effectively means traversing back up to a parent page. fn pop(&self) { let current = *self.current_page.borrow(); debug!("pagestack::pop(current={})", current); @@ -1569,6 +1680,8 @@ impl PageStack { *self.current_page.borrow_mut() -= 1; } + /// Get the top page on the stack. + /// This is the page that is currently being traversed. fn top(&self) -> PageRef { let current = *self.current_page.borrow(); let page = self.stack.borrow()[current as usize] @@ -1583,6 +1696,7 @@ impl PageStack { page } + /// Get the parent page of the current page. fn parent(&self) -> PageRef { let current = *self.current_page.borrow(); self.stack.borrow()[current as usize - 1] @@ -1597,13 +1711,15 @@ impl PageStack { } /// Cell index of the current page - fn current_index(&self) -> i32 { + fn current_cell_index(&self) -> i32 { let current = self.current(); self.cell_indices.borrow()[current] } - fn curr_idx_out_of_begin(&self) -> bool { - let cell_idx = self.current_index(); + /// Check if the current cell index is less than 0. + /// This means we have been iterating backwards and have reached the start of the page. + fn current_cell_index_less_than_min(&self) -> bool { + let cell_idx = self.current_cell_index(); cell_idx < 0 } @@ -1639,7 +1755,7 @@ fn find_free_cell(page_ref: &PageContent, db_header: Ref, amount let buf = page_ref.as_ptr(); - let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; + let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; let maxpc = usable_space - amount; let mut found = false; while pc <= maxpc { @@ -1785,8 +1901,8 @@ impl Cursor for BTreeCursor { let equals = match &contents.cell_get( cell_idx, self.pager.clone(), - self.max_local(contents.page_type()), - self.min_local(contents.page_type()), + self.payload_overflow_threshold_max(contents.page_type()), + self.payload_overflow_threshold_min(contents.page_type()), self.usable_space(), )? { BTreeCell::TableLeafCell(l) => l._rowid == int_key, @@ -1823,15 +1939,18 @@ pub fn btree_init_page( let contents = contents.contents.as_mut().unwrap(); contents.offset = offset; let id = page_type as u8; - contents.write_u8(BTREE_HEADER_OFFSET_TYPE, id); - contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0); - contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0); + contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, id); + contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); + contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); - let cell_content_area_start = db_header.page_size - db_header.unused_space as u16; - contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start); + let cell_content_area_start = db_header.page_size - db_header.reserved_space as u16; + contents.write_u16( + PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, + cell_content_area_start, + ); - contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0); - contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0); + contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0); + contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, 0); } fn to_static_buf(buf: &[u8]) -> &'static [u8] { diff --git a/core/storage/pager.rs b/core/storage/pager.rs index 105a8a75a..cd934d42a 100644 --- a/core/storage/pager.rs +++ b/core/storage/pager.rs @@ -482,7 +482,7 @@ impl Pager { pub fn usable_size(&self) -> usize { let db_header = self.db_header.borrow(); - (db_header.page_size - db_header.unused_space as u16) as usize + (db_header.page_size - db_header.reserved_space as u16) as usize } } diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index a1a8aec0c..15d5b2c6c 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -64,30 +64,84 @@ const DEFAULT_CACHE_SIZE: i32 = -2000; // Minimum number of pages that cache can hold. pub const MIN_PAGE_CACHE_SIZE: usize = 10; +/// The database header. +/// The first 100 bytes of the database file comprise the database file header. +/// The database file header is divided into fields as shown by the table below. +/// All multibyte fields in the database file header are stored with the most significant byte first (big-endian). #[derive(Debug, Clone)] pub struct DatabaseHeader { + /// The header string: "SQLite format 3\0" magic: [u8; 16], + + /// The database page size in bytes. Must be a power of two between 512 and 32768 inclusive, + /// or the value 1 representing a page size of 65536. pub page_size: u16, + + /// File format write version. 1 for legacy; 2 for WAL. write_version: u8, + + /// File format read version. 1 for legacy; 2 for WAL. read_version: u8, - pub unused_space: u8, + + /// Bytes of unused "reserved" space at the end of each page. Usually 0. + /// SQLite has the ability to set aside a small number of extra bytes at the end of every page for use by extensions. + /// These extra bytes are used, for example, by the SQLite Encryption Extension to store a nonce and/or + /// cryptographic checksum associated with each page. + pub reserved_space: u8, + + /// Maximum embedded payload fraction. Must be 64. max_embed_frac: u8, + + /// Minimum embedded payload fraction. Must be 32. min_embed_frac: u8, + + /// Leaf payload fraction. Must be 32. min_leaf_frac: u8, + + /// File change counter, incremented when database is modified. change_counter: u32, + + /// Size of the database file in pages. The "in-header database size". pub database_size: u32, + + /// Page number of the first freelist trunk page. freelist_trunk_page: u32, + + /// Total number of freelist pages. freelist_pages: u32, + + /// The schema cookie. Incremented when the database schema changes. schema_cookie: u32, + + /// The schema format number. Supported formats are 1, 2, 3, and 4. schema_format: u32, - pub default_cache_size: i32, - vacuum: u32, + + /// Default page cache size. + pub default_page_cache_size: i32, + + /// The page number of the largest root b-tree page when in auto-vacuum or + /// incremental-vacuum modes, or zero otherwise. + vacuum_mode_largest_root_page: u32, + + /// The database text encoding. 1=UTF-8, 2=UTF-16le, 3=UTF-16be. text_encoding: u32, + + /// The "user version" as read and set by the user_version pragma. user_version: u32, - incremental_vacuum: u32, + + /// True (non-zero) for incremental-vacuum mode. False (zero) otherwise. + incremental_vacuum_enabled: u32, + + /// The "Application ID" set by PRAGMA application_id. application_id: u32, - reserved: [u8; 20], + + /// Reserved for expansion. Must be zero. + reserved_for_expansion: [u8; 20], + + /// The version-valid-for number. version_valid_for: u32, + + /// SQLITE_VERSION_NUMBER pub version_number: u32, } @@ -98,28 +152,62 @@ pub const WAL_FRAME_HEADER_SIZE: usize = 24; pub const WAL_MAGIC_LE: u32 = 0x377f0682; pub const WAL_MAGIC_BE: u32 = 0x377f0683; +/// The Write-Ahead Log (WAL) header. +/// The first 32 bytes of a WAL file comprise the WAL header. +/// The WAL header is divided into the following fields stored in big-endian order. #[derive(Debug, Default, Clone)] #[repr(C)] // This helps with encoding because rust does not respect the order in structs, so in // this case we want to keep the order pub struct WalHeader { + /// Magic number. 0x377f0682 or 0x377f0683 + /// If the LSB is 0, checksums are native byte order, else checksums are serialized pub magic: u32, + + /// WAL format version. Currently 3007000 pub file_format: u32, + + /// Database page size in bytes. Power of two between 512 and 32768 inclusive pub page_size: u32, + + /// Checkpoint sequence number. Increases with each checkpoint pub checkpoint_seq: u32, + + /// Random value used for the first salt in checksum calculations pub salt_1: u32, + + /// Random value used for the second salt in checksum calculations pub salt_2: u32, + + /// First checksum value in the wal-header pub checksum_1: u32, + + /// Second checksum value in the wal-header pub checksum_2: u32, } +/// Immediately following the wal-header are zero or more frames. +/// Each frame consists of a 24-byte frame-header followed by bytes of page data. +/// The frame-header is six big-endian 32-bit unsigned integer values, as follows: #[allow(dead_code)] #[derive(Debug, Default)] pub struct WalFrameHeader { + /// Page number page_number: u32, + + /// For commit records, the size of the database file in pages after the commit. + /// For all other records, zero. db_size: u32, + + /// Salt-1 copied from the WAL header salt_1: u32, + + /// Salt-2 copied from the WAL header salt_2: u32, + + /// Checksum-1: Cumulative checksum up through and including this page checksum_1: u32, + + /// Checksum-2: Second half of the cumulative checksum checksum_2: u32, } @@ -130,7 +218,7 @@ impl Default for DatabaseHeader { page_size: 4096, write_version: 2, read_version: 2, - unused_space: 0, + reserved_space: 0, max_embed_frac: 64, min_embed_frac: 32, min_leaf_frac: 32, @@ -140,13 +228,13 @@ impl Default for DatabaseHeader { freelist_pages: 0, schema_cookie: 0, schema_format: 4, // latest format, new sqlite3 databases use this format - default_cache_size: 500, // pages - vacuum: 0, + default_page_cache_size: 500, // pages + vacuum_mode_largest_root_page: 0, text_encoding: 1, // utf-8 user_version: 1, - incremental_vacuum: 0, + incremental_vacuum_enabled: 0, application_id: 0, - reserved: [0; 20], + reserved_for_expansion: [0; 20], version_valid_for: 3047000, version_number: 3047000, } @@ -180,7 +268,7 @@ fn finish_read_database_header( header.page_size = u16::from_be_bytes([buf[16], buf[17]]); header.write_version = buf[18]; header.read_version = buf[19]; - header.unused_space = buf[20]; + header.reserved_space = buf[20]; header.max_embed_frac = buf[21]; header.min_embed_frac = buf[22]; header.min_leaf_frac = buf[23]; @@ -190,16 +278,16 @@ fn finish_read_database_header( header.freelist_pages = u32::from_be_bytes([buf[36], buf[37], buf[38], buf[39]]); header.schema_cookie = u32::from_be_bytes([buf[40], buf[41], buf[42], buf[43]]); header.schema_format = u32::from_be_bytes([buf[44], buf[45], buf[46], buf[47]]); - header.default_cache_size = i32::from_be_bytes([buf[48], buf[49], buf[50], buf[51]]); - if header.default_cache_size == 0 { - header.default_cache_size = DEFAULT_CACHE_SIZE; + header.default_page_cache_size = i32::from_be_bytes([buf[48], buf[49], buf[50], buf[51]]); + if header.default_page_cache_size == 0 { + header.default_page_cache_size = DEFAULT_CACHE_SIZE; } - header.vacuum = u32::from_be_bytes([buf[52], buf[53], buf[54], buf[55]]); + header.vacuum_mode_largest_root_page = u32::from_be_bytes([buf[52], buf[53], buf[54], buf[55]]); header.text_encoding = u32::from_be_bytes([buf[56], buf[57], buf[58], buf[59]]); header.user_version = u32::from_be_bytes([buf[60], buf[61], buf[62], buf[63]]); - header.incremental_vacuum = u32::from_be_bytes([buf[64], buf[65], buf[66], buf[67]]); + header.incremental_vacuum_enabled = u32::from_be_bytes([buf[64], buf[65], buf[66], buf[67]]); header.application_id = u32::from_be_bytes([buf[68], buf[69], buf[70], buf[71]]); - header.reserved.copy_from_slice(&buf[72..92]); + header.reserved_for_expansion.copy_from_slice(&buf[72..92]); header.version_valid_for = u32::from_be_bytes([buf[92], buf[93], buf[94], buf[95]]); header.version_number = u32::from_be_bytes([buf[96], buf[97], buf[98], buf[99]]); Ok(()) @@ -258,7 +346,7 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) { buf[16..18].copy_from_slice(&header.page_size.to_be_bytes()); buf[18] = header.write_version; buf[19] = header.read_version; - buf[20] = header.unused_space; + buf[20] = header.reserved_space; buf[21] = header.max_embed_frac; buf[22] = header.min_embed_frac; buf[23] = header.min_leaf_frac; @@ -268,15 +356,15 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) { buf[36..40].copy_from_slice(&header.freelist_pages.to_be_bytes()); buf[40..44].copy_from_slice(&header.schema_cookie.to_be_bytes()); buf[44..48].copy_from_slice(&header.schema_format.to_be_bytes()); - buf[48..52].copy_from_slice(&header.default_cache_size.to_be_bytes()); + buf[48..52].copy_from_slice(&header.default_page_cache_size.to_be_bytes()); - buf[52..56].copy_from_slice(&header.vacuum.to_be_bytes()); + buf[52..56].copy_from_slice(&header.vacuum_mode_largest_root_page.to_be_bytes()); buf[56..60].copy_from_slice(&header.text_encoding.to_be_bytes()); buf[60..64].copy_from_slice(&header.user_version.to_be_bytes()); - buf[64..68].copy_from_slice(&header.incremental_vacuum.to_be_bytes()); + buf[64..68].copy_from_slice(&header.incremental_vacuum_enabled.to_be_bytes()); buf[68..72].copy_from_slice(&header.application_id.to_be_bytes()); - buf[72..92].copy_from_slice(&header.reserved); + buf[72..92].copy_from_slice(&header.reserved_for_expansion); buf[92..96].copy_from_slice(&header.version_valid_for.to_be_bytes()); buf[96..100].copy_from_slice(&header.version_number.to_be_bytes()); } @@ -387,6 +475,12 @@ impl PageContent { buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes()); } + /// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page. + /// A freeblock is a structure used to identify unallocated space within a b-tree page. + /// Freeblocks are organized as a chain. + /// + /// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead + /// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions. pub fn first_freeblock(&self) -> u16 { self.read_u16(1) } @@ -395,10 +489,16 @@ impl PageContent { self.read_u16(3) as usize } + /// The start of the cell content area. + /// SQLite strives to place cells as far toward the end of the b-tree page as it can, + /// in order to leave space for future growth of the cell pointer array. + /// = the cell content area pointer moves leftward as cells are added to the page pub fn cell_content_area(&self) -> u16 { self.read_u16(5) } + /// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header. + /// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area. pub fn num_frag_free_bytes(&self) -> u8 { self.read_u8(7) } @@ -416,22 +516,24 @@ impl PageContent { &self, idx: usize, pager: Rc, - max_local: usize, - min_local: usize, + payload_overflow_threshold_max: usize, + payload_overflow_threshold_min: usize, usable_size: usize, ) -> Result { log::debug!("cell_get(idx={})", idx); let buf = self.as_ptr(); let ncells = self.cell_count(); - let cell_start = match self.page_type() { + // the page header is 12 bytes for interior pages, 8 bytes for leaf pages + // this is because the 4 last bytes in the interior page's header are used for the rightmost pointer. + let cell_pointer_array_start = match self.page_type() { PageType::IndexInterior => 12, PageType::TableInterior => 12, PageType::IndexLeaf => 8, PageType::TableLeaf => 8, }; assert!(idx < ncells, "cell_get: idx out of bounds"); - let cell_pointer = cell_start + (idx * 2); + let cell_pointer = cell_pointer_array_start + (idx * 2); let cell_pointer = self.read_u16(cell_pointer) as usize; read_btree_cell( @@ -439,13 +541,17 @@ impl PageContent { &self.page_type(), cell_pointer, pager, - max_local, - min_local, + payload_overflow_threshold_max, + payload_overflow_threshold_min, usable_size, ) } - - /// When using this fu + /// The cell pointer array of a b-tree page immediately follows the b-tree page header. + /// Let K be the number of cells on the btree. + /// The cell pointer array consists of K 2-byte integer offsets to the cell contents. + /// The cell pointers are arranged in key order with: + /// - left-most cell (the cell with the smallest key) first and + /// - the right-most cell (the cell with the largest key) last. pub fn cell_get_raw_pointer_region(&self) -> (usize, usize) { let cell_start = match self.page_type() { PageType::IndexInterior => 12, @@ -460,27 +566,31 @@ impl PageContent { pub fn cell_get_raw_region( &self, idx: usize, - max_local: usize, - min_local: usize, + payload_overflow_threshold_max: usize, + payload_overflow_threshold_min: usize, usable_size: usize, ) -> (usize, usize) { let buf = self.as_ptr(); let ncells = self.cell_count(); - let cell_start = match self.page_type() { + let cell_pointer_array_start = match self.page_type() { PageType::IndexInterior => 12, PageType::TableInterior => 12, PageType::IndexLeaf => 8, PageType::TableLeaf => 8, }; assert!(idx < ncells, "cell_get: idx out of bounds"); - let cell_pointer = cell_start + (idx * 2); + let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each let cell_pointer = self.read_u16(cell_pointer) as usize; let start = cell_pointer; let len = match self.page_type() { PageType::IndexInterior => { let (len_payload, n_payload) = read_varint(&buf[cell_pointer + 4..]).unwrap(); - let (overflows, to_read) = - payload_overflows(len_payload as usize, max_local, min_local, usable_size); + let (overflows, to_read) = payload_overflows( + len_payload as usize, + payload_overflow_threshold_max, + payload_overflow_threshold_min, + usable_size, + ); if overflows { 4 + to_read + n_payload + 4 } else { @@ -493,8 +603,12 @@ impl PageContent { } PageType::IndexLeaf => { let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap(); - let (overflows, to_read) = - payload_overflows(len_payload as usize, max_local, min_local, usable_size); + let (overflows, to_read) = payload_overflows( + len_payload as usize, + payload_overflow_threshold_max, + payload_overflow_threshold_min, + usable_size, + ); if overflows { to_read + n_payload + 4 } else { @@ -504,8 +618,12 @@ impl PageContent { PageType::TableLeaf => { let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap(); let (_, n_rowid) = read_varint(&buf[cell_pointer + n_payload..]).unwrap(); - let (overflows, to_read) = - payload_overflows(len_payload as usize, max_local, min_local, usable_size); + let (overflows, to_read) = payload_overflows( + len_payload as usize, + payload_overflow_threshold_max, + payload_overflow_threshold_min, + usable_size, + ); if overflows { to_read + n_payload + n_rowid } else { @@ -1170,28 +1288,46 @@ pub fn begin_write_wal_header(io: &Rc, header: &WalHeader) -> Result<( Ok(()) } -/* - Checks if payload will overflow a cell based on max local and - it will return the min size that will be stored in that case, - including overflow pointer -*/ +/// Checks if payload will overflow a cell based on the maximum allowed size. +/// It will return the min size that will be stored in that case, +/// including overflow pointer +/// see e.g. https://github.com/sqlite/sqlite/blob/9591d3fe93936533c8c3b0dc4d025ac999539e11/src/dbstat.c#L371 pub fn payload_overflows( payload_size: usize, - max_local: usize, - min_local: usize, + payload_overflow_threshold_max: usize, + payload_overflow_threshold_min: usize, usable_size: usize, ) -> (bool, usize) { - if payload_size <= max_local { + if payload_size <= payload_overflow_threshold_max { return (false, 0); } - let mut space_left = min_local + (payload_size - min_local) % (usable_size - 4); - if space_left > max_local { - space_left = min_local; + let mut space_left = payload_overflow_threshold_min + + (payload_size - payload_overflow_threshold_min) % (usable_size - 4); + if space_left > payload_overflow_threshold_max { + space_left = payload_overflow_threshold_min; } (true, space_left + 4) } +/// The checksum is computed by interpreting the input as an even number of unsigned 32-bit integers: x(0) through x(N). +/// The 32-bit integers are big-endian if the magic number in the first 4 bytes of the WAL header is 0x377f0683 +/// and the integers are little-endian if the magic number is 0x377f0682. +/// The checksum values are always stored in the frame header in a big-endian format regardless of which byte order is used to compute the checksum. + +/// The checksum algorithm only works for content which is a multiple of 8 bytes in length. +/// In other words, if the inputs are x(0) through x(N) then N must be odd. +/// The checksum algorithm is as follows: +/// +/// s0 = s1 = 0 +/// for i from 0 to n-1 step 2: +/// s0 += x(i) + s1; +/// s1 += x(i+1) + s0; +/// endfor +/// +/// The outputs s0 and s1 are both weighted checksums using Fibonacci weights in reverse order. +/// (The largest Fibonacci weight occurs on the first element of the sequence being summed.) +/// The s1 value spans all 32-bit integer terms of the sequence whereas s0 omits the final term. pub fn checksum_wal( buf: &[u8], _wal_header: &WalHeader, diff --git a/core/translate/mod.rs b/core/translate/mod.rs index 2e5d86141..db69f1578 100644 --- a/core/translate/mod.rs +++ b/core/translate/mod.rs @@ -386,7 +386,7 @@ fn query_pragma( match pragma { PragmaName::CacheSize => { program.emit_insn(Insn::Integer { - value: database_header.borrow().default_cache_size.into(), + value: database_header.borrow().default_page_cache_size.into(), dest: register, }); } @@ -424,7 +424,7 @@ fn update_cache_size(value: i64, header: Rc>, pager: Rc< } // update in-memory header - header.borrow_mut().default_cache_size = cache_size_unformatted + header.borrow_mut().default_page_cache_size = cache_size_unformatted .try_into() .unwrap_or_else(|_| panic!("invalid value, too big for a i32 {}", value));