diff --git a/core/storage/btree.rs b/core/storage/btree.rs index dfc0c63c5..efd65d62d 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -68,7 +68,14 @@ pub mod offset { /// The number of cells in the page (u16). pub const BTREE_CELL_COUNT: usize = 3; - /// A pointer to first byte of cell allocated content from top (u16). + /// A pointer to the first byte of cell allocated content from top (u16). + /// + /// A zero value for this integer is interpreted as 65,536. + /// If a page contains no cells (which is only possible for a root page of a table that + /// contains no rows) then the offset to the cell content area will equal the page size minus + /// the bytes of reserved space. If the database uses a 65536-byte page size and the + /// reserved space is zero (the usual value for reserved space) then the cell content offset of + /// an empty page wants to be 6,5536 /// /// SQLite strives to place cells as far toward the end of the b-tree page as it can, in /// order to leave space for future growth of the cell pointer array. This means that the @@ -2218,10 +2225,10 @@ impl BTreeCursor { cell_idx, self.usable_space() as u16, )?; - contents.overflow_cells.len() + !contents.overflow_cells.is_empty() }; self.stack.set_cell_index(cell_idx as i32); - if overflow > 0 { + if overflow { // A balance will happen so save the key we were inserting tracing::debug!(page = page.get().get().id, cell_idx, "balance triggered:"); self.save_context(match bkey { @@ -4280,13 +4287,10 @@ impl BTreeCursor { page.get().get_contents().page_type(), PageType::TableLeaf | PageType::TableInterior ) { - let _target_rowid = match return_if_io!(self.rowid()) { - Some(rowid) => rowid, - _ => { - self.state = CursorState::None; - return Ok(CursorResult::Ok(())); - } - }; + if return_if_io!(self.rowid()).is_none() { + self.state = CursorState::None; + return Ok(CursorResult::Ok(())); + } } else if self.reusable_immutable_record.borrow().is_none() { self.state = CursorState::None; return Ok(CursorResult::Ok(())); @@ -4395,8 +4399,6 @@ impl BTreeCursor { let page = page.get(); let contents = page.get_contents(); - let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1); - let delete_info = self.state.mut_delete_info().unwrap(); if !contents.is_leaf() { delete_info.state = DeleteState::InteriorNodeReplacement { @@ -4405,7 +4407,7 @@ impl BTreeCursor { post_balancing_seek_key, }; } else { - let contents = page.get().contents.as_mut().unwrap(); + let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1); drop_cell(contents, cell_idx, self.usable_space() as u16)?; let delete_info = self.state.mut_delete_info().unwrap(); @@ -6062,8 +6064,8 @@ fn free_cell_range( pc }; - if offset <= page.cell_content_area() { - if offset < page.cell_content_area() { + if (offset as u32) <= page.cell_content_area() { + if (offset as u32) < page.cell_content_area() { return_corrupt!("Free block before content area"); } if pointer_to_pc != page.offset as u16 + offset::BTREE_FIRST_FREEBLOCK as u16 { @@ -6238,8 +6240,13 @@ fn insert_into_cell( Ok(()) } -/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte -/// and end of cell pointer area. +/// The amount of free space is the sum of: +/// #1. The size of the unallocated region +/// #2. Fragments (isolated 1-3 byte chunks of free space within the cell content area) +/// #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that +/// are not in use due to e.g. deletions) +/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected +/// to be between first cell byte and end of cell pointer area. #[allow(unused_assignments)] fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { // TODO(pere): maybe free space is not calculated correctly with offset @@ -6248,38 +6255,14 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { // space that is not reserved for extensions by sqlite. Usually reserved_space is 0. let usable_space = usable_space as usize; - let mut cell_content_area_start = page.cell_content_area(); - // A zero value for the cell content area pointer is interpreted as 65536. - // See https://www.sqlite.org/fileformat.html - // The max page size for a sqlite database is 64kiB i.e. 65536 bytes. - // 65536 is u16::MAX + 1, and since cell content grows from right to left, this means - // the cell content area pointer is at the end of the page, - // i.e. - // 1. the page size is 64kiB - // 2. there are no cells on the page - // 3. there is no reserved space at the end of the page - if cell_content_area_start == 0 { - cell_content_area_start = u16::MAX; - } - - // The amount of free space is the sum of: - // #1. the size of the unallocated region - // #2. fragments (isolated 1-3 byte chunks of free space within the cell content area) - // #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions) - - let pointer_size = if matches!(page.page_type(), PageType::TableLeaf | PageType::IndexLeaf) { - 0 - } else { - 4 - }; - let first_cell = page.offset + 8 + pointer_size + (2 * page.cell_count()); - let mut free_space_bytes = - cell_content_area_start as usize + page.num_frag_free_bytes() as usize; + let first_cell = page.offset + page.header_size() + (2 * page.cell_count()); + let cell_content_area_start = page.cell_content_area() as usize; + let mut free_space_bytes = cell_content_area_start + page.num_frag_free_bytes() as usize; // #3 is computed by iterating over the freeblocks linked list let mut cur_freeblock_ptr = page.first_freeblock() as usize; if cur_freeblock_ptr > 0 { - if cur_freeblock_ptr < cell_content_area_start as usize { + if cur_freeblock_ptr < cell_content_area_start { // Freeblocks exist in the cell content area e.g. after deletions // They should never exist in the unused area of the page. todo!("corrupted page"); @@ -6293,7 +6276,7 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { size = page.read_u16_no_offset(cur_freeblock_ptr + 2) as usize; // next 2 bytes in freeblock = size of current freeblock free_space_bytes += size; // Freeblocks are in order from left to right on the page, - // so next pointer should > current pointer + its size, or 0 if no next block exists. + // so the next pointer should > current pointer + its size, or 0 if no next block exists. if next <= cur_freeblock_ptr + size + 3 { break; } @@ -6301,8 +6284,8 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { } // Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list - assert!( - next == 0, + assert_eq!( + next, 0, "corrupted page: freeblocks list not in ascending order" ); @@ -6317,10 +6300,6 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 { "corrupted page: free space is greater than usable space" ); - // if( nFree>usableSize || nFree>, pub overflow_cells: Vec, @@ -376,6 +381,7 @@ impl Clone for PageContent { } } +const CELL_POINTER_SIZE_BYTES: usize = 2; impl PageContent { pub fn new(offset: usize, buffer: Arc>) -> Self { Self { @@ -386,7 +392,7 @@ impl PageContent { } pub fn page_type(&self) -> PageType { - self.read_u8(0).try_into().unwrap() + self.read_u8(BTREE_PAGE_TYPE).try_into().unwrap() } pub fn maybe_page_type(&self) -> Option { @@ -455,19 +461,14 @@ impl PageContent { buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes()); } - /// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page. - /// A freeblock is a structure used to identify unallocated space within a b-tree page. - /// Freeblocks are organized as a chain. - /// - /// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead - /// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions. + /// The offset of the first freeblock, or zero if there are no freeblocks on the page. pub fn first_freeblock(&self) -> u16 { - self.read_u16(1) + self.read_u16(BTREE_FIRST_FREEBLOCK) } /// The number of cells on the page. pub fn cell_count(&self) -> usize { - self.read_u16(3) as usize + self.read_u16(BTREE_CELL_COUNT) as usize } /// The size of the cell pointer array in bytes. @@ -489,11 +490,13 @@ impl PageContent { } /// The start of the cell content area. - /// SQLite strives to place cells as far toward the end of the b-tree page as it can, - /// in order to leave space for future growth of the cell pointer array. - /// = the cell content area pointer moves leftward as cells are added to the page - pub fn cell_content_area(&self) -> u16 { - self.read_u16(5) + pub fn cell_content_area(&self) -> u32 { + let offset = self.read_u16(BTREE_CELL_CONTENT_AREA); + if offset == 0 { + MAX_PAGE_SIZE + } else { + offset as u32 + } } /// The size of the page header in bytes. @@ -507,16 +510,15 @@ impl PageContent { } } - /// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header. - /// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area. + /// The total number of bytes in all fragments pub fn num_frag_free_bytes(&self) -> u8 { - self.read_u8(7) + self.read_u8(BTREE_FRAGMENTED_BYTES_COUNT) } pub fn rightmost_pointer(&self) -> Option { match self.page_type() { - PageType::IndexInterior => Some(self.read_u32(8)), - PageType::TableInterior => Some(self.read_u32(8)), + PageType::IndexInterior => Some(self.read_u32(BTREE_RIGHTMOST_PTR)), + PageType::TableInterior => Some(self.read_u32(BTREE_RIGHTMOST_PTR)), PageType::IndexLeaf => None, PageType::TableLeaf => None, } @@ -524,9 +526,11 @@ impl PageContent { pub fn rightmost_pointer_raw(&self) -> Option<*mut u8> { match self.page_type() { - PageType::IndexInterior | PageType::TableInterior => { - Some(unsafe { self.as_ptr().as_mut_ptr().add(self.offset + 8) }) - } + PageType::IndexInterior | PageType::TableInterior => Some(unsafe { + self.as_ptr() + .as_mut_ptr() + .add(self.offset + BTREE_RIGHTMOST_PTR) + }), PageType::IndexLeaf => None, PageType::TableLeaf => None, } @@ -543,16 +547,14 @@ impl PageContent { let buf = self.as_ptr(); let ncells = self.cell_count(); - // the page header is 12 bytes for interior pages, 8 bytes for leaf pages - // this is because the 4 last bytes in the interior page's header are used for the rightmost pointer. - let cell_pointer_array_start = self.header_size(); assert!( idx < ncells, "cell_get: idx out of bounds: idx={}, ncells={}", idx, ncells ); - let cell_pointer = cell_pointer_array_start + (idx * 2); + let cell_pointer_array_start = self.header_size(); + let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; // SAFETY: this buffer is valid as long as the page is alive. We could store the page in the cell and do some lifetime magic @@ -573,9 +575,8 @@ impl PageContent { pub fn cell_table_interior_read_rowid(&self, idx: usize) -> Result { debug_assert!(self.page_type() == PageType::TableInterior); let buf = self.as_ptr(); - const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12; - let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES; - let cell_pointer = cell_pointer_array_start + (idx * 2); + let cell_pointer_array_start = self.header_size(); + let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; const LEFT_CHILD_PAGE_SIZE_BYTES: usize = 4; let (rowid, _) = read_varint(&buf[cell_pointer + LEFT_CHILD_PAGE_SIZE_BYTES..])?; @@ -590,9 +591,8 @@ impl PageContent { || self.page_type() == PageType::IndexInterior ); let buf = self.as_ptr(); - const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12; - let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES; - let cell_pointer = cell_pointer_array_start + (idx * 2); + let cell_pointer_array_start = self.header_size(); + let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; u32::from_be_bytes([ buf[cell_pointer], @@ -607,9 +607,8 @@ impl PageContent { pub fn cell_table_leaf_read_rowid(&self, idx: usize) -> Result { debug_assert!(self.page_type() == PageType::TableLeaf); let buf = self.as_ptr(); - const LEAF_PAGE_HEADER_SIZE_BYTES: usize = 8; - let cell_pointer_array_start = LEAF_PAGE_HEADER_SIZE_BYTES; - let cell_pointer = cell_pointer_array_start + (idx * 2); + let cell_pointer_array_start = self.header_size(); + let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); let cell_pointer = self.read_u16(cell_pointer) as usize; let mut pos = cell_pointer; let (_, nr) = read_varint(&buf[pos..])?; @@ -629,7 +628,7 @@ impl PageContent { (self.offset + header_size, self.cell_pointer_array_size()) } - /// Get region of a cell's payload + /// Get region(start end length) of a cell's payload pub fn cell_get_raw_region( &self, idx: usize, @@ -641,7 +640,7 @@ impl PageContent { let ncells = self.cell_count(); let (cell_pointer_array_start, _) = self.cell_pointer_array_offset_and_size(); assert!(idx < ncells, "cell_get: idx out of bounds"); - let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each + let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES); let cell_pointer = self.read_u16_no_offset(cell_pointer) as usize; let start = cell_pointer; let len = match self.page_type() {