From 91cca0d5b7b337249a555cfdfa78fc7a4c5ba878 Mon Sep 17 00:00:00 2001 From: jussisaurio Date: Tue, 24 Dec 2024 10:28:53 +0200 Subject: [PATCH 1/4] use more descriptive names in BTreeCursor::insert_into_cell() --- core/storage/btree.rs | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index bdd27932b..d62173342 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -742,7 +742,8 @@ impl BTreeCursor { /// i.e. whether we need to balance the btree after the insert. fn insert_into_cell(&self, page: &mut PageContent, payload: &[u8], cell_idx: usize) { let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); - let enough_space = payload.len() + 2 <= free as usize; + const CELL_POINTER_SIZE_BYTES: usize = 2; + let enough_space = payload.len() + CELL_POINTER_SIZE_BYTES <= free as usize; if !enough_space { // add to overflow cell page.overflow_cells.push(OverflowCell { @@ -753,27 +754,30 @@ impl BTreeCursor { } // TODO: insert into cell payload in internal page - let pc = self.allocate_cell_space(page, payload.len() as u16); + let new_cell_data_pointer = self.allocate_cell_space(page, payload.len() as u16); let buf = page.as_ptr(); // copy data - buf[pc as usize..pc as usize + payload.len()].copy_from_slice(payload); + buf[new_cell_data_pointer as usize..new_cell_data_pointer as usize + payload.len()] + .copy_from_slice(payload); // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); - let (pointer_area_pc_by_idx, _) = page.cell_get_raw_pointer_region(); - let pointer_area_pc_by_idx = pointer_area_pc_by_idx + (2 * cell_idx); + let (cell_pointer_array_start, _) = page.cell_get_raw_pointer_region(); + let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); - // move previous pointers forward and insert new pointer there - let n_cells_forward = 2 * (page.cell_count() - cell_idx); - if n_cells_forward > 0 { + // move existing pointers forward by CELL_POINTER_SIZE_BYTES... + let n_cells_forward = page.cell_count() - cell_idx; + let n_bytes_forward = CELL_POINTER_SIZE_BYTES * n_cells_forward; + if n_bytes_forward > 0 { buf.copy_within( - pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, - pointer_area_pc_by_idx + 2, + cell_pointer_cur_idx..cell_pointer_cur_idx + n_bytes_forward, + cell_pointer_cur_idx + CELL_POINTER_SIZE_BYTES, ); } - page.write_u16(pointer_area_pc_by_idx - page.offset, pc); + // ...and insert new cell pointer at the current index + page.write_u16(cell_pointer_cur_idx - page.offset, new_cell_data_pointer); - // update first byte of content area - page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, pc); + // update first byte of content area (cell data always appended to the left, so cell content area pointer moves to point to the new cell data) + page.write_u16(PAGE_HEADER_OFFSET_CELL_CONTENT_AREA, new_cell_data_pointer); // update cell count let new_n_cells = (page.cell_count() + 1) as u16; From c6b7ddf77adee9e57fcfa99e34b295914ad3e438 Mon Sep 17 00:00:00 2001 From: jussisaurio Date: Tue, 24 Dec 2024 10:30:27 +0200 Subject: [PATCH 2/4] Improve comments in BTreeCursor::compute_free_space() --- core/storage/btree.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index d62173342..5a0a51e33 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1441,10 +1441,14 @@ impl BTreeCursor { let first_cell = (page.offset + 8 + child_pointer_size + (2 * ncell)) as u16; // The amount of free space is the sum of: - // 1. 0..first_byte_in_cell_content (everything to the left of the cell content area pointer is unused free space) - // 2. fragmented_free_bytes. + // #1. space to the left of the cell content area pointer + // #2. fragmented_free_bytes (isolated 1-3 byte chunks of free space within the cell content area) + // #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions) + + // #1 and #2 are known from the page header let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize; + // #3 is computed by iterating over the freeblocks linked list let mut pc = free_block_pointer as usize; if pc > 0 { if pc < first_byte_in_cell_content as usize { @@ -1457,28 +1461,33 @@ impl BTreeCursor { let mut size = 0; loop { // TODO: check corruption icellast - next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()) as usize; - size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()) as usize; + next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()) as usize; // first 2 bytes in freeblock = next freeblock pointer + size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()) as usize; // next 2 bytes in freeblock = size of current freeblock nfree += size; + // Freeblocks are in order from left to right on the page, + // so next pointer should > current pointer + its size, or 0 if no next block exists. if next <= pc + size + 3 { break; } pc = next; } - if next > 0 { - todo!("corrupted page ascending order"); - } + // Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list + assert!( + next == 0, + "corrupted page: freeblocks list not in ascending order" + ); - if pc + size > usable_space { - todo!("corrupted page last freeblock extends last page end"); - } + assert!( + pc + size <= usable_space, + "corrupted page: last freeblock extends last page end" + ); } // if( nFree>usableSize || nFree Date: Tue, 24 Dec 2024 18:50:16 +0200 Subject: [PATCH 3/4] refactor compute_free_space() --- core/storage/btree.rs | 63 +++++++++++++++++++--------------- core/storage/sqlite3_ondisk.rs | 50 +++++++++++++++++---------- 2 files changed, 67 insertions(+), 46 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 5a0a51e33..1ff53e7d2 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -1415,10 +1415,12 @@ impl BTreeCursor { #[allow(unused_assignments)] fn compute_free_space(&self, page: &PageContent, db_header: Ref) -> u16 { // TODO(pere): maybe free space is not calculated correctly with offset - let buf = page.as_ptr(); + // Usable space, not the same as free space, simply means: + // space that is not reserved for extensions by sqlite. Usually reserved_space is 0. let usable_space = (db_header.page_size - db_header.reserved_space as u16) as usize; - let mut first_byte_in_cell_content = page.cell_content_area(); + + let mut cell_content_area_start = page.cell_content_area(); // A zero value for the cell content area pointer is interpreted as 65536. // See https://www.sqlite.org/fileformat.html // The max page size for a sqlite database is 64kiB i.e. 65536 bytes. @@ -1428,30 +1430,23 @@ impl BTreeCursor { // 1. the page size is 64kiB // 2. there are no cells on the page // 3. there is no reserved space at the end of the page - if first_byte_in_cell_content == 0 { - first_byte_in_cell_content = u16::MAX; + if cell_content_area_start == 0 { + cell_content_area_start = u16::MAX; } - let fragmented_free_bytes = page.num_frag_free_bytes(); - let free_block_pointer = page.first_freeblock(); - let ncell = page.cell_count(); - - // 8 + 4 == header end - let child_pointer_size = if page.is_leaf() { 0 } else { 4 }; - let first_cell = (page.offset + 8 + child_pointer_size + (2 * ncell)) as u16; - // The amount of free space is the sum of: - // #1. space to the left of the cell content area pointer - // #2. fragmented_free_bytes (isolated 1-3 byte chunks of free space within the cell content area) + // #1. the size of the unallocated region + // #2. fragments (isolated 1-3 byte chunks of free space within the cell content area) // #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions) - // #1 and #2 are known from the page header - let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize; + let mut free_space_bytes = + page.unallocated_region_size() as usize + page.num_frag_free_bytes() as usize; // #3 is computed by iterating over the freeblocks linked list - let mut pc = free_block_pointer as usize; - if pc > 0 { - if pc < first_byte_in_cell_content as usize { + let mut cur_freeblock_ptr = page.first_freeblock() as usize; + let page_buf = page.as_ptr(); + if cur_freeblock_ptr > 0 { + if cur_freeblock_ptr < cell_content_area_start as usize { // Freeblocks exist in the cell content area e.g. after deletions // They should never exist in the unused area of the page. todo!("corrupted page"); @@ -1461,15 +1456,23 @@ impl BTreeCursor { let mut size = 0; loop { // TODO: check corruption icellast - next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()) as usize; // first 2 bytes in freeblock = next freeblock pointer - size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()) as usize; // next 2 bytes in freeblock = size of current freeblock - nfree += size; + next = u16::from_be_bytes( + page_buf[cur_freeblock_ptr..cur_freeblock_ptr + 2] + .try_into() + .unwrap(), + ) as usize; // first 2 bytes in freeblock = next freeblock pointer + size = u16::from_be_bytes( + page_buf[cur_freeblock_ptr + 2..cur_freeblock_ptr + 4] + .try_into() + .unwrap(), + ) as usize; // next 2 bytes in freeblock = size of current freeblock + free_space_bytes += size; // Freeblocks are in order from left to right on the page, // so next pointer should > current pointer + its size, or 0 if no next block exists. - if next <= pc + size + 3 { + if next <= cur_freeblock_ptr + size + 3 { break; } - pc = next; + cur_freeblock_ptr = next; } // Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list @@ -1479,17 +1482,21 @@ impl BTreeCursor { ); assert!( - pc + size <= usable_space, + cur_freeblock_ptr + size <= usable_space, "corrupted page: last freeblock extends last page end" ); } + assert!( + free_space_bytes <= usable_space, + "corrupted page: free space is greater than usable space" + ); + // if( nFree>usableSize || nFree usize { self.read_u16(3) as usize } + /// The size of the cell pointer array in bytes. + /// 2 bytes per cell pointer + pub fn cell_pointer_array_size(&self) -> usize { + const CELL_POINTER_SIZE_BYTES: usize = 2; + self.cell_count() * CELL_POINTER_SIZE_BYTES + } + + /// The start of the unallocated region. + /// Effectively: the offset after the page header + the cell pointer array. + pub fn unallocated_region_start(&self) -> usize { + self.offset + self.header_size() + self.cell_pointer_array_size() + } + + pub fn unallocated_region_size(&self) -> usize { + self.cell_content_area() as usize - self.unallocated_region_start() + } + /// The start of the cell content area. /// SQLite strives to place cells as far toward the end of the b-tree page as it can, /// in order to leave space for future growth of the cell pointer array. @@ -497,6 +515,17 @@ impl PageContent { self.read_u16(5) } + /// The size of the page header in bytes. + /// 8 bytes for leaf pages, 12 bytes for interior pages (due to storing rightmost child pointer) + pub fn header_size(&self) -> usize { + match self.page_type() { + PageType::IndexInterior => 12, + PageType::TableInterior => 12, + PageType::IndexLeaf => 8, + PageType::TableLeaf => 8, + } + } + /// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header. /// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area. pub fn num_frag_free_bytes(&self) -> u8 { @@ -526,12 +555,7 @@ impl PageContent { let ncells = self.cell_count(); // the page header is 12 bytes for interior pages, 8 bytes for leaf pages // this is because the 4 last bytes in the interior page's header are used for the rightmost pointer. - let cell_pointer_array_start = match self.page_type() { - PageType::IndexInterior => 12, - PageType::TableInterior => 12, - PageType::IndexLeaf => 8, - PageType::TableLeaf => 8, - }; + let cell_pointer_array_start = self.header_size(); assert!(idx < ncells, "cell_get: idx out of bounds"); let cell_pointer = cell_pointer_array_start + (idx * 2); let cell_pointer = self.read_u16(cell_pointer) as usize; @@ -553,12 +577,7 @@ impl PageContent { /// - left-most cell (the cell with the smallest key) first and /// - the right-most cell (the cell with the largest key) last. pub fn cell_get_raw_pointer_region(&self) -> (usize, usize) { - let cell_start = match self.page_type() { - PageType::IndexInterior => 12, - PageType::TableInterior => 12, - PageType::IndexLeaf => 8, - PageType::TableLeaf => 8, - }; + let cell_start = self.header_size(); (self.offset + cell_start, self.cell_count() * 2) } @@ -572,12 +591,7 @@ impl PageContent { ) -> (usize, usize) { let buf = self.as_ptr(); let ncells = self.cell_count(); - let cell_pointer_array_start = match self.page_type() { - PageType::IndexInterior => 12, - PageType::TableInterior => 12, - PageType::IndexLeaf => 8, - PageType::TableLeaf => 8, - }; + let cell_pointer_array_start = self.header_size(); assert!(idx < ncells, "cell_get: idx out of bounds"); let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each let cell_pointer = self.read_u16(cell_pointer) as usize; From 42ea9041e1f3ce30ca95a843426d1fa64b0f9015 Mon Sep 17 00:00:00 2001 From: jussisaurio Date: Tue, 24 Dec 2024 19:21:44 +0200 Subject: [PATCH 4/4] rename cell_get_raw_pointer_region() and refactor a bit --- core/storage/btree.rs | 11 ++++------- core/storage/sqlite3_ondisk.rs | 9 +++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/core/storage/btree.rs b/core/storage/btree.rs index 1ff53e7d2..387351225 100644 --- a/core/storage/btree.rs +++ b/core/storage/btree.rs @@ -761,7 +761,7 @@ impl BTreeCursor { buf[new_cell_data_pointer as usize..new_cell_data_pointer as usize + payload.len()] .copy_from_slice(payload); // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); - let (cell_pointer_array_start, _) = page.cell_get_raw_pointer_region(); + let (cell_pointer_array_start, _) = page.cell_pointer_array_offset_and_size(); let cell_pointer_cur_idx = cell_pointer_array_start + (CELL_POINTER_SIZE_BYTES * cell_idx); // move existing pointers forward by CELL_POINTER_SIZE_BYTES... @@ -1232,7 +1232,7 @@ impl BTreeCursor { if is_page_1 { // Remove header from child and set offset to 0 let contents = child.get().contents.as_mut().unwrap(); - let (cell_pointer_offset, _) = contents.cell_get_raw_pointer_region(); + let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size(); // change cell pointers for cell_idx in 0..contents.cell_count() { let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset; @@ -1288,7 +1288,7 @@ impl BTreeCursor { fn allocate_cell_space(&self, page_ref: &PageContent, amount: u16) -> u16 { let amount = amount as usize; - let (cell_offset, _) = page_ref.cell_get_raw_pointer_region(); + let (cell_offset, _) = page_ref.cell_pointer_array_offset_and_size(); let gap = cell_offset + 2 * page_ref.cell_count(); let mut top = page_ref.cell_content_area() as usize; @@ -1330,10 +1330,7 @@ impl BTreeCursor { // TODO: implement fast algorithm let last_cell = usable_space - 4; - let first_cell = { - let (start, end) = cloned_page.cell_get_raw_pointer_region(); - start + end - }; + let first_cell = cloned_page.unallocated_region_start() as u64; if cloned_page.cell_count() > 0 { let page_type = page.page_type(); diff --git a/core/storage/sqlite3_ondisk.rs b/core/storage/sqlite3_ondisk.rs index eb57ff041..0403bee87 100644 --- a/core/storage/sqlite3_ondisk.rs +++ b/core/storage/sqlite3_ondisk.rs @@ -500,7 +500,8 @@ impl PageContent { /// The start of the unallocated region. /// Effectively: the offset after the page header + the cell pointer array. pub fn unallocated_region_start(&self) -> usize { - self.offset + self.header_size() + self.cell_pointer_array_size() + let (cell_ptr_array_start, cell_ptr_array_size) = self.cell_pointer_array_offset_and_size(); + cell_ptr_array_start + cell_ptr_array_size } pub fn unallocated_region_size(&self) -> usize { @@ -576,9 +577,9 @@ impl PageContent { /// The cell pointers are arranged in key order with: /// - left-most cell (the cell with the smallest key) first and /// - the right-most cell (the cell with the largest key) last. - pub fn cell_get_raw_pointer_region(&self) -> (usize, usize) { - let cell_start = self.header_size(); - (self.offset + cell_start, self.cell_count() * 2) + pub fn cell_pointer_array_offset_and_size(&self) -> (usize, usize) { + let header_size = self.header_size(); + (self.offset + header_size, self.cell_pointer_array_size()) } /* Get region of a cell's payload */