From 35c3fe7448c4c34f1cafa8435a9ddf60013f3243 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 21:05:08 +0200 Subject: [PATCH] core: refactor page in memory representation --- core/btree.rs | 69 ++++++++++------------ core/pager.rs | 6 +- core/sqlite3_ondisk.rs | 130 +++++++++++++++++++++++++---------------- 3 files changed, 113 insertions(+), 92 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 6dd36b492..91dbc1ca9 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -1,6 +1,6 @@ use crate::pager::Pager; use crate::sqlite3_ondisk::{ - read_varint, write_varint, BTreeCell, BTreePage, DatabaseHeader, PageType, TableInteriorCell, + read_varint, write_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, TableLeafCell, }; use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; @@ -76,9 +76,9 @@ impl BTreeCursor { } let page = page.contents.read().unwrap(); let page = page.as_ref().unwrap(); - if mem_page.cell_idx() >= page.cells.len() { + if mem_page.cell_idx() >= page.cell_count() { let parent = mem_page.parent.clone(); - match page.header.right_most_pointer { + match page.rightmost_pointer() { Some(right_most_pointer) => { let mem_page = MemPage::new(parent.clone(), right_most_pointer as usize, 0); self.page.replace(Some(Rc::new(mem_page))); @@ -95,7 +95,7 @@ impl BTreeCursor { }, } } - let cell = &page.cells[mem_page.cell_idx()]; + let cell = page.cell_get(mem_page.cell_idx())?; match &cell { BTreeCell::TableInteriorCell(TableInteriorCell { _left_child_page, @@ -153,8 +153,8 @@ impl BTreeCursor { } let mut found_cell = false; - for cell in &page.cells { - match &cell { + for cell_idx in 0..page.cell_count() { + match &page.cell_get(cell_idx)? { BTreeCell::TableInteriorCell(TableInteriorCell { _left_child_page, _rowid, @@ -188,7 +188,7 @@ impl BTreeCursor { if !found_cell { let parent = mem_page.parent.clone(); - match page.header.right_most_pointer { + match page.rightmost_pointer() { Some(right_most_pointer) => { let mem_page = MemPage::new(parent, right_most_pointer as usize, 0); self.page.replace(Some(Rc::new(mem_page))); @@ -224,7 +224,7 @@ impl BTreeCursor { let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); - assert!(matches!(page.header.page_type, PageType::TableLeaf)); + assert!(matches!(page.page_type(), PageType::TableLeaf)); let free = self.compute_free_space(page, self.database_header.borrow()); @@ -283,7 +283,7 @@ impl BTreeCursor { let pointer_area_pc_by_idx = 8 + 2 * cell_idx; // move previous pointers forward and insert new pointer there - let n_cells_forward = 2 * (page.cells.len() - cell_idx); + let n_cells_forward = 2 * (page.cell_count() - cell_idx); buf.copy_within( pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, pointer_area_pc_by_idx + 2, @@ -295,36 +295,27 @@ impl BTreeCursor { buf[5..7].copy_from_slice(&pc.to_be_bytes()); // update cell count - let new_n_cells = (page.cells.len() + 1) as u16; + let new_n_cells = (page.cell_count() + 1) as u16; buf[3..5].copy_from_slice(&new_n_cells.to_be_bytes()); - // TODo: refactor cells to be lazy loadable because this will be crazy slow let mut payload_for_cell_in_memory: Vec = Vec::new(); _record.serialize(&mut payload_for_cell_in_memory); - page.cells.insert( - cell_idx, - BTreeCell::TableLeafCell(TableLeafCell { - _rowid: int_key, - _payload: payload_for_cell_in_memory, - first_overflow_page: None, - }), - ); } Ok(CursorResult::Ok(())) } - fn allocate_cell_space(&mut self, page_ref: &BTreePage, amount: u16) -> u16 { + fn allocate_cell_space(&mut self, page_ref: &PageContent, amount: u16) -> u16 { let amount = amount as usize; let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); let buf = buf_ref.as_mut_slice(); let cell_offset = 8; - let gap = cell_offset + 2 * page_ref.cells.len(); - let mut top = page_ref.header._cell_content_area as usize; + let gap = cell_offset + 2 * page_ref.cell_count(); + let mut top = page_ref.cell_content_area() as usize; // there are free blocks and enough space - if page_ref.header._first_freeblock_offset != 0 && gap + 2 <= top { + if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot let db_header = self.database_header.borrow(); let pc = find_free_cell(page_ref, db_header, amount, buf); @@ -346,7 +337,7 @@ impl BTreeCursor { return top as u16; } - fn defragment_page(&self, page: &BTreePage, db_header: Ref) { + fn defragment_page(&self, page: &PageContent, db_header: Ref) { let cloned_page = page.clone(); let usable_space = (db_header.page_size - db_header.unused_space as u16) as u64; let mut cbrk = usable_space as u64; @@ -354,14 +345,14 @@ impl BTreeCursor { // TODO: implement fast algorithm let last_cell = (usable_space - 4) as u64; - let first_cell = cloned_page.header._cell_content_area as u64; - if cloned_page.cells.len() > 0 { + let first_cell = cloned_page.cell_content_area() as u64; + if cloned_page.cell_count() > 0 { let buf = cloned_page.buffer.borrow(); let buf = buf.as_slice(); let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); - for i in 0..cloned_page.cells.len() { + for i in 0..cloned_page.cell_count() { let cell_offset = 8; let cell_idx = cell_offset + i * 2; @@ -411,19 +402,19 @@ impl BTreeCursor { // Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte // and end of cell pointer area. - fn compute_free_space(&self, page: &BTreePage, db_header: Ref) -> u16 { + fn compute_free_space(&self, page: &PageContent, db_header: Ref) -> u16 { let buffer = page.buffer.borrow(); let buf = buffer.as_slice(); let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; - let mut first_byte_in_cell_content = page.header._cell_content_area; + let mut first_byte_in_cell_content = page.cell_content_area(); if first_byte_in_cell_content == 0 { first_byte_in_cell_content = u16::MAX; } - let fragmented_free_bytes = page.header._num_frag_free_bytes; - let free_block_pointer = page.header._first_freeblock_offset; - let ncell = page.cells.len(); + let fragmented_free_bytes = page.num_frag_free_bytes(); + let free_block_pointer = page.first_freeblock(); + let ncell = page.cell_count(); // 8 + 4 == header end let first_cell = 8 + 4 + (2 * ncell) as u16; @@ -469,14 +460,14 @@ impl BTreeCursor { } fn find_free_cell( - page_ref: &BTreePage, + page_ref: &PageContent, db_header: Ref, amount: usize, buf: &[u8], ) -> usize { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc - let mut pc = page_ref.header._first_freeblock_offset as usize; + let mut pc = page_ref.first_freeblock() as usize; let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; let maxpc = (usable_space - amount as usize) as usize; let mut found = false; @@ -598,10 +589,10 @@ impl Cursor for BTreeCursor { _ => unreachable!("btree tables are indexed by integers!"), }; let cell_idx = find_cell(page, int_key); - if cell_idx >= page.cells.len() { + if cell_idx >= page.cell_count() { Ok(CursorResult::Ok(false)) } else { - let equals = match &page.cells[cell_idx] { + let equals = match &page.cell_get(cell_idx)? { BTreeCell::TableLeafCell(l) => l._rowid == int_key, _ => unreachable!(), }; @@ -610,10 +601,10 @@ impl Cursor for BTreeCursor { } } -fn find_cell(page: &BTreePage, int_key: u64) -> usize { +fn find_cell(page: &PageContent, int_key: u64) -> usize { let mut cell_idx = 0; - for cell in &page.cells { - match cell { + while cell_idx < page.cell_count() { + match page.cell_get(cell_idx).unwrap() { BTreeCell::TableLeafCell(cell) => { if int_key <= cell._rowid { break; diff --git a/core/pager.rs b/core/pager.rs index a2ec517f5..5f409ff5d 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -1,5 +1,5 @@ use crate::buffer_pool::BufferPool; -use crate::sqlite3_ondisk::BTreePage; +use crate::sqlite3_ondisk::PageContent; use crate::sqlite3_ondisk::{self, DatabaseHeader}; use crate::{PageSource, Result}; use log::trace; @@ -14,7 +14,7 @@ use std::sync::{Arc, RwLock}; pub struct Page { flags: AtomicUsize, - pub contents: RwLock>, + pub contents: RwLock>, pub id: usize, } @@ -305,7 +305,7 @@ impl Pager { } let page = Rc::new(RefCell::new(Page::new(page_idx))); page.borrow().set_locked(); - sqlite3_ondisk::begin_read_btree_page( + sqlite3_ondisk::begin_read_page( &self.page_source, self.buffer_pool.clone(), page.clone(), diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index d4834f465..831065f5b 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -188,17 +188,6 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re Ok(()) } -#[derive(Debug, Clone)] -pub struct BTreePageHeader { - pub(crate) page_type: PageType, - pub(crate) _first_freeblock_offset: u16, - pub(crate) num_cells: u16, - // First byte of content area - pub(crate) _cell_content_area: u16, - pub(crate) _num_frag_free_bytes: u8, - pub(crate) right_most_pointer: Option, -} - #[repr(u8)] #[derive(Debug, PartialEq, Clone)] pub enum PageType { @@ -223,15 +212,84 @@ impl TryFrom for PageType { } #[derive(Debug, Clone)] -pub struct BTreePage { - pub header: BTreePageHeader, - pub cells: Vec, +pub struct PageContent { + pub offset: usize, pub buffer: Rc>, } -impl BTreePage { +impl PageContent { + pub fn page_type(&self) -> PageType { + let buf = self.buffer.borrow(); + let buf = buf.as_slice(); + buf[self.offset].try_into().unwrap() + } + + fn read_u16(&self, pos: usize) -> u16 { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_ref().unwrap().as_slice(); + u16::from_be_bytes([buf[self.offset + pos], buf[self.offset + pos + 1]]) + } + } + + fn read_u32(&self, pos: usize) -> u32 { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_ref().unwrap().as_slice(); + u32::from_be_bytes([ + buf[self.offset + pos], + buf[self.offset + pos + 1], + buf[self.offset + pos + 2], + buf[self.offset + pos + 3], + ]) + } + } + + pub fn first_freeblock(&self) -> u16 { + self.read_u16(1) + } + + pub fn cell_count(&self) -> usize { + self.read_u16(3) as usize + } + + pub fn cell_content_area(&self) -> u16 { + self.read_u16(5) as u16 + } + + pub fn num_frag_free_bytes(&self) -> u16 { + self.read_u16(7) as u16 + } + + pub fn rightmost_pointer(&self) -> Option { + match self.page_type() { + PageType::IndexInterior => Some(self.read_u32(8)), + PageType::TableInterior => Some(self.read_u32(8)), + PageType::IndexLeaf => None, + PageType::TableLeaf => None, + } + } + + pub fn cell_get(&self, idx: usize) -> Result { + let buf = self.buffer.borrow(); + let buf = buf.as_slice(); + + let ncells = self.cell_count(); + let cell_start = match self.page_type() { + PageType::IndexInterior => 12, + PageType::TableInterior => 12, + PageType::IndexLeaf => 8, + PageType::TableLeaf => 8, + }; + assert!(idx < ncells, "cell_get: idx out of bounds"); + let cell_pointer = cell_start + (idx * 2); + let cell_pointer = self.read_u16(cell_pointer) as usize; + + read_btree_cell(buf, &self.page_type(), cell_pointer) + } + pub fn is_leaf(&self) -> bool { - match self.header.page_type { + match self.page_type() { PageType::IndexInterior => false, PageType::TableInterior => false, PageType::IndexLeaf => true, @@ -240,7 +298,7 @@ impl BTreePage { } } -pub fn begin_read_btree_page( +pub fn begin_read_page( page_source: &PageSource, buffer_pool: Rc, page: Rc>, @@ -255,7 +313,7 @@ pub fn begin_read_btree_page( let buf = Rc::new(RefCell::new(Buffer::new(buf, drop_fn))); let complete = Box::new(move |buf: Rc>| { let page = page.clone(); - if finish_read_btree_page(page_idx, buf, page.clone()).is_err() { + if finish_read_page(page_idx, buf, page.clone()).is_err() { page.borrow_mut().set_error(); } }); @@ -264,47 +322,19 @@ pub fn begin_read_btree_page( Ok(()) } -fn finish_read_btree_page( +fn finish_read_page( page_idx: usize, buffer_ref: Rc>, page: Rc>, ) -> Result<()> { trace!("finish_read_btree_page(page_idx = {})", page_idx); - let mut pos = if page_idx == 1 { + let pos = if page_idx == 1 { DATABASE_HEADER_SIZE } else { 0 }; - let buf = buffer_ref.borrow(); - let buf = buf.as_slice(); - let mut header = BTreePageHeader { - page_type: buf[pos].try_into()?, - _first_freeblock_offset: u16::from_be_bytes([buf[pos + 1], buf[pos + 2]]), - num_cells: u16::from_be_bytes([buf[pos + 3], buf[pos + 4]]), - _cell_content_area: u16::from_be_bytes([buf[pos + 5], buf[pos + 6]]), - _num_frag_free_bytes: buf[pos + 7], - right_most_pointer: None, - }; - pos += 8; - if header.page_type == PageType::IndexInterior || header.page_type == PageType::TableInterior { - header.right_most_pointer = Some(u32::from_be_bytes([ - buf[pos], - buf[pos + 1], - buf[pos + 2], - buf[pos + 3], - ])); - pos += 4; - } - let mut cells = Vec::with_capacity(header.num_cells as usize); - for _ in 0..header.num_cells { - let cell_pointer = u16::from_be_bytes([buf[pos], buf[pos + 1]]); - pos += 2; - let cell = read_btree_cell(buf, &header.page_type, cell_pointer as usize)?; - cells.push(cell); - } - let inner = BTreePage { - header, - cells, + let inner = PageContent { + offset: pos, buffer: buffer_ref.clone(), }; {