From bbed54d11c3ceff8b841b99baeba10279f584752 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sat, 20 Jul 2024 15:47:59 +0200 Subject: [PATCH 01/35] core: basic op explain insert --- core/translate/insert.rs | 96 ++++++++++++++++++++++++++++ core/translate/mod.rs | 19 +++++- core/vdbe/explain.rs | 131 +++++++++++++++++++++++++++++++++++++++ core/vdbe/mod.rs | 94 ++++++++++++++++++++++++++++ 4 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 core/translate/insert.rs diff --git a/core/translate/insert.rs b/core/translate/insert.rs new file mode 100644 index 000000000..851384a66 --- /dev/null +++ b/core/translate/insert.rs @@ -0,0 +1,96 @@ +use std::{ops::Deref, rc::Rc}; + +use sqlite3_parser::ast::{ + DistinctNames, InsertBody, Name, QualifiedName, ResolveType, ResultColumn, Select, With, +}; + +use crate::Result; +use crate::{ + schema::{self, Schema, Table}, + translate::expr::resolve_ident_qualified, + vdbe::{builder::ProgramBuilder, Insn, Program}, +}; + +pub fn translate_insert( + schema: &Schema, + with: &Option, + or_conflict: &Option, + tbl_name: &QualifiedName, + columns: &Option, + body: &InsertBody, + returning: &Option>, +) -> Result { + assert!(with.is_none()); + assert!(or_conflict.is_none()); + let mut program = ProgramBuilder::new(); + let init_label = program.allocate_label(); + program.emit_insn_with_label_dependency( + Insn::Init { + target_pc: init_label, + }, + init_label, + ); + let start_offset = program.offset(); + + dbg!(tbl_name); + dbg!(columns); + dbg!(returning); + dbg!(with); + dbg!(body); + + let yield_reg = program.alloc_register(); + let jump_on_definition_label = program.allocate_label(); + program.emit_insn(Insn::InitCoroutine { + yield_reg, + jump_on_definition: jump_on_definition_label, + start_offset: program.offset() + 1, + }); + match body { + InsertBody::Select(select, None) => match &select.body.select { + sqlite3_parser::ast::OneSelect::Select { + distinctness: _, + columns: _, + from: _, + where_clause: _, + group_by: _, + window_clause: _, + } => todo!(), + sqlite3_parser::ast::OneSelect::Values(values) => {} + }, + InsertBody::DefaultValues => todo!("default values not yet supported"), + _ => todo!(), + } + program.emit_insn(Insn::EndCoroutine { yield_reg }); + + // open table + let table_name = &tbl_name.name; + + let table = match schema.get_table(table_name.0.as_str()) { + Some(table) => table, + None => crate::bail_corrupt_error!("Parse error: no such table: {}", table_name), + }; + let table = Rc::new(Table::BTree(table)); + let cursor_id = program.alloc_cursor_id( + Some(table_name.0.clone()), + Some(table.clone().deref().clone()), + ); + let root_page = match table.as_ref() { + Table::BTree(btree) => btree.root_page, + Table::Pseudo(_) => todo!(), + }; + program.emit_insn(Insn::OpenWriteAsync { + cursor_id, + root_page, + }); + program.emit_insn(Insn::OpenWriteAwait {}); + + program.emit_insn(Insn::Halt); + program.resolve_label(init_label, program.offset()); + program.emit_insn(Insn::Transaction); + program.emit_constant_insns(); + program.emit_insn(Insn::Goto { + target_pc: start_offset, + }); + program.resolve_deferred_labels(); + Ok(program.build()) +} diff --git a/core/translate/mod.rs b/core/translate/mod.rs index 53c288ab9..48ebe1746 100644 --- a/core/translate/mod.rs +++ b/core/translate/mod.rs @@ -8,6 +8,7 @@ //! will read rows from the database and filter them according to a WHERE clause. pub(crate) mod expr; +pub(crate) mod insert; pub(crate) mod select; pub(crate) mod where_clause; @@ -20,6 +21,7 @@ use crate::sqlite3_ondisk::{DatabaseHeader, MIN_PAGE_CACHE_SIZE}; use crate::util::normalize_ident; use crate::vdbe::{builder::ProgramBuilder, Insn, Program}; use crate::{bail_parse_error, Result}; +use insert::translate_insert; use select::{prepare_select, translate_select}; use sqlite3_parser::ast; @@ -49,7 +51,6 @@ pub fn translate( ast::Stmt::DropTable { .. } => bail_parse_error!("DROP TABLE not supported yet"), ast::Stmt::DropTrigger { .. } => bail_parse_error!("DROP TRIGGER not supported yet"), ast::Stmt::DropView { .. } => bail_parse_error!("DROP VIEW not supported yet"), - ast::Stmt::Insert { .. } => bail_parse_error!("INSERT not supported yet"), ast::Stmt::Pragma(name, body) => translate_pragma(&name, body, database_header, pager), ast::Stmt::Reindex { .. } => bail_parse_error!("REINDEX not supported yet"), ast::Stmt::Release(_) => bail_parse_error!("RELEASE not supported yet"), @@ -61,6 +62,22 @@ pub fn translate( } ast::Stmt::Update { .. } => bail_parse_error!("UPDATE not supported yet"), ast::Stmt::Vacuum(_, _) => bail_parse_error!("VACUUM not supported yet"), + ast::Stmt::Insert { + with, + or_conflict, + tbl_name, + columns, + body, + returning, + } => translate_insert( + schema, + &with, + &or_conflict, + &tbl_name, + &columns, + &body, + &returning, + ), } } diff --git a/core/vdbe/explain.rs b/core/vdbe/explain.rs index 8f29c360b..f13aaf00a 100644 --- a/core/vdbe/explain.rs +++ b/core/vdbe/explain.rs @@ -507,6 +507,137 @@ pub fn insn_to_str(program: &Program, addr: InsnReference, insn: &Insn, indent: 0, format!("r[{}]=func(r[{}..])", dest, start_reg), ), + Insn::InitCoroutine { + yield_reg, + jump_on_definition, + start_offset, + } => ( + "InitCoroutine", + *yield_reg as i32, + *jump_on_definition as i32, + *start_offset as i32, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::EndCoroutine { yield_reg } => ( + "EndCoroutine", + *yield_reg as i32, + 0, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::Yield { + yield_reg, + end_offset, + } => ( + "Yield", + *yield_reg as i32, + *end_offset as i32, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::InsertAsync { + cursor, + key_reg, + record_reg, + flag, + } => ( + "InsertAsync", + *cursor as i32, + *record_reg as i32, + *key_reg as i32, + OwnedValue::Text(Rc::new("".to_string())), + *flag as u16, + format!(""), + ), + Insn::InsertAwait {} => ( + "InsertAwait", + 0, + 0, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::NewRowid { reg } => ( + "NewRowId", + 0, + *reg as i32, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::MustBeInt { reg } => ( + "MustBeInt", + *reg as i32, + 0, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::SoftNull { reg } => ( + "SoftNull", + *reg as i32, + 0, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::NotExists { + cursor, + rowid_reg, + target_pc, + } => ( + "NotExists", + *cursor as i32, + *target_pc as i32, + *rowid_reg as i32, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::OpenWriteAsync { + cursor_id, + root_page, + } => ( + "OpenWriteAsync", + *cursor_id as i32, + *root_page as i32, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::OpenWriteAwait {} => ( + "OpenWriteAsync", + 0, + 0, + 0, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), + Insn::Copy { + src_reg, + dst_reg, + amount, + } => ( + "Copy", + *src_reg as i32, + *dst_reg as i32, + *amount as i32, + OwnedValue::Text(Rc::new("".to_string())), + 0, + format!(""), + ), }; format!( "{:<4} {:<17} {:<4} {:<4} {:<4} {:<13} {:<2} {}", diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 587536bed..437d57523 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -279,6 +279,61 @@ pub enum Insn { dest: usize, // P3 func: ScalarFunc, // P4 }, + + InitCoroutine { + yield_reg: usize, + jump_on_definition: BranchOffset, + start_offset: BranchOffset, + }, + + EndCoroutine { + yield_reg: usize, + }, + + Yield { + yield_reg: usize, + end_offset: BranchOffset, + }, + + InsertAsync { + cursor: CursorID, + key_reg: usize, // Must be int. + record_reg: usize, // Blob of record data. + flag: usize, // Flags used by insert, for now not used. + }, + + InsertAwait {}, + + NewRowid { + reg: usize, + }, + + MustBeInt { + reg: usize, + }, + + SoftNull { + reg: usize, + }, + + NotExists { + cursor: CursorID, + rowid_reg: usize, + target_pc: BranchOffset, + }, + + OpenWriteAsync { + cursor_id: CursorID, + root_page: PageIdx, + }, + + OpenWriteAwait {}, + + Copy { + src_reg: usize, + dst_reg: usize, + amount: usize, // 0 amount means we include src_reg, dst_reg..=dst_reg+amount = src_reg..=src_reg+amount + }, } // Index of insn in list of insns @@ -1198,6 +1253,45 @@ impl Program { state.pc += 1; } }, + Insn::InitCoroutine { + yield_reg, + jump_on_definition, + start_offset, + } => todo!(), + Insn::EndCoroutine { yield_reg } => todo!(), + Insn::Yield { + yield_reg, + end_offset, + } => todo!(), + Insn::InsertAsync { + cursor, + key_reg, + record_reg, + flag, + } => todo!(), + Insn::InsertAwait {} => todo!(), + Insn::NewRowid { reg } => todo!(), + Insn::MustBeInt { reg } => todo!(), + Insn::SoftNull { reg } => todo!(), + Insn::NotExists { + cursor, + rowid_reg, + target_pc, + } => todo!(), + Insn::OpenWriteAsync { + cursor_id, + root_page, + } => todo!(), + Insn::OpenWriteAwait {} => todo!(), + Insn::Copy { + src_reg, + dst_reg, + amount, + } => { + for i in 0..=*amount { + state.registers[*dst_reg + i] = state.registers[*src_reg + i].clone(); + } + } } } } From affe3443cc5dd963dc82c51d6b7872aed10f8c97 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sat, 20 Jul 2024 17:46:00 +0200 Subject: [PATCH 02/35] core: vbde coroutine generation with rowid insert --- core/schema.rs | 7 ++ core/translate/expr.rs | 8 +- core/translate/insert.rs | 146 +++++++++++++++++++++++++++------ core/translate/select.rs | 44 +++++++--- core/translate/where_clause.rs | 8 +- 5 files changed, 166 insertions(+), 47 deletions(-) diff --git a/core/schema.rs b/core/schema.rs index d91a8a9e8..b910a81e2 100644 --- a/core/schema.rs +++ b/core/schema.rs @@ -83,6 +83,13 @@ impl Table { Table::Pseudo(table) => &table.columns, } } + + pub fn has_rowid(&self) -> bool { + match self { + Table::BTree(table) => table.has_rowid, + Table::Pseudo(_) => todo!(), + } + } } impl PartialEq for Table { diff --git a/core/translate/expr.rs b/core/translate/expr.rs index 163357978..1d5c5ea6e 100644 --- a/core/translate/expr.rs +++ b/core/translate/expr.rs @@ -11,7 +11,7 @@ use crate::{ pub fn translate_expr( program: &mut ProgramBuilder, - select: &Select, + select: Option<&Select>, expr: &ast::Expr, target_register: usize, cursor_hint: Option, @@ -435,7 +435,7 @@ pub fn translate_expr( ast::Expr::Parenthesized(_) => todo!(), ast::Expr::Qualified(tbl, ident) => { let (idx, col_type, cursor_id, is_primary_key) = - resolve_ident_qualified(program, &tbl.0, &ident.0, select, cursor_hint)?; + resolve_ident_qualified(program, &tbl.0, &ident.0, select.unwrap(), cursor_hint)?; if is_primary_key { program.emit_insn(Insn::RowId { cursor_id, @@ -614,12 +614,12 @@ pub fn resolve_ident_qualified( pub fn resolve_ident_table( program: &ProgramBuilder, ident: &String, - select: &Select, + select: Option<&Select>, cursor_hint: Option, ) -> Result<(usize, Type, usize, bool)> { let ident = normalize_ident(ident); let mut found = Vec::new(); - for join in &select.src_tables { + for join in &select.unwrap().src_tables { match join.table { Table::BTree(ref table) => { let res = table diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 851384a66..3a614ae4b 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -7,7 +7,7 @@ use sqlite3_parser::ast::{ use crate::Result; use crate::{ schema::{self, Schema, Table}, - translate::expr::resolve_ident_qualified, + translate::expr::{resolve_ident_qualified, translate_expr}, vdbe::{builder::ProgramBuilder, Insn, Program}, }; @@ -37,31 +37,6 @@ pub fn translate_insert( dbg!(returning); dbg!(with); dbg!(body); - - let yield_reg = program.alloc_register(); - let jump_on_definition_label = program.allocate_label(); - program.emit_insn(Insn::InitCoroutine { - yield_reg, - jump_on_definition: jump_on_definition_label, - start_offset: program.offset() + 1, - }); - match body { - InsertBody::Select(select, None) => match &select.body.select { - sqlite3_parser::ast::OneSelect::Select { - distinctness: _, - columns: _, - from: _, - where_clause: _, - group_by: _, - window_clause: _, - } => todo!(), - sqlite3_parser::ast::OneSelect::Values(values) => {} - }, - InsertBody::DefaultValues => todo!("default values not yet supported"), - _ => todo!(), - } - program.emit_insn(Insn::EndCoroutine { yield_reg }); - // open table let table_name = &tbl_name.name; @@ -78,12 +53,131 @@ pub fn translate_insert( Table::BTree(btree) => btree.root_page, Table::Pseudo(_) => todo!(), }; + + let mut num_cols = table.columns().len(); + if table.has_rowid() { + num_cols += 1; + } + // column_registers_start[0] == rowid if has rowid + let column_registers_start = program.alloc_registers(num_cols); + + // Coroutine for values + let yield_reg = program.alloc_register(); + let jump_on_definition_label = program.allocate_label(); + { + program.emit_insn_with_label_dependency( + Insn::InitCoroutine { + yield_reg, + jump_on_definition: jump_on_definition_label, + start_offset: program.offset() + 1, + }, + jump_on_definition_label, + ); + match body { + InsertBody::Select(select, None) => match &select.body.select { + sqlite3_parser::ast::OneSelect::Select { + distinctness: _, + columns: _, + from: _, + where_clause: _, + group_by: _, + window_clause: _, + } => todo!(), + sqlite3_parser::ast::OneSelect::Values(values) => { + for value in values { + for (col, expr) in value.iter().enumerate() { + let mut col = col; + if table.has_rowid() { + col += 1; + } + translate_expr( + &mut program, + None, + expr, + column_registers_start + col, + None, + )?; + } + program.emit_insn(Insn::Yield { + yield_reg, + end_offset: 0, + }); + } + } + }, + InsertBody::DefaultValues => todo!("default values not yet supported"), + _ => todo!(), + } + program.emit_insn(Insn::EndCoroutine { yield_reg }); + } + + program.resolve_label(jump_on_definition_label, program.offset()); program.emit_insn(Insn::OpenWriteAsync { cursor_id, root_page, }); program.emit_insn(Insn::OpenWriteAwait {}); + // Main loop + let record_register = program.alloc_register(); + let halt_label = program.allocate_label(); + program.emit_insn_with_label_dependency( + Insn::Yield { + yield_reg, + end_offset: halt_label, + }, + halt_label, + ); + + if table.has_rowid() { + let key_reg = column_registers_start + 1; + let row_id_reg = column_registers_start; + // copy key to rowid + program.emit_insn(Insn::Copy { + src_reg: key_reg, + dst_reg: row_id_reg, + amount: 0, + }); + program.emit_insn(Insn::SoftNull { reg: key_reg }); + + let notnull_label = program.allocate_label(); + program.emit_insn_with_label_dependency( + Insn::NotNull { + reg: row_id_reg, + target_pc: notnull_label, + }, + notnull_label, + ); + program.emit_insn(Insn::NewRowid { reg: row_id_reg }); + + program.resolve_label(notnull_label, program.offset()); + program.emit_insn(Insn::MustBeInt { reg: row_id_reg }); + let make_record_label = program.allocate_label(); + program.emit_insn_with_label_dependency( + Insn::NotExists { + cursor: cursor_id, + rowid_reg: row_id_reg, + target_pc: make_record_label, + }, + make_record_label, + ); + program.emit_insn(Insn::Halt); // Add error code 1555 and rollback + program.resolve_label(make_record_label, program.offset()); + program.emit_insn(Insn::MakeRecord { + start_reg: column_registers_start, + count: num_cols, + dest_reg: record_register, + }); + program.emit_insn(Insn::InsertAsync { + cursor: cursor_id, + key_reg: column_registers_start, + record_reg: record_register, + flag: 0, + }); + program.emit_insn(Insn::InsertAwait {}); + } + + program.resolve_label(halt_label, program.offset()); program.emit_insn(Insn::Halt); program.resolve_label(init_label, program.offset()); program.emit_insn(Insn::Transaction); diff --git a/core/translate/select.rs b/core/translate/select.rs index fe832dcaf..5e79039d5 100644 --- a/core/translate/select.rs +++ b/core/translate/select.rs @@ -274,7 +274,13 @@ pub fn translate_select(mut select: Select) -> Result { let limit_info = if let Some(limit) = &select.limit { assert!(limit.offset.is_none()); let target_register = program.alloc_register(); - let limit_reg = translate_expr(&mut program, &select, &limit.expr, target_register, None)?; + let limit_reg = translate_expr( + &mut program, + Some(&select), + &limit.expr, + target_register, + None, + )?; let num = if let ast::Expr::Literal(ast::Literal::Numeric(num)) = &limit.expr { num.parse::()? } else { @@ -326,7 +332,7 @@ pub fn translate_select(mut select: Select) -> Result { } else { &col.expr }; - translate_expr(&mut program, &select, sort_col_expr, target, None)?; + translate_expr(&mut program, Some(&select), sort_col_expr, target, None)?; } let (_, result_cols_count) = translate_columns(&mut program, &select, None)?; sort_info @@ -742,7 +748,7 @@ fn translate_column( cursor_hint, )?; } else { - let _ = translate_expr(program, select, expr, target_register, cursor_hint)?; + let _ = translate_expr(program, Some(select), expr, target_register, cursor_hint)?; } } ast::ResultColumn::Star => { @@ -807,7 +813,7 @@ fn translate_aggregation( } let expr = &args[0]; let expr_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, expr_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), expr, expr_reg, cursor_hint)?; program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, @@ -822,7 +828,7 @@ fn translate_aggregation( } else { let expr = &args[0]; let expr_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, expr_reg, cursor_hint); + let _ = translate_expr(program, Some(select), expr, expr_reg, cursor_hint); expr_reg }; program.emit_insn(Insn::AggStep { @@ -865,8 +871,14 @@ fn translate_aggregation( ast::Expr::Literal(ast::Literal::String(String::from("\",\""))); } - translate_expr(program, select, expr, expr_reg, cursor_hint)?; - translate_expr(program, select, &delimiter_expr, delimiter_reg, cursor_hint)?; + translate_expr(program, Some(select), expr, expr_reg, cursor_hint)?; + translate_expr( + program, + Some(select), + &delimiter_expr, + delimiter_reg, + cursor_hint, + )?; program.emit_insn(Insn::AggStep { acc_reg: target_register, @@ -883,7 +895,7 @@ fn translate_aggregation( } let expr = &args[0]; let expr_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, expr_reg, cursor_hint); + let _ = translate_expr(program, Some(select), expr, expr_reg, cursor_hint); program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, @@ -898,7 +910,7 @@ fn translate_aggregation( } let expr = &args[0]; let expr_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, expr_reg, cursor_hint); + let _ = translate_expr(program, Some(select), expr, expr_reg, cursor_hint); program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, @@ -932,8 +944,14 @@ fn translate_aggregation( _ => crate::bail_parse_error!("Incorrect delimiter parameter"), }; - translate_expr(program, select, expr, expr_reg, cursor_hint)?; - translate_expr(program, select, &delimiter_expr, delimiter_reg, cursor_hint)?; + translate_expr(program, Some(select), expr, expr_reg, cursor_hint)?; + translate_expr( + program, + Some(select), + &delimiter_expr, + delimiter_reg, + cursor_hint, + )?; program.emit_insn(Insn::AggStep { acc_reg: target_register, @@ -950,7 +968,7 @@ fn translate_aggregation( } let expr = &args[0]; let expr_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, expr_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), expr, expr_reg, cursor_hint)?; program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, @@ -965,7 +983,7 @@ fn translate_aggregation( } let expr = &args[0]; let expr_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, expr_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), expr, expr_reg, cursor_hint)?; program.emit_insn(Insn::AggStep { acc_reg: target_register, col: expr_reg, diff --git a/core/translate/where_clause.rs b/core/translate/where_clause.rs index 4b6e89acb..e4ec529ba 100644 --- a/core/translate/where_clause.rs +++ b/core/translate/where_clause.rs @@ -306,12 +306,12 @@ fn translate_condition_expr( ast::Expr::Binary(lhs, op, rhs) => { let lhs_reg = program.alloc_register(); let rhs_reg = program.alloc_register(); - let _ = translate_expr(program, select, lhs, lhs_reg, cursor_hint); + let _ = translate_expr(program, Some(select), lhs, lhs_reg, cursor_hint); match lhs.as_ref() { ast::Expr::Literal(_) => program.mark_last_insn_constant(), _ => {} } - let _ = translate_expr(program, select, rhs, rhs_reg, cursor_hint); + let _ = translate_expr(program, Some(select), rhs, rhs_reg, cursor_hint); match rhs.as_ref() { ast::Expr::Literal(_) => program.mark_last_insn_constant(), _ => {} @@ -657,9 +657,9 @@ fn translate_condition_expr( let pattern_reg = program.alloc_register(); let column_reg = program.alloc_register(); // LIKE(pattern, column). We should translate the pattern first before the column - let _ = translate_expr(program, select, rhs, pattern_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), rhs, pattern_reg, cursor_hint)?; program.mark_last_insn_constant(); - let _ = translate_expr(program, select, lhs, column_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), lhs, column_reg, cursor_hint)?; program.emit_insn(Insn::Function { func: ScalarFunc::Like, start_reg: pattern_reg, From 6357e88b46f99b73823286aa5750b69d15c3bd8e Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sun, 21 Jul 2024 08:56:28 +0200 Subject: [PATCH 03/35] core: implement vdbe opcodes minus newrowid --- core/btree.rs | 6 ++- core/pseudo.rs | 4 ++ core/translate/insert.rs | 4 +- core/types.rs | 1 + core/vdbe/explain.rs | 6 +-- core/vdbe/mod.rs | 86 +++++++++++++++++++++++++++++++++++----- core/vdbe/sorter.rs | 6 ++- 7 files changed, 96 insertions(+), 17 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 41becb614..ae5b826ed 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -1,6 +1,6 @@ use crate::pager::Pager; use crate::sqlite3_ondisk::{BTreeCell, TableInteriorCell, TableLeafCell}; -use crate::types::{Cursor, CursorResult, OwnedRecord}; +use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; use std::cell::{Ref, RefCell}; @@ -170,4 +170,8 @@ impl Cursor for BTreeCursor { fn get_null_flag(&self) -> bool { self.null_flag } + + fn exists(&mut self, key: &OwnedValue) -> Result { + unimplemented!() + } } diff --git a/core/pseudo.rs b/core/pseudo.rs index dfd3212ad..4d6b6e78a 100644 --- a/core/pseudo.rs +++ b/core/pseudo.rs @@ -62,4 +62,8 @@ impl Cursor for PseudoCursor { fn set_null_flag(&mut self, _null_flag: bool) { // Do nothing } + + fn exists(&mut self, key: &OwnedValue) -> Result { + todo!() + } } diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 3a614ae4b..1d34cf4dc 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -174,7 +174,9 @@ pub fn translate_insert( record_reg: record_register, flag: 0, }); - program.emit_insn(Insn::InsertAwait {}); + program.emit_insn(Insn::InsertAwait { + cursor_id: cursor_id, + }); } program.resolve_label(halt_label, program.offset()); diff --git a/core/types.rs b/core/types.rs index 2562fc9de..1369aa358 100644 --- a/core/types.rs +++ b/core/types.rs @@ -316,6 +316,7 @@ pub trait Cursor { fn rowid(&self) -> Result>; fn record(&self) -> Result>>; fn insert(&mut self, record: &OwnedRecord) -> Result<()>; + fn exists(&mut self, key: &OwnedValue) -> Result; fn set_null_flag(&mut self, flag: bool); fn get_null_flag(&self) -> bool; } diff --git a/core/vdbe/explain.rs b/core/vdbe/explain.rs index f13aaf00a..242428170 100644 --- a/core/vdbe/explain.rs +++ b/core/vdbe/explain.rs @@ -555,9 +555,9 @@ pub fn insn_to_str(program: &Program, addr: InsnReference, insn: &Insn, indent: *flag as u16, format!(""), ), - Insn::InsertAwait {} => ( + Insn::InsertAwait { cursor_id } => ( "InsertAwait", - 0, + *cursor_id as i32, 0, 0, OwnedValue::Text(Rc::new("".to_string())), @@ -617,7 +617,7 @@ pub fn insn_to_str(program: &Program, addr: InsnReference, insn: &Insn, indent: format!(""), ), Insn::OpenWriteAwait {} => ( - "OpenWriteAsync", + "OpenWriteAwait", 0, 0, 0, diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 437d57523..567763552 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -302,7 +302,9 @@ pub enum Insn { flag: usize, // Flags used by insert, for now not used. }, - InsertAwait {}, + InsertAwait { + cursor_id: usize, + }, NewRowid { reg: usize, @@ -350,6 +352,7 @@ pub struct ProgramState { pub pc: BranchOffset, cursors: RefCell>>, registers: Vec, + ended_coroutine: bool, // flag to notify yield coroutine finished } impl ProgramState { @@ -361,6 +364,7 @@ impl ProgramState { pc: 0, cursors, registers, + ended_coroutine: false, } } @@ -1257,32 +1261,92 @@ impl Program { yield_reg, jump_on_definition, start_offset, - } => todo!(), - Insn::EndCoroutine { yield_reg } => todo!(), + } => { + state.registers[*yield_reg] = OwnedValue::Integer(*start_offset); + state.pc = *jump_on_definition; + } + Insn::EndCoroutine { yield_reg } => { + if let OwnedValue::Integer(pc) = state.registers[*yield_reg] { + state.ended_coroutine = true; + state.pc = pc; + } else { + unreachable!(); + } + } Insn::Yield { yield_reg, end_offset, - } => todo!(), + } => { + if let OwnedValue::Integer(pc) = state.registers[*yield_reg] { + if state.ended_coroutine { + state.pc = *end_offset; + } else { + // swap + (state.pc, state.registers[*yield_reg]) = + (pc, OwnedValue::Integer(state.pc)); + } + } else { + unreachable!(); + } + } Insn::InsertAsync { cursor, key_reg, record_reg, flag, - } => todo!(), - Insn::InsertAwait {} => todo!(), + } => { + let mut cursors = state.cursors.borrow_mut(); + let cursor = cursors.get_mut(cursor).unwrap(); + let record = match &state.registers[*record_reg] { + OwnedValue::Record(r) => r, + _ => unreachable!("Not a record! Cannot insert a non record value."), + }; + cursor.insert(record).unwrap(); + } + Insn::InsertAwait { cursor_id } => { + let cursor = cursors.get_mut(cursor_id).unwrap(); + cursor.wait_for_completion()?; + state.pc += 1; + } Insn::NewRowid { reg } => todo!(), - Insn::MustBeInt { reg } => todo!(), - Insn::SoftNull { reg } => todo!(), + Insn::MustBeInt { reg } => { + match state.registers[*reg] { + OwnedValue::Integer(_) => {} + _ => { + crate::bail_parse_error!( + "MustBeInt: the value in the register is not an integer" + ); + } + }; + state.pc += 1; + } + Insn::SoftNull { reg } => { + state.registers[*reg] = OwnedValue::Null; + state.pc += 1; + } Insn::NotExists { cursor, rowid_reg, target_pc, - } => todo!(), + } => { + let mut cursors = state.cursors.borrow_mut(); + let cursor = cursors.get_mut(cursor).unwrap(); + cursor.exists(&state.registers[*rowid_reg]); + } // TODO(pere): how is not exists implemented? We probably need to traverse keys my pointing cursor. + // this cursor may be reused for next insert + // Update: tablemoveto is used to travers on not exists, on insert depending on flags if nonseek it traverses again. + // If not there might be some optimizations obviously. Insn::OpenWriteAsync { cursor_id, root_page, - } => todo!(), - Insn::OpenWriteAwait {} => todo!(), + } => { + let cursor = Box::new(BTreeCursor::new(pager.clone(), *root_page)); + cursors.insert(*cursor_id, cursor); + state.pc += 1; + } + Insn::OpenWriteAwait {} => { + state.pc += 1; + } Insn::Copy { src_reg, dst_reg, diff --git a/core/vdbe/sorter.rs b/core/vdbe/sorter.rs index de4f90296..70cff752c 100644 --- a/core/vdbe/sorter.rs +++ b/core/vdbe/sorter.rs @@ -1,5 +1,5 @@ use crate::{ - types::{Cursor, CursorResult, OwnedRecord}, + types::{Cursor, CursorResult, OwnedRecord, OwnedValue}, Result, }; use std::{ @@ -93,4 +93,8 @@ impl Cursor for Sorter { fn get_null_flag(&self) -> bool { todo!(); } + + fn exists(&mut self, key: &OwnedValue) -> Result { + todo!() + } } From a441e5e302757ff74403ff4a6505763bc16e60a4 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 22 Jul 2024 13:24:40 +0200 Subject: [PATCH 04/35] core: new custom lru page cache --- core/btree.rs | 6 ++ core/pager.rs | 178 +++++++++++++++++++++++++++++++++++++++-- core/sqlite3_ondisk.rs | 15 ++-- 3 files changed, 187 insertions(+), 12 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index ae5b826ed..65c1d68e7 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -61,6 +61,7 @@ impl BTreeCursor { }; let page_idx = mem_page.page_idx; let page = self.pager.read_page(page_idx)?; + let page = page.borrow(); if page.is_locked() { return Ok(CursorResult::IO); } @@ -115,6 +116,11 @@ impl BTreeCursor { } } } + + fn move_to_root(&mut self) { + let root_page = self.pager.read_page(self.root_page).unwrap(); + let current_page = root_page; + } } impl Cursor for BTreeCursor { diff --git a/core/pager.rs b/core/pager.rs index de7a1d6bf..ecca1b566 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -1,11 +1,16 @@ +#![feature(box_into_raw_non_null)] use crate::buffer_pool::BufferPool; use crate::sqlite3_ondisk::BTreePage; use crate::sqlite3_ondisk::{self, DatabaseHeader}; use crate::{PageSource, Result}; use log::trace; use sieve_cache::SieveCache; +use std::borrow::BorrowMut; use std::cell::RefCell; +use std::collections::HashMap; use std::hash::Hash; +use std::mem; +use std::ptr::{drop_in_place, NonNull}; use std::rc::Rc; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; @@ -21,6 +26,8 @@ const PAGE_UPTODATE: usize = 0b001; const PAGE_LOCKED: usize = 0b010; /// Page had an I/O error. const PAGE_ERROR: usize = 0b100; +/// Page had an I/O error. +const PAGE_DIRTY: usize = 0b1000; impl Default for Page { fn default() -> Self { @@ -71,6 +78,166 @@ impl Page { pub fn clear_error(&self) { self.flags.fetch_and(!PAGE_ERROR, Ordering::SeqCst); } + + pub fn is_dirty(&self) -> bool { + self.flags.load(Ordering::SeqCst) & PAGE_DIRTY != 0 + } + + pub fn set_dirty(&self) { + self.flags.fetch_or(PAGE_DIRTY, Ordering::SeqCst); + } + + pub fn clear_dirty(&self) { + self.flags.fetch_and(!PAGE_DIRTY, Ordering::SeqCst); + } +} + +struct PageCacheEntry { + key: usize, + page: Rc>, + prev: Option>, + next: Option>, +} + +impl PageCacheEntry { + fn into_non_null(&mut self) -> NonNull { + NonNull::new(&mut *self).unwrap() + } + + unsafe fn from_non_null(ptr: NonNull) -> Box { + Box::from_raw(ptr.as_ptr()) + } +} + +struct DumbLruPageCache { + capacity: usize, + map: RefCell>>, + head: RefCell>>, + tail: RefCell>>, +} + +impl DumbLruPageCache { + pub fn new(capacity: usize) -> Self { + Self { + capacity: capacity, + map: RefCell::new(HashMap::new()), + head: RefCell::new(None), + tail: RefCell::new(None), + } + } + + pub fn insert(&mut self, key: usize, value: Rc>) { + self.delete(key); + let mut entry = Box::new(PageCacheEntry { + key: key, + next: None, + prev: None, + page: value, + }); + self.touch(&mut entry); + + if self.map.borrow().len() >= self.capacity { + self.pop_if_not_dirty(); + } + let b = Box::into_raw(entry); + let as_non_null = NonNull::new(b).unwrap(); + self.map.borrow_mut().insert(key, as_non_null); + } + + pub fn delete(&mut self, key: usize) { + let ptr = self.map.borrow_mut().remove(&key); + if ptr.is_none() { + return; + } + let mut ptr = ptr.unwrap(); + { + let ptr = unsafe { ptr.as_mut() }; + self.detach(ptr); + } + unsafe { drop_in_place(ptr.as_ptr()) }; + } + + fn get_ptr(&mut self, key: usize) -> Option> { + let m = self.map.borrow_mut(); + let ptr = m.get(&key); + match ptr { + Some(v) => Some(*v), + None => None, + } + } + + pub fn get(&mut self, key: &usize) -> Option>> { + let ptr = self.get_ptr(*key); + if ptr.is_none() { + return None; + } + let ptr = unsafe { ptr.unwrap().as_mut() }; + let page = ptr.page.clone(); + self.detach(ptr); + self.touch(ptr); + return Some(page); + } + + pub fn resize(&mut self, capacity: usize) { + todo!(); + } + + fn detach(&mut self, entry: &mut PageCacheEntry) { + let mut current = entry.into_non_null(); + + let mut next = None; + let mut prev = None; + unsafe { + let c = current.as_mut(); + next = c.next; + prev = c.prev; + c.prev = None; + c.next = None; + } + + // detach + match (prev, next) { + (None, None) => {} + (None, Some(_)) => todo!(), + (Some(p), None) => { + self.tail = RefCell::new(Some(p)); + } + (Some(mut p), Some(mut n)) => unsafe { + let p_mut = p.as_mut(); + p_mut.next = Some(n); + let n_mut = n.as_mut(); + n_mut.prev = Some(p); + }, + }; + } + + fn touch(&mut self, entry: &mut PageCacheEntry) { + let mut current = entry.into_non_null(); + unsafe { + let c = current.as_mut(); + c.next = *self.head.borrow(); + } + + if let Some(mut head) = *self.head.borrow_mut() { + unsafe { + let head = head.as_mut(); + head.prev = Some(current); + } + } + } + + fn pop_if_not_dirty(&mut self) { + let tail = *self.tail.borrow(); + if tail.is_none() { + return; + } + let tail = unsafe { tail.unwrap().as_mut() }; + if tail.page.borrow().is_dirty() { + // TODO: drop from another clean entry? + return; + } + self.detach(tail); + } } pub struct PageCache { @@ -101,8 +268,7 @@ impl PageCache { pub struct Pager { /// Source of the database pages. pub page_source: PageSource, - /// Cache for storing loaded pages. - page_cache: RefCell>>, + page_cache: RefCell, /// Buffer pool for temporary data storage. buffer_pool: Rc, /// I/O interface for input/output operations. @@ -124,7 +290,7 @@ impl Pager { let db_header = db_header.borrow(); let page_size = db_header.page_size as usize; let buffer_pool = Rc::new(BufferPool::new(page_size)); - let page_cache = RefCell::new(PageCache::new(SieveCache::new(10).unwrap())); + let page_cache = RefCell::new(DumbLruPageCache::new(10)); Ok(Self { page_source, buffer_pool, @@ -134,14 +300,14 @@ impl Pager { } /// Reads a page from the database. - pub fn read_page(&self, page_idx: usize) -> Result> { + pub fn read_page(&self, page_idx: usize) -> crate::Result>> { trace!("read_page(page_idx = {})", page_idx); let mut page_cache = self.page_cache.borrow_mut(); if let Some(page) = page_cache.get(&page_idx) { return Ok(page.clone()); } - let page = Rc::new(Page::new()); - page.set_locked(); + let page = Rc::new(RefCell::new(Page::new())); + page.borrow().set_locked(); sqlite3_ondisk::begin_read_btree_page( &self.page_source, self.buffer_pool.clone(), diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 8a4216853..419500821 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -226,7 +226,7 @@ pub struct BTreePage { pub fn begin_read_btree_page( page_source: &PageSource, buffer_pool: Rc, - page: Rc, + page: Rc>, page_idx: usize, ) -> Result<()> { trace!("begin_read_btree_page(page_idx = {})", page_idx); @@ -239,7 +239,7 @@ pub fn begin_read_btree_page( let complete = Box::new(move |buf: &Buffer| { let page = page.clone(); if finish_read_btree_page(page_idx, buf, page.clone()).is_err() { - page.set_error(); + page.borrow_mut().set_error(); } }); let c = Rc::new(Completion::new(buf, complete)); @@ -247,7 +247,7 @@ pub fn begin_read_btree_page( Ok(()) } -fn finish_read_btree_page(page_idx: usize, buf: &Buffer, page: Rc) -> Result<()> { +fn finish_read_btree_page(page_idx: usize, buf: &Buffer, page: Rc>) -> Result<()> { trace!("finish_read_btree_page(page_idx = {})", page_idx); let mut pos = if page_idx == 1 { DATABASE_HEADER_SIZE @@ -281,9 +281,12 @@ fn finish_read_btree_page(page_idx: usize, buf: &Buffer, page: Rc) -> Resu cells.push(cell); } let inner = BTreePage { header, cells }; - page.contents.write().unwrap().replace(inner); - page.set_uptodate(); - page.clear_locked(); + { + let page = page.borrow_mut(); + page.contents.write().unwrap().replace(inner); + page.set_uptodate(); + page.clear_locked(); + } Ok(()) } From 6db82abf63b9020adb4a9eaba15770f646de7865 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 22 Jul 2024 19:35:45 +0200 Subject: [PATCH 05/35] core: move_to --- core/btree.rs | 83 ++++++++++++++++++++++++++++++++++++++++-- core/sqlite3_ondisk.rs | 11 ++++++ core/types.rs | 2 +- core/vdbe/mod.rs | 9 ++++- 4 files changed, 99 insertions(+), 6 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 65c1d68e7..f453d9bab 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -118,8 +118,79 @@ impl BTreeCursor { } fn move_to_root(&mut self) { - let root_page = self.pager.read_page(self.root_page).unwrap(); - let current_page = root_page; + self.page + .replace(Some(Rc::new(MemPage::new(None, self.root_page, 0)))); + } + + pub fn move_to(&mut self, key: u64) -> Result> { + self.move_to_root(); + + loop { + let mem_page = { + let mem_page = self.page.borrow(); + let mem_page = mem_page.as_ref().unwrap(); + mem_page.clone() + }; + let page_idx = mem_page.page_idx; + let page = self.pager.read_page(page_idx)?; + let page = page.borrow(); + if page.is_locked() { + return Ok(CursorResult::IO); + } + let page = page.contents.read().unwrap(); + let page = page.as_ref().unwrap(); + if page.is_leaf() { + return Ok(CursorResult::Ok(())); + } + + let mut found_cell = false; + for cell in &page.cells { + match &cell { + BTreeCell::TableInteriorCell(TableInteriorCell { + _left_child_page, + _rowid, + }) => { + if key < *_rowid { + mem_page.advance(); + let mem_page = + MemPage::new(Some(mem_page.clone()), *_left_child_page as usize, 0); + self.page.replace(Some(Rc::new(mem_page))); + found_cell = true; + break; + } + } + BTreeCell::TableLeafCell(TableLeafCell { + _rowid: _, + _payload: _, + first_overflow_page: _, + }) => { + unreachable!( + "we don't iterate leaf cells while trying to move to a leaf cell" + ); + } + BTreeCell::IndexInteriorCell(_) => { + unimplemented!(); + } + BTreeCell::IndexLeafCell(_) => { + unimplemented!(); + } + } + } + + if !found_cell { + let parent = mem_page.parent.clone(); + match page.header.right_most_pointer { + Some(right_most_pointer) => { + let mem_page = MemPage::new(parent, right_most_pointer as usize, 0); + self.page.replace(Some(Rc::new(mem_page))); + continue; + } + None => { + unreachable!("we shall not go back up! The only way is down the slope") + } + } + } + } } } @@ -165,8 +236,12 @@ impl Cursor for BTreeCursor { Ok(self.record.borrow()) } - fn insert(&mut self, _record: &OwnedRecord) -> Result<()> { - unimplemented!() + fn insert(&mut self, key: &OwnedValue, _record: &OwnedRecord) -> Result> { + let int_key = match key { + OwnedValue::Integer(i) => i, + _ => unreachable!("btree tables are indexed by integers!"), + }; + self.move_to(*int_key as u64) } fn set_null_flag(&mut self, flag: bool) { diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 419500821..35f170475 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -223,6 +223,17 @@ pub struct BTreePage { pub cells: Vec, } +impl BTreePage { + pub fn is_leaf(&self) -> bool { + match self.header.page_type { + PageType::IndexInterior => false, + PageType::TableInterior => false, + PageType::IndexLeaf => true, + PageType::TableLeaf => true, + } + } +} + pub fn begin_read_btree_page( page_source: &PageSource, buffer_pool: Rc, diff --git a/core/types.rs b/core/types.rs index 1369aa358..c5b46778f 100644 --- a/core/types.rs +++ b/core/types.rs @@ -315,7 +315,7 @@ pub trait Cursor { fn wait_for_completion(&mut self) -> Result<()>; fn rowid(&self) -> Result>; fn record(&self) -> Result>>; - fn insert(&mut self, record: &OwnedRecord) -> Result<()>; + fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result>; fn exists(&mut self, key: &OwnedValue) -> Result; fn set_null_flag(&mut self, flag: bool); fn get_null_flag(&self) -> bool; diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 567763552..42994b101 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -1301,7 +1301,14 @@ impl Program { OwnedValue::Record(r) => r, _ => unreachable!("Not a record! Cannot insert a non record value."), }; - cursor.insert(record).unwrap(); + let key = &state.registers[*key_reg]; + match cursor.insert(key, record)? { + CursorResult::Ok(_) => {} + CursorResult::IO => { + // If there is I/O, the instruction is restarted. + return Ok(StepResult::IO); + } + } } Insn::InsertAwait { cursor_id } => { let cursor = cursors.get_mut(cursor_id).unwrap(); From a09f3485f98c50e7c3a92de7b9610a3cb5223ee0 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 22 Jul 2024 20:37:28 +0200 Subject: [PATCH 06/35] core: fix op generation --- core/translate/insert.rs | 5 +++++ core/vdbe/mod.rs | 23 ++++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 1d34cf4dc..70637ace1 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -121,6 +121,7 @@ pub fn translate_insert( // Main loop let record_register = program.alloc_register(); let halt_label = program.allocate_label(); + let loop_start_offset = program.offset(); program.emit_insn_with_label_dependency( Insn::Yield { yield_reg, @@ -179,6 +180,10 @@ pub fn translate_insert( }); } + program.emit_insn(Insn::Goto { + target_pc: loop_start_offset, + }); + program.resolve_label(halt_label, program.offset()); program.emit_insn(Insn::Halt); program.resolve_label(init_label, program.offset()); diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 42994b101..d638a302c 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -410,6 +410,8 @@ impl Program { loop { let insn = &self.insns[state.pc as usize]; trace_insn(self, state.pc as InsnReference, insn); + dbg!(state.pc); + dbg!(insn); let mut cursors = state.cursors.borrow_mut(); match insn { Insn::Init { target_pc } => { @@ -1068,7 +1070,8 @@ impl Program { OwnedValue::Record(record) => record, _ => unreachable!("SorterInsert on non-record register"), }; - cursor.insert(record)?; + // TODO: set correct key + cursor.insert(&OwnedValue::Integer(0), record)?; state.pc += 1; } Insn::SorterSort { @@ -1268,7 +1271,8 @@ impl Program { Insn::EndCoroutine { yield_reg } => { if let OwnedValue::Integer(pc) = state.registers[*yield_reg] { state.ended_coroutine = true; - state.pc = pc; + println!("jumping to {}", pc); + state.pc = pc - 1; // yield jump is always next to yield. Here we substract 1 to go back to yield instruction } else { unreachable!(); } @@ -1278,12 +1282,13 @@ impl Program { end_offset, } => { if let OwnedValue::Integer(pc) = state.registers[*yield_reg] { + println!("yield {} to {}", state.pc, pc); if state.ended_coroutine { state.pc = *end_offset; } else { // swap (state.pc, state.registers[*yield_reg]) = - (pc, OwnedValue::Integer(state.pc)); + (pc, OwnedValue::Integer(state.pc + 1)); } } else { unreachable!(); @@ -1295,7 +1300,6 @@ impl Program { record_reg, flag, } => { - let mut cursors = state.cursors.borrow_mut(); let cursor = cursors.get_mut(cursor).unwrap(); let record = match &state.registers[*record_reg] { OwnedValue::Record(r) => r, @@ -1303,7 +1307,9 @@ impl Program { }; let key = &state.registers[*key_reg]; match cursor.insert(key, record)? { - CursorResult::Ok(_) => {} + CursorResult::Ok(_) => { + state.pc += 1; + } CursorResult::IO => { // If there is I/O, the instruction is restarted. return Ok(StepResult::IO); @@ -1336,9 +1342,11 @@ impl Program { rowid_reg, target_pc, } => { - let mut cursors = state.cursors.borrow_mut(); let cursor = cursors.get_mut(cursor).unwrap(); - cursor.exists(&state.registers[*rowid_reg]); + match cursor.exists(&state.registers[*rowid_reg])? { + true => state.pc += 1, + false => state.pc = *target_pc, + }; } // TODO(pere): how is not exists implemented? We probably need to traverse keys my pointing cursor. // this cursor may be reused for next insert // Update: tablemoveto is used to travers on not exists, on insert depending on flags if nonseek it traverses again. @@ -1362,6 +1370,7 @@ impl Program { for i in 0..=*amount { state.registers[*dst_reg + i] = state.registers[*src_reg + i].clone(); } + state.pc += 1; } } } From bbf238a6a4d6f1f568858299f0557a5cd708b19a Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 22 Jul 2024 21:24:23 +0200 Subject: [PATCH 07/35] core/io: save raw buffers on page necessary for future lazy addressing of values and writes to page data --- core/io/mod.rs | 18 ++++++++++++------ core/sqlite3_ondisk.rs | 35 +++++++++++++++++++++++++---------- core/vdbe/mod.rs | 2 -- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/core/io/mod.rs b/core/io/mod.rs index 98d903adc..36d9b985a 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -1,7 +1,9 @@ use crate::Result; use cfg_block::cfg_block; +use std::fmt; use std::{ cell::{Ref, RefCell, RefMut}, + fmt::Debug, mem::ManuallyDrop, pin::Pin, rc::Rc, @@ -21,11 +23,11 @@ pub trait IO { fn run_once(&self) -> Result<()>; } -pub type Complete = dyn Fn(&Buffer); +pub type Complete = dyn Fn(Rc>); pub type WriteComplete = dyn Fn(usize); pub struct Completion { - pub buf: RefCell, + pub buf: Rc>, pub complete: Box, } @@ -34,8 +36,7 @@ pub struct WriteCompletion { } impl Completion { - pub fn new(buf: Buffer, complete: Box) -> Self { - let buf = RefCell::new(buf); + pub fn new(buf: Rc>, complete: Box) -> Self { Self { buf, complete } } @@ -48,8 +49,7 @@ impl Completion { } pub fn complete(&self) { - let buf = self.buf.borrow_mut(); - (self.complete)(&buf); + (self.complete)(self.buf.clone()); } } @@ -72,6 +72,12 @@ pub struct Buffer { drop: BufferDropFn, } +impl Debug for Buffer { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self.data) + } +} + impl Drop for Buffer { fn drop(&mut self) { let data = unsafe { ManuallyDrop::take(&mut self.data) }; diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 35f170475..e7ba274c4 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -31,6 +31,7 @@ use crate::types::{OwnedRecord, OwnedValue}; use crate::{PageSource, Result}; use log::trace; use std::cell::RefCell; +use std::ptr::NonNull; use std::rc::Rc; /// The size of the database header in bytes. @@ -70,10 +71,10 @@ pub struct DatabaseHeader { pub fn begin_read_database_header(page_source: &PageSource) -> Result>> { let drop_fn = Rc::new(|_buf| {}); - let buf = Buffer::allocate(512, drop_fn); + let buf = Rc::new(RefCell::new(Buffer::allocate(512, drop_fn))); let result = Rc::new(RefCell::new(DatabaseHeader::default())); let header = result.clone(); - let complete = Box::new(move |buf: &Buffer| { + let complete = Box::new(move |buf: Rc>| { let header = header.clone(); finish_read_database_header(buf, header).unwrap(); }); @@ -82,7 +83,11 @@ pub fn begin_read_database_header(page_source: &PageSource) -> Result>) -> Result<()> { +fn finish_read_database_header( + buf: Rc>, + header: Rc>, +) -> Result<()> { + let buf = buf.borrow(); let buf = buf.as_slice(); let mut header = std::cell::RefCell::borrow_mut(&header); header.magic.copy_from_slice(&buf[0..16]); @@ -123,9 +128,9 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re let buffer_to_copy_in_cb = buffer_to_copy.clone(); let header_cb = header.clone(); - let complete = Box::new(move |buffer: &Buffer| { + let complete = Box::new(move |buffer: Rc>| { let header = header_cb.clone(); - let buffer: Buffer = buffer.clone(); + let buffer: Buffer = buffer.borrow().clone(); let buffer = Rc::new(RefCell::new(buffer)); { let mut buf_mut = std::cell::RefCell::borrow_mut(&buffer); @@ -163,7 +168,7 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re }); let drop_fn = Rc::new(|_buf| {}); - let buf = Buffer::allocate(512, drop_fn); + let buf = Rc::new(RefCell::new(Buffer::allocate(512, drop_fn))); let c = Rc::new(Completion::new(buf.clone(), complete)); page_source.get(1, c.clone())?; // run get header block @@ -221,6 +226,7 @@ impl TryFrom for PageType { pub struct BTreePage { pub header: BTreePageHeader, pub cells: Vec, + pub buffer: Rc>, } impl BTreePage { @@ -246,8 +252,8 @@ pub fn begin_read_btree_page( let buffer_pool = buffer_pool.clone(); buffer_pool.put(buf); }); - let buf = Buffer::new(buf, drop_fn); - let complete = Box::new(move |buf: &Buffer| { + let buf = Rc::new(RefCell::new(Buffer::new(buf, drop_fn))); + let complete = Box::new(move |buf: Rc>| { let page = page.clone(); if finish_read_btree_page(page_idx, buf, page.clone()).is_err() { page.borrow_mut().set_error(); @@ -258,13 +264,18 @@ pub fn begin_read_btree_page( Ok(()) } -fn finish_read_btree_page(page_idx: usize, buf: &Buffer, page: Rc>) -> Result<()> { +fn finish_read_btree_page( + page_idx: usize, + buffer_ref: Rc>, + page: Rc>, +) -> Result<()> { trace!("finish_read_btree_page(page_idx = {})", page_idx); let mut pos = if page_idx == 1 { DATABASE_HEADER_SIZE } else { 0 }; + let buf = buffer_ref.borrow(); let buf = buf.as_slice(); let mut header = BTreePageHeader { page_type: buf[pos].try_into()?, @@ -291,7 +302,11 @@ fn finish_read_btree_page(page_idx: usize, buf: &Buffer, page: Rc> let cell = read_btree_cell(buf, &header.page_type, cell_pointer as usize)?; cells.push(cell); } - let inner = BTreePage { header, cells }; + let inner = BTreePage { + header, + cells, + buffer: buffer_ref.clone(), + }; { let page = page.borrow_mut(); page.contents.write().unwrap().replace(inner); diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index d638a302c..929b8c66a 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -410,8 +410,6 @@ impl Program { loop { let insn = &self.insns[state.pc as usize]; trace_insn(self, state.pc as InsnReference, insn); - dbg!(state.pc); - dbg!(insn); let mut cursors = state.cursors.borrow_mut(); match insn { Insn::Init { target_pc } => { From 4474317aa80eff54d30c414ae2cde10644c78f8b Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 24 Jul 2024 09:34:23 +0200 Subject: [PATCH 08/35] core: compute free space --- core/btree.rs | 103 +++++++++++++++++++++++++++++++++++++++-- core/sqlite3_ondisk.rs | 10 ++-- 2 files changed, 105 insertions(+), 8 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index f453d9bab..b9ccb89af 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -1,5 +1,5 @@ use crate::pager::Pager; -use crate::sqlite3_ondisk::{BTreeCell, TableInteriorCell, TableLeafCell}; +use crate::sqlite3_ondisk::{BTreeCell, BTreePage, TableInteriorCell, TableLeafCell}; use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; @@ -192,6 +192,95 @@ impl BTreeCursor { } } } + + fn insert_to_page( + &mut self, + key: &OwnedValue, + _record: &OwnedRecord, + ) -> Result> { + let mem_page = { + let mem_page = self.page.borrow(); + let mem_page = mem_page.as_ref().unwrap(); + mem_page.clone() + }; + let page_idx = mem_page.page_idx; + let page = self.pager.read_page(page_idx)?; + let page = page.borrow(); + if page.is_locked() { + return Ok(CursorResult::IO); + } + + page.set_dirty(); + + let page = page.contents.read().unwrap(); + let page = page.as_ref().unwrap(); + + let free = self.compute_free_space(page); + dbg!(free); + + Ok(CursorResult::Ok(())) + } + + fn compute_free_space(&self, page: &BTreePage) -> u16 { + let buffer = page.buffer.borrow(); + let buf = buffer.as_slice(); + + let mut first_byte_in_cell_content = page.header._cell_content_area; + if first_byte_in_cell_content == 0 { + first_byte_in_cell_content = u16::MAX; + } + + let fragmented_free_bytes = page.header._num_frag_free_bytes; + let free_block_pointer = page.header._first_freeblock_offset; + let ncell = page.cells.len(); + + // 8 + 4 == header end + let first_cell = 8 + 4 + (2 * ncell) as u16; + + dbg!(first_byte_in_cell_content); + dbg!(fragmented_free_bytes); + let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize; + + dbg!(nfree); + let mut pc = free_block_pointer as usize; + if pc > 0 { + let mut next = 0; + let mut size = 0; + if pc < first_byte_in_cell_content as usize { + // corrupt + todo!("corrupted page"); + } + + loop { + // TODO: check corruption icellast + next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()) as usize; + size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()) as usize; + nfree += size as usize; + if next <= pc + size + 3 { + break; + } + pc = next as usize; + } + + if next > 0 { + /* Freeblock not in ascending order */ + todo!("corrupted page ascending order"); + } + // if( pc+size>(unsigned int)usableSize ){ + // /* Last freeblock extends past page end */ + // todo!("corrupted page last freeblock extends last page end"); + // } + } + + // if( nFree>usableSize || nFreenFree = (u16)(nFree - iCellFirst); + + // don't count header and cell pointers? + nfree = nfree - first_cell as usize; + return nfree as u16; + } } impl Cursor for BTreeCursor { @@ -241,7 +330,15 @@ impl Cursor for BTreeCursor { OwnedValue::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - self.move_to(*int_key as u64) + match self.move_to(*int_key as u64)? { + CursorResult::Ok(_) => {} + CursorResult::IO => return Ok(CursorResult::IO), + }; + + match self.insert_to_page(key, _record)? { + CursorResult::Ok(_) => Ok(CursorResult::Ok(())), + CursorResult::IO => Ok(CursorResult::IO), + } } fn set_null_flag(&mut self, flag: bool) { @@ -253,6 +350,6 @@ impl Cursor for BTreeCursor { } fn exists(&mut self, key: &OwnedValue) -> Result { - unimplemented!() + Ok(false) } } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index e7ba274c4..18e898384 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -191,11 +191,11 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re #[derive(Debug)] pub struct BTreePageHeader { - page_type: PageType, - _first_freeblock_offset: u16, - num_cells: u16, - _cell_content_area: u16, - _num_frag_free_bytes: u8, + pub(crate) page_type: PageType, + pub(crate) _first_freeblock_offset: u16, + pub(crate) num_cells: u16, + pub(crate) _cell_content_area: u16, + pub(crate) _num_frag_free_bytes: u8, pub(crate) right_most_pointer: Option, } From 661573f2bd09606e530c7491b9647ce3e01574e0 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 24 Jul 2024 12:11:26 +0200 Subject: [PATCH 09/35] core: write_varint --- core/sqlite3_ondisk.rs | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 18e898384..0ef380443 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -592,6 +592,46 @@ fn read_varint(buf: &[u8]) -> Result<(u64, usize)> { Ok((v, 9)) } +fn write_varint(buf: &mut [u8], value: u64) -> usize { + if value <= 0x7f { + buf[0] = (value & 0x7f) as u8; + return 1; + } + + if value <= 0x3fff { + buf[0] = (((value >> 7) & 0x7f) | 0x80) as u8; + buf[1] = (value & 0x7f) as u8; + return 2; + } + + let mut value = value; + if (value & ((0xff000000_u64) << 32)) > 0 { + buf[8] = value as u8; + value >>= 8; + for i in (0..8).rev() { + buf[i] = ((value & 0x7f) | 0x80) as u8; + value >>= 7; + } + return 9; + } + + let mut encoded: [u8; 10] = [0; 10]; + let mut bytes = value; + let mut n = 0; + while bytes != 0 { + let v = 0x80 | (bytes & 0x7f); + encoded[n] = v as u8; + bytes >>= 7; + n += 1; + } + encoded[0] &= 0x7f; + dbg!(encoded); + for i in 0..n { + buf[i] = encoded[n - 1 - i]; + } + return n; +} + #[cfg(test)] mod tests { use super::*; @@ -680,4 +720,34 @@ mod tests { let result = read_varint(&buf); assert!(result.is_err()); } + + // ** 0x00 becomes 0x00000000 + // ** 0x7f becomes 0x0000007f + // ** 0x81 0x00 becomes 0x00000080 + // ** 0x82 0x00 becomes 0x00000100 + // ** 0x80 0x7f becomes 0x0000007f + // ** 0x81 0x91 0xd1 0xac 0x78 becomes 0x12345678 + // ** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081 + #[rstest] + #[case((0, 1), &[0x00])] + #[case((1, 1), &[0x01])] + #[case((129, 2), &[0x81, 0x01] )] + #[case((16513, 3), &[0x81, 0x81, 0x01] )] + #[case((2113665, 4), &[0x81, 0x81, 0x81, 0x01] )] + #[case((270549121, 5), &[0x81, 0x81, 0x81, 0x81, 0x01] )] + #[case((34630287489, 6), &[0x81, 0x81, 0x81, 0x81, 0x81, 0x01] )] + #[case((4432676798593, 7), &[0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x01] )] + #[case((567382630219905, 8), &[0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x01] )] + #[case((145249953336295681, 9), &[0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x01] )] + fn test_write_varint(#[case] value: (u64, usize), #[case] output: &[u8]) { + let mut buf: [u8; 10] = [0; 10]; + let n = write_varint(&mut buf, value.0); + assert_eq!(n, value.1); + dbg!(value); + dbg!(buf); + dbg!(output); + for i in 0..output.len() { + assert_eq!(buf[i], output[i]); + } + } } From 1820761335f2721eb7a3b4c573f0147df0bb32a1 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 24 Jul 2024 12:58:19 +0200 Subject: [PATCH 10/35] core: serialize record --- core/sqlite3_ondisk.rs | 2 +- core/types.rs | 76 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 0ef380443..ad721c058 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -592,7 +592,7 @@ fn read_varint(buf: &[u8]) -> Result<(u64, usize)> { Ok((v, 9)) } -fn write_varint(buf: &mut [u8], value: u64) -> usize { +pub fn write_varint(buf: &mut [u8], value: u64) -> usize { if value <= 0x7f { buf[0] = (value & 0x7f) as u8; return 1; diff --git a/core/types.rs b/core/types.rs index c5b46778f..2ced3f49d 100644 --- a/core/types.rs +++ b/core/types.rs @@ -1,9 +1,12 @@ use std::fmt::Display; +use std::io::Read; use std::{cell::Ref, rc::Rc}; use crate::error::LimboError; use crate::Result; +use crate::sqlite3_ondisk::write_varint; + #[derive(Debug, Clone, PartialEq)] pub enum Value<'a> { Null, @@ -301,6 +304,79 @@ impl OwnedRecord { pub fn new(values: Vec) -> Self { Self { values } } + + pub fn serialize(&self) -> Vec { + let mut buf: Vec = Vec::new(); + let mut header_bytes: usize = 0; + let mut buf_i = 0; + + let mut write_and_advance = |value: u64| { + if buf.len() - buf_i < 9 { + // ensure we have enough space for 9 bytes + buf.extend(std::iter::repeat(0).take(9)); + } + let n = write_varint(&mut buf.as_mut_slice()[buf_i..], value); + buf_i += n; + return n; + }; + + // calculate header_bytes and write serial types + for value in &self.values { + let n = match value { + OwnedValue::Null => write_and_advance(0), + OwnedValue::Integer(_) => write_and_advance(6), // for now let's only do i64 + OwnedValue::Float(_) => write_and_advance(7), + OwnedValue::Text(t) => write_and_advance((t.len() * 2 - 12) as u64), + OwnedValue::Blob(b) => write_and_advance((b.len() * 2 - 13) as u64), + // not serializable values + OwnedValue::Agg(_) => unreachable!(), + OwnedValue::Record(_) => unreachable!(), + }; + header_bytes += n; + } + + let mut write_and_advance_payload = |data: &[u8]| { + if buf.len() - buf_i < data.len() { + // ensure we have enough space for data + buf.extend(std::iter::repeat(0).take(data.len())); + } + let n = buf.as_mut_slice()[buf_i..].clone_from_slice(data); + buf_i += data.len(); + return n; + }; + // write content + for value in &self.values { + match value { + OwnedValue::Null => {} + OwnedValue::Integer(i) => { + write_and_advance_payload(&i.to_be_bytes()); + } + OwnedValue::Float(f) => { + write_and_advance_payload(&f.to_be_bytes()); + } + OwnedValue::Text(t) => { + write_and_advance_payload(t.as_bytes()); + } + OwnedValue::Blob(b) => { + write_and_advance_payload(b.as_slice()); + } + // non serializable + OwnedValue::Agg(_) => unreachable!(), + OwnedValue::Record(_) => unreachable!(), + }; + } + + let mut header_bytes_buf: Vec = Vec::new(); + header_bytes_buf.extend(std::iter::repeat(0).take(9)); + let n = write_varint(&mut header_bytes_buf.as_mut_slice(), header_bytes as u64); + buf.splice(0..0, header_bytes_buf.iter().cloned()); + buf_i += n; + + // cleanup extra extends + buf.truncate(buf_i); + + buf + } } pub enum CursorResult { From e6f8b34f2b6ffa6894e54bb710e23801ea37ea10 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 25 Jul 2024 01:19:40 +0200 Subject: [PATCH 11/35] core: insert_to_page almost complete --- core/btree.rs | 250 ++++++++++++++++++++++++++++++++++++--- core/pseudo.rs | 4 +- core/sqlite3_ondisk.rs | 5 +- core/translate/insert.rs | 6 +- core/translate/mod.rs | 12 +- core/translate/select.rs | 9 +- core/types.rs | 13 +- core/vdbe/builder.rs | 7 +- core/vdbe/mod.rs | 16 ++- core/vdbe/sorter.rs | 4 +- 10 files changed, 283 insertions(+), 43 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index b9ccb89af..a498adafe 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -1,5 +1,8 @@ use crate::pager::Pager; -use crate::sqlite3_ondisk::{BTreeCell, BTreePage, TableInteriorCell, TableLeafCell}; +use crate::sqlite3_ondisk::{ + read_varint, write_varint, BTreeCell, BTreePage, DatabaseHeader, PageType, TableInteriorCell, + TableLeafCell, +}; use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; @@ -38,10 +41,15 @@ pub struct BTreeCursor { rowid: RefCell>, record: RefCell>, null_flag: bool, + database_header: Rc>, } impl BTreeCursor { - pub fn new(pager: Rc, root_page: usize) -> Self { + pub fn new( + pager: Rc, + root_page: usize, + database_header: Rc>, + ) -> Self { Self { pager, root_page, @@ -49,6 +57,7 @@ impl BTreeCursor { rowid: RefCell::new(None), record: RefCell::new(None), null_flag: false, + database_header, } } @@ -205,26 +214,210 @@ impl BTreeCursor { }; let page_idx = mem_page.page_idx; let page = self.pager.read_page(page_idx)?; - let page = page.borrow(); + let page = page.borrow_mut(); if page.is_locked() { return Ok(CursorResult::IO); } page.set_dirty(); - let page = page.contents.read().unwrap(); - let page = page.as_ref().unwrap(); + let mut page = page.contents.write().unwrap(); + let page = page.as_mut().unwrap(); - let free = self.compute_free_space(page); - dbg!(free); + let free = self.compute_free_space(page, self.database_header.borrow()); + + // find cell + let int_key = match key { + OwnedValue::Integer(i) => *i as u64, + _ => unreachable!("btree tables are indexed by integers!"), + }; + let mut cell_idx = 0; + for cell in &page.cells { + match cell { + BTreeCell::TableLeafCell(cell) => { + if int_key >= cell._rowid { + break; + } + } + _ => todo!(), + } + cell_idx += 1; + } + + // if overwrite drop cell + + // insert cell + assert!(page.header.page_type == PageType::TableLeaf); + let mut payload: Vec = Vec::new(); + + { + // Data len will be prepended later + // Key + let mut key_varint: Vec = Vec::new(); + key_varint.extend(std::iter::repeat(0).take(9)); + let n = write_varint(&mut key_varint.as_mut_slice()[0..9], int_key); + write_varint(&mut key_varint, int_key); + key_varint.truncate(n); + payload.extend_from_slice(&key_varint); + } + + // Data payload + let payload_size_before_record = payload.len(); + _record.serialize(&mut payload); + let data_size = payload.len() - payload_size_before_record; + payload[0..8].copy_from_slice(&(data_size as u64).to_be_bytes()); + + { + // Data len + let mut data_len_varint: Vec = Vec::new(); + data_len_varint.extend(std::iter::repeat(0).take(9)); + let n = write_varint(&mut data_len_varint.as_mut_slice()[0..9], int_key); + write_varint(&mut data_len_varint, int_key); + data_len_varint.truncate(n); + payload.splice(0..0, data_len_varint.iter().cloned()); + } + + if payload.len() + 2 > free as usize { + // overflow or balance + todo!("overflow/balance"); + } else { + // insert + let pc = self.allocate_cell_space(page, payload.len() as u16); + let mut buf = page.buffer.borrow_mut(); + let mut buf = buf.as_mut_slice(); + buf[pc as usize..pc as usize + payload.len()].copy_from_slice(&payload); + // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); + let pointer_area_pc_by_idx = 8 + 2 * cell_idx; + // move previous pointers forward and insert new pointer there + let n_cells_forward = 2 * (page.cells.len() - cell_idx); + buf.copy_within( + pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, + pointer_area_pc_by_idx + 2, + ); + // TODo: refactor cells to be lazy loadable because this will be crazy slow + let mut payload_for_cell_in_memory: Vec = Vec::new(); + _record.serialize(&mut payload_for_cell_in_memory); + page.cells.insert( + cell_idx, + BTreeCell::TableLeafCell(TableLeafCell { + _rowid: int_key, + _payload: payload_for_cell_in_memory, + first_overflow_page: None, + }), + ); + + dbg!(pc); + } Ok(CursorResult::Ok(())) } - fn compute_free_space(&self, page: &BTreePage) -> u16 { + fn allocate_cell_space(&mut self, page_ref: &BTreePage, amount: u16) -> u16 { + let amount = amount as usize; + let mut page = page_ref.buffer.borrow_mut(); + let buf = page.as_mut_slice(); + + let cell_offset = 8; + let mut gap = cell_offset + 2 * page_ref.cells.len(); + let mut top = page_ref.header._cell_content_area as usize; + + dbg!(gap); + dbg!(top); + // there are free blocks and enough space + if page_ref.header._first_freeblock_offset != 0 && gap + 2 <= top { + // find slot + let db_header = self.database_header.borrow(); + let pc = find_free_cell(page_ref, db_header, amount, buf); + dbg!("found"); + dbg!(pc); + return pc as u16; + } + + if gap + 2 + amount as usize > top { + // defragment + self.defragment_page(page_ref, self.database_header.borrow()); + top = u16::from_be_bytes([buf[5], buf[6]]) as usize; + return 0; + } + + let db_header = self.database_header.borrow(); + top -= amount; + buf[5..7].copy_from_slice(&(top as u16).to_be_bytes()); + let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; + assert!(top + amount <= usable_space); + return top as u16; + } + + fn defragment_page(&self, page: &BTreePage, db_header: Ref) { + let cloned_page = page.clone(); + let usable_space = (db_header.page_size - db_header.unused_space as u16) as u64; + let mut cbrk = usable_space as u64; + + // TODO: implement fast algorithm + + let last_cell = (usable_space - 4) as u64; + let first_cell = cloned_page.header._cell_content_area as u64; + if cloned_page.cells.len() > 0 { + let buf = cloned_page.buffer.borrow(); + let buf = buf.as_slice(); + let mut write_buf = page.buffer.borrow_mut(); + let write_buf = write_buf.as_mut_slice(); + + for i in 0..cloned_page.cells.len() { + let cell_offset = 8; + let cell_idx = cell_offset + i * 2; + + let pc = u16::from_be_bytes([buf[cell_idx], buf[cell_idx + 1]]) as u64; + if pc > last_cell { + unimplemented!("corrupted page"); + } + + assert!(pc <= last_cell); + + let size = match read_varint(&buf[pc as usize..pc as usize + 9]) { + Ok(v) => v.0, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + cbrk -= size; + if cbrk < first_cell as u64 || pc as u64 + size > usable_space as u64 { + todo!("corrupt"); + } + assert!(cbrk + size <= usable_space && cbrk >= first_cell); + // set new pointer + write_buf[cell_idx..cell_idx + 2].copy_from_slice(&cbrk.to_be_bytes()); + // copy payload + write_buf[cbrk as usize..cbrk as usize + size as usize] + .copy_from_slice(&buf[pc as usize..pc as usize + size as usize]); + } + } + + // assert!( nfree >= 0 ); + // if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ + // return SQLITE_CORRUPT_PAGE(pPage); + // } + assert!(cbrk >= first_cell); + let mut write_buf = page.buffer.borrow_mut(); + let write_buf = write_buf.as_mut_slice(); + + // set new first byte of cell content + write_buf[5..7].copy_from_slice(&cbrk.to_be_bytes()); + // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start + write_buf[1] = 0; + write_buf[2] = 0; + // set unused space to 0 + write_buf[first_cell as usize..first_cell as usize + cbrk as usize - first_cell as usize] + .fill(0); + } + + // Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte + // and end of cell pointer area. + fn compute_free_space(&self, page: &BTreePage, db_header: Ref) -> u16 { let buffer = page.buffer.borrow(); let buf = buffer.as_slice(); + let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; let mut first_byte_in_cell_content = page.header._cell_content_area; if first_byte_in_cell_content == 0 { first_byte_in_cell_content = u16::MAX; @@ -237,11 +430,8 @@ impl BTreeCursor { // 8 + 4 == header end let first_cell = 8 + 4 + (2 * ncell) as u16; - dbg!(first_byte_in_cell_content); - dbg!(fragmented_free_bytes); let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize; - dbg!(nfree); let mut pc = free_block_pointer as usize; if pc > 0 { let mut next = 0; @@ -263,26 +453,50 @@ impl BTreeCursor { } if next > 0 { - /* Freeblock not in ascending order */ todo!("corrupted page ascending order"); } - // if( pc+size>(unsigned int)usableSize ){ - // /* Last freeblock extends past page end */ - // todo!("corrupted page last freeblock extends last page end"); - // } + + if pc + size > usable_space { + todo!("corrupted page last freeblock extends last page end"); + } } // if( nFree>usableSize || nFreenFree = (u16)(nFree - iCellFirst); - // don't count header and cell pointers? nfree = nfree - first_cell as usize; return nfree as u16; } } +fn find_free_cell( + page_ref: &BTreePage, + db_header: Ref, + amount: usize, + buf: &[u8], +) -> usize { + // NOTE: freelist is in ascending order of keys and pc + // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc + let mut pc = page_ref.header._first_freeblock_offset as usize; + let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; + let maxpc = (usable_space - amount as usize) as usize; + let mut found = false; + while pc <= maxpc { + let next = u16::from_be_bytes(buf[pc..pc + 2].try_into().unwrap()); + let size = u16::from_be_bytes(buf[pc + 2..pc + 4].try_into().unwrap()); + if amount <= size as usize { + found = true; + break; + } + pc = next as usize; + } + if !found { + unimplemented!("recover for fragmented space"); + } + pc +} + impl Cursor for BTreeCursor { fn is_empty(&self) -> bool { self.record.borrow().is_none() diff --git a/core/pseudo.rs b/core/pseudo.rs index 4d6b6e78a..92c69f7ee 100644 --- a/core/pseudo.rs +++ b/core/pseudo.rs @@ -50,9 +50,9 @@ impl Cursor for PseudoCursor { Ok(self.current.borrow()) } - fn insert(&mut self, record: &OwnedRecord) -> Result<()> { + fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result> { *self.current.borrow_mut() = Some(record.clone()); - Ok(()) + Ok(CursorResult::Ok(())) } fn get_null_flag(&self) -> bool { diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index ad721c058..9255257bd 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -48,7 +48,7 @@ pub struct DatabaseHeader { pub page_size: u16, write_version: u8, read_version: u8, - unused_space: u8, + pub unused_space: u8, max_embed_frac: u8, min_embed_frac: u8, min_leaf_frac: u8, @@ -194,6 +194,7 @@ pub struct BTreePageHeader { pub(crate) page_type: PageType, pub(crate) _first_freeblock_offset: u16, pub(crate) num_cells: u16, + // First byte of content area pub(crate) _cell_content_area: u16, pub(crate) _num_frag_free_bytes: u8, pub(crate) right_most_pointer: Option, @@ -573,7 +574,7 @@ pub fn read_value(buf: &[u8], serial_type: &SerialType) -> Result<(OwnedValue, u } } -fn read_varint(buf: &[u8]) -> Result<(u64, usize)> { +pub fn read_varint(buf: &[u8]) -> Result<(u64, usize)> { let mut v: u64 = 0; for i in 0..8 { match buf.get(i) { diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 70637ace1..7ada31f75 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -1,4 +1,4 @@ -use std::{ops::Deref, rc::Rc}; +use std::{cell::RefCell, ops::Deref, rc::Rc}; use sqlite3_parser::ast::{ DistinctNames, InsertBody, Name, QualifiedName, ResolveType, ResultColumn, Select, With, @@ -7,6 +7,7 @@ use sqlite3_parser::ast::{ use crate::Result; use crate::{ schema::{self, Schema, Table}, + sqlite3_ondisk::DatabaseHeader, translate::expr::{resolve_ident_qualified, translate_expr}, vdbe::{builder::ProgramBuilder, Insn, Program}, }; @@ -19,6 +20,7 @@ pub fn translate_insert( columns: &Option, body: &InsertBody, returning: &Option>, + database_header: Rc>, ) -> Result { assert!(with.is_none()); assert!(or_conflict.is_none()); @@ -193,5 +195,5 @@ pub fn translate_insert( target_pc: start_offset, }); program.resolve_deferred_labels(); - Ok(program.build()) + Ok(program.build(database_header)) } diff --git a/core/translate/mod.rs b/core/translate/mod.rs index 48ebe1746..280cbc372 100644 --- a/core/translate/mod.rs +++ b/core/translate/mod.rs @@ -58,7 +58,7 @@ pub fn translate( ast::Stmt::Savepoint(_) => bail_parse_error!("SAVEPOINT not supported yet"), ast::Stmt::Select(select) => { let select = prepare_select(schema, &select)?; - translate_select(select) + translate_select(select, database_header) } ast::Stmt::Update { .. } => bail_parse_error!("UPDATE not supported yet"), ast::Stmt::Vacuum(_, _) => bail_parse_error!("VACUUM not supported yet"), @@ -77,6 +77,7 @@ pub fn translate( &columns, &body, &returning, + database_header, ), } } @@ -124,7 +125,12 @@ fn translate_pragma( }, _ => 0, }; - update_pragma(&name.name.0, value_to_update, database_header, pager); + update_pragma( + &name.name.0, + value_to_update, + database_header.clone(), + pager, + ); } Some(ast::PragmaBody::Call(_)) => { todo!() @@ -138,7 +144,7 @@ fn translate_pragma( target_pc: start_offset, }); program.resolve_deferred_labels(); - Ok(program.build()) + Ok(program.build(database_header)) } fn update_pragma(name: &str, value: i64, header: Rc>, pager: Rc) { diff --git a/core/translate/select.rs b/core/translate/select.rs index 5e79039d5..154b91be5 100644 --- a/core/translate/select.rs +++ b/core/translate/select.rs @@ -1,5 +1,6 @@ use crate::function::{AggFunc, Func}; use crate::schema::{Column, PseudoTable, Schema, Table}; +use crate::sqlite3_ondisk::DatabaseHeader; use crate::translate::expr::{analyze_columns, maybe_apply_affinity, translate_expr}; use crate::translate::where_clause::{ process_where, translate_processed_where, translate_tableless_where, ProcessedWhereClause, @@ -11,6 +12,7 @@ use crate::Result; use sqlite3_parser::ast::{self, JoinOperator, JoinType, ResultColumn}; +use std::cell::RefCell; use std::rc::Rc; /// A representation of a `SELECT` statement that has all the information @@ -235,7 +237,10 @@ pub fn prepare_select<'a>(schema: &Schema, select: &'a ast::Select) -> Result Result { +pub fn translate_select( + mut select: Select, + database_header: Rc>, +) -> Result { let mut program = ProgramBuilder::new(); let init_label = program.allocate_label(); let early_terminate_label = program.allocate_label(); @@ -423,7 +428,7 @@ pub fn translate_select(mut select: Select) -> Result { target_pc: start_offset, }); program.resolve_deferred_labels(); - Ok(program.build()) + Ok(program.build(database_header)) } fn emit_limit_insn(limit_info: &Option, program: &mut ProgramBuilder) { diff --git a/core/types.rs b/core/types.rs index 2ced3f49d..dd130a9f8 100644 --- a/core/types.rs +++ b/core/types.rs @@ -305,8 +305,7 @@ impl OwnedRecord { Self { values } } - pub fn serialize(&self) -> Vec { - let mut buf: Vec = Vec::new(); + pub fn serialize(&self, buf: &mut Vec) { let mut header_bytes: usize = 0; let mut buf_i = 0; @@ -315,7 +314,7 @@ impl OwnedRecord { // ensure we have enough space for 9 bytes buf.extend(std::iter::repeat(0).take(9)); } - let n = write_varint(&mut buf.as_mut_slice()[buf_i..], value); + let n = write_varint(&mut buf.as_mut_slice()[buf_i..buf_i + 9], value); buf_i += n; return n; }; @@ -326,8 +325,8 @@ impl OwnedRecord { OwnedValue::Null => write_and_advance(0), OwnedValue::Integer(_) => write_and_advance(6), // for now let's only do i64 OwnedValue::Float(_) => write_and_advance(7), - OwnedValue::Text(t) => write_and_advance((t.len() * 2 - 12) as u64), - OwnedValue::Blob(b) => write_and_advance((b.len() * 2 - 13) as u64), + OwnedValue::Text(t) => write_and_advance((t.len() * 2 + 13) as u64), + OwnedValue::Blob(b) => write_and_advance((b.len() * 2 + 12) as u64), // not serializable values OwnedValue::Agg(_) => unreachable!(), OwnedValue::Record(_) => unreachable!(), @@ -340,7 +339,7 @@ impl OwnedRecord { // ensure we have enough space for data buf.extend(std::iter::repeat(0).take(data.len())); } - let n = buf.as_mut_slice()[buf_i..].clone_from_slice(data); + let n = buf.as_mut_slice()[buf_i..buf_i + data.len()].clone_from_slice(data); buf_i += data.len(); return n; }; @@ -374,8 +373,6 @@ impl OwnedRecord { // cleanup extra extends buf.truncate(buf_i); - - buf } } diff --git a/core/vdbe/builder.rs b/core/vdbe/builder.rs index 382bc8731..c1ddb380c 100644 --- a/core/vdbe/builder.rs +++ b/core/vdbe/builder.rs @@ -1,3 +1,7 @@ +use std::{cell::RefCell, rc::Rc}; + +use crate::sqlite3_ondisk::DatabaseHeader; + use super::{BranchOffset, CursorID, Insn, InsnReference, Program, Table}; pub struct ProgramBuilder { @@ -281,7 +285,7 @@ impl ProgramBuilder { self.deferred_label_resolutions.clear(); } - pub fn build(self) -> Program { + pub fn build(self, database_header: Rc>) -> Program { assert!( self.deferred_label_resolutions.is_empty(), "deferred_label_resolutions is not empty when build() is called, did you forget to call resolve_deferred_labels()?" @@ -294,6 +298,7 @@ impl ProgramBuilder { max_registers: self.next_free_register, insns: self.insns, cursor_ref: self.cursor_ref, + database_header, } } } diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 929b8c66a..08a77c937 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -28,6 +28,7 @@ use crate::function::{AggFunc, ScalarFunc}; use crate::pager::Pager; use crate::pseudo::PseudoCursor; use crate::schema::Table; +use crate::sqlite3_ondisk::DatabaseHeader; use crate::types::{AggContext, Cursor, CursorResult, OwnedRecord, OwnedValue, Record}; use crate::Result; @@ -381,6 +382,7 @@ pub struct Program { pub max_registers: usize, pub insns: Vec, pub cursor_ref: Vec<(Option, Option)>, + pub database_header: Rc>, } impl Program { @@ -639,7 +641,11 @@ impl Program { cursor_id, root_page, } => { - let cursor = Box::new(BTreeCursor::new(pager.clone(), *root_page)); + let cursor = Box::new(BTreeCursor::new( + pager.clone(), + *root_page, + self.database_header.clone(), + )); cursors.insert(*cursor_id, cursor); state.pc += 1; } @@ -1056,7 +1062,7 @@ impl Program { }; state.registers[*dest_reg] = OwnedValue::Record(record.clone()); let sorter_cursor = cursors.get_mut(sorter_cursor).unwrap(); - sorter_cursor.insert(&record)?; + sorter_cursor.insert(&OwnedValue::Integer(0), &record)?; // fix key later state.pc += 1; } Insn::SorterInsert { @@ -1353,7 +1359,11 @@ impl Program { cursor_id, root_page, } => { - let cursor = Box::new(BTreeCursor::new(pager.clone(), *root_page)); + let cursor = Box::new(BTreeCursor::new( + pager.clone(), + *root_page, + self.database_header.clone(), + )); cursors.insert(*cursor_id, cursor); state.pc += 1; } diff --git a/core/vdbe/sorter.rs b/core/vdbe/sorter.rs index 70cff752c..951c56615 100644 --- a/core/vdbe/sorter.rs +++ b/core/vdbe/sorter.rs @@ -79,11 +79,11 @@ impl Cursor for Sorter { Ok(self.current.borrow()) } - fn insert(&mut self, record: &OwnedRecord) -> Result<()> { + fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result> { let key_fields = self.order.len(); let key = OwnedRecord::new(record.values[0..key_fields].to_vec()); self.insert(key, OwnedRecord::new(record.values[key_fields..].to_vec())); - Ok(()) + Ok(CursorResult::Ok(())) } fn set_null_flag(&mut self, _flag: bool) { From 7846a3b29c0454a88ac334cd00e7b92a7cc71525 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 25 Jul 2024 12:21:20 +0200 Subject: [PATCH 12/35] core: fix calculation of record header size varint --- core/btree.rs | 17 ++++----- core/sqlite3_ondisk.rs | 8 ++--- core/translate/insert.rs | 9 ++--- core/types.rs | 75 ++++++++++++++++------------------------ core/vdbe/mod.rs | 2 -- 5 files changed, 39 insertions(+), 72 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index a498adafe..f8b0e11e0 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -235,7 +235,7 @@ impl BTreeCursor { for cell in &page.cells { match cell { BTreeCell::TableLeafCell(cell) => { - if int_key >= cell._rowid { + if int_key <= cell._rowid { break; } } @@ -264,15 +264,16 @@ impl BTreeCursor { // Data payload let payload_size_before_record = payload.len(); _record.serialize(&mut payload); - let data_size = payload.len() - payload_size_before_record; - payload[0..8].copy_from_slice(&(data_size as u64).to_be_bytes()); + let header_size = payload.len() - payload_size_before_record; { // Data len let mut data_len_varint: Vec = Vec::new(); data_len_varint.extend(std::iter::repeat(0).take(9)); - let n = write_varint(&mut data_len_varint.as_mut_slice()[0..9], int_key); - write_varint(&mut data_len_varint, int_key); + let n = write_varint( + &mut data_len_varint.as_mut_slice()[0..9], + header_size as u64, + ); data_len_varint.truncate(n); payload.splice(0..0, data_len_varint.iter().cloned()); } @@ -305,8 +306,6 @@ impl BTreeCursor { first_overflow_page: None, }), ); - - dbg!(pc); } Ok(CursorResult::Ok(())) @@ -321,15 +320,11 @@ impl BTreeCursor { let mut gap = cell_offset + 2 * page_ref.cells.len(); let mut top = page_ref.header._cell_content_area as usize; - dbg!(gap); - dbg!(top); // there are free blocks and enough space if page_ref.header._first_freeblock_offset != 0 && gap + 2 <= top { // find slot let db_header = self.database_header.borrow(); let pc = find_free_cell(page_ref, db_header, amount, buf); - dbg!("found"); - dbg!(pc); return pc as u16; } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 9255257bd..9cf03209a 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -479,8 +479,8 @@ pub fn read_record(payload: &[u8]) -> Result { } let mut values = Vec::with_capacity(serial_types.len()); for serial_type in &serial_types { - let (value, usize) = read_value(&payload[pos..], serial_type)?; - pos += usize; + let (value, n) = read_value(&payload[pos..], serial_type)?; + pos += n; values.push(value); } Ok(OwnedRecord::new(values)) @@ -626,7 +626,6 @@ pub fn write_varint(buf: &mut [u8], value: u64) -> usize { n += 1; } encoded[0] &= 0x7f; - dbg!(encoded); for i in 0..n { buf[i] = encoded[n - 1 - i]; } @@ -744,9 +743,6 @@ mod tests { let mut buf: [u8; 10] = [0; 10]; let n = write_varint(&mut buf, value.0); assert_eq!(n, value.1); - dbg!(value); - dbg!(buf); - dbg!(output); for i in 0..output.len() { assert_eq!(buf[i], output[i]); } diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 7ada31f75..11a5a558e 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -34,11 +34,6 @@ pub fn translate_insert( ); let start_offset = program.offset(); - dbg!(tbl_name); - dbg!(columns); - dbg!(returning); - dbg!(with); - dbg!(body); // open table let table_name = &tbl_name.name; @@ -167,8 +162,8 @@ pub fn translate_insert( program.emit_insn(Insn::Halt); // Add error code 1555 and rollback program.resolve_label(make_record_label, program.offset()); program.emit_insn(Insn::MakeRecord { - start_reg: column_registers_start, - count: num_cols, + start_reg: column_registers_start + 1, + count: num_cols - 1, dest_reg: record_register, }); program.emit_insn(Insn::InsertAsync { diff --git a/core/types.rs b/core/types.rs index dd130a9f8..fc8c98413 100644 --- a/core/types.rs +++ b/core/types.rs @@ -306,59 +306,35 @@ impl OwnedRecord { } pub fn serialize(&self, buf: &mut Vec) { - let mut header_bytes: usize = 0; - let mut buf_i = 0; + let initial_i = buf.len(); - let mut write_and_advance = |value: u64| { - if buf.len() - buf_i < 9 { - // ensure we have enough space for 9 bytes - buf.extend(std::iter::repeat(0).take(9)); - } - let n = write_varint(&mut buf.as_mut_slice()[buf_i..buf_i + 9], value); - buf_i += n; - return n; - }; - - // calculate header_bytes and write serial types for value in &self.values { - let n = match value { - OwnedValue::Null => write_and_advance(0), - OwnedValue::Integer(_) => write_and_advance(6), // for now let's only do i64 - OwnedValue::Float(_) => write_and_advance(7), - OwnedValue::Text(t) => write_and_advance((t.len() * 2 + 13) as u64), - OwnedValue::Blob(b) => write_and_advance((b.len() * 2 + 12) as u64), + let serial_type = match value { + OwnedValue::Null => 0, + OwnedValue::Integer(_) => 6, // for now let's only do i64 + OwnedValue::Float(_) => 7, + OwnedValue::Text(t) => (t.len() * 2 + 13) as u64, + OwnedValue::Blob(b) => (b.len() * 2 + 12) as u64, // not serializable values OwnedValue::Agg(_) => unreachable!(), OwnedValue::Record(_) => unreachable!(), }; - header_bytes += n; + + buf.resize(buf.len() + 9, 0); // Ensure space for varint + let len = buf.len(); + let n = write_varint(&mut buf[len - 9..], serial_type); + buf.truncate(buf.len() - 9 + n); // Remove unused bytes } - let mut write_and_advance_payload = |data: &[u8]| { - if buf.len() - buf_i < data.len() { - // ensure we have enough space for data - buf.extend(std::iter::repeat(0).take(data.len())); - } - let n = buf.as_mut_slice()[buf_i..buf_i + data.len()].clone_from_slice(data); - buf_i += data.len(); - return n; - }; + let mut header_size = buf.len() - initial_i; // write content for value in &self.values { match value { OwnedValue::Null => {} - OwnedValue::Integer(i) => { - write_and_advance_payload(&i.to_be_bytes()); - } - OwnedValue::Float(f) => { - write_and_advance_payload(&f.to_be_bytes()); - } - OwnedValue::Text(t) => { - write_and_advance_payload(t.as_bytes()); - } - OwnedValue::Blob(b) => { - write_and_advance_payload(b.as_slice()); - } + OwnedValue::Integer(i) => buf.extend_from_slice(&i.to_be_bytes()), + OwnedValue::Float(f) => buf.extend_from_slice(&f.to_be_bytes()), + OwnedValue::Text(t) => buf.extend_from_slice(t.as_bytes()), + OwnedValue::Blob(b) => buf.extend_from_slice(b), // non serializable OwnedValue::Agg(_) => unreachable!(), OwnedValue::Record(_) => unreachable!(), @@ -366,13 +342,20 @@ impl OwnedRecord { } let mut header_bytes_buf: Vec = Vec::new(); + if header_size <= 126 { + // common case + header_size += 1; + } else { + todo!("calculate big header size extra bytes"); + // get header varint len + // header_size += n; + // if( nVarint { if let OwnedValue::Integer(pc) = state.registers[*yield_reg] { state.ended_coroutine = true; - println!("jumping to {}", pc); state.pc = pc - 1; // yield jump is always next to yield. Here we substract 1 to go back to yield instruction } else { unreachable!(); @@ -1286,7 +1285,6 @@ impl Program { end_offset, } => { if let OwnedValue::Integer(pc) = state.registers[*yield_reg] { - println!("yield {} to {}", state.pc, pc); if state.ended_coroutine { state.pc = *end_offset; } else { From 845a1ea175b6e441fa9a4566a0f7e7b5129db64e Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 25 Jul 2024 17:46:38 +0200 Subject: [PATCH 13/35] core: cacheflush and fix *Completion casting --- core/btree.rs | 20 +++++++++++--------- core/io/linux.rs | 10 +++++++--- core/io/mod.rs | 21 +++++++++++++++++---- core/lib.rs | 5 +++++ core/pager.rs | 29 ++++++++++++++++++++++++++--- core/sqlite3_ondisk.rs | 35 +++++++++++++++++++++++++++++------ core/storage.rs | 29 ++++++++++++++--------------- 7 files changed, 109 insertions(+), 40 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index f8b0e11e0..16020070b 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -6,6 +6,7 @@ use crate::sqlite3_ondisk::{ use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; +use std::borrow::BorrowMut; use std::cell::{Ref, RefCell}; use std::rc::Rc; @@ -213,13 +214,14 @@ impl BTreeCursor { mem_page.clone() }; let page_idx = mem_page.page_idx; - let page = self.pager.read_page(page_idx)?; - let page = page.borrow_mut(); + let page_ref = self.pager.read_page(page_idx)?; + let page = page_ref.borrow(); if page.is_locked() { return Ok(CursorResult::IO); } page.set_dirty(); + self.pager.add_dirty(page_ref.clone()); let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); @@ -284,8 +286,8 @@ impl BTreeCursor { } else { // insert let pc = self.allocate_cell_space(page, payload.len() as u16); - let mut buf = page.buffer.borrow_mut(); - let mut buf = buf.as_mut_slice(); + let mut buf_ref = RefCell::borrow_mut(&page.buffer); + let buf: &mut [u8] = buf_ref.as_mut_slice(); buf[pc as usize..pc as usize + payload.len()].copy_from_slice(&payload); // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); let pointer_area_pc_by_idx = 8 + 2 * cell_idx; @@ -313,11 +315,11 @@ impl BTreeCursor { fn allocate_cell_space(&mut self, page_ref: &BTreePage, amount: u16) -> u16 { let amount = amount as usize; - let mut page = page_ref.buffer.borrow_mut(); - let buf = page.as_mut_slice(); + let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); + let buf = buf_ref.as_mut_slice(); let cell_offset = 8; - let mut gap = cell_offset + 2 * page_ref.cells.len(); + let gap = cell_offset + 2 * page_ref.cells.len(); let mut top = page_ref.header._cell_content_area as usize; // there are free blocks and enough space @@ -355,7 +357,7 @@ impl BTreeCursor { if cloned_page.cells.len() > 0 { let buf = cloned_page.buffer.borrow(); let buf = buf.as_slice(); - let mut write_buf = page.buffer.borrow_mut(); + let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); for i in 0..cloned_page.cells.len() { @@ -393,7 +395,7 @@ impl BTreeCursor { // return SQLITE_CORRUPT_PAGE(pPage); // } assert!(cbrk >= first_cell); - let mut write_buf = page.buffer.borrow_mut(); + let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); // set new first byte of cell content diff --git a/core/io/linux.rs b/core/io/linux.rs index 71a5cd58d..3aafbf4ca 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -159,11 +159,15 @@ impl File for LinuxFile { } fn pread(&self, pos: usize, c: Rc) -> Result<()> { - trace!("pread(pos = {}, length = {})", pos, c.buf().len()); + let r = match &(*c) { + Completion::Read(r) => r, + Completion::Write(_) => unreachable!(), + }; + trace!("pread(pos = {}, length = {})", pos, r.buf().len()); let fd = io_uring::types::Fd(self.file.as_raw_fd()); let mut io = self.io.borrow_mut(); let read_e = { - let mut buf = c.buf_mut(); + let mut buf = r.buf_mut(); let len = buf.len(); let buf = buf.as_mut_ptr(); let ptr = Rc::into_raw(c.clone()); @@ -186,7 +190,7 @@ impl File for LinuxFile { &self, pos: usize, buffer: Rc>, - c: Rc, + c: Rc, ) -> Result<()> { let mut io = self.io.borrow_mut(); let fd = io_uring::types::Fd(self.file.as_raw_fd()); diff --git a/core/io/mod.rs b/core/io/mod.rs index 36d9b985a..cf8ef25a3 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -13,8 +13,7 @@ pub trait File { fn lock_file(&self, exclusive: bool) -> Result<()>; fn unlock_file(&self) -> Result<()>; fn pread(&self, pos: usize, c: Rc) -> Result<()>; - fn pwrite(&self, pos: usize, buffer: Rc>, c: Rc) - -> Result<()>; + fn pwrite(&self, pos: usize, buffer: Rc>, c: Rc) -> Result<()>; } pub trait IO { @@ -26,16 +25,30 @@ pub trait IO { pub type Complete = dyn Fn(Rc>); pub type WriteComplete = dyn Fn(usize); -pub struct Completion { +pub enum Completion { + Read(ReadCompletion), + Write(WriteCompletion), +} + +pub struct ReadCompletion { pub buf: Rc>, pub complete: Box, } +impl Completion { + pub fn complete(&self) { + match self { + Completion::Read(r) => r.complete(), + Completion::Write(w) => w.complete(234234), // fix + } + } +} + pub struct WriteCompletion { pub complete: Box, } -impl Completion { +impl ReadCompletion { pub fn new(buf: Rc>, complete: Box) -> Self { Self { buf, complete } } diff --git a/core/lib.rs b/core/lib.rs index 593e8765e..eefd2acde 100644 --- a/core/lib.rs +++ b/core/lib.rs @@ -201,6 +201,11 @@ impl Connection { } Ok(()) } + + pub fn cacheflush(&self) -> Result<()> { + self.pager.cacheflush()?; + Ok(()) + } } pub struct Statement { diff --git a/core/pager.rs b/core/pager.rs index ecca1b566..e38a266e5 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -18,6 +18,7 @@ use std::sync::{Arc, RwLock}; pub struct Page { flags: AtomicUsize, pub contents: RwLock>, + pub id: usize, } /// Page is up-to-date. @@ -31,15 +32,16 @@ const PAGE_DIRTY: usize = 0b1000; impl Default for Page { fn default() -> Self { - Self::new() + Self::new(0) } } impl Page { - pub fn new() -> Page { + pub fn new(id: usize) -> Page { Page { flags: AtomicUsize::new(0), contents: RwLock::new(None), + id, } } @@ -273,6 +275,7 @@ pub struct Pager { buffer_pool: Rc, /// I/O interface for input/output operations. pub io: Arc, + dirty_pages: Rc>>>>, } impl Pager { @@ -296,6 +299,7 @@ impl Pager { buffer_pool, page_cache, io, + dirty_pages: Rc::new(RefCell::new(Vec::new())), }) } @@ -306,7 +310,7 @@ impl Pager { if let Some(page) = page_cache.get(&page_idx) { return Ok(page.clone()); } - let page = Rc::new(RefCell::new(Page::new())); + let page = Rc::new(RefCell::new(Page::new(page_idx))); page.borrow().set_locked(); sqlite3_ondisk::begin_read_btree_page( &self.page_source, @@ -327,4 +331,23 @@ impl Pager { pub fn change_page_cache_size(&self, capacity: usize) { self.page_cache.borrow_mut().resize(capacity); } + + pub fn add_dirty(&self, page: Rc>) { + // TODO: cehck duplicates? + let mut dirty_pages = RefCell::borrow_mut(&self.dirty_pages); + dirty_pages.push(page); + } + + pub fn cacheflush(&self) -> anyhow::Result<()> { + let mut dirty_pages = RefCell::borrow_mut(&self.dirty_pages); + loop { + if dirty_pages.len() == 0 { + break; + } + let page = dirty_pages.pop().unwrap(); + sqlite3_ondisk::begin_write_btree_page(self, &page)?; + self.io.run_once()?; + } + Ok(()) + } } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 9cf03209a..471bf4be4 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -25,13 +25,12 @@ /// For more information, see: https://www.sqlite.org/fileformat.html use crate::buffer_pool::BufferPool; use crate::error::LimboError; -use crate::io::{Buffer, Completion, WriteCompletion}; +use crate::io::{Buffer, Completion, ReadCompletion, WriteCompletion}; use crate::pager::{Page, Pager}; use crate::types::{OwnedRecord, OwnedValue}; use crate::{PageSource, Result}; use log::trace; use std::cell::RefCell; -use std::ptr::NonNull; use std::rc::Rc; /// The size of the database header in bytes. @@ -78,7 +77,7 @@ pub fn begin_read_database_header(page_source: &PageSource) -> Result Re let drop_fn = Rc::new(|_buf| {}); let buf = Rc::new(RefCell::new(Buffer::allocate(512, drop_fn))); - let c = Rc::new(Completion::new(buf.clone(), complete)); + let c = Rc::new(Completion::Read(ReadCompletion::new(buf.clone(), complete))); page_source.get(1, c.clone())?; // run get header block pager.io.run_once()?; @@ -183,7 +182,7 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re } // finish_read_database_header(buf, header).unwrap(); }); - let c = Rc::new(WriteCompletion::new(write_complete)); + let c = Rc::new(Completion::Write(WriteCompletion::new(write_complete))); page_source.write(0, buffer_to_copy.clone(), c).unwrap(); Ok(()) @@ -260,7 +259,7 @@ pub fn begin_read_btree_page( page.borrow_mut().set_error(); } }); - let c = Rc::new(Completion::new(buf, complete)); + let c = Rc::new(Completion::Read(ReadCompletion::new(buf, complete))); page_source.get(page_idx, c.clone())?; Ok(()) } @@ -317,6 +316,30 @@ fn finish_read_btree_page( Ok(()) } +pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result<()> { + let page_source = &pager.page_source; + let page = page.borrow(); + let contents = page.contents.read().unwrap(); + let contents = contents.as_ref().unwrap(); + let buffer = contents.buffer.clone(); + let write_complete = { + let buf_copy = buffer.clone(); + Box::new(move |bytes_written: usize| { + let buf_copy = buf_copy.clone(); + let buf_len = buf_copy.borrow().len(); + if bytes_written < buf_len { + log::error!("wrote({bytes_written}) less than expected({buf_len})"); + } + println!("done"); + // finish_read_database_header(buf, header).unwrap(); + }) + }; + dbg!(buffer.borrow().len()); + let c = Rc::new(Completion::Write(WriteCompletion::new(write_complete))); + page_source.write(page.id, buffer.clone(), c)?; + Ok(()) +} + #[derive(Debug)] pub enum BTreeCell { TableInteriorCell(TableInteriorCell), diff --git a/core/storage.rs b/core/storage.rs index bb62df0bd..3eed75797 100644 --- a/core/storage.rs +++ b/core/storage.rs @@ -39,7 +39,7 @@ impl PageSource { &self, page_idx: usize, buffer: Rc>, - c: Rc, + c: Rc, ) -> Result<()> { self.io.write(page_idx, buffer, c) } @@ -47,12 +47,7 @@ impl PageSource { pub trait PageIO { fn get(&self, page_idx: usize, c: Rc) -> Result<()>; - fn write( - &self, - page_idx: usize, - buffer: Rc>, - c: Rc, - ) -> Result<()>; + fn write(&self, page_idx: usize, buffer: Rc>, c: Rc) -> Result<()>; } #[cfg(feature = "fs")] @@ -63,7 +58,11 @@ struct FileStorage { #[cfg(feature = "fs")] impl PageIO for FileStorage { fn get(&self, page_idx: usize, c: Rc) -> Result<()> { - let size = c.buf().len(); + let r = match &(*c) { + Completion::Read(r) => r, + Completion::Write(_) => unreachable!(), + }; + let size = r.buf().len(); assert!(page_idx > 0); if size < 512 || size > 65536 || size & (size - 1) != 0 { return Err(LimboError::NotADB.into()); @@ -73,17 +72,17 @@ impl PageIO for FileStorage { Ok(()) } - fn write( - &self, - page_idx: usize, - buffer: Rc>, - c: Rc, - ) -> Result<()> { + fn write(&self, page_idx: usize, buffer: Rc>, c: Rc) -> Result<()> { + let w = match &(*c) { + Completion::Read(_) => unreachable!(), + Completion::Write(w) => w, + }; let buffer_size = buffer.borrow().len(); assert!(buffer_size >= 512); assert!(buffer_size <= 65536); assert!((buffer_size & (buffer_size - 1)) == 0); - self.file.pwrite(page_idx, buffer, c)?; + let pos = (page_idx - 1) * buffer_size; + self.file.pwrite(pos, buffer, c)?; Ok(()) } } From 037e2606829ad9ed296a4eef4560cd4d56baa732 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Thu, 25 Jul 2024 17:47:31 +0200 Subject: [PATCH 14/35] core: add cacheflush in cli query --- cli/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli/main.rs b/cli/main.rs index 678bc4afd..bae51058e 100644 --- a/cli/main.rs +++ b/cli/main.rs @@ -277,5 +277,7 @@ fn query( eprintln!("{}", err); } } + // for now let's cache flush always + conn.cacheflush()?; Ok(()) } From 463292c2fee84ea7c755e4ec3d1588e2e6e74363 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 12:44:37 +0200 Subject: [PATCH 15/35] core: fix rebase errors --- core/pager.rs | 2 +- core/vdbe/builder.rs | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/core/pager.rs b/core/pager.rs index e38a266e5..d39912083 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -338,7 +338,7 @@ impl Pager { dirty_pages.push(page); } - pub fn cacheflush(&self) -> anyhow::Result<()> { + pub fn cacheflush(&self) -> Result<()> { let mut dirty_pages = RefCell::borrow_mut(&self.dirty_pages); loop { if dirty_pages.len() == 0 { diff --git a/core/vdbe/builder.rs b/core/vdbe/builder.rs index c1ddb380c..d19c2f34c 100644 --- a/core/vdbe/builder.rs +++ b/core/vdbe/builder.rs @@ -250,6 +250,26 @@ impl ProgramBuilder { assert!(*pc_if_next < 0); *pc_if_next = to_offset; } + Insn::InitCoroutine { + yield_reg, + jump_on_definition, + start_offset, + } => { + *jump_on_definition = to_offset; + } + Insn::NotExists { + cursor, + rowid_reg, + target_pc, + } => { + *target_pc = to_offset; + } + Insn::Yield { + yield_reg, + end_offset, + } => { + *end_offset = to_offset; + } _ => { todo!("missing resolve_label for {:?}", insn); } From 84bf0ea96a856f9be34c8208acf30adef042a304 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 12:56:59 +0200 Subject: [PATCH 16/35] core: remove a bunch of warnings Signed-off-by: Pere Diaz Bou --- core/btree.rs | 1 - core/io/linux.rs | 13 +++++++++---- core/pager.rs | 19 ++++++------------- core/pseudo.rs | 2 ++ core/sqlite3_ondisk.rs | 16 ++++++++-------- core/storage.rs | 10 +--------- core/translate/expr.rs | 2 +- core/translate/insert.rs | 10 +++++----- core/types.rs | 1 - core/vdbe/builder.rs | 10 +++++----- core/vdbe/mod.rs | 4 ++-- core/vdbe/sorter.rs | 2 ++ 12 files changed, 41 insertions(+), 49 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 16020070b..9c5d12b38 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -6,7 +6,6 @@ use crate::sqlite3_ondisk::{ use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; -use std::borrow::BorrowMut; use std::cell::{Ref, RefCell}; use std::rc::Rc; diff --git a/core/io/linux.rs b/core/io/linux.rs index 3aafbf4ca..d5da77c52 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -1,5 +1,5 @@ -use super::{common, Completion, File, WriteCompletion, IO}; -use crate::{Result, LimboError}; +use super::{common, Completion, File, IO}; +use crate::{LimboError, Result}; use libc::{c_short, fcntl, flock, iovec, F_SETLK}; use log::{debug, trace}; use nix::fcntl::{FcntlArg, OFlag}; @@ -95,7 +95,10 @@ impl IO for LinuxIO { while let Some(cqe) = ring.completion().next() { let result = cqe.result(); if result < 0 { - return Err(LimboError::LinuxIOError(format!("{}", LinuxIOError::IOUringCQError(result)))); + return Err(LimboError::LinuxIOError(format!( + "{}", + LinuxIOError::IOUringCQError(result) + ))); } let c = unsafe { Rc::from_raw(cqe.user_data() as *const Completion) }; c.complete(); @@ -130,7 +133,9 @@ impl File for LinuxFile { if lock_result == -1 { let err = std::io::Error::last_os_error(); if err.kind() == std::io::ErrorKind::WouldBlock { - return Err(LimboError::LockingError("File is locked by another process".into())); + return Err(LimboError::LockingError( + "File is locked by another process".into(), + )); } else { return Err(LimboError::IOError(err)); } diff --git a/core/pager.rs b/core/pager.rs index d39912083..316dbe292 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -1,15 +1,12 @@ -#![feature(box_into_raw_non_null)] use crate::buffer_pool::BufferPool; use crate::sqlite3_ondisk::BTreePage; use crate::sqlite3_ondisk::{self, DatabaseHeader}; use crate::{PageSource, Result}; use log::trace; use sieve_cache::SieveCache; -use std::borrow::BorrowMut; use std::cell::RefCell; use std::collections::HashMap; use std::hash::Hash; -use std::mem; use std::ptr::{drop_in_place, NonNull}; use std::rc::Rc; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -105,10 +102,6 @@ impl PageCacheEntry { fn into_non_null(&mut self) -> NonNull { NonNull::new(&mut *self).unwrap() } - - unsafe fn from_non_null(ptr: NonNull) -> Box { - Box::from_raw(ptr.as_ptr()) - } } struct DumbLruPageCache { @@ -181,21 +174,21 @@ impl DumbLruPageCache { } pub fn resize(&mut self, capacity: usize) { + let _ = capacity; todo!(); } fn detach(&mut self, entry: &mut PageCacheEntry) { let mut current = entry.into_non_null(); - let mut next = None; - let mut prev = None; - unsafe { + let (next, prev) = unsafe { let c = current.as_mut(); - next = c.next; - prev = c.prev; + let next = c.next; + let prev = c.prev; c.prev = None; c.next = None; - } + (next, prev) + }; // detach match (prev, next) { diff --git a/core/pseudo.rs b/core/pseudo.rs index 92c69f7ee..ef35a9dc6 100644 --- a/core/pseudo.rs +++ b/core/pseudo.rs @@ -51,6 +51,7 @@ impl Cursor for PseudoCursor { } fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result> { + let _ = key; *self.current.borrow_mut() = Some(record.clone()); Ok(CursorResult::Ok(())) } @@ -64,6 +65,7 @@ impl Cursor for PseudoCursor { } fn exists(&mut self, key: &OwnedValue) -> Result { + let _ = key; todo!() } } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 471bf4be4..528e4a1f2 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -188,7 +188,7 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re Ok(()) } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct BTreePageHeader { pub(crate) page_type: PageType, pub(crate) _first_freeblock_offset: u16, @@ -200,7 +200,7 @@ pub struct BTreePageHeader { } #[repr(u8)] -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub enum PageType { IndexInterior = 2, TableInterior = 5, @@ -222,7 +222,7 @@ impl TryFrom for PageType { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct BTreePage { pub header: BTreePageHeader, pub cells: Vec, @@ -340,7 +340,7 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result Ok(()) } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum BTreeCell { TableInteriorCell(TableInteriorCell), TableLeafCell(TableLeafCell), @@ -348,27 +348,27 @@ pub enum BTreeCell { IndexLeafCell(IndexLeafCell), } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct TableInteriorCell { pub _left_child_page: u32, pub _rowid: u64, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct TableLeafCell { pub _rowid: u64, pub _payload: Vec, pub first_overflow_page: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct IndexInteriorCell { pub left_child_page: u32, pub payload: Vec, pub first_overflow_page: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct IndexLeafCell { pub payload: Vec, pub first_overflow_page: Option, diff --git a/core/storage.rs b/core/storage.rs index 3eed75797..54cab056f 100644 --- a/core/storage.rs +++ b/core/storage.rs @@ -1,10 +1,6 @@ #[cfg(feature = "fs")] use crate::io::File; -use crate::{ - error::LimboError, - io::{Completion, WriteCompletion}, - Buffer, Result, -}; +use crate::{error::LimboError, io::Completion, Buffer, Result}; use std::{cell::RefCell, rc::Rc}; pub struct PageSource { @@ -73,10 +69,6 @@ impl PageIO for FileStorage { } fn write(&self, page_idx: usize, buffer: Rc>, c: Rc) -> Result<()> { - let w = match &(*c) { - Completion::Read(_) => unreachable!(), - Completion::Write(w) => w, - }; let buffer_size = buffer.borrow().len(); assert!(buffer_size >= 512); assert!(buffer_size <= 65536); diff --git a/core/translate/expr.rs b/core/translate/expr.rs index 1d5c5ea6e..deb81ecc0 100644 --- a/core/translate/expr.rs +++ b/core/translate/expr.rs @@ -3,7 +3,7 @@ use sqlite3_parser::ast::{self, Expr, UnaryOperator}; use crate::{ function::{Func, ScalarFunc}, - schema::{Schema, Table, Type}, + schema::{Table, Type}, translate::select::{ColumnInfo, Select, SrcTable}, util::normalize_ident, vdbe::{builder::ProgramBuilder, BranchOffset, Insn}, diff --git a/core/translate/insert.rs b/core/translate/insert.rs index 11a5a558e..a54c3a2ce 100644 --- a/core/translate/insert.rs +++ b/core/translate/insert.rs @@ -1,14 +1,14 @@ use std::{cell::RefCell, ops::Deref, rc::Rc}; use sqlite3_parser::ast::{ - DistinctNames, InsertBody, Name, QualifiedName, ResolveType, ResultColumn, Select, With, + DistinctNames, InsertBody, QualifiedName, ResolveType, ResultColumn, With, }; use crate::Result; use crate::{ - schema::{self, Schema, Table}, + schema::{Schema, Table}, sqlite3_ondisk::DatabaseHeader, - translate::expr::{resolve_ident_qualified, translate_expr}, + translate::expr::translate_expr, vdbe::{builder::ProgramBuilder, Insn, Program}, }; @@ -17,9 +17,9 @@ pub fn translate_insert( with: &Option, or_conflict: &Option, tbl_name: &QualifiedName, - columns: &Option, + _columns: &Option, body: &InsertBody, - returning: &Option>, + _returning: &Option>, database_header: Rc>, ) -> Result { assert!(with.is_none()); diff --git a/core/types.rs b/core/types.rs index fc8c98413..44f250cc5 100644 --- a/core/types.rs +++ b/core/types.rs @@ -1,5 +1,4 @@ use std::fmt::Display; -use std::io::Read; use std::{cell::Ref, rc::Rc}; use crate::error::LimboError; diff --git a/core/vdbe/builder.rs b/core/vdbe/builder.rs index d19c2f34c..101177a4d 100644 --- a/core/vdbe/builder.rs +++ b/core/vdbe/builder.rs @@ -251,21 +251,21 @@ impl ProgramBuilder { *pc_if_next = to_offset; } Insn::InitCoroutine { - yield_reg, + yield_reg: _, jump_on_definition, - start_offset, + start_offset: _, } => { *jump_on_definition = to_offset; } Insn::NotExists { - cursor, - rowid_reg, + cursor: _, + rowid_reg: _, target_pc, } => { *target_pc = to_offset; } Insn::Yield { - yield_reg, + yield_reg: _, end_offset, } => { *end_offset = to_offset; diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 6e9c5c3fe..6a2209acb 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -1300,7 +1300,7 @@ impl Program { cursor, key_reg, record_reg, - flag, + flag: _, } => { let cursor = cursors.get_mut(cursor).unwrap(); let record = match &state.registers[*record_reg] { @@ -1323,7 +1323,7 @@ impl Program { cursor.wait_for_completion()?; state.pc += 1; } - Insn::NewRowid { reg } => todo!(), + Insn::NewRowid { reg: _ } => todo!(), Insn::MustBeInt { reg } => { match state.registers[*reg] { OwnedValue::Integer(_) => {} diff --git a/core/vdbe/sorter.rs b/core/vdbe/sorter.rs index 951c56615..51bdc6b29 100644 --- a/core/vdbe/sorter.rs +++ b/core/vdbe/sorter.rs @@ -80,6 +80,7 @@ impl Cursor for Sorter { } fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result> { + let _ = key; let key_fields = self.order.len(); let key = OwnedRecord::new(record.values[0..key_fields].to_vec()); self.insert(key, OwnedRecord::new(record.values[key_fields..].to_vec())); @@ -95,6 +96,7 @@ impl Cursor for Sorter { } fn exists(&mut self, key: &OwnedValue) -> Result { + let _ = key; todo!() } } From cfeddeaadffdda02c95b0ead030f00a0ab275d7c Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 14:00:24 +0200 Subject: [PATCH 17/35] core: fix payload serialization --- core/btree.rs | 14 +++++++++++++- core/types.rs | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 9c5d12b38..fcd74b78c 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -248,7 +248,6 @@ impl BTreeCursor { // if overwrite drop cell // insert cell - assert!(page.header.page_type == PageType::TableLeaf); let mut payload: Vec = Vec::new(); { @@ -287,15 +286,28 @@ impl BTreeCursor { let pc = self.allocate_cell_space(page, payload.len() as u16); let mut buf_ref = RefCell::borrow_mut(&page.buffer); let buf: &mut [u8] = buf_ref.as_mut_slice(); + + // copy data buf[pc as usize..pc as usize + payload.len()].copy_from_slice(&payload); // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); let pointer_area_pc_by_idx = 8 + 2 * cell_idx; + // move previous pointers forward and insert new pointer there let n_cells_forward = 2 * (page.cells.len() - cell_idx); buf.copy_within( pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, pointer_area_pc_by_idx + 2, ); + buf[pointer_area_pc_by_idx..pointer_area_pc_by_idx + 2] + .copy_from_slice(&pc.to_be_bytes()); + + // update first byte of content area + buf[5..7].copy_from_slice(&pc.to_be_bytes()); + + // update cell count + let new_n_cells = (page.cells.len() + 1) as u16; + buf[3..5].copy_from_slice(&new_n_cells.to_be_bytes()); + // TODo: refactor cells to be lazy loadable because this will be crazy slow let mut payload_for_cell_in_memory: Vec = Vec::new(); _record.serialize(&mut payload_for_cell_in_memory); diff --git a/core/types.rs b/core/types.rs index 44f250cc5..024541eae 100644 --- a/core/types.rs +++ b/core/types.rs @@ -354,7 +354,7 @@ impl OwnedRecord { header_bytes_buf.extend(std::iter::repeat(0).take(9)); let n = write_varint(&mut header_bytes_buf.as_mut_slice(), header_size as u64); header_bytes_buf.truncate(n); - buf.splice(0..0, header_bytes_buf.iter().cloned()); + buf.splice(initial_i..initial_i, header_bytes_buf.iter().cloned()); } } From d088640855c8bba435e64f4037d8c1d87825afde Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 14:26:41 +0200 Subject: [PATCH 18/35] core: cqe result Signed-off-by: Pere Diaz Bou --- core/io/linux.rs | 2 +- core/io/mod.rs | 8 ++++---- core/sqlite3_ondisk.rs | 10 ++++------ 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/core/io/linux.rs b/core/io/linux.rs index d5da77c52..24621dd93 100644 --- a/core/io/linux.rs +++ b/core/io/linux.rs @@ -101,7 +101,7 @@ impl IO for LinuxIO { ))); } let c = unsafe { Rc::from_raw(cqe.user_data() as *const Completion) }; - c.complete(); + c.complete(cqe.result()); } Ok(()) } diff --git a/core/io/mod.rs b/core/io/mod.rs index cf8ef25a3..cb26bb832 100644 --- a/core/io/mod.rs +++ b/core/io/mod.rs @@ -23,7 +23,7 @@ pub trait IO { } pub type Complete = dyn Fn(Rc>); -pub type WriteComplete = dyn Fn(usize); +pub type WriteComplete = dyn Fn(i32); pub enum Completion { Read(ReadCompletion), @@ -36,10 +36,10 @@ pub struct ReadCompletion { } impl Completion { - pub fn complete(&self) { + pub fn complete(&self, result: i32) { match self { Completion::Read(r) => r.complete(), - Completion::Write(w) => w.complete(234234), // fix + Completion::Write(w) => w.complete(result), // fix } } } @@ -70,7 +70,7 @@ impl WriteCompletion { pub fn new(complete: Box) -> Self { Self { complete } } - pub fn complete(&self, bytes_written: usize) { + pub fn complete(&self, bytes_written: i32) { (self.complete)(bytes_written); } } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 528e4a1f2..ad9a37b55 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -174,10 +174,10 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re pager.io.run_once()?; let buffer_in_cb = buffer_to_copy.clone(); - let write_complete = Box::new(move |bytes_written: usize| { + let write_complete = Box::new(move |bytes_written: i32| { let buf = buffer_in_cb.clone(); let buf_len = std::cell::RefCell::borrow(&buf).len(); - if bytes_written < buf_len { + if bytes_written < buf_len as i32 { log::error!("wrote({bytes_written}) less than expected({buf_len})"); } // finish_read_database_header(buf, header).unwrap(); @@ -324,14 +324,12 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result let buffer = contents.buffer.clone(); let write_complete = { let buf_copy = buffer.clone(); - Box::new(move |bytes_written: usize| { + Box::new(move |bytes_written: i32| { let buf_copy = buf_copy.clone(); let buf_len = buf_copy.borrow().len(); - if bytes_written < buf_len { + if bytes_written < buf_len as i32 { log::error!("wrote({bytes_written}) less than expected({buf_len})"); } - println!("done"); - // finish_read_database_header(buf, header).unwrap(); }) }; dbg!(buffer.borrow().len()); From b6468f11e72dccb5f8f8893fcb1f16014d8d0a3f Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 14:33:58 +0200 Subject: [PATCH 19/35] core: clear dirty on finish write Signed-off-by: Pere Diaz Bou --- core/pager.rs | 7 +++++-- core/sqlite3_ondisk.rs | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/core/pager.rs b/core/pager.rs index 316dbe292..a2ec517f5 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -24,7 +24,7 @@ const PAGE_UPTODATE: usize = 0b001; const PAGE_LOCKED: usize = 0b010; /// Page had an I/O error. const PAGE_ERROR: usize = 0b100; -/// Page had an I/O error. +/// Page is dirty. Flush needed. const PAGE_DIRTY: usize = 0b1000; impl Default for Page { @@ -333,14 +333,17 @@ impl Pager { pub fn cacheflush(&self) -> Result<()> { let mut dirty_pages = RefCell::borrow_mut(&self.dirty_pages); + if dirty_pages.len() == 0 { + return Ok(()); + } loop { if dirty_pages.len() == 0 { break; } let page = dirty_pages.pop().unwrap(); sqlite3_ondisk::begin_write_btree_page(self, &page)?; - self.io.run_once()?; } + self.io.run_once()?; Ok(()) } } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index ad9a37b55..d2ff1cac4 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -318,6 +318,7 @@ fn finish_read_btree_page( pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result<()> { let page_source = &pager.page_source; + let page_finish = page.clone(); let page = page.borrow(); let contents = page.contents.read().unwrap(); let contents = contents.as_ref().unwrap(); @@ -327,6 +328,7 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result Box::new(move |bytes_written: i32| { let buf_copy = buf_copy.clone(); let buf_len = buf_copy.borrow().len(); + page_finish.borrow_mut().clear_dirty(); if bytes_written < buf_len as i32 { log::error!("wrote({bytes_written}) less than expected({buf_len})"); } From 3b9f5aa51178b3abe3f7d589fe5dbd49a8130375 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 14:52:38 +0200 Subject: [PATCH 20/35] core: implement exists --- core/btree.rs | 72 +++++++++++++++++++++++++++++++++--------- core/pseudo.rs | 2 +- core/sqlite3_ondisk.rs | 1 - core/types.rs | 2 +- core/vdbe/mod.rs | 7 ++-- core/vdbe/sorter.rs | 2 +- 6 files changed, 64 insertions(+), 22 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index fcd74b78c..a0388f269 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -232,20 +232,9 @@ impl BTreeCursor { OwnedValue::Integer(i) => *i as u64, _ => unreachable!("btree tables are indexed by integers!"), }; - let mut cell_idx = 0; - for cell in &page.cells { - match cell { - BTreeCell::TableLeafCell(cell) => { - if int_key <= cell._rowid { - break; - } - } - _ => todo!(), - } - cell_idx += 1; - } + let cell_idx = find_cell(page, int_key); - // if overwrite drop cell + // TODO: if overwrite drop cell // insert cell let mut payload: Vec = Vec::new(); @@ -571,7 +560,60 @@ impl Cursor for BTreeCursor { self.null_flag } - fn exists(&mut self, key: &OwnedValue) -> Result { - Ok(false) + fn exists(&mut self, key: &OwnedValue) -> Result> { + let int_key = match key { + OwnedValue::Integer(i) => i, + _ => unreachable!("btree tables are indexed by integers!"), + }; + match self.move_to(*int_key as u64)? { + CursorResult::Ok(_) => {} + CursorResult::IO => return Ok(CursorResult::IO), + }; + let mem_page = { + let mem_page = self.page.borrow(); + let mem_page = mem_page.as_ref().unwrap(); + mem_page.clone() + }; + let page_idx = mem_page.page_idx; + let page_ref = self.pager.read_page(page_idx)?; + let page = page_ref.borrow(); + if page.is_locked() { + return Ok(CursorResult::IO); + } + + let page = page.contents.read().unwrap(); + let page = page.as_ref().unwrap(); + + // find cell + let int_key = match key { + OwnedValue::Integer(i) => *i as u64, + _ => unreachable!("btree tables are indexed by integers!"), + }; + let cell_idx = find_cell(page, int_key); + if cell_idx >= page.cells.len() { + Ok(CursorResult::Ok(false)) + } else { + let equals = match &page.cells[cell_idx] { + BTreeCell::TableLeafCell(l) => l._rowid == int_key, + _ => unreachable!(), + }; + Ok(CursorResult::Ok(equals)) + } } } + +fn find_cell(page: &BTreePage, int_key: u64) -> usize { + let mut cell_idx = 0; + for cell in &page.cells { + match cell { + BTreeCell::TableLeafCell(cell) => { + if int_key <= cell._rowid { + break; + } + } + _ => todo!(), + } + cell_idx += 1; + } + cell_idx +} diff --git a/core/pseudo.rs b/core/pseudo.rs index ef35a9dc6..04881597d 100644 --- a/core/pseudo.rs +++ b/core/pseudo.rs @@ -64,7 +64,7 @@ impl Cursor for PseudoCursor { // Do nothing } - fn exists(&mut self, key: &OwnedValue) -> Result { + fn exists(&mut self, key: &OwnedValue) -> Result> { let _ = key; todo!() } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index d2ff1cac4..d4834f465 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -334,7 +334,6 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result } }) }; - dbg!(buffer.borrow().len()); let c = Rc::new(Completion::Write(WriteCompletion::new(write_complete))); page_source.write(page.id, buffer.clone(), c)?; Ok(()) diff --git a/core/types.rs b/core/types.rs index 024541eae..2eb6c9ef9 100644 --- a/core/types.rs +++ b/core/types.rs @@ -371,7 +371,7 @@ pub trait Cursor { fn rowid(&self) -> Result>; fn record(&self) -> Result>>; fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result>; - fn exists(&mut self, key: &OwnedValue) -> Result; + fn exists(&mut self, key: &OwnedValue) -> Result>; fn set_null_flag(&mut self, flag: bool); fn get_null_flag(&self) -> bool; } diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 6a2209acb..2fc2f8e17 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -1346,10 +1346,11 @@ impl Program { } => { let cursor = cursors.get_mut(cursor).unwrap(); match cursor.exists(&state.registers[*rowid_reg])? { - true => state.pc += 1, - false => state.pc = *target_pc, + CursorResult::Ok(true) => state.pc += 1, + CursorResult::Ok(false) => state.pc = *target_pc, + CursorResult::IO => return Ok(StepResult::IO), }; - } // TODO(pere): how is not exists implemented? We probably need to traverse keys my pointing cursor. + } // this cursor may be reused for next insert // Update: tablemoveto is used to travers on not exists, on insert depending on flags if nonseek it traverses again. // If not there might be some optimizations obviously. diff --git a/core/vdbe/sorter.rs b/core/vdbe/sorter.rs index 51bdc6b29..7ec6753c5 100644 --- a/core/vdbe/sorter.rs +++ b/core/vdbe/sorter.rs @@ -95,7 +95,7 @@ impl Cursor for Sorter { todo!(); } - fn exists(&mut self, key: &OwnedValue) -> Result { + fn exists(&mut self, key: &OwnedValue) -> Result> { let _ = key; todo!() } From 20dc068a9d97639528684bf858c8530ca427434d Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 14:58:18 +0200 Subject: [PATCH 21/35] core: don't traverse twice --- core/btree.rs | 18 +++++++++++++----- core/pseudo.rs | 8 +++++++- core/types.rs | 7 ++++++- core/vdbe/mod.rs | 6 +++--- core/vdbe/sorter.rs | 8 +++++++- 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index a0388f269..6dd36b492 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -224,6 +224,7 @@ impl BTreeCursor { let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); + assert!(matches!(page.header.page_type, PageType::TableLeaf)); let free = self.compute_free_space(page, self.database_header.borrow()); @@ -536,15 +537,22 @@ impl Cursor for BTreeCursor { Ok(self.record.borrow()) } - fn insert(&mut self, key: &OwnedValue, _record: &OwnedRecord) -> Result> { + fn insert( + &mut self, + key: &OwnedValue, + _record: &OwnedRecord, + moved_before: bool, /* Indicate whether it's necessary to traverse to find the leaf page */ + ) -> Result> { let int_key = match key { OwnedValue::Integer(i) => i, _ => unreachable!("btree tables are indexed by integers!"), }; - match self.move_to(*int_key as u64)? { - CursorResult::Ok(_) => {} - CursorResult::IO => return Ok(CursorResult::IO), - }; + if !moved_before { + match self.move_to(*int_key as u64)? { + CursorResult::Ok(_) => {} + CursorResult::IO => return Ok(CursorResult::IO), + }; + } match self.insert_to_page(key, _record)? { CursorResult::Ok(_) => Ok(CursorResult::Ok(())), diff --git a/core/pseudo.rs b/core/pseudo.rs index 04881597d..3cb8e421c 100644 --- a/core/pseudo.rs +++ b/core/pseudo.rs @@ -50,8 +50,14 @@ impl Cursor for PseudoCursor { Ok(self.current.borrow()) } - fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result> { + fn insert( + &mut self, + key: &OwnedValue, + record: &OwnedRecord, + moved_before: bool, + ) -> Result> { let _ = key; + let _ = moved_before; *self.current.borrow_mut() = Some(record.clone()); Ok(CursorResult::Ok(())) } diff --git a/core/types.rs b/core/types.rs index 2eb6c9ef9..67d4162cc 100644 --- a/core/types.rs +++ b/core/types.rs @@ -370,7 +370,12 @@ pub trait Cursor { fn wait_for_completion(&mut self) -> Result<()>; fn rowid(&self) -> Result>; fn record(&self) -> Result>>; - fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result>; + fn insert( + &mut self, + key: &OwnedValue, + record: &OwnedRecord, + moved_before: bool, /* Tells inserter that it doesn't need to traverse in order to find leaf page */ + ) -> Result>; // fn exists(&mut self, key: &OwnedValue) -> Result>; fn set_null_flag(&mut self, flag: bool); fn get_null_flag(&self) -> bool; diff --git a/core/vdbe/mod.rs b/core/vdbe/mod.rs index 2fc2f8e17..5e68f93b3 100644 --- a/core/vdbe/mod.rs +++ b/core/vdbe/mod.rs @@ -1062,7 +1062,7 @@ impl Program { }; state.registers[*dest_reg] = OwnedValue::Record(record.clone()); let sorter_cursor = cursors.get_mut(sorter_cursor).unwrap(); - sorter_cursor.insert(&OwnedValue::Integer(0), &record)?; // fix key later + sorter_cursor.insert(&OwnedValue::Integer(0), &record, false)?; // fix key later state.pc += 1; } Insn::SorterInsert { @@ -1075,7 +1075,7 @@ impl Program { _ => unreachable!("SorterInsert on non-record register"), }; // TODO: set correct key - cursor.insert(&OwnedValue::Integer(0), record)?; + cursor.insert(&OwnedValue::Integer(0), record, false)?; state.pc += 1; } Insn::SorterSort { @@ -1308,7 +1308,7 @@ impl Program { _ => unreachable!("Not a record! Cannot insert a non record value."), }; let key = &state.registers[*key_reg]; - match cursor.insert(key, record)? { + match cursor.insert(key, record, true)? { CursorResult::Ok(_) => { state.pc += 1; } diff --git a/core/vdbe/sorter.rs b/core/vdbe/sorter.rs index 7ec6753c5..26704ae4b 100644 --- a/core/vdbe/sorter.rs +++ b/core/vdbe/sorter.rs @@ -79,8 +79,14 @@ impl Cursor for Sorter { Ok(self.current.borrow()) } - fn insert(&mut self, key: &OwnedValue, record: &OwnedRecord) -> Result> { + fn insert( + &mut self, + key: &OwnedValue, + record: &OwnedRecord, + moved_before: bool, + ) -> Result> { let _ = key; + let _ = moved_before; let key_fields = self.order.len(); let key = OwnedRecord::new(record.values[0..key_fields].to_vec()); self.insert(key, OwnedRecord::new(record.values[key_fields..].to_vec())); From 35c3fe7448c4c34f1cafa8435a9ddf60013f3243 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Fri, 26 Jul 2024 21:05:08 +0200 Subject: [PATCH 22/35] core: refactor page in memory representation --- core/btree.rs | 69 ++++++++++------------ core/pager.rs | 6 +- core/sqlite3_ondisk.rs | 130 +++++++++++++++++++++++++---------------- 3 files changed, 113 insertions(+), 92 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 6dd36b492..91dbc1ca9 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -1,6 +1,6 @@ use crate::pager::Pager; use crate::sqlite3_ondisk::{ - read_varint, write_varint, BTreeCell, BTreePage, DatabaseHeader, PageType, TableInteriorCell, + read_varint, write_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, TableLeafCell, }; use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; @@ -76,9 +76,9 @@ impl BTreeCursor { } let page = page.contents.read().unwrap(); let page = page.as_ref().unwrap(); - if mem_page.cell_idx() >= page.cells.len() { + if mem_page.cell_idx() >= page.cell_count() { let parent = mem_page.parent.clone(); - match page.header.right_most_pointer { + match page.rightmost_pointer() { Some(right_most_pointer) => { let mem_page = MemPage::new(parent.clone(), right_most_pointer as usize, 0); self.page.replace(Some(Rc::new(mem_page))); @@ -95,7 +95,7 @@ impl BTreeCursor { }, } } - let cell = &page.cells[mem_page.cell_idx()]; + let cell = page.cell_get(mem_page.cell_idx())?; match &cell { BTreeCell::TableInteriorCell(TableInteriorCell { _left_child_page, @@ -153,8 +153,8 @@ impl BTreeCursor { } let mut found_cell = false; - for cell in &page.cells { - match &cell { + for cell_idx in 0..page.cell_count() { + match &page.cell_get(cell_idx)? { BTreeCell::TableInteriorCell(TableInteriorCell { _left_child_page, _rowid, @@ -188,7 +188,7 @@ impl BTreeCursor { if !found_cell { let parent = mem_page.parent.clone(); - match page.header.right_most_pointer { + match page.rightmost_pointer() { Some(right_most_pointer) => { let mem_page = MemPage::new(parent, right_most_pointer as usize, 0); self.page.replace(Some(Rc::new(mem_page))); @@ -224,7 +224,7 @@ impl BTreeCursor { let mut page = page.contents.write().unwrap(); let page = page.as_mut().unwrap(); - assert!(matches!(page.header.page_type, PageType::TableLeaf)); + assert!(matches!(page.page_type(), PageType::TableLeaf)); let free = self.compute_free_space(page, self.database_header.borrow()); @@ -283,7 +283,7 @@ impl BTreeCursor { let pointer_area_pc_by_idx = 8 + 2 * cell_idx; // move previous pointers forward and insert new pointer there - let n_cells_forward = 2 * (page.cells.len() - cell_idx); + let n_cells_forward = 2 * (page.cell_count() - cell_idx); buf.copy_within( pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, pointer_area_pc_by_idx + 2, @@ -295,36 +295,27 @@ impl BTreeCursor { buf[5..7].copy_from_slice(&pc.to_be_bytes()); // update cell count - let new_n_cells = (page.cells.len() + 1) as u16; + let new_n_cells = (page.cell_count() + 1) as u16; buf[3..5].copy_from_slice(&new_n_cells.to_be_bytes()); - // TODo: refactor cells to be lazy loadable because this will be crazy slow let mut payload_for_cell_in_memory: Vec = Vec::new(); _record.serialize(&mut payload_for_cell_in_memory); - page.cells.insert( - cell_idx, - BTreeCell::TableLeafCell(TableLeafCell { - _rowid: int_key, - _payload: payload_for_cell_in_memory, - first_overflow_page: None, - }), - ); } Ok(CursorResult::Ok(())) } - fn allocate_cell_space(&mut self, page_ref: &BTreePage, amount: u16) -> u16 { + fn allocate_cell_space(&mut self, page_ref: &PageContent, amount: u16) -> u16 { let amount = amount as usize; let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); let buf = buf_ref.as_mut_slice(); let cell_offset = 8; - let gap = cell_offset + 2 * page_ref.cells.len(); - let mut top = page_ref.header._cell_content_area as usize; + let gap = cell_offset + 2 * page_ref.cell_count(); + let mut top = page_ref.cell_content_area() as usize; // there are free blocks and enough space - if page_ref.header._first_freeblock_offset != 0 && gap + 2 <= top { + if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot let db_header = self.database_header.borrow(); let pc = find_free_cell(page_ref, db_header, amount, buf); @@ -346,7 +337,7 @@ impl BTreeCursor { return top as u16; } - fn defragment_page(&self, page: &BTreePage, db_header: Ref) { + fn defragment_page(&self, page: &PageContent, db_header: Ref) { let cloned_page = page.clone(); let usable_space = (db_header.page_size - db_header.unused_space as u16) as u64; let mut cbrk = usable_space as u64; @@ -354,14 +345,14 @@ impl BTreeCursor { // TODO: implement fast algorithm let last_cell = (usable_space - 4) as u64; - let first_cell = cloned_page.header._cell_content_area as u64; - if cloned_page.cells.len() > 0 { + let first_cell = cloned_page.cell_content_area() as u64; + if cloned_page.cell_count() > 0 { let buf = cloned_page.buffer.borrow(); let buf = buf.as_slice(); let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); - for i in 0..cloned_page.cells.len() { + for i in 0..cloned_page.cell_count() { let cell_offset = 8; let cell_idx = cell_offset + i * 2; @@ -411,19 +402,19 @@ impl BTreeCursor { // Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte // and end of cell pointer area. - fn compute_free_space(&self, page: &BTreePage, db_header: Ref) -> u16 { + fn compute_free_space(&self, page: &PageContent, db_header: Ref) -> u16 { let buffer = page.buffer.borrow(); let buf = buffer.as_slice(); let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; - let mut first_byte_in_cell_content = page.header._cell_content_area; + let mut first_byte_in_cell_content = page.cell_content_area(); if first_byte_in_cell_content == 0 { first_byte_in_cell_content = u16::MAX; } - let fragmented_free_bytes = page.header._num_frag_free_bytes; - let free_block_pointer = page.header._first_freeblock_offset; - let ncell = page.cells.len(); + let fragmented_free_bytes = page.num_frag_free_bytes(); + let free_block_pointer = page.first_freeblock(); + let ncell = page.cell_count(); // 8 + 4 == header end let first_cell = 8 + 4 + (2 * ncell) as u16; @@ -469,14 +460,14 @@ impl BTreeCursor { } fn find_free_cell( - page_ref: &BTreePage, + page_ref: &PageContent, db_header: Ref, amount: usize, buf: &[u8], ) -> usize { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc - let mut pc = page_ref.header._first_freeblock_offset as usize; + let mut pc = page_ref.first_freeblock() as usize; let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; let maxpc = (usable_space - amount as usize) as usize; let mut found = false; @@ -598,10 +589,10 @@ impl Cursor for BTreeCursor { _ => unreachable!("btree tables are indexed by integers!"), }; let cell_idx = find_cell(page, int_key); - if cell_idx >= page.cells.len() { + if cell_idx >= page.cell_count() { Ok(CursorResult::Ok(false)) } else { - let equals = match &page.cells[cell_idx] { + let equals = match &page.cell_get(cell_idx)? { BTreeCell::TableLeafCell(l) => l._rowid == int_key, _ => unreachable!(), }; @@ -610,10 +601,10 @@ impl Cursor for BTreeCursor { } } -fn find_cell(page: &BTreePage, int_key: u64) -> usize { +fn find_cell(page: &PageContent, int_key: u64) -> usize { let mut cell_idx = 0; - for cell in &page.cells { - match cell { + while cell_idx < page.cell_count() { + match page.cell_get(cell_idx).unwrap() { BTreeCell::TableLeafCell(cell) => { if int_key <= cell._rowid { break; diff --git a/core/pager.rs b/core/pager.rs index a2ec517f5..5f409ff5d 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -1,5 +1,5 @@ use crate::buffer_pool::BufferPool; -use crate::sqlite3_ondisk::BTreePage; +use crate::sqlite3_ondisk::PageContent; use crate::sqlite3_ondisk::{self, DatabaseHeader}; use crate::{PageSource, Result}; use log::trace; @@ -14,7 +14,7 @@ use std::sync::{Arc, RwLock}; pub struct Page { flags: AtomicUsize, - pub contents: RwLock>, + pub contents: RwLock>, pub id: usize, } @@ -305,7 +305,7 @@ impl Pager { } let page = Rc::new(RefCell::new(Page::new(page_idx))); page.borrow().set_locked(); - sqlite3_ondisk::begin_read_btree_page( + sqlite3_ondisk::begin_read_page( &self.page_source, self.buffer_pool.clone(), page.clone(), diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index d4834f465..831065f5b 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -188,17 +188,6 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re Ok(()) } -#[derive(Debug, Clone)] -pub struct BTreePageHeader { - pub(crate) page_type: PageType, - pub(crate) _first_freeblock_offset: u16, - pub(crate) num_cells: u16, - // First byte of content area - pub(crate) _cell_content_area: u16, - pub(crate) _num_frag_free_bytes: u8, - pub(crate) right_most_pointer: Option, -} - #[repr(u8)] #[derive(Debug, PartialEq, Clone)] pub enum PageType { @@ -223,15 +212,84 @@ impl TryFrom for PageType { } #[derive(Debug, Clone)] -pub struct BTreePage { - pub header: BTreePageHeader, - pub cells: Vec, +pub struct PageContent { + pub offset: usize, pub buffer: Rc>, } -impl BTreePage { +impl PageContent { + pub fn page_type(&self) -> PageType { + let buf = self.buffer.borrow(); + let buf = buf.as_slice(); + buf[self.offset].try_into().unwrap() + } + + fn read_u16(&self, pos: usize) -> u16 { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_ref().unwrap().as_slice(); + u16::from_be_bytes([buf[self.offset + pos], buf[self.offset + pos + 1]]) + } + } + + fn read_u32(&self, pos: usize) -> u32 { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_ref().unwrap().as_slice(); + u32::from_be_bytes([ + buf[self.offset + pos], + buf[self.offset + pos + 1], + buf[self.offset + pos + 2], + buf[self.offset + pos + 3], + ]) + } + } + + pub fn first_freeblock(&self) -> u16 { + self.read_u16(1) + } + + pub fn cell_count(&self) -> usize { + self.read_u16(3) as usize + } + + pub fn cell_content_area(&self) -> u16 { + self.read_u16(5) as u16 + } + + pub fn num_frag_free_bytes(&self) -> u16 { + self.read_u16(7) as u16 + } + + pub fn rightmost_pointer(&self) -> Option { + match self.page_type() { + PageType::IndexInterior => Some(self.read_u32(8)), + PageType::TableInterior => Some(self.read_u32(8)), + PageType::IndexLeaf => None, + PageType::TableLeaf => None, + } + } + + pub fn cell_get(&self, idx: usize) -> Result { + let buf = self.buffer.borrow(); + let buf = buf.as_slice(); + + let ncells = self.cell_count(); + let cell_start = match self.page_type() { + PageType::IndexInterior => 12, + PageType::TableInterior => 12, + PageType::IndexLeaf => 8, + PageType::TableLeaf => 8, + }; + assert!(idx < ncells, "cell_get: idx out of bounds"); + let cell_pointer = cell_start + (idx * 2); + let cell_pointer = self.read_u16(cell_pointer) as usize; + + read_btree_cell(buf, &self.page_type(), cell_pointer) + } + pub fn is_leaf(&self) -> bool { - match self.header.page_type { + match self.page_type() { PageType::IndexInterior => false, PageType::TableInterior => false, PageType::IndexLeaf => true, @@ -240,7 +298,7 @@ impl BTreePage { } } -pub fn begin_read_btree_page( +pub fn begin_read_page( page_source: &PageSource, buffer_pool: Rc, page: Rc>, @@ -255,7 +313,7 @@ pub fn begin_read_btree_page( let buf = Rc::new(RefCell::new(Buffer::new(buf, drop_fn))); let complete = Box::new(move |buf: Rc>| { let page = page.clone(); - if finish_read_btree_page(page_idx, buf, page.clone()).is_err() { + if finish_read_page(page_idx, buf, page.clone()).is_err() { page.borrow_mut().set_error(); } }); @@ -264,47 +322,19 @@ pub fn begin_read_btree_page( Ok(()) } -fn finish_read_btree_page( +fn finish_read_page( page_idx: usize, buffer_ref: Rc>, page: Rc>, ) -> Result<()> { trace!("finish_read_btree_page(page_idx = {})", page_idx); - let mut pos = if page_idx == 1 { + let pos = if page_idx == 1 { DATABASE_HEADER_SIZE } else { 0 }; - let buf = buffer_ref.borrow(); - let buf = buf.as_slice(); - let mut header = BTreePageHeader { - page_type: buf[pos].try_into()?, - _first_freeblock_offset: u16::from_be_bytes([buf[pos + 1], buf[pos + 2]]), - num_cells: u16::from_be_bytes([buf[pos + 3], buf[pos + 4]]), - _cell_content_area: u16::from_be_bytes([buf[pos + 5], buf[pos + 6]]), - _num_frag_free_bytes: buf[pos + 7], - right_most_pointer: None, - }; - pos += 8; - if header.page_type == PageType::IndexInterior || header.page_type == PageType::TableInterior { - header.right_most_pointer = Some(u32::from_be_bytes([ - buf[pos], - buf[pos + 1], - buf[pos + 2], - buf[pos + 3], - ])); - pos += 4; - } - let mut cells = Vec::with_capacity(header.num_cells as usize); - for _ in 0..header.num_cells { - let cell_pointer = u16::from_be_bytes([buf[pos], buf[pos + 1]]); - pos += 2; - let cell = read_btree_cell(buf, &header.page_type, cell_pointer as usize)?; - cells.push(cell); - } - let inner = BTreePage { - header, - cells, + let inner = PageContent { + offset: pos, buffer: buffer_ref.clone(), }; { From dac2868c66902782d9df0e2bbe38110cb13f107d Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Sat, 27 Jul 2024 17:00:43 +0200 Subject: [PATCH 23/35] core: pager allocate page --- core/btree.rs | 121 ++++++++++++++++++++++++++++++++--------- core/pager.rs | 53 ++++++++++++++++-- core/sqlite3_ondisk.rs | 86 ++++++++++++++++++++--------- 3 files changed, 202 insertions(+), 58 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 91dbc1ca9..bb3f6d4b7 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -1,4 +1,4 @@ -use crate::pager::Pager; +use crate::pager::{Page, Pager}; use crate::sqlite3_ondisk::{ read_varint, write_varint, BTreeCell, DatabaseHeader, PageContent, PageType, TableInteriorCell, TableLeafCell, @@ -6,6 +6,8 @@ use crate::sqlite3_ondisk::{ use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; +use std::any::Any; +use std::borrow::BorrowMut; use std::cell::{Ref, RefCell}; use std::rc::Rc; @@ -70,7 +72,7 @@ impl BTreeCursor { }; let page_idx = mem_page.page_idx; let page = self.pager.read_page(page_idx)?; - let page = page.borrow(); + let page = RefCell::borrow(&page); if page.is_locked() { return Ok(CursorResult::IO); } @@ -142,7 +144,7 @@ impl BTreeCursor { }; let page_idx = mem_page.page_idx; let page = self.pager.read_page(page_idx)?; - let page = page.borrow(); + let page = RefCell::borrow(&page); if page.is_locked() { return Ok(CursorResult::IO); } @@ -207,14 +209,8 @@ impl BTreeCursor { key: &OwnedValue, _record: &OwnedRecord, ) -> Result> { - let mem_page = { - let mem_page = self.page.borrow(); - let mem_page = mem_page.as_ref().unwrap(); - mem_page.clone() - }; - let page_idx = mem_page.page_idx; - let page_ref = self.pager.read_page(page_idx)?; - let page = page_ref.borrow(); + let page_ref = self.get_page()?; + let page = RefCell::borrow(&page_ref); if page.is_locked() { return Ok(CursorResult::IO); } @@ -226,8 +222,6 @@ impl BTreeCursor { let page = page.as_mut().unwrap(); assert!(matches!(page.page_type(), PageType::TableLeaf)); - let free = self.compute_free_space(page, self.database_header.borrow()); - // find cell let int_key = match key { OwnedValue::Integer(i) => *i as u64, @@ -268,9 +262,18 @@ impl BTreeCursor { payload.splice(0..0, data_len_varint.iter().cloned()); } + let usable_space = { + let db_header = RefCell::borrow(&self.database_header); + (db_header.page_size - db_header.unused_space as u16) as usize + }; + let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); + assert!( + payload.len() <= usable_space - 100, /* 100 bytes minus for precaution to remember */ + "need to implemented overflow pages, too big to even add to a an empty page" + ); if payload.len() + 2 > free as usize { // overflow or balance - todo!("overflow/balance"); + self.balance_leaf(int_key, &payload); } else { // insert let pc = self.allocate_cell_space(page, payload.len() as u16); @@ -305,6 +308,76 @@ impl BTreeCursor { Ok(CursorResult::Ok(())) } + fn get_page(&mut self) -> crate::Result>> { + let mem_page = { + let mem_page = self.page.borrow(); + let mem_page = mem_page.as_ref().unwrap(); + mem_page.clone() + }; + let page_idx = mem_page.page_idx; + let page_ref = self.pager.read_page(page_idx)?; + Ok(page_ref) + } + + fn balance_leaf(&mut self, key: u64, payload: &Vec) { + // This is a naive algorithm that doesn't try to distribute cells evenly by content. + // It will try to split the page in half by keys not by content. + // Sqlite tries to have a page at least 40% full. + loop { + let mem_page = { + let mem_page = self.page.borrow(); + let mem_page = mem_page.as_ref().unwrap(); + mem_page.clone() + }; + let page_ref = self.read_page_sync(mem_page.page_idx); + let page = RefCell::borrow_mut(&page_ref); + let mut page = page.contents.write().unwrap(); + let page = page.as_mut().unwrap(); + let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); + if payload.len() + 2 <= free as usize { + break; + } + + let right_page_ref = self.allocate_page(page.page_type()); + let right_page = RefCell::borrow_mut(&right_page_ref); + let mut right_page = right_page.contents.write().unwrap(); + let right_page = right_page.as_mut().unwrap(); + } + } + + fn read_page_sync(&mut self, page_idx: usize) -> Rc> { + loop { + let page_ref = self.pager.read_page(page_idx); + match page_ref { + Ok(p) => return p, + Err(_) => {} + } + } + } + + fn allocate_page(&mut self, page_type: PageType) -> Rc> { + let page = self.pager.allocate_page().unwrap(); + + { + // setup btree page + let contents = RefCell::borrow(&page); + let mut contents = contents.contents.write().unwrap(); + let contents = contents.as_mut().unwrap(); + let id = page_type as u8; + contents.write_u8(0, id); + contents.write_u16(1, 0); + contents.write_u16(3, 0); + contents.write_u16(5, 0); + contents.write_u8(7, 0); + contents.write_u32(8, 0); + } + + page + } + + /* + Allocate space for a cell on a page. + */ fn allocate_cell_space(&mut self, page_ref: &PageContent, amount: u16) -> u16 { let amount = amount as usize; let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); @@ -317,19 +390,19 @@ impl BTreeCursor { // there are free blocks and enough space if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot - let db_header = self.database_header.borrow(); + let db_header = RefCell::borrow(&self.database_header); let pc = find_free_cell(page_ref, db_header, amount, buf); return pc as u16; } if gap + 2 + amount as usize > top { // defragment - self.defragment_page(page_ref, self.database_header.borrow()); + self.defragment_page(page_ref, RefCell::borrow(&self.database_header)); top = u16::from_be_bytes([buf[5], buf[6]]) as usize; return 0; } - let db_header = self.database_header.borrow(); + let db_header = RefCell::borrow(&self.database_header); top -= amount; buf[5..7].copy_from_slice(&(top as u16).to_be_bytes()); let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; @@ -347,7 +420,7 @@ impl BTreeCursor { let last_cell = (usable_space - 4) as u64; let first_cell = cloned_page.cell_content_area() as u64; if cloned_page.cell_count() > 0 { - let buf = cloned_page.buffer.borrow(); + let buf = RefCell::borrow(&cloned_page.buffer); let buf = buf.as_slice(); let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); @@ -403,7 +476,7 @@ impl BTreeCursor { // Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte // and end of cell pointer area. fn compute_free_space(&self, page: &PageContent, db_header: Ref) -> u16 { - let buffer = page.buffer.borrow(); + let buffer = RefCell::borrow(&page.buffer); let buf = buffer.as_slice(); let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; @@ -568,14 +641,8 @@ impl Cursor for BTreeCursor { CursorResult::Ok(_) => {} CursorResult::IO => return Ok(CursorResult::IO), }; - let mem_page = { - let mem_page = self.page.borrow(); - let mem_page = mem_page.as_ref().unwrap(); - mem_page.clone() - }; - let page_idx = mem_page.page_idx; - let page_ref = self.pager.read_page(page_idx)?; - let page = page_ref.borrow(); + let page_ref = self.get_page()?; + let page = RefCell::borrow(&page_ref); if page.is_locked() { return Ok(CursorResult::IO); } diff --git a/core/pager.rs b/core/pager.rs index 5f409ff5d..e05dea4c2 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -1,9 +1,10 @@ use crate::buffer_pool::BufferPool; use crate::sqlite3_ondisk::PageContent; use crate::sqlite3_ondisk::{self, DatabaseHeader}; -use crate::{PageSource, Result}; +use crate::{Buffer, PageSource, Result}; use log::trace; use sieve_cache::SieveCache; +use std::borrow::Borrow; use std::cell::RefCell; use std::collections::HashMap; use std::hash::Hash; @@ -227,7 +228,7 @@ impl DumbLruPageCache { return; } let tail = unsafe { tail.unwrap().as_mut() }; - if tail.page.borrow().is_dirty() { + if RefCell::borrow(&tail.page).is_dirty() { // TODO: drop from another clean entry? return; } @@ -269,6 +270,7 @@ pub struct Pager { /// I/O interface for input/output operations. pub io: Arc, dirty_pages: Rc>>>>, + db_header: Rc>, } impl Pager { @@ -279,11 +281,11 @@ impl Pager { /// Completes opening a database by initializing the Pager with the database header. pub fn finish_open( - db_header: Rc>, + db_header_ref: Rc>, page_source: PageSource, io: Arc, ) -> Result { - let db_header = db_header.borrow(); + let db_header = RefCell::borrow(&db_header_ref); let page_size = db_header.page_size as usize; let buffer_pool = Rc::new(BufferPool::new(page_size)); let page_cache = RefCell::new(DumbLruPageCache::new(10)); @@ -293,6 +295,7 @@ impl Pager { page_cache, io, dirty_pages: Rc::new(RefCell::new(Vec::new())), + db_header: db_header_ref.clone(), }) } @@ -304,7 +307,7 @@ impl Pager { return Ok(page.clone()); } let page = Rc::new(RefCell::new(Page::new(page_idx))); - page.borrow().set_locked(); + RefCell::borrow(&page).set_locked(); sqlite3_ondisk::begin_read_page( &self.page_source, self.buffer_pool.clone(), @@ -346,4 +349,44 @@ impl Pager { self.io.run_once()?; Ok(()) } + + /* + Get's a new page that increasing the size of the page or uses a free page. + Currently free list pages are not yet supported. + */ + pub fn allocate_page(&self) -> Result>> { + let header = &self.db_header; + let mut header = RefCell::borrow_mut(&header); + header.database_size += 1; + { + // update database size + let first_page_ref = self.read_page(1).unwrap(); + let first_page = RefCell::borrow_mut(&first_page_ref); + first_page.set_dirty(); + self.add_dirty(first_page_ref.clone()); + + let contents = first_page.contents.write().unwrap(); + let contents = contents.as_ref().unwrap(); + contents.write_database_header(&header); + } + + let page_ref = Rc::new(RefCell::new(Page::new(0))); + { + // setup page and add to cache + self.add_dirty(page_ref.clone()); + let mut page = RefCell::borrow_mut(&page_ref); + page.set_dirty(); + page.id = header.database_size as usize; + let buffer = self.buffer_pool.get(); + let bp = self.buffer_pool.clone(); + let drop_fn = Rc::new(move |buf| { + bp.put(buf); + }); + let buffer = Rc::new(RefCell::new(Buffer::new(buffer, drop_fn))); + page.contents = RwLock::new(Some(PageContent { offset: 0, buffer })); + let mut cache = RefCell::borrow_mut(&self.page_cache); + cache.insert(page.id, page_ref.clone()); + } + Ok(page_ref) + } } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 831065f5b..d55d03df2 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -52,7 +52,7 @@ pub struct DatabaseHeader { min_embed_frac: u8, min_leaf_frac: u8, change_counter: u32, - database_size: u32, + pub database_size: u32, freelist_trunk_page: u32, freelist_pages: u32, schema_cookie: u32, @@ -134,31 +134,7 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re { let mut buf_mut = std::cell::RefCell::borrow_mut(&buffer); let buf = buf_mut.as_mut_slice(); - buf[0..16].copy_from_slice(&header.magic); - buf[16..18].copy_from_slice(&header.page_size.to_be_bytes()); - buf[18] = header.write_version; - buf[19] = header.read_version; - buf[20] = header.unused_space; - buf[21] = header.max_embed_frac; - buf[22] = header.min_embed_frac; - buf[23] = header.min_leaf_frac; - buf[24..28].copy_from_slice(&header.change_counter.to_be_bytes()); - buf[28..32].copy_from_slice(&header.database_size.to_be_bytes()); - buf[32..36].copy_from_slice(&header.freelist_trunk_page.to_be_bytes()); - buf[36..40].copy_from_slice(&header.freelist_pages.to_be_bytes()); - buf[40..44].copy_from_slice(&header.schema_cookie.to_be_bytes()); - buf[44..48].copy_from_slice(&header.schema_format.to_be_bytes()); - buf[48..52].copy_from_slice(&header.default_cache_size.to_be_bytes()); - - buf[52..56].copy_from_slice(&header.vacuum.to_be_bytes()); - buf[56..60].copy_from_slice(&header.text_encoding.to_be_bytes()); - buf[60..64].copy_from_slice(&header.user_version.to_be_bytes()); - buf[64..68].copy_from_slice(&header.incremental_vacuum.to_be_bytes()); - - buf[68..72].copy_from_slice(&header.application_id.to_be_bytes()); - buf[72..92].copy_from_slice(&header.reserved); - buf[92..96].copy_from_slice(&header.version_valid_for.to_be_bytes()); - buf[96..100].copy_from_slice(&header.version_number.to_be_bytes()); + write_header_to_buf(buf, &header); let mut buffer_to_copy = std::cell::RefCell::borrow_mut(&buffer_to_copy_in_cb); let buffer_to_copy_slice = buffer_to_copy.as_mut_slice(); @@ -188,6 +164,34 @@ pub fn begin_write_database_header(header: &DatabaseHeader, pager: &Pager) -> Re Ok(()) } +fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) { + buf[0..16].copy_from_slice(&header.magic); + buf[16..18].copy_from_slice(&header.page_size.to_be_bytes()); + buf[18] = header.write_version; + buf[19] = header.read_version; + buf[20] = header.unused_space; + buf[21] = header.max_embed_frac; + buf[22] = header.min_embed_frac; + buf[23] = header.min_leaf_frac; + buf[24..28].copy_from_slice(&header.change_counter.to_be_bytes()); + buf[28..32].copy_from_slice(&header.database_size.to_be_bytes()); + buf[32..36].copy_from_slice(&header.freelist_trunk_page.to_be_bytes()); + buf[36..40].copy_from_slice(&header.freelist_pages.to_be_bytes()); + buf[40..44].copy_from_slice(&header.schema_cookie.to_be_bytes()); + buf[44..48].copy_from_slice(&header.schema_format.to_be_bytes()); + buf[48..52].copy_from_slice(&header.default_cache_size.to_be_bytes()); + + buf[52..56].copy_from_slice(&header.vacuum.to_be_bytes()); + buf[56..60].copy_from_slice(&header.text_encoding.to_be_bytes()); + buf[60..64].copy_from_slice(&header.user_version.to_be_bytes()); + buf[64..68].copy_from_slice(&header.incremental_vacuum.to_be_bytes()); + + buf[68..72].copy_from_slice(&header.application_id.to_be_bytes()); + buf[72..92].copy_from_slice(&header.reserved); + buf[92..96].copy_from_slice(&header.version_valid_for.to_be_bytes()); + buf[96..100].copy_from_slice(&header.version_number.to_be_bytes()); +} + #[repr(u8)] #[derive(Debug, PartialEq, Clone)] pub enum PageType { @@ -245,6 +249,30 @@ impl PageContent { } } + pub fn write_u8(&self, pos: usize, value: u8) { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_mut().unwrap().as_mut_slice(); + buf[self.offset + pos] = value; + } + } + + pub fn write_u16(&self, pos: usize, value: u16) { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_mut().unwrap().as_mut_slice(); + buf[self.offset + pos..self.offset + pos + 2].copy_from_slice(&value.to_be_bytes()); + } + } + + pub fn write_u32(&self, pos: usize, value: u32) { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_mut().unwrap().as_mut_slice(); + buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes()); + } + } + pub fn first_freeblock(&self) -> u16 { self.read_u16(1) } @@ -296,6 +324,12 @@ impl PageContent { PageType::TableLeaf => true, } } + + pub fn write_database_header(&self, header: &DatabaseHeader) { + let mut buf = self.buffer.borrow_mut(); + let buf = buf.as_mut_slice(); + write_header_to_buf(buf, header); + } } pub fn begin_read_page( From 61cfad203e58fc275661151a5d12e6a983414bb2 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 30 Jul 2024 10:59:56 +0200 Subject: [PATCH 24/35] core: balance --- core/btree.rs | 269 ++++++++++++++++++++++++++++++++++------- core/sqlite3_ondisk.rs | 48 +++++++- 2 files changed, 270 insertions(+), 47 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index bb3f6d4b7..71364a61f 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -6,11 +6,20 @@ use crate::sqlite3_ondisk::{ use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; use crate::Result; -use std::any::Any; -use std::borrow::BorrowMut; use std::cell::{Ref, RefCell}; +use std::mem; use std::rc::Rc; +/* + These are offsets of fields in the header of a b-tree page. +*/ +const BTREE_HEADER_OFFSET_TYPE: usize = 0; /* type of btree page -> u8 */ +const BTREE_HEADER_OFFSET_FREEBLOCK: usize = 1; /* pointer to first freeblock -> u16 */ +const BTREE_HEADER_OFFSET_CELL_COUNT: usize = 3; /* number of cells in the page -> u16 */ +const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of cell allocated content from top -> u16 */ +const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */ +const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */ + pub struct MemPage { parent: Option>, page_idx: usize, @@ -273,41 +282,48 @@ impl BTreeCursor { ); if payload.len() + 2 > free as usize { // overflow or balance - self.balance_leaf(int_key, &payload); + self.balance_leaf(int_key, payload); } else { // insert - let pc = self.allocate_cell_space(page, payload.len() as u16); - let mut buf_ref = RefCell::borrow_mut(&page.buffer); - let buf: &mut [u8] = buf_ref.as_mut_slice(); - - // copy data - buf[pc as usize..pc as usize + payload.len()].copy_from_slice(&payload); - // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); - let pointer_area_pc_by_idx = 8 + 2 * cell_idx; - - // move previous pointers forward and insert new pointer there - let n_cells_forward = 2 * (page.cell_count() - cell_idx); - buf.copy_within( - pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, - pointer_area_pc_by_idx + 2, - ); - buf[pointer_area_pc_by_idx..pointer_area_pc_by_idx + 2] - .copy_from_slice(&pc.to_be_bytes()); - - // update first byte of content area - buf[5..7].copy_from_slice(&pc.to_be_bytes()); - - // update cell count - let new_n_cells = (page.cell_count() + 1) as u16; - buf[3..5].copy_from_slice(&new_n_cells.to_be_bytes()); - - let mut payload_for_cell_in_memory: Vec = Vec::new(); - _record.serialize(&mut payload_for_cell_in_memory); + self.insert_into_cell(page, &payload, cell_idx); } Ok(CursorResult::Ok(())) } + /* insert to postion and shift other pointers */ + fn insert_into_cell(&mut self, page: &mut PageContent, payload: &Vec, cell_idx: usize) { + dbg!(page.is_leaf(), cell_idx); + assert!( + page.is_leaf() || (!page.is_leaf() && cell_idx < page.cell_count()), + "if it's greater it might mean we need to insert in a rightmost pointer?" + ); + // TODO: insert into cell payload in internal page + let pc = self.allocate_cell_space(page, payload.len() as u16); + let mut buf_ref = RefCell::borrow_mut(&page.buffer); + let buf: &mut [u8] = buf_ref.as_mut_slice(); + + // copy data + buf[pc as usize..pc as usize + payload.len()].copy_from_slice(&payload); + // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); + let pointer_area_pc_by_idx = page.offset + 8 + 2 * cell_idx; + + // move previous pointers forward and insert new pointer there + let n_cells_forward = 2 * (page.cell_count() - cell_idx); + buf.copy_within( + pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, + pointer_area_pc_by_idx + 2, + ); + buf[pointer_area_pc_by_idx..pointer_area_pc_by_idx + 2].copy_from_slice(&pc.to_be_bytes()); + + // update first byte of content area + buf[5..7].copy_from_slice(&pc.to_be_bytes()); + + // update cell count + let new_n_cells = (page.cell_count() + 1) as u16; + buf[3..5].copy_from_slice(&new_n_cells.to_be_bytes()); + } + fn get_page(&mut self) -> crate::Result>> { let mem_page = { let mem_page = self.page.borrow(); @@ -319,10 +335,12 @@ impl BTreeCursor { Ok(page_ref) } - fn balance_leaf(&mut self, key: u64, payload: &Vec) { + fn balance_leaf(&mut self, key: u64, payload: Vec) { // This is a naive algorithm that doesn't try to distribute cells evenly by content. // It will try to split the page in half by keys not by content. // Sqlite tries to have a page at least 40% full. + let mut key = key; + let mut payload = payload; loop { let mem_page = { let mem_page = self.page.borrow(); @@ -330,18 +348,173 @@ impl BTreeCursor { mem_page.clone() }; let page_ref = self.read_page_sync(mem_page.page_idx); - let page = RefCell::borrow_mut(&page_ref); - let mut page = page.contents.write().unwrap(); - let page = page.as_mut().unwrap(); - let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); - if payload.len() + 2 <= free as usize { + let mut page_rc = RefCell::borrow_mut(&page_ref); + + let right_page_id = { + // split procedure + let mut page = page_rc.contents.write().unwrap(); + let page = page.as_mut().unwrap(); + let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); + assert!( + matches!( + page.page_type(), + PageType::TableLeaf | PageType::TableInterior + ), + "indexes still not supported " + ); + if payload.len() + 2 <= free as usize { + let cell_idx = find_cell(page, key); + self.insert_into_cell(page, &payload, cell_idx); + break; + } + + let right_page_ref = self.allocate_page(page.page_type()); + let right_page = RefCell::borrow_mut(&right_page_ref); + let right_page_id = right_page.id; + let mut right_page = right_page.contents.write().unwrap(); + let right_page = right_page.as_mut().unwrap(); + { + // move data from one buffer to another + // done in a separate block to satisfy borrow checker + let mut left_buf = RefCell::borrow_mut(&page.buffer); + let left_buf: &mut [u8] = left_buf.as_mut_slice(); + let mut right_buf = RefCell::borrow_mut(&right_page.buffer); + let right_buf: &mut [u8] = right_buf.as_mut_slice(); + + let mut rbrk = right_page.cell_content_area() as usize; + + // move half of cells to right page + for cell_idx in 0..page.cell_count() { + let (start, len) = page.cell_get_raw_region(cell_idx); + rbrk -= len; + right_buf[rbrk..rbrk + len].copy_from_slice(&left_buf[start..start + len]); + } + // move half of keys to right page + let (src_pointers_start, src_pointers_len) = page.cell_get_raw_pointer_region(); + assert!(page.cell_count() >= 2); + let keys_to_move_start = page.cell_count() / 2; + let (dst_pointers_start, _) = right_page.cell_get_raw_pointer_region(); + /* + Copy half + count = 8 + k-v = 2 bytes + keys_to_move_start + V + ------------------------------------------------- + | 0k-v | 1k-v | 2k-v | 3k-v | 4k-v | 5k-v | 7k-v | + ------------------------------------------------- + + */ + let pointer_data_to_move = (page.cell_count() - keys_to_move_start - 1) * 2; + right_buf[dst_pointers_start + pointer_data_to_move + ..dst_pointers_start + pointer_data_to_move] + .copy_from_slice( + &left_buf[src_pointers_start + pointer_data_to_move + ..src_pointers_start + (pointer_data_to_move * 2)], + ); + // update cell count in both pages + let keys_moved = page.cell_count() - keys_to_move_start + 1; + page.write_u16( + BTREE_HEADER_OFFSET_CELL_COUNT, + (page.cell_count() - keys_moved) as u16, + ); + right_page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, keys_moved as u16); + // update cell content are start + right_page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, rbrk as u16); + } + let last_cell = page.cell_get(page.cell_count() - 1).unwrap(); + let last_cell_key = match &last_cell { + BTreeCell::TableLeafCell(cell) => cell._rowid, + BTreeCell::TableInteriorCell(cell) => cell._rowid, + _ => unreachable!(), /* not yet supported index tables */ + }; + // if not leaf page update rightmost pointer + if let PageType::TableInterior = page.page_type() { + right_page.write_u32( + BTREE_HEADER_OFFSET_RIGHTMOST, + page.rightmost_pointer().unwrap(), + ); + // convert last cell to rightmost pointer + let BTreeCell::TableInteriorCell(last_cell) = &last_cell else { + unreachable!(); + }; + page.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, last_cell._left_child_page); + // page count now has one less cell because we've added the last one to rightmost pointer + page.write_u16( + BTREE_HEADER_OFFSET_CELL_COUNT, + (page.cell_count() - 1) as u16, + ); + } + + // update free list block by defragmenting page + self.defragment_page(page, RefCell::borrow(&self.database_header)); + // insert into one of the pages + if key < last_cell_key { + let cell_idx = find_cell(page, key); + self.insert_into_cell(page, &payload, cell_idx); + } else { + let cell_idx = find_cell(right_page, key); + self.insert_into_cell(right_page, &payload, cell_idx); + } + // propagate parent split + key = last_cell_key; + right_page_id + }; + + payload = Vec::new(); + if mem_page.page_idx == self.root_page { + /* if we are in root page then we just need to create a new root and push key there */ + let new_root_page_ref = self.allocate_page(PageType::TableInterior); + let mut new_root_page = RefCell::borrow_mut(&new_root_page_ref); + let new_root_page_id = new_root_page.id; + new_root_page.set_dirty(); + self.pager.add_dirty(new_root_page_ref.clone()); + { + let mut new_root_page_contents = new_root_page.contents.write().unwrap(); + let new_root_page_contents = new_root_page_contents.as_mut().unwrap(); + /* + Note that we set cell pointer to point to itself, because we will later swap this page's + content with splitted page in order to not update root page idx. + */ + payload.extend_from_slice(&(new_root_page_id as u32).to_be_bytes()); + payload.extend(std::iter::repeat(0).take(9)); + let n = write_varint(&mut payload.as_mut_slice()[0..9], key as u64); + payload.truncate(n); + + // write left child cell + self.insert_into_cell(new_root_page_contents, &payload, 0); + + // write right child cell + new_root_page_contents + .write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, right_page_id as u32); + } + + /* swap splitted page buffer with new root buffer so we don't have to update page idx */ + { + let mut new_root_page_contents = new_root_page.contents.write().unwrap(); + let new_root_page_contents = new_root_page_contents.as_mut().unwrap(); + let root_buf = new_root_page_contents.buffer.as_ptr(); + let root_buf = unsafe { (*root_buf).as_mut_slice() }; + let mut page = page_rc.contents.write().unwrap(); + let page = page.as_mut().unwrap(); + let mut left_buf = RefCell::borrow_mut(&page.buffer); + let left_buf: &mut [u8] = left_buf.as_mut_slice(); + + left_buf.swap_with_slice(root_buf); + } + // swap in memory state of pages + mem::swap(&mut page_rc.id, &mut new_root_page.id); + self.page = RefCell::new(Some(Rc::new(MemPage::new(None, new_root_page.id, 0)))); + break; } - let right_page_ref = self.allocate_page(page.page_type()); - let right_page = RefCell::borrow_mut(&right_page_ref); - let mut right_page = right_page.contents.write().unwrap(); - let right_page = right_page.as_mut().unwrap(); + payload.extend_from_slice(&(mem_page.page_idx as u32).to_be_bytes()); + payload.extend(std::iter::repeat(0).take(9)); + let n = write_varint(&mut payload.as_mut_slice()[0..9], key as u64); + payload.truncate(n); + + self.page = RefCell::new(Some(mem_page.parent.as_ref().unwrap().clone())); } } @@ -364,12 +537,16 @@ impl BTreeCursor { let mut contents = contents.contents.write().unwrap(); let contents = contents.as_mut().unwrap(); let id = page_type as u8; - contents.write_u8(0, id); - contents.write_u16(1, 0); - contents.write_u16(3, 0); - contents.write_u16(5, 0); - contents.write_u8(7, 0); - contents.write_u32(8, 0); + contents.write_u8(BTREE_HEADER_OFFSET_TYPE, id); + contents.write_u16(BTREE_HEADER_OFFSET_FREEBLOCK, 0); + contents.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, 0); + + let db_header = RefCell::borrow(&self.database_header); + let cell_content_area_start = db_header.page_size - db_header.unused_space as u16; + contents.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, cell_content_area_start); + + contents.write_u8(BTREE_HEADER_OFFSET_FRAGMENTED, 0); + contents.write_u32(BTREE_HEADER_OFFSET_RIGHTMOST, 0); } page diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index d55d03df2..696f386a0 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -316,6 +316,53 @@ impl PageContent { read_btree_cell(buf, &self.page_type(), cell_pointer) } + pub fn cell_get_raw_pointer_region(&self) -> (usize, usize) { + let cell_start = match self.page_type() { + PageType::IndexInterior => 12, + PageType::TableInterior => 12, + PageType::IndexLeaf => 8, + PageType::TableLeaf => 8, + }; + (cell_start, self.cell_count() * 2) + } + + pub fn cell_get_raw_region(&self, idx: usize) -> (usize, usize) { + let buf = self.buffer.borrow(); + let buf = buf.as_slice(); + + let ncells = self.cell_count(); + let cell_start = match self.page_type() { + PageType::IndexInterior => 12, + PageType::TableInterior => 12, + PageType::IndexLeaf => 8, + PageType::TableLeaf => 8, + }; + assert!(idx < ncells, "cell_get: idx out of bounds"); + let cell_pointer = cell_start + (idx * 2); + let cell_pointer = self.read_u16(cell_pointer) as usize; + let start = cell_pointer; + let len = match self.page_type() { + PageType::IndexInterior => { + let (len_payload, n_payload) = read_varint(&buf[cell_pointer + 4..]).unwrap(); + 4 + len_payload as usize + n_payload + 4 + } + PageType::TableInterior => { + let (_, n_rowid) = read_varint(&buf[cell_pointer + 4..]).unwrap(); + 4 + n_rowid + } + PageType::IndexLeaf => { + let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap(); + len_payload as usize + n_payload + 4 + } + PageType::TableLeaf => { + let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap(); + let (_, n_rowid) = read_varint(&buf[cell_pointer + n_payload..]).unwrap(); + len_payload as usize + n_payload + n_rowid + 4 + } + }; + (start, len) + } + pub fn is_leaf(&self) -> bool { match self.page_type() { PageType::IndexInterior => false, @@ -558,7 +605,6 @@ pub fn read_record(payload: &[u8]) -> Result { let (serial_type, nr) = read_varint(&payload[pos..])?; let serial_type = SerialType::try_from(serial_type)?; serial_types.push(serial_type); - assert!(pos + nr < payload.len()); pos += nr; assert!(header_size >= nr); header_size -= nr; From 68e7a062a4b11eb8eb92b22d466468b354f5cf48 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 30 Jul 2024 16:25:54 +0200 Subject: [PATCH 25/35] core: fix defragmentation Signed-off-by: Pere Diaz Bou --- core/btree.rs | 99 +++++++++++++++++++++++++++++------------- core/sqlite3_ondisk.rs | 25 +++++++++-- 2 files changed, 89 insertions(+), 35 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 71364a61f..334699a02 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -293,7 +293,6 @@ impl BTreeCursor { /* insert to postion and shift other pointers */ fn insert_into_cell(&mut self, page: &mut PageContent, payload: &Vec, cell_idx: usize) { - dbg!(page.is_leaf(), cell_idx); assert!( page.is_leaf() || (!page.is_leaf() && cell_idx < page.cell_count()), "if it's greater it might mean we need to insert in a rightmost pointer?" @@ -557,10 +556,8 @@ impl BTreeCursor { */ fn allocate_cell_space(&mut self, page_ref: &PageContent, amount: u16) -> u16 { let amount = amount as usize; - let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); - let buf = buf_ref.as_mut_slice(); - let cell_offset = 8; + let (cell_offset, _) = page_ref.cell_get_raw_pointer_region(); let gap = cell_offset + 2 * page_ref.cell_count(); let mut top = page_ref.cell_content_area() as usize; @@ -568,20 +565,30 @@ impl BTreeCursor { if page_ref.first_freeblock() != 0 && gap + 2 <= top { // find slot let db_header = RefCell::borrow(&self.database_header); - let pc = find_free_cell(page_ref, db_header, amount, buf); - return pc as u16; + let pc = find_free_cell(page_ref, db_header, amount); + if pc != 0 { + return pc as u16; + } + /* fall through, we might need to defragment */ } if gap + 2 + amount as usize > top { // defragment self.defragment_page(page_ref, RefCell::borrow(&self.database_header)); + let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); + let buf = buf_ref.as_mut_slice(); top = u16::from_be_bytes([buf[5], buf[6]]) as usize; - return 0; } let db_header = RefCell::borrow(&self.database_header); top -= amount; - buf[5..7].copy_from_slice(&(top as u16).to_be_bytes()); + + { + let mut buf_ref = RefCell::borrow_mut(&page_ref.buffer); + let buf = buf_ref.as_mut_slice(); + buf[5..7].copy_from_slice(&(top as u16).to_be_bytes()); + } + let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; assert!(top + amount <= usable_space); return top as u16; @@ -595,15 +602,20 @@ impl BTreeCursor { // TODO: implement fast algorithm let last_cell = (usable_space - 4) as u64; - let first_cell = cloned_page.cell_content_area() as u64; + let first_cell = { + let (start, end) = cloned_page.cell_get_raw_pointer_region(); + start + end + }; + if cloned_page.cell_count() > 0 { + let page_type = page.page_type(); let buf = RefCell::borrow(&cloned_page.buffer); let buf = buf.as_slice(); let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); for i in 0..cloned_page.cell_count() { - let cell_offset = 8; + let cell_offset = page.offset + 8; let cell_idx = cell_offset + i * 2; let pc = u16::from_be_bytes([buf[cell_idx], buf[cell_idx + 1]]) as u64; @@ -613,19 +625,42 @@ impl BTreeCursor { assert!(pc <= last_cell); - let size = match read_varint(&buf[pc as usize..pc as usize + 9]) { - Ok(v) => v.0, - Err(_) => todo!( - "error while parsing varint from cell, probably treat this as corruption?" - ), + let size = match page_type { + PageType::TableInterior => { + let (_, nr_key) = match read_varint(&buf[pc as usize ..]) { + Ok(v) => v, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + 4 + nr_key as u64 + } + PageType::TableLeaf => { + let (payload_size, nr_payload) = match read_varint(&buf[pc as usize..]) { + Ok(v) => v, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + let (_, nr_key) = match read_varint(&buf[pc as usize + nr_payload as usize..]) { + Ok(v) => v, + Err(_) => todo!( + "error while parsing varint from cell, probably treat this as corruption?" + ), + }; + // TODO: add overflow page calculation + payload_size + nr_payload as u64 + nr_key as u64 + } + PageType::IndexInterior => todo!(), + PageType::IndexLeaf => todo!(), }; cbrk -= size; if cbrk < first_cell as u64 || pc as u64 + size > usable_space as u64 { todo!("corrupt"); } - assert!(cbrk + size <= usable_space && cbrk >= first_cell); + assert!(cbrk + size <= usable_space && cbrk >= first_cell as u64); // set new pointer - write_buf[cell_idx..cell_idx + 2].copy_from_slice(&cbrk.to_be_bytes()); + write_buf[cell_idx..cell_idx + 2].copy_from_slice(&(cbrk as u16).to_be_bytes()); // copy payload write_buf[cbrk as usize..cbrk as usize + size as usize] .copy_from_slice(&buf[pc as usize..pc as usize + size as usize]); @@ -636,18 +671,19 @@ impl BTreeCursor { // if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ // return SQLITE_CORRUPT_PAGE(pPage); // } - assert!(cbrk >= first_cell); + assert!(cbrk >= first_cell as u64); let mut write_buf = RefCell::borrow_mut(&page.buffer); let write_buf = write_buf.as_mut_slice(); // set new first byte of cell content - write_buf[5..7].copy_from_slice(&cbrk.to_be_bytes()); + write_buf[5..7].copy_from_slice(&(cbrk as u16).to_be_bytes()); // set free block to 0, unused spaced can be retrieved from gap between cell pointer end and content start write_buf[1] = 0; write_buf[2] = 0; // set unused space to 0 - write_buf[first_cell as usize..first_cell as usize + cbrk as usize - first_cell as usize] - .fill(0); + let first_cell = cloned_page.cell_content_area() as u64; + assert!(first_cell <= cbrk); + write_buf[first_cell as usize..cbrk as usize].fill(0); } // Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte @@ -667,7 +703,7 @@ impl BTreeCursor { let ncell = page.cell_count(); // 8 + 4 == header end - let first_cell = 8 + 4 + (2 * ncell) as u16; + let first_cell = (page.offset + 8 + 4 + (2 * ncell)) as u16; let mut nfree = fragmented_free_bytes as usize + first_byte_in_cell_content as usize; @@ -709,15 +745,14 @@ impl BTreeCursor { } } -fn find_free_cell( - page_ref: &PageContent, - db_header: Ref, - amount: usize, - buf: &[u8], -) -> usize { +fn find_free_cell(page_ref: &PageContent, db_header: Ref, amount: usize) -> usize { // NOTE: freelist is in ascending order of keys and pc // unuse_space is reserved bytes at the end of page, therefore we must substract from maxpc let mut pc = page_ref.first_freeblock() as usize; + + let buf_ref = RefCell::borrow(&page_ref.buffer); + let buf = buf_ref.as_slice(); + let usable_space = (db_header.page_size - db_header.unused_space as u16) as usize; let maxpc = (usable_space - amount as usize) as usize; let mut found = false; @@ -731,9 +766,10 @@ fn find_free_cell( pc = next as usize; } if !found { - unimplemented!("recover for fragmented space"); + 0 + } else { + pc } - pc } impl Cursor for BTreeCursor { @@ -847,7 +883,8 @@ impl Cursor for BTreeCursor { fn find_cell(page: &PageContent, int_key: u64) -> usize { let mut cell_idx = 0; - while cell_idx < page.cell_count() { + let cell_count = page.cell_count(); + while cell_idx < cell_count { match page.cell_get(cell_idx).unwrap() { BTreeCell::TableLeafCell(cell) => { if int_key <= cell._rowid { diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index 696f386a0..b7739d8f8 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -215,12 +215,21 @@ impl TryFrom for PageType { } } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct PageContent { pub offset: usize, pub buffer: Rc>, } +impl Clone for PageContent { + fn clone(&self) -> Self { + Self { + offset: self.offset, + buffer: Rc::new(RefCell::new((*self.buffer.borrow()).clone())), + } + } +} + impl PageContent { pub fn page_type(&self) -> PageType { let buf = self.buffer.borrow(); @@ -228,6 +237,14 @@ impl PageContent { buf[self.offset].try_into().unwrap() } + fn read_u8(&self, pos: usize) -> u8 { + unsafe { + let buf_pointer = &self.buffer.as_ptr(); + let buf = (*buf_pointer).as_ref().unwrap().as_slice(); + buf[pos] + } + } + fn read_u16(&self, pos: usize) -> u16 { unsafe { let buf_pointer = &self.buffer.as_ptr(); @@ -285,8 +302,8 @@ impl PageContent { self.read_u16(5) as u16 } - pub fn num_frag_free_bytes(&self) -> u16 { - self.read_u16(7) as u16 + pub fn num_frag_free_bytes(&self) -> u8 { + self.read_u8(7) as u8 } pub fn rightmost_pointer(&self) -> Option { @@ -323,7 +340,7 @@ impl PageContent { PageType::IndexLeaf => 8, PageType::TableLeaf => 8, }; - (cell_start, self.cell_count() * 2) + (self.offset + cell_start, self.cell_count() * 2) } pub fn cell_get_raw_region(&self, idx: usize) -> (usize, usize) { From 836aa6ee07ff7485082434e1a1123e5ff3b2cf31 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 13:03:16 +0200 Subject: [PATCH 26/35] core: fix split cells to right page --- core/btree.rs | 139 +++++++++++++++++++---------------------- core/pager.rs | 2 +- core/sqlite3_ondisk.rs | 16 +++-- 3 files changed, 75 insertions(+), 82 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 334699a02..ad11a7b43 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -219,24 +219,27 @@ impl BTreeCursor { _record: &OwnedRecord, ) -> Result> { let page_ref = self.get_page()?; - let page = RefCell::borrow(&page_ref); - if page.is_locked() { - return Ok(CursorResult::IO); - } - - page.set_dirty(); - self.pager.add_dirty(page_ref.clone()); - - let mut page = page.contents.write().unwrap(); - let page = page.as_mut().unwrap(); - assert!(matches!(page.page_type(), PageType::TableLeaf)); - - // find cell let int_key = match key { OwnedValue::Integer(i) => *i as u64, _ => unreachable!("btree tables are indexed by integers!"), }; - let cell_idx = find_cell(page, int_key); + + let cell_idx = { + let page = RefCell::borrow(&page_ref); + if page.is_locked() { + return Ok(CursorResult::IO); + } + + page.set_dirty(); + self.pager.add_dirty(page_ref.clone()); + + let mut page = page.contents.write().unwrap(); + let page = page.as_mut().unwrap(); + assert!(matches!(page.page_type(), PageType::TableLeaf)); + + // find cell + find_cell(page, int_key) + }; // TODO: if overwrite drop cell @@ -275,7 +278,12 @@ impl BTreeCursor { let db_header = RefCell::borrow(&self.database_header); (db_header.page_size - db_header.unused_space as u16) as usize }; - let free = self.compute_free_space(page, RefCell::borrow(&self.database_header)); + let free = { + let page = RefCell::borrow(&page_ref); + let mut page = page.contents.write().unwrap(); + let page = page.as_mut().unwrap(); + self.compute_free_space(page, RefCell::borrow(&self.database_header)) + }; assert!( payload.len() <= usable_space - 100, /* 100 bytes minus for precaution to remember */ "need to implemented overflow pages, too big to even add to a an empty page" @@ -285,6 +293,10 @@ impl BTreeCursor { self.balance_leaf(int_key, payload); } else { // insert + let page = RefCell::borrow(&page_ref); + + let mut page = page.contents.write().unwrap(); + let page = page.as_mut().unwrap(); self.insert_into_cell(page, &payload, cell_idx); } @@ -293,10 +305,6 @@ impl BTreeCursor { /* insert to postion and shift other pointers */ fn insert_into_cell(&mut self, page: &mut PageContent, payload: &Vec, cell_idx: usize) { - assert!( - page.is_leaf() || (!page.is_leaf() && cell_idx < page.cell_count()), - "if it's greater it might mean we need to insert in a rightmost pointer?" - ); // TODO: insert into cell payload in internal page let pc = self.allocate_cell_space(page, payload.len() as u16); let mut buf_ref = RefCell::borrow_mut(&page.buffer); @@ -305,22 +313,25 @@ impl BTreeCursor { // copy data buf[pc as usize..pc as usize + payload.len()].copy_from_slice(&payload); // memmove(pIns+2, pIns, 2*(pPage->nCell - i)); - let pointer_area_pc_by_idx = page.offset + 8 + 2 * cell_idx; + let (pointer_area_pc_by_idx, _) = page.cell_get_raw_pointer_region(); + let pointer_area_pc_by_idx = pointer_area_pc_by_idx + (2 * cell_idx); // move previous pointers forward and insert new pointer there let n_cells_forward = 2 * (page.cell_count() - cell_idx); - buf.copy_within( - pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, - pointer_area_pc_by_idx + 2, - ); - buf[pointer_area_pc_by_idx..pointer_area_pc_by_idx + 2].copy_from_slice(&pc.to_be_bytes()); + if n_cells_forward > 0 { + buf.copy_within( + pointer_area_pc_by_idx..pointer_area_pc_by_idx + n_cells_forward, + pointer_area_pc_by_idx + 2, + ); + } + page.write_u16(pointer_area_pc_by_idx, pc); // update first byte of content area - buf[5..7].copy_from_slice(&pc.to_be_bytes()); + page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, pc); // update cell count let new_n_cells = (page.cell_count() + 1) as u16; - buf[3..5].copy_from_slice(&new_n_cells.to_be_bytes()); + page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, new_n_cells); } fn get_page(&mut self) -> crate::Result>> { @@ -382,45 +393,26 @@ impl BTreeCursor { let mut rbrk = right_page.cell_content_area() as usize; + let cells_to_move = page.cell_count() / 2; + let (mut cell_pointer_idx, _) = page.cell_get_raw_pointer_region(); // move half of cells to right page - for cell_idx in 0..page.cell_count() { - let (start, len) = page.cell_get_raw_region(cell_idx); + for cell_idx in cells_to_move..page.cell_count() { + let (start, len) = page.cell_get_raw_region_borrowed(cell_idx, left_buf); + // copy data rbrk -= len; right_buf[rbrk..rbrk + len].copy_from_slice(&left_buf[start..start + len]); + // set pointer + right_page.write_u16(cell_pointer_idx, rbrk as u16); + cell_pointer_idx += 2; } - // move half of keys to right page - let (src_pointers_start, src_pointers_len) = page.cell_get_raw_pointer_region(); - assert!(page.cell_count() >= 2); - let keys_to_move_start = page.cell_count() / 2; - let (dst_pointers_start, _) = right_page.cell_get_raw_pointer_region(); - /* - Copy half - count = 8 - k-v = 2 bytes - keys_to_move_start - V - ------------------------------------------------- - | 0k-v | 1k-v | 2k-v | 3k-v | 4k-v | 5k-v | 7k-v | - ------------------------------------------------- - - */ - let pointer_data_to_move = (page.cell_count() - keys_to_move_start - 1) * 2; - right_buf[dst_pointers_start + pointer_data_to_move - ..dst_pointers_start + pointer_data_to_move] - .copy_from_slice( - &left_buf[src_pointers_start + pointer_data_to_move - ..src_pointers_start + (pointer_data_to_move * 2)], - ); // update cell count in both pages - let keys_moved = page.cell_count() - keys_to_move_start + 1; - page.write_u16( - BTREE_HEADER_OFFSET_CELL_COUNT, - (page.cell_count() - keys_moved) as u16, - ); + let keys_moved = page.cell_count() - cells_to_move; + page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, cells_to_move as u16); right_page.write_u16(BTREE_HEADER_OFFSET_CELL_COUNT, keys_moved as u16); // update cell content are start right_page.write_u16(BTREE_HEADER_OFFSET_CELL_CONTENT, rbrk as u16); } + let last_cell = page.cell_get(page.cell_count() - 1).unwrap(); let last_cell_key = match &last_cell { BTreeCell::TableLeafCell(cell) => cell._rowid, @@ -462,23 +454,23 @@ impl BTreeCursor { payload = Vec::new(); if mem_page.page_idx == self.root_page { + /* todo: balance deeper, create child and copy contents of root there. Then split root */ /* if we are in root page then we just need to create a new root and push key there */ let new_root_page_ref = self.allocate_page(PageType::TableInterior); - let mut new_root_page = RefCell::borrow_mut(&new_root_page_ref); - let new_root_page_id = new_root_page.id; - new_root_page.set_dirty(); - self.pager.add_dirty(new_root_page_ref.clone()); { + let new_root_page = RefCell::borrow_mut(&new_root_page_ref); + let new_root_page_id = new_root_page.id; let mut new_root_page_contents = new_root_page.contents.write().unwrap(); let new_root_page_contents = new_root_page_contents.as_mut().unwrap(); /* Note that we set cell pointer to point to itself, because we will later swap this page's content with splitted page in order to not update root page idx. */ + let new_root_page_id = new_root_page_id as u32; payload.extend_from_slice(&(new_root_page_id as u32).to_be_bytes()); payload.extend(std::iter::repeat(0).take(9)); - let n = write_varint(&mut payload.as_mut_slice()[0..9], key as u64); - payload.truncate(n); + let n = write_varint(&mut payload.as_mut_slice()[4..], key as u64); + payload.truncate(4 + n); // write left child cell self.insert_into_cell(new_root_page_contents, &payload, 0); @@ -490,20 +482,17 @@ impl BTreeCursor { /* swap splitted page buffer with new root buffer so we don't have to update page idx */ { - let mut new_root_page_contents = new_root_page.contents.write().unwrap(); - let new_root_page_contents = new_root_page_contents.as_mut().unwrap(); - let root_buf = new_root_page_contents.buffer.as_ptr(); - let root_buf = unsafe { (*root_buf).as_mut_slice() }; - let mut page = page_rc.contents.write().unwrap(); - let page = page.as_mut().unwrap(); - let mut left_buf = RefCell::borrow_mut(&page.buffer); - let left_buf: &mut [u8] = left_buf.as_mut_slice(); + let mut new_root_page = RefCell::borrow_mut(&new_root_page_ref); + mem::swap(&mut *new_root_page, &mut *page_rc); - left_buf.swap_with_slice(root_buf); + // now swap contents + let mut new_root_page_contents = new_root_page.contents.write().unwrap(); + let mut page_contents = page_rc.contents.write().unwrap(); + std::mem::swap(&mut *new_root_page_contents, &mut *page_contents); + + self.page = + RefCell::new(Some(Rc::new(MemPage::new(None, new_root_page.id, 0)))); } - // swap in memory state of pages - mem::swap(&mut page_rc.id, &mut new_root_page.id); - self.page = RefCell::new(Some(Rc::new(MemPage::new(None, new_root_page.id, 0)))); break; } diff --git a/core/pager.rs b/core/pager.rs index e05dea4c2..876c5c242 100644 --- a/core/pager.rs +++ b/core/pager.rs @@ -14,7 +14,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; pub struct Page { - flags: AtomicUsize, + pub flags: AtomicUsize, pub contents: RwLock>, pub id: usize, } diff --git a/core/sqlite3_ondisk.rs b/core/sqlite3_ondisk.rs index b7739d8f8..d57efef75 100644 --- a/core/sqlite3_ondisk.rs +++ b/core/sqlite3_ondisk.rs @@ -232,12 +232,11 @@ impl Clone for PageContent { impl PageContent { pub fn page_type(&self) -> PageType { - let buf = self.buffer.borrow(); - let buf = buf.as_slice(); - buf[self.offset].try_into().unwrap() + self.read_u8(self.offset).try_into().unwrap() } fn read_u8(&self, pos: usize) -> u8 { + // unsafe trick to borrow twice unsafe { let buf_pointer = &self.buffer.as_ptr(); let buf = (*buf_pointer).as_ref().unwrap().as_slice(); @@ -344,9 +343,12 @@ impl PageContent { } pub fn cell_get_raw_region(&self, idx: usize) -> (usize, usize) { - let buf = self.buffer.borrow(); - let buf = buf.as_slice(); + let mut buf = self.buffer.borrow_mut(); + let buf = buf.as_mut_slice(); + self.cell_get_raw_region_borrowed(idx, buf) + } + pub fn cell_get_raw_region_borrowed(&self, idx: usize, buf: &mut [u8]) -> (usize, usize) { let ncells = self.cell_count(); let cell_start = match self.page_type() { PageType::IndexInterior => 12, @@ -374,7 +376,8 @@ impl PageContent { PageType::TableLeaf => { let (len_payload, n_payload) = read_varint(&buf[cell_pointer..]).unwrap(); let (_, n_rowid) = read_varint(&buf[cell_pointer + n_payload..]).unwrap(); - len_payload as usize + n_payload + n_rowid + 4 + // TODO: add overflow 4 bytes + len_payload as usize + n_payload + n_rowid } }; (start, len) @@ -448,6 +451,7 @@ pub fn begin_write_btree_page(pager: &Pager, page: &Rc>) -> Result let page_source = &pager.page_source; let page_finish = page.clone(); let page = page.borrow(); + let contents = page.contents.read().unwrap(); let contents = contents.as_ref().unwrap(); let buffer = contents.buffer.clone(); From 8c654adc5027589e608b90c701936e657ea82f99 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 13:56:07 +0200 Subject: [PATCH 27/35] core: fix propagation of key to top Signed-off-by: Pere Diaz Bou --- core/btree.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/btree.rs b/core/btree.rs index ad11a7b43..4aa5fe640 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -497,9 +497,10 @@ impl BTreeCursor { break; } + // Propagate split divided to top. payload.extend_from_slice(&(mem_page.page_idx as u32).to_be_bytes()); payload.extend(std::iter::repeat(0).take(9)); - let n = write_varint(&mut payload.as_mut_slice()[0..9], key as u64); + let n = write_varint(&mut payload.as_mut_slice()[4..], key as u64); payload.truncate(n); self.page = RefCell::new(Some(mem_page.parent.as_ref().unwrap().clone())); From 8810a5c11ee5871b4901500419783d9a9ee6c867 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 14:13:10 +0200 Subject: [PATCH 28/35] core: fix move_to replace cursor current page Signed-off-by: Pere Diaz Bou --- core/btree.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/core/btree.rs b/core/btree.rs index 4aa5fe640..9b1318e0a 100644 --- a/core/btree.rs +++ b/core/btree.rs @@ -198,15 +198,15 @@ impl BTreeCursor { } if !found_cell { - let parent = mem_page.parent.clone(); + let parent = mem_page.clone(); match page.rightmost_pointer() { Some(right_most_pointer) => { - let mem_page = MemPage::new(parent, right_most_pointer as usize, 0); + let mem_page = MemPage::new(Some(parent), right_most_pointer as usize, 0); self.page.replace(Some(Rc::new(mem_page))); continue; } None => { - unreachable!("we shall not go back up! The only way is down the slope") + unreachable!("we shall not go back up! The only way is down the slope"); } } } @@ -881,6 +881,11 @@ fn find_cell(page: &PageContent, int_key: u64) -> usize { break; } } + BTreeCell::TableInteriorCell(cell) => { + if int_key <= cell._rowid { + break; + } + } _ => todo!(), } cell_idx += 1; From 2b221d2b3c42287a7528e7bc44e71e90820f1931 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 17:38:33 +0200 Subject: [PATCH 29/35] fix conflicts Signed-off-by: Pere Diaz Bou --- bindings/wasm/lib.rs | 2 +- core/translate/where_clause.rs | 6 +++--- simulator/main.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bindings/wasm/lib.rs b/bindings/wasm/lib.rs index 9b7582e7c..d859509d9 100644 --- a/bindings/wasm/lib.rs +++ b/bindings/wasm/lib.rs @@ -45,7 +45,7 @@ impl limbo_core::PageIO for PageIO { &self, _page_idx: usize, _buffer: Rc>, - _c: Rc, + _c: Rc, ) -> Result<()> { todo!() } diff --git a/core/translate/where_clause.rs b/core/translate/where_clause.rs index e4ec529ba..fca060a65 100644 --- a/core/translate/where_clause.rs +++ b/core/translate/where_clause.rs @@ -554,7 +554,7 @@ fn translate_condition_expr( // The left hand side only needs to be evaluated once we have a list of values to compare against. let lhs_reg = program.alloc_register(); - let _ = translate_expr(program, select, lhs, lhs_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), lhs, lhs_reg, cursor_hint)?; let rhs = rhs.as_ref().unwrap(); @@ -577,7 +577,7 @@ fn translate_condition_expr( for (i, expr) in rhs.iter().enumerate() { let rhs_reg = program.alloc_register(); let last_condition = i == rhs.len() - 1; - let _ = translate_expr(program, select, expr, rhs_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), expr, rhs_reg, cursor_hint)?; // If this is not the last condition, we need to jump to the 'jump_target_when_true' label if the condition is true. if !last_condition { program.emit_insn_with_label_dependency( @@ -614,7 +614,7 @@ fn translate_condition_expr( // If it's a NOT IN expression, we need to jump to the 'jump_target_when_false' label if any of the conditions are true. for expr in rhs.iter() { let rhs_reg = program.alloc_register(); - let _ = translate_expr(program, select, expr, rhs_reg, cursor_hint)?; + let _ = translate_expr(program, Some(select), expr, rhs_reg, cursor_hint)?; program.emit_insn_with_label_dependency( Insn::Eq { lhs: lhs_reg, diff --git a/simulator/main.rs b/simulator/main.rs index 628ba5526..4e26eea9f 100644 --- a/simulator/main.rs +++ b/simulator/main.rs @@ -162,7 +162,7 @@ impl limbo_core::File for SimulatorFile { &self, pos: usize, buffer: Rc>, - c: Rc, + c: Rc, ) -> Result<()> { if *self.fault.borrow() { *self.nr_pwrite_faults.borrow_mut() += 1; From 82ee0e4a00c3e6ff76d4328a75b733f6a828a00c Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 17:58:45 +0200 Subject: [PATCH 30/35] core: fix completion generic io --- core/io/generic.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/core/io/generic.rs b/core/io/generic.rs index 0fb86d3d8..113a3aacd 100644 --- a/core/io/generic.rs +++ b/core/io/generic.rs @@ -1,4 +1,4 @@ -use crate::{Completion, File, Result, WriteCompletion, IO}; +use crate::{Completion, File, Result, IO}; use log::trace; use std::cell::RefCell; use std::io::{Read, Seek, Write}; @@ -45,11 +45,15 @@ impl File for GenericFile { let mut file = self.file.borrow_mut(); file.seek(std::io::SeekFrom::Start(pos as u64))?; { - let mut buf = c.buf_mut(); + let r = match &(*c) { + Completion::Read(r) => r, + Completion::Write(_) => unreachable!(), + }; + let mut buf = r.buf_mut(); let buf = buf.as_mut_slice(); file.read_exact(buf)?; } - c.complete(); + c.complete(0); Ok(()) } @@ -57,7 +61,7 @@ impl File for GenericFile { &self, pos: usize, buffer: Rc>, - c: Rc, + c: Rc, ) -> Result<()> { let mut file = self.file.borrow_mut(); file.seek(std::io::SeekFrom::Start(pos as u64))?; @@ -72,4 +76,4 @@ impl Drop for GenericFile { fn drop(&mut self) { self.unlock_file().expect("Failed to unlock file"); } -} \ No newline at end of file +} From 38c407a286e9f248f9a775decdb77546a03d78b6 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 18:01:38 +0200 Subject: [PATCH 31/35] core: fix completion darwin io --- core/io/darwin.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/core/io/darwin.rs b/core/io/darwin.rs index 2b63da31e..f99d7bd62 100644 --- a/core/io/darwin.rs +++ b/core/io/darwin.rs @@ -67,7 +67,11 @@ impl IO for DarwinIO { match cf { CompletionCallback::Read(ref file, ref c, pos) => { let mut file = file.borrow_mut(); - let mut buf = c.buf_mut(); + let r = match &(*c) { + Completion::Read(r) => r, + Completion::Write(_) => unreachable!(), + }; + let mut buf = r.buf_mut(); file.seek(std::io::SeekFrom::Start(pos as u64))?; file.read(buf.as_mut_slice()) } @@ -83,7 +87,7 @@ impl IO for DarwinIO { std::result::Result::Ok(n) => { match cf { CompletionCallback::Read(_, ref c, _) => { - c.complete(); + c.complete(0); } CompletionCallback::Write(_, ref c, _, _) => { c.complete(n); @@ -142,7 +146,10 @@ impl File for DarwinFile { "Failed locking file. File is locked by another process" ))); } else { - return Err(LimboError::LockingError(format!("Failed locking file, {}", err))); + return Err(LimboError::LockingError(format!( + "Failed locking file, {}", + err + ))); } } Ok(()) From 10da6a673d90d00b7c938b63e01aba7159f0e5e5 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 18:04:26 +0200 Subject: [PATCH 32/35] core: more fix completion darwin io --- core/io/darwin.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/core/io/darwin.rs b/core/io/darwin.rs index f99d7bd62..0cce7bea2 100644 --- a/core/io/darwin.rs +++ b/core/io/darwin.rs @@ -90,7 +90,7 @@ impl IO for DarwinIO { c.complete(0); } CompletionCallback::Write(_, ref c, _, _) => { - c.complete(n); + c.complete(n as i32); } } return Ok(()); @@ -178,14 +178,18 @@ impl File for DarwinFile { fn pread(&self, pos: usize, c: Rc) -> Result<()> { let file = self.file.borrow(); let result = { - let mut buf = c.buf_mut(); + let r = match &(*c) { + Completion::Read(r) => r, + Completion::Write(_) => unreachable!(), + }; + let mut buf = r.buf_mut(); rustix::io::pread(file.as_fd(), buf.as_mut_slice(), pos as u64) }; match result { std::result::Result::Ok(n) => { trace!("pread n: {}", n); // Read succeeded immediately - c.complete(); + c.complete(0); Ok(()) } Err(Errno::AGAIN) => { @@ -222,7 +226,7 @@ impl File for DarwinFile { std::result::Result::Ok(n) => { trace!("pwrite n: {}", n); // Read succeeded immediately - c.complete(n); + c.complete(n as i32); Ok(()) } Err(Errno::AGAIN) => { From b8e08dcdc4025c0dbdd2b94077d5b595c3b02e8b Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 18:07:22 +0200 Subject: [PATCH 33/35] core: more2 fix completion darwin io --- core/io/darwin.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/io/darwin.rs b/core/io/darwin.rs index 0cce7bea2..ed8acfc3d 100644 --- a/core/io/darwin.rs +++ b/core/io/darwin.rs @@ -67,7 +67,7 @@ impl IO for DarwinIO { match cf { CompletionCallback::Read(ref file, ref c, pos) => { let mut file = file.borrow_mut(); - let r = match &(*c) { + let r = match *c { Completion::Read(r) => r, Completion::Write(_) => unreachable!(), }; @@ -178,7 +178,7 @@ impl File for DarwinFile { fn pread(&self, pos: usize, c: Rc) -> Result<()> { let file = self.file.borrow(); let result = { - let r = match &(*c) { + let r = match *c { Completion::Read(r) => r, Completion::Write(_) => unreachable!(), }; From e00690bf9b29ca38182602e34f8ec2c93fef4e52 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 31 Jul 2024 19:52:59 +0300 Subject: [PATCH 34/35] core: Fix I/O build on Darwin --- core/io/darwin.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/core/io/darwin.rs b/core/io/darwin.rs index ed8acfc3d..06b057f65 100644 --- a/core/io/darwin.rs +++ b/core/io/darwin.rs @@ -2,7 +2,7 @@ use crate::error::LimboError; use crate::io::common; use crate::Result; -use super::{Completion, File, WriteCompletion, IO}; +use super::{Completion, File, IO}; use libc::{c_short, fcntl, flock, F_SETLK}; use log::trace; use polling::{Event, Events, Poller}; @@ -67,7 +67,8 @@ impl IO for DarwinIO { match cf { CompletionCallback::Read(ref file, ref c, pos) => { let mut file = file.borrow_mut(); - let r = match *c { + let c: &Completion = &c; + let r = match c { Completion::Read(r) => r, Completion::Write(_) => unreachable!(), }; @@ -85,7 +86,7 @@ impl IO for DarwinIO { }; match result { std::result::Result::Ok(n) => { - match cf { + match &cf { CompletionCallback::Read(_, ref c, _) => { c.complete(0); } @@ -109,7 +110,7 @@ enum CompletionCallback { Read(Rc>, Rc, usize), Write( Rc>, - Rc, + Rc, Rc>, usize, ), @@ -175,10 +176,10 @@ impl File for DarwinFile { Ok(()) } - fn pread(&self, pos: usize, c: Rc) -> Result<()> { - let file = self.file.borrow(); + fn pread(&self, pos: usize, c: Rc) -> Result<()> { + let file = self.file.borrow(); let result = { - let r = match *c { + let r = match &(*c) { Completion::Read(r) => r, Completion::Write(_) => unreachable!(), }; @@ -215,7 +216,7 @@ impl File for DarwinFile { &self, pos: usize, buffer: Rc>, - c: Rc, + c: Rc, ) -> Result<()> { let file = self.file.borrow(); let result = { From 8f6a2fc814c3337a5b23c237a3d01ab531aa9663 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 31 Jul 2024 19:58:37 +0300 Subject: [PATCH 35/35] core: Fix I/O compliation on Windows ...just steal the generic implementation of pread() and pwrite(). --- core/io/windows.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/io/windows.rs b/core/io/windows.rs index cd0119b9c..ca48e8d0b 100644 --- a/core/io/windows.rs +++ b/core/io/windows.rs @@ -43,11 +43,15 @@ impl File for WindowsFile { let mut file = self.file.borrow_mut(); file.seek(std::io::SeekFrom::Start(pos as u64))?; { - let mut buf = c.buf_mut(); + let r = match &(*c) { + Completion::Read(r) => r, + Completion::Write(_) => unreachable!(), + }; + let mut buf = r.buf_mut(); let buf = buf.as_mut_slice(); file.read_exact(buf)?; } - c.complete(); + c.complete(0); Ok(()) } @@ -55,7 +59,7 @@ impl File for WindowsFile { &self, pos: usize, buffer: Rc>, - _c: Rc, + c: Rc, ) -> Result<()> { let mut file = self.file.borrow_mut(); file.seek(std::io::SeekFrom::Start(pos as u64))?;