wip: refactor insert to separate file

This commit is contained in:
nazeh
2023-12-20 19:57:15 +03:00
parent 5d2f61ccc8
commit 2c29d021f0
7 changed files with 335 additions and 345 deletions

View File

@@ -2,6 +2,9 @@
mod mermaid;
mod node;
mod operations;
pub mod treap;
pub(crate) use blake3::{Hash, Hasher};
pub const HASH_LEN: usize = 32;

View File

@@ -3,13 +3,13 @@ mod test {
use crate::node::Node;
use crate::treap::HashTreap;
impl<'a> HashTreap<'a> {
impl<'treap> HashTreap<'treap> {
pub fn as_mermaid_graph(&self) -> String {
let mut graph = String::new();
graph.push_str("graph TD;\n");
if let Some(root) = self.root.clone() {
if let Some(root) = self.root() {
self.build_graph_string(&root, &mut graph);
}

View File

@@ -1,11 +1,13 @@
use redb::{Database, ReadableTable, Table, TableDefinition, WriteTransaction};
//! In memory representation of a treap node.
use crate::{Hash, Hasher};
use redb::{ReadableTable, Table};
use crate::{Hash, Hasher, HASH_LEN};
// TODO: Are we creating too many hashers?
// TODO: are we calculating the rank and hash too often?
const HASH_LEN: usize = 32;
// TODO: remove unused
// TODO: remove unwrap
#[derive(Debug, Clone)]
/// In memory reprsentation of treap node.
@@ -28,6 +30,12 @@ pub(crate) enum Branch {
Right,
}
#[derive(Debug)]
enum RefCountDiff {
Increment,
Decrement,
}
impl Node {
pub fn new(key: &[u8], value: &[u8]) -> Self {
Self {
@@ -58,7 +66,7 @@ impl Node {
}
};
let (right, rest) = decode(rest);
let (right, _) = decode(rest);
let right = match right.len() {
0 => None,
32 => {
@@ -109,31 +117,7 @@ impl Node {
hash(&self.canonical_encode())
}
pub(crate) fn set_child(
&mut self,
branch: &Branch,
new_child: Option<Hash>,
table: &mut Table<&[u8], (u64, &[u8])>,
) {
let old_child = match branch {
Branch::Left => self.left,
Branch::Right => self.right,
};
// increment old child's ref count.
decrement_ref_count(old_child, table);
// increment new child's ref count.
increment_ref_count(new_child, table);
// set new child
match branch {
Branch::Left => self.left = new_child,
Branch::Right => self.right = new_child,
}
self.save(table);
}
pub(crate) fn decrement_ref_count(&self, table: &mut Table<&[u8], (u64, &[u8])>) {}
pub(crate) fn save(&self, table: &mut Table<&[u8], (u64, &[u8])>) {
let encoded = self.canonical_encode();
@@ -147,6 +131,27 @@ impl Node {
// === Private Methods ===
fn update_ref_count(&self, table: &mut Table<&[u8], (u64, &[u8])>, diff: RefCountDiff) {
let ref_count = match diff {
RefCountDiff::Increment => self.ref_count + 1,
RefCountDiff::Decrement => {
if self.ref_count > 0 {
self.ref_count - 1
} else {
self.ref_count
}
}
};
let bytes = self.canonical_encode();
let hash = hash(&bytes);
match ref_count {
0 => table.remove(hash.as_bytes().as_slice()),
_ => table.insert(hash.as_bytes().as_slice(), (ref_count, bytes.as_slice())),
};
}
fn canonical_encode(&self) -> Vec<u8> {
let mut bytes = vec![];
@@ -166,6 +171,40 @@ impl Node {
}
}
pub(crate) fn rank(key: &[u8]) -> Hash {
hash(key)
}
/// Returns the node for a given hash.
pub(crate) fn get_node<'a>(
table: &'a impl ReadableTable<&'static [u8], (u64, &'static [u8])>,
hash: &[u8],
) -> Option<Node> {
let existing = table.get(hash).unwrap();
if existing.is_none() {
return None;
}
let data = existing.unwrap();
Some(Node::decode(data.value()))
}
/// Returns the root hash for a given table.
pub(crate) fn get_root_hash<'a>(
table: &'a impl ReadableTable<&'static [u8], &'static [u8]>,
name: &str,
) -> Option<Hash> {
let existing = table.get(name.as_bytes()).unwrap();
if existing.is_none() {
return None;
}
let hash = existing.unwrap();
let hash: [u8; HASH_LEN] = hash.value().try_into().expect("Invalid root hash");
Some(Hash::from_bytes(hash))
}
fn encode(bytes: &[u8], out: &mut Vec<u8>) {
// TODO: find a better way to reserve bytes.
let current_len = out.len();
@@ -191,64 +230,3 @@ fn hash(bytes: &[u8]) -> Hash {
hasher.finalize()
}
#[derive(Debug)]
enum RefCountDiff {
Increment,
Decrement,
}
pub(crate) fn increment_ref_count(child: Option<Hash>, table: &mut Table<&[u8], (u64, &[u8])>) {
update_ref_count(child, RefCountDiff::Increment, table);
}
pub(crate) fn decrement_ref_count(child: Option<Hash>, table: &mut Table<&[u8], (u64, &[u8])>) {
update_ref_count(child, RefCountDiff::Decrement, table);
}
fn update_ref_count(
child: Option<Hash>,
ref_diff: RefCountDiff,
table: &mut Table<&[u8], (u64, &[u8])>,
) {
if let Some(hash) = child {
dbg!("should update child ref", &child);
let mut existing = table
.get(hash.as_bytes().as_slice())
.unwrap()
.expect("Child shouldn't be messing!");
let (ref_count, bytes) = {
let (r, v) = existing.value();
(r, v.to_vec())
};
drop(existing);
dbg!((
"\n\n decrmenting blah blah blah child",
&child,
&ref_count,
&ref_diff
));
let ref_count = match ref_diff {
RefCountDiff::Increment => ref_count + 1,
RefCountDiff::Decrement => {
if ref_count > 0 {
ref_count - 1
} else {
ref_count
}
}
};
match ref_count {
0 => {
// TODO: Confirm (read: test) this, because it is not easy to see in graphs.
table.remove(hash.as_bytes().as_slice());
}
_ => {
table.insert(hash.as_bytes().as_slice(), (ref_count, bytes.as_slice()));
}
}
}
}

56
mast/src/operations/gc.rs Normal file
View File

@@ -0,0 +1,56 @@
use blake3::Hash;
use redb::{Database, ReadableTable, Table, TableDefinition, WriteTransaction};
#[derive(Debug)]
enum RefCountDiff {
Increment,
Decrement,
}
pub(crate) fn increment_ref_count(node: Option<Hash>, table: &mut Table<&[u8], (u64, &[u8])>) {
update_ref_count(node, RefCountDiff::Increment, table);
}
pub(crate) fn decrement_ref_count(node: Option<Hash>, table: &mut Table<&[u8], (u64, &[u8])>) {
update_ref_count(node, RefCountDiff::Decrement, table);
}
fn update_ref_count(
node: Option<Hash>,
ref_diff: RefCountDiff,
table: &mut Table<&[u8], (u64, &[u8])>,
) {
if let Some(hash) = node {
let mut existing = table
.get(hash.as_bytes().as_slice())
.unwrap()
.expect("node shouldn't be messing!");
let (ref_count, bytes) = {
let (r, v) = existing.value();
(r, v.to_vec())
};
drop(existing);
let ref_count = match ref_diff {
RefCountDiff::Increment => ref_count + 1,
RefCountDiff::Decrement => {
if ref_count > 0 {
ref_count - 1
} else {
ref_count
}
}
};
match ref_count {
0 => {
// TODO: Confirm (read: test) this, because it is not easy to see in graphs.
table.remove(hash.as_bytes().as_slice());
}
_ => {
table.insert(hash.as_bytes().as_slice(), (ref_count, bytes.as_slice()));
}
}
}
}

View File

@@ -0,0 +1,142 @@
use crate::node::{get_node, get_root_hash, rank, Branch, Node};
use crate::treap::{HashTreap, NODES_TABLE, ROOTS_TABLE};
use crate::HASH_LEN;
use blake3::Hash;
use redb::{Database, ReadTransaction, ReadableTable, Table, TableDefinition, WriteTransaction};
// Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm.
// Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf).
// The difference here is that in a Hash Treap, we need to update nodes bottom up.
// Let's say we have the following tree:
//
// F
// / \
// D P
// / / \
// C H X
// / / \ \
// A G M Y
// /
// I
//
// The binary search path for inserting `J` then is:
//
// F
// \
// P
// /
// H
// \
// M
// /
// I
//
// Then we define `upper_path` as the path from the root to the insertion point
// marked by the first node with a `rank` that is either:
//
// - less than the `rank` of the inserted key:
//
// F
// \
// P
// ∧-- / --∧ upper path if rank(J) > rank(H)
// -- H -- unzip path
// \
// M Note that this is an arbitrary example,
// / do not expect the actual ranks of these keys to be the same in implmentation.
// I
//
// Upper path doesn't change much beyond updating the hash of their child in the branch featured in
// this binary search path.
//
// We call the rest of the path `unzipping path` or `split path` and this is where we create two
// new paths (left and right), each contain the nodes with keys smaller than or larger than the
// inserted key respectively.
//
// We update these unzipped paths from the _bottom up_ to generate the new hashes for their
// parents.
// Once we have the two paths, we use their tips as the new children of the newly inserted node.
// Finally we update the hashes upwards until we reach the new root of the tree.
//
// - equal to the `rank` of the inserted key:
//
// F
// \
// P
// /
// H --^ upper path if
// rank(H) = rank(H)
//
// This (exact key match) is the only way for the rank to match
// for secure hashes like blake3.
//
// This is a different case since we don't really need to split (unzip) the lower path, we just
// need to update the hash of the node (according to the new value) and update the hash of its
// parents until we reach the root.
//
// Also note that we need to update the `ref_count` of all the nodes, and delete the nodes with
// `ref_count` of zero.
//
// The simplest way to do so, is to decrement all the nodes in the search path, and then increment
// all then new nodes (in both the upper and lower paths) before comitting the write transaction.
impl<'treap> HashTreap<'treap> {
pub fn insert(&mut self, key: &[u8], value: &[u8]) {
// TODO: validate key and value length.
let write_txn = self.db.begin_write().unwrap();
'transaction: {
let roots_table = write_txn.open_table(ROOTS_TABLE).unwrap();
let mut nodes_table = write_txn.open_table(NODES_TABLE).unwrap();
let root = get_root_hash(&roots_table, &self.name);
let mut path = upper_path(key, root, &nodes_table);
path.iter_mut()
.for_each(|node| node.decrement_ref_count(&mut nodes_table))
// if path.
};
// Finally commit the changes to the storage.
write_txn.commit().unwrap();
}
}
/// Returns the current nodes from the root to the insertion point on the binary search path.
fn upper_path<'a>(
key: &[u8],
root: Option<Hash>,
nodes_table: &'a impl ReadableTable<&'static [u8], (u64, &'static [u8])>,
) -> Vec<Node> {
let rank = rank(key);
let mut path: Vec<Node> = Vec::new();
let mut previous_hash = root;
while let Some(current_hash) = previous_hash {
let current_node = get_node(nodes_table, current_hash.as_bytes()).expect("Node not found!");
let current_key = current_node.key();
if key == current_key {
// We found an exact match, we don't need to unzip the rest.
path.push(current_node);
break;
}
if key < current_key {
previous_hash = *current_node.left();
} else {
previous_hash = *current_node.right();
}
path.push(current_node);
}
path
}

View File

@@ -0,0 +1,2 @@
pub mod gc;
mod insert;

View File

@@ -1,272 +1,61 @@
use blake3::{Hash, Hasher};
// use redb::{Database, ite, ReadableTable, Table, TableDefinition};
use blake3::Hash;
use redb::*;
use crate::node::{decrement_ref_count, increment_ref_count, Branch, Node};
// TODO: remove unused
// TODO: remove unwrap
#[derive(Debug)]
pub struct HashTreap<'a> {
/// Redb database to store the nodes.
pub(crate) db: &'a Database,
pub(crate) root: Option<Node>,
}
use crate::node::{get_node, get_root_hash, Node};
// Table: Nodes v0
// stores all the hash treap nodes from all the treaps in the storage.
//
// Key: `[u8; 32]` # Node hash
// Value: `(u64, [u8])` # (RefCount, EncodedNode)
const NODES_TABLE: TableDefinition<&[u8], (u64, &[u8])> =
pub const NODES_TABLE: TableDefinition<&[u8], (u64, &[u8])> =
TableDefinition::new("kytz:hash_treap:nodes:v0");
impl<'a> HashTreap<'a> {
// Table: Roots v0
// stores all the current roots for all treaps in the storage.
//
// Key: `[u8; 32]` # Treap name
// Value: `[u8; 32]` # Hash
pub const ROOTS_TABLE: TableDefinition<&[u8], &[u8]> =
TableDefinition::new("kytz:hash_treap:roots:v0");
#[derive(Debug)]
pub struct HashTreap<'treap> {
/// Redb database to store the nodes.
pub(crate) db: &'treap Database,
pub(crate) name: &'treap str,
}
impl<'treap> HashTreap<'treap> {
// TODO: add name to open from storage with.
pub fn new(db: &'a Database) -> Self {
pub fn new(db: &'treap Database, name: &'treap str) -> Self {
// Setup tables
let write_tx = db.begin_write().unwrap();
{
let _table = write_tx.open_table(NODES_TABLE).unwrap();
let _table = write_tx.open_table(ROOTS_TABLE).unwrap();
}
write_tx.commit().unwrap();
// TODO: Try to open root (using this treaps or tags table).
// TODO: sould be checking for root on the fly probably!
Self { root: None, db }
Self { name, db }
}
pub fn insert(&mut self, key: &[u8], value: &[u8]) {
// TODO: validate key and value length.
// === Getters ===
let mut node = Node::new(key, value);
pub(crate) fn root(&self) -> Option<Node> {
let read_txn = self.db.begin_read().unwrap();
let write_txn = self.db.begin_write().unwrap();
let roots_table = read_txn.open_table(ROOTS_TABLE).unwrap();
let nodes_table = read_txn.open_table(NODES_TABLE).unwrap();
let _ = 'transaction: {
let mut nodes_table = write_txn.open_table(NODES_TABLE).unwrap();
if self.root.is_none() {
// We are done.
self.update_root(&node, &mut nodes_table);
break 'transaction;
}
// Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm.
// Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf).
// The difference here is that in a Hash Treap, we need to update nodes bottom up.
// Let's say we have the following tree:
//
// F
// / \
// D P
// / / \
// C H X
// / / \ \
// A G M Y
// /
// I
//
// First we mark the binary search path to the leaf, going right if the key is greater than
// the current node's key and vice versa.
//
// F
// \
// P
// /
// H
// \
// M
// /
// I
//
// Path before insertion point. (Node, Branch to update)
let mut top_path: Vec<(Node, Branch)> = Vec::new();
// Subtree of nodes on the path smaller than the inserted key.
let mut left_unzip_path: Vec<Node> = Vec::new();
// Subtree of nodes on the path larger than the inserted key.
let mut right_unzip_path: Vec<Node> = Vec::new();
let mut next = self.root.clone().map(|n| n.hash());
// Top down traversal of the binary search path.
while let Some(current) = self.get_node(&next) {
let node_rank = node.rank();
let curr_rank = current.rank();
if node_rank.as_bytes() == curr_rank.as_bytes() {
// Same key, we should update the value and return.
self.update_root(&node, &mut nodes_table);
break 'transaction;
}
let should_zip = node_rank.as_bytes() > curr_rank.as_bytes();
// Traverse left or right.
if key < current.key() {
next = *current.left();
if should_zip {
right_unzip_path.push(current)
} else {
top_path.push((current, Branch::Left));
}
} else {
next = *current.right();
if should_zip {
left_unzip_path.push(current)
} else {
top_path.push((current, Branch::Right));
}
};
}
// === Updating hashes bottom up ===
// We are at the unzipping part of the path.
//
// First do the unzipping bottom up.
//
// H
// \
// M < current_right
// /
// I < current_left
//
// Into (hopefully you can see the "unzipping"):
//
// left right
// subtree subtree
//
// H |
// \ |
// I | M
// dbg!((
// "unzipping left",
// String::from_utf8(node.key().to_vec()).unwrap(),
// &left_unzip_path
// .iter()
// .map(|n| String::from_utf8(n.key().to_vec()).unwrap())
// .collect::<Vec<_>>(),
// &right_unzip_path
// .iter()
// .map(|n| String::from_utf8(n.key().to_vec()).unwrap())
// .collect::<Vec<_>>(),
// ));
let left_unzip_path_len = left_unzip_path.len();
for i in 0..left_unzip_path_len {
if i == left_unzip_path_len - 1 {
// The last node in the path is special, since we need to clear its right
// child from older versions.
let child = left_unzip_path.get_mut(i).unwrap();
child.set_child(&Branch::Right, None, &mut nodes_table);
// Skip the last element for the first iterator
break;
}
let (first, second) = left_unzip_path.split_at_mut(i + 1);
let child = &first[i];
let parent = &mut second[0];
parent.set_child(&Branch::Right, Some(child.hash()), &mut nodes_table);
}
let right_unzip_path_len = right_unzip_path.len();
for i in 0..right_unzip_path_len {
if i == right_unzip_path_len - 1 {
// The last node in the path is special, since we need to clear its right
// child from older versions.
let child = right_unzip_path.get_mut(i).unwrap();
dbg!(("clearing the left child fuckin please", &child));
child.set_child(&Branch::Left, None, &mut nodes_table);
dbg!(("clearing the left child fuckin please", &child));
// Skip the last element for the first iterator
break;
}
let (first, second) = right_unzip_path.split_at_mut(i + 1);
let child = &first[i];
let parent = &mut second[0];
parent.set_child(&Branch::Left, Some(child.hash()), &mut nodes_table);
}
// Done unzipping, join the current_left and current_right to J and update hashes upwards.
//
// J < Insertion point.
// / \
// H M
// \
// I
node.set_child(
&Branch::Left,
left_unzip_path.first().map(|n| n.hash()),
&mut nodes_table,
);
node.set_child(
&Branch::Right,
right_unzip_path.first().map(|n| n.hash()),
&mut nodes_table,
);
// Update the rest of the path upwards with the new hashes.
// So the final tree should look like:
//
// F
// / \
// D P
// / / \
// C J X
// / / \ \
// A H M Y
// / \
// G I
if top_path.is_empty() {
// The insertion point is at the root and we are done.
self.update_root(&node, &mut nodes_table)
}
let mut previous = node;
while let Some((mut parent, branch)) = top_path.pop() {
parent.set_child(&branch, Some(previous.hash()), &mut nodes_table);
previous = parent;
}
// Update the root pointer.
self.update_root(&previous, &mut nodes_table)
};
// Finally we should commit the changes to the storage.
write_txn.commit().unwrap();
let hash = get_root_hash(&roots_table, self.name);
hash.and_then(|hash| get_node(&nodes_table, hash.as_bytes()))
}
// === Private Methods ===
fn update_root(&mut self, node: &Node, table: &mut Table<&[u8], (u64, &[u8])>) {
// decrement_ref_count(self.root.clone().map(|n| n.hash()), table);
node.save(table);
// The tree is empty, the incoming node has to be the root, and we are done.
self.root = Some(node.clone());
// TODO: we need to persist the root change too to the storage.
// TODO: add a tag to persist snapshots.
increment_ref_count(self.root.clone().map(|n| n.hash()), table);
}
/// Create a read transaction and get a node from the nodes table.
pub(crate) fn get_node(&self, hash: &Option<Hash>) -> Option<Node> {
let read_txn = self.db.begin_read().unwrap();
let table = read_txn.open_table(NODES_TABLE).unwrap();
@@ -281,10 +70,12 @@ impl<'a> HashTreap<'a> {
// === Test Methods ===
// TODO: move tests and test helper methods to separate module.
// Only keep the public methods here, and probably move it to lib.rs too.
#[cfg(test)]
fn verify_ranks(&self) -> bool {
let node = self.get_node(&self.root.clone().map(|n| n.hash()));
self.check_rank(node)
self.check_rank(self.root())
}
#[cfg(test)]
@@ -292,11 +83,9 @@ impl<'a> HashTreap<'a> {
match node {
Some(n) => {
let left_check = self.get_node(n.left()).map_or(true, |left| {
dbg!(("left", &left));
n.rank().as_bytes() > left.rank().as_bytes() && self.check_rank(Some(left))
});
let right_check = self.get_node(n.right()).map_or(true, |right| {
dbg!(("right", &right));
n.rank().as_bytes() > right.rank().as_bytes() && self.check_rank(Some(right))
});
@@ -308,6 +97,7 @@ impl<'a> HashTreap<'a> {
#[cfg(test)]
fn list_all_nodes(&self) {
// TODO: return all the nodes to verify GC in the test, or verify it here.
let read_txn = self.db.begin_read().unwrap();
let nodes_table = read_txn.open_table(NODES_TABLE).unwrap();
@@ -337,17 +127,17 @@ mod test {
use super::HashTreap;
use super::Node;
use redb::backends::InMemoryBackend;
use redb::{Database, Error, ReadableTable, TableDefinition};
// TODO: write a good test for GC.
#[test]
fn sorted_insert() {
// Create an in-memory database
let file = tempfile::NamedTempFile::new().unwrap();
let db = Database::create(file.path()).unwrap();
let mut treap = HashTreap::new(&db);
let mut treap = HashTreap::new(&db, "test");
let mut keys = [
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q",
@@ -364,11 +154,10 @@ mod test {
#[test]
fn unsorted_insert() {
// Create an in-memory database
let file = tempfile::NamedTempFile::new().unwrap();
let db = Database::create(file.path()).unwrap();
let mut treap = HashTreap::new(&db);
let mut treap = HashTreap::new(&db, "test");
// TODO: fix this cases
let mut keys = [
@@ -399,7 +188,7 @@ mod test {
let file = tempfile::NamedTempFile::new().unwrap();
let db = Database::create(file.path()).unwrap();
let mut treap = HashTreap::new(&db);
let mut treap = HashTreap::new(&db, "test");
let mut keys = ["X", "X"];
@@ -413,4 +202,24 @@ mod test {
println!("{}", treap.as_mermaid_graph())
}
#[test]
fn upsert_deeper_than_root() {
let file = tempfile::NamedTempFile::new().unwrap();
let db = Database::create(file.path()).unwrap();
let mut treap = HashTreap::new(&db, "test");
let mut keys = ["F", "X", "X"];
for key in keys.iter() {
treap.insert(key.as_bytes(), b"0");
}
assert!(treap.verify_ranks(), "Ranks are not correct");
// TODO: check the value.
println!("{}", treap.as_mermaid_graph())
}
}