From eac90cc9fe7f6249e8361e2d8bc087e9f4e742af Mon Sep 17 00:00:00 2001 From: nazeh Date: Tue, 19 Dec 2023 19:46:24 +0300 Subject: [PATCH] wip: fix zip path --- mast/src/lib.rs | 11 -- mast/src/mermaid.rs | 14 +- mast/src/node.rs | 134 +++++++++----- mast/src/storage/memory.rs | 39 ----- mast/src/storage/mod.rs | 1 - mast/src/treap.rs | 345 ++++++++++++++++++++----------------- 6 files changed, 289 insertions(+), 255 deletions(-) delete mode 100644 mast/src/storage/memory.rs delete mode 100644 mast/src/storage/mod.rs diff --git a/mast/src/lib.rs b/mast/src/lib.rs index dfadc2d..2a0d595 100644 --- a/mast/src/lib.rs +++ b/mast/src/lib.rs @@ -2,17 +2,6 @@ mod mermaid; mod node; -mod storage; pub mod treap; pub(crate) use blake3::{Hash, Hasher}; - -pub(crate) use node::Node; -pub(crate) use treap::HashTreap; - -// TODO: If we are going to use Iroh Bytes, might as well ues this from Iroh basics. -/// The hash for the empty byte range (`b""`). -pub(crate) const EMPTY_HASH: Hash = Hash::from_bytes([ - 175, 19, 73, 185, 245, 249, 161, 166, 160, 64, 77, 234, 54, 220, 201, 73, 155, 203, 37, 201, - 173, 193, 18, 183, 204, 154, 147, 202, 228, 31, 50, 98, -]); diff --git a/mast/src/mermaid.rs b/mast/src/mermaid.rs index e5481f6..9f177a0 100644 --- a/mast/src/mermaid.rs +++ b/mast/src/mermaid.rs @@ -1,6 +1,7 @@ #[cfg(test)] mod test { - use crate::{HashTreap, Node}; + use crate::node::Node; + use crate::treap::HashTreap; impl<'a> HashTreap<'a> { pub fn as_mermaid_graph(&self) -> String { @@ -8,7 +9,7 @@ mod test { graph.push_str("graph TD;\n"); - if let Some(root) = self.get_node(&self.root) { + if let Some(root) = self.root.clone() { self.build_graph_string(&root, &mut graph); } @@ -23,29 +24,30 @@ mod test { let key = bytes_to_string(node.key()); let node_label = format!("{}(({}))", node.hash(), key); - graph.push_str(&format!(" {};\n", node_label)); - + // graph.push_str(&format!("## START node {}\n", node_label)); if let Some(child) = self.get_node(node.left()) { let key = bytes_to_string(child.key()); let child_label = format!("{}(({}))", child.hash(), key); - graph.push_str(&format!(" {} --> {};\n", node_label, child_label)); + graph.push_str(&format!(" {} --l--> {};\n", node_label, child_label)); self.build_graph_string(&child, graph); } else { graph.push_str(&format!(" {} -.-> {}l((l));\n", node_label, node.hash())); graph.push_str(&format!(" class {}l null;\n", node.hash())); } + // graph.push_str(&format!("## done left at node {}\n", node_label)); if let Some(child) = self.get_node(node.right()) { let key = bytes_to_string(child.key()); let child_label = format!("{}(({}))", child.hash(), key); - graph.push_str(&format!(" {} --> {};\n", node_label, child_label)); + graph.push_str(&format!(" {} --r--> {};\n", node_label, child_label)); self.build_graph_string(&child, graph); } else { graph.push_str(&format!(" {} -.-> {}r((r));\n", node_label, node.hash())); graph.push_str(&format!(" class {}r null;\n", node.hash())); } + // graph.push_str(&format!("## done right at node {}\n", node_label)); } } diff --git a/mast/src/node.rs b/mast/src/node.rs index 828624b..bcf2477 100644 --- a/mast/src/node.rs +++ b/mast/src/node.rs @@ -1,10 +1,12 @@ use redb::{Database, ReadableTable, Table, TableDefinition, WriteTransaction}; -use crate::{Hash, Hasher, EMPTY_HASH}; +use crate::{Hash, Hasher}; // TODO: Are we creating too many hashers? // TODO: are we calculating the rank and hash too often? +const HASH_LEN: usize = 32; + #[derive(Debug, Clone)] /// In memory reprsentation of treap node. pub(crate) struct Node { @@ -27,31 +29,6 @@ pub(crate) enum Branch { } impl Node { - pub fn from_bytes(bytes: &[u8]) -> Self { - let (size, remaining) = varu64::decode(bytes).unwrap(); - let key = remaining[..size as usize].to_vec().into_boxed_slice(); - - let (size, remaining) = varu64::decode(&remaining[size as usize..]).unwrap(); - let value = remaining[..size as usize].to_vec().into_boxed_slice(); - - let left = remaining[size as usize..((size as usize) + 32)] - .try_into() - .map_or(None, |h| Some(Hash::from_bytes(h))); - - let right = remaining[(size as usize) + 32..((size as usize) + 32 + 32)] - .try_into() - .map_or(None, |h| Some(Hash::from_bytes(h))); - - Node { - key, - value, - left, - right, - - ref_count: 0, - } - } - pub fn new(key: &[u8], value: &[u8]) -> Self { Self { key: key.into(), @@ -62,7 +39,46 @@ impl Node { ref_count: 0, } } - // TODO: remember to update its hash. + + pub fn decode(data: (u64, &[u8])) -> Node { + let (ref_count, encoded_node) = data; + + let (key, rest) = decode(encoded_node); + let (value, rest) = decode(rest); + + let (left, rest) = decode(rest); + let left = match left.len() { + 0 => None, + 32 => { + let bytes: [u8; HASH_LEN] = left.try_into().unwrap(); + Some(Hash::from_bytes(bytes)) + } + _ => { + panic!("invalid hash length!") + } + }; + + let (right, rest) = decode(rest); + let right = match right.len() { + 0 => None, + 32 => { + let bytes: [u8; HASH_LEN] = right.try_into().unwrap(); + Some(Hash::from_bytes(bytes)) + } + _ => { + panic!("invalid hash length!") + } + }; + + Node { + key: key.into(), + value: value.into(), + left, + right, + + ref_count, + } + } // === Getters === @@ -116,9 +132,15 @@ impl Node { Branch::Right => self.right = new_child, } + self.save(table); + } + + pub(crate) fn save(&self, table: &mut Table<&[u8], (u64, &[u8])>) { let encoded = self.canonical_encode(); + let hash = hash(&encoded); + table.insert( - hash(&encoded).as_bytes().as_slice(), + hash.as_bytes().as_slice(), (self.ref_count, encoded.as_slice()), ); } @@ -130,24 +152,39 @@ impl Node { encode(&self.key, &mut bytes); encode(&self.value, &mut bytes); - encode( - &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(), - &mut bytes, - ); - encode( - &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(), - &mut bytes, - ); + + let left = &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(); + let right = &self + .right + .map(|h| h.as_bytes().to_vec()) + .unwrap_or_default(); + + encode(left, &mut bytes); + encode(right, &mut bytes); bytes } } fn encode(bytes: &[u8], out: &mut Vec) { - varu64::encode(bytes.len() as u64, out); + // TODO: find a better way to reserve bytes. + let current_len = out.len(); + for _ in 0..varu64::encoding_length(bytes.len() as u64) { + out.push(0) + } + varu64::encode(bytes.len() as u64, &mut out[current_len..]); + out.extend_from_slice(bytes); } +fn decode(bytes: &[u8]) -> (&[u8], &[u8]) { + let (len, remaining) = varu64::decode(bytes).unwrap(); + let value = &remaining[..len as usize]; + let rest = &remaining[value.len() as usize..]; + + (value, rest) +} + fn hash(bytes: &[u8]) -> Hash { let mut hasher = Hasher::new(); hasher.update(bytes); @@ -176,9 +213,24 @@ fn update_ref_count(child: Option, ref_diff: i8, table: &mut Table<&[u8], }; drop(existing); - table.insert( - hash.as_bytes().as_slice(), - (ref_count + ref_diff as u64, bytes.as_slice()), - ); + match ref_count { + 0 => { + // TODO: This doesn't seem to work yet. + // I think we should keep doing it recursively. + // or wait for the GC to do it? + // TODO: Is it the case that we don't clean up the other branch when the tree requires that? + // Well that should not happen really, but it is probably caused by the fact that + // the order of keys are missed up (not history independent) + // + // TODO: Confirm (read: test) this, because it is not easy to see in graphs. + table.remove(hash.as_bytes().as_slice()); + } + _ => { + table.insert( + hash.as_bytes().as_slice(), + (ref_count + ref_diff as u64, bytes.as_slice()), + ); + } + } } } diff --git a/mast/src/storage/memory.rs b/mast/src/storage/memory.rs deleted file mode 100644 index 6025e3c..0000000 --- a/mast/src/storage/memory.rs +++ /dev/null @@ -1,39 +0,0 @@ -use blake3::Hash; -use std::collections::HashMap; - -use crate::Node; - -#[derive(Debug)] -pub struct MemoryStorage { - roots: HashMap, Node>, - nodes: HashMap, - blobs: HashMap>, -} - -impl MemoryStorage { - pub(crate) fn new() -> Self { - Self { - roots: HashMap::new(), - nodes: HashMap::new(), - blobs: HashMap::new(), - } - } - - // TODO: return result or something. - - pub(crate) fn insert_root(&mut self, name: &[u8], node: Node) { - self.roots.insert(name.into(), node); - } - - pub(crate) fn insert_node(&mut self, node: &Node) { - self.nodes.insert(*node.hash(), node.clone()); - } - - pub(crate) fn insert_blob(&mut self, hash: Hash, blob: &[u8]) { - self.blobs.insert(hash, blob.into()); - } - - pub(crate) fn get_node(&self, hash: &Hash) -> Option { - self.nodes.get(hash).cloned() - } -} diff --git a/mast/src/storage/mod.rs b/mast/src/storage/mod.rs deleted file mode 100644 index eb29191..0000000 --- a/mast/src/storage/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod memory; diff --git a/mast/src/treap.rs b/mast/src/treap.rs index fe15786..6bb586c 100644 --- a/mast/src/treap.rs +++ b/mast/src/treap.rs @@ -1,212 +1,237 @@ use blake3::{Hash, Hasher}; +use redb::{Database, ReadableTable, Table, TableDefinition}; -use crate::node::Branch; -use crate::storage::memory::MemoryStorage; -use crate::Node; +use crate::node::{Branch, Node}; + +// TODO: remove unused +// TODO: remove unwrap #[derive(Debug)] pub struct HashTreap<'a> { - pub(crate) storage: &'a mut MemoryStorage, - pub(crate) root: Option, + /// Redb database to store the nodes. + pub(crate) db: &'a Database, + pub(crate) root: Option, } +// Table: Nodes v0 +// Key: `[u8; 32]` # Node hash +// Value: `(u64, [u8])` # (RefCount, EncodedNode) +const NODES_TABLE: TableDefinition<&[u8], (u64, &[u8])> = + TableDefinition::new("kytz:hash_treap:nodes:v0"); + impl<'a> HashTreap<'a> { // TODO: add name to open from storage with. - pub fn new(storage: &'a mut MemoryStorage) -> Self { - Self { - root: None, - storage, + pub fn new(db: &'a Database) -> Self { + // Setup tables + + let write_tx = db.begin_write().unwrap(); + { + let _table = write_tx.open_table(NODES_TABLE).unwrap(); } + write_tx.commit().unwrap(); + + // TODO: Try to open root (using this treaps or tags table). + // TODO: sould be checking for root on the fly probably! + + Self { root: None, db } } pub fn insert(&mut self, key: &[u8], value: &[u8]) { // TODO: validate key and value length. - let value = self.insert_blob(value); let mut node = Node::new(key, value); - if self.root.is_none() { - node.update(self.storage); - self.update_root(*node.hash()); - return; - } + let write_txn = self.db.begin_write().unwrap(); - // Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm. - // Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf). - // The difference here is that in a Hash Treap, we need to update nodes bottom up. + let _ = 'transaction: { + let mut nodes_table = write_txn.open_table(NODES_TABLE).unwrap(); - // Let's say we have the following tree: - // - // F - // / \ - // D P - // / / \ - // C H X - // / / \ \ - // A G M Y - // / - // I - // - // First we mark the binary search path to the leaf, going right if the key is greater than - // the current node's key and vice versa. - // - // F - // \ - // P - // / - // H - // \ - // M - // / - // I - // + if self.root.is_none() { + // We are done. + self.update_root(&node, &mut nodes_table); - // Path before insertion point. (Node, Branch to update) - let mut top_path: Vec<(Node, Branch)> = Vec::new(); - // Subtree of nodes on the path smaller than the inserted key. - let mut left_unzip_path: Vec = Vec::new(); - // Subtree of nodes on the path larger than the inserted key. - let mut right_unzip_path: Vec = Vec::new(); + break 'transaction; + } - let mut next = self.root; + // Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm. + // Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf). + // The difference here is that in a Hash Treap, we need to update nodes bottom up. - // Top down traversal of the binary search path. - while let Some(current) = self.get_node(&next) { - let should_zip = node.rank().as_bytes() > current.rank().as_bytes(); + // Let's say we have the following tree: + // + // F + // / \ + // D P + // / / \ + // C H X + // / / \ \ + // A G M Y + // / + // I + // + // First we mark the binary search path to the leaf, going right if the key is greater than + // the current node's key and vice versa. + // + // F + // \ + // P + // / + // H + // \ + // M + // / + // I + // - // Traverse left or right. - if key < current.key() { - next = *current.left(); + // Path before insertion point. (Node, Branch to update) + let mut top_path: Vec<(Node, Branch)> = Vec::new(); + // Subtree of nodes on the path smaller than the inserted key. + let mut left_unzip_path: Vec = Vec::new(); + // Subtree of nodes on the path larger than the inserted key. + let mut right_unzip_path: Vec = Vec::new(); - if should_zip { - left_unzip_path.push(current) + let mut next = self.root.clone().map(|n| n.hash()); + + // Top down traversal of the binary search path. + while let Some(current) = self.get_node(&next) { + let should_zip = node.rank().as_bytes() > current.rank().as_bytes(); + + // Traverse left or right. + if key < current.key() { + next = *current.left(); + + if should_zip { + right_unzip_path.push(current) + } else { + top_path.push((current, Branch::Left)); + } } else { - top_path.push((current, Branch::Left)); - } - } else { - next = *current.right(); + next = *current.right(); - if should_zip { - right_unzip_path.push(current) - } else { - top_path.push((current, Branch::Right)); - } - }; - } + if should_zip { + left_unzip_path.push(current) + } else { + top_path.push((current, Branch::Right)); + } + }; + } - // === Updating hashes bottom up === + // === Updating hashes bottom up === - // We are at the unzipping part of the path. - // - // First do the unzipping bottom up. - // - // H - // \ - // M < current_right - // / - // I < current_left - // - // Into (hopefully you can see the "unzipping"): - // - // left right - // subtree subtree - // - // H | - // \ | - // I | M + // We are at the unzipping part of the path. + // + // First do the unzipping bottom up. + // + // H + // \ + // M < current_right + // / + // I < current_left + // + // Into (hopefully you can see the "unzipping"): + // + // left right + // subtree subtree + // + // H | + // \ | + // I | M - while left_unzip_path.len() > 1 { - let child = left_unzip_path.pop().unwrap(); - let mut parent = left_unzip_path.last_mut().unwrap(); + while left_unzip_path.len() > 1 { + let child = left_unzip_path.pop().unwrap(); + let mut parent = left_unzip_path.last_mut().unwrap(); - parent.set_child(&Branch::Right, Some(*child.hash())); - parent.update(self.storage); - } + parent.set_child(&Branch::Right, Some(child.hash()), &mut nodes_table); + } - while right_unzip_path.len() > 1 { - let child = right_unzip_path.pop().unwrap(); - let mut parent = right_unzip_path.last_mut().unwrap(); + while right_unzip_path.len() > 1 { + let child = right_unzip_path.pop().unwrap(); + let mut parent = right_unzip_path.last_mut().unwrap(); - parent.set_child(&Branch::Left, Some(*child.hash())); - parent.update(self.storage); - } + parent.set_child(&Branch::Left, Some(child.hash()), &mut nodes_table); + } - // Done unzipping, join the current_left and current_right to J and update hashes upwards. - // - // J < Insertion point. - // / \ - // H M - // \ - // I + // Done unzipping, join the current_left and current_right to J and update hashes upwards. + // + // J < Insertion point. + // / \ + // H M + // \ + // I - node.set_child(&Branch::Left, left_unzip_path.first().map(|n| *n.hash())); - node.set_child(&Branch::Right, right_unzip_path.first().map(|n| *n.hash())); - // No more updates lower than the new node, save it to storage. - node.update(self.storage); + node.set_child( + &Branch::Left, + left_unzip_path.first().map(|n| n.hash()), + &mut nodes_table, + ); + node.set_child( + &Branch::Right, + right_unzip_path.first().map(|n| n.hash()), + &mut nodes_table, + ); - // Update the rest of the path upwards with the new hashes. - // So the final tree should look like: - // - // F - // / \ - // D P - // / / \ - // C J X - // / / \ \ - // A H M Y - // / \ - // G I + // Update the rest of the path upwards with the new hashes. + // So the final tree should look like: + // + // F + // / \ + // D P + // / / \ + // C J X + // / / \ \ + // A H M Y + // / \ + // G I - if top_path.is_empty() { - // The insertion point is at the root and we are done. - self.update_root(*node.hash()) - } + if top_path.is_empty() { + // The insertion point is at the root and we are done. + self.update_root(&node, &mut nodes_table) + } - let mut previous = node; + let mut previous = node; - while let Some((mut parent, branch)) = top_path.pop() { - parent.set_child(&branch, Some(*previous.hash())); - parent.update(self.storage); + while let Some((mut parent, branch)) = top_path.pop() { + parent.set_child(&branch, Some(previous.hash()), &mut nodes_table); - previous = parent; - } + previous = parent; + } - // Update the root pointer. - self.update_root(*previous.hash()) + // Update the root pointer. + self.update_root(&previous, &mut nodes_table) + }; // Finally we should commit the changes to the storage. - // TODO: commit + write_txn.commit().unwrap(); } // === Private Methods === - fn update_root(&mut self, hash: Hash) { + fn update_root(&mut self, node: &Node, table: &mut Table<&[u8], (u64, &[u8])>) { + node.save(table); + // The tree is empty, the incoming node has to be the root, and we are done. - self.root = Some(hash); + self.root = Some(node.clone()); // TODO: we need to persist the root change too to the storage. } - // TODO: Add stream input API. - fn insert_blob(&mut self, blob: &[u8]) -> Hash { - let mut hasher = Hasher::new(); - hasher.update(blob); - let hash = hasher.finalize(); - - self.storage.insert_blob(hash, blob); - - hash - } - pub(crate) fn get_node(&self, hash: &Option) -> Option { - hash.and_then(|h| self.storage.get_node(&h)) + let read_txn = self.db.begin_read().unwrap(); + let table = read_txn.open_table(NODES_TABLE).unwrap(); + + hash.and_then(|h| { + table + .get(h.as_bytes().as_slice()) + .unwrap() + .map(|existing| Node::decode(existing.value())) + }) } // === Test Methods === #[cfg(test)] fn verify_ranks(&self) -> bool { - let node = self.get_node(&self.root); + let node = self.get_node(&self.root.clone().map(|n| n.hash())); self.check_rank(node) } @@ -231,19 +256,25 @@ impl<'a> HashTreap<'a> { #[cfg(test)] mod test { use super::HashTreap; - use super::MemoryStorage; use super::Node; + use redb::{Database, Error, ReadableTable, TableDefinition}; + #[test] fn basic() { - let mut storage = MemoryStorage::new(); - let mut treap = HashTreap::new(&mut storage); + // Create an in-memory database + let file = tempfile::NamedTempFile::new().unwrap(); + let db = Database::create(file.path()).unwrap(); + + let mut treap = HashTreap::new(&db); let mut keys = ["A", "C", "D", "F", "G", "H", "M", "P", "X", "Y"]; - let mut keys = [ - "D", "N", "P", "X", "F", "Z", "Y", "A", "G", "C", "M", "H", "I", "J", - ]; + // let mut keys = [ + // "D", "N", "P", "X", "F", "Z", "Y", "A", "G", "C", "M", "H", "I", "J", + // ]; let mut keys = ["A", "B", "C"]; + // let mut keys = ["A", "B"]; + // let mut keys = ["A"]; // keys.reverse(); // keys.reverse(); // Overflowing stack! damn recursion.