wip: fix zip path

This commit is contained in:
nazeh
2023-12-19 19:46:24 +03:00
parent c88c085dec
commit eac90cc9fe
6 changed files with 289 additions and 255 deletions

View File

@@ -2,17 +2,6 @@
mod mermaid;
mod node;
mod storage;
pub mod treap;
pub(crate) use blake3::{Hash, Hasher};
pub(crate) use node::Node;
pub(crate) use treap::HashTreap;
// TODO: If we are going to use Iroh Bytes, might as well ues this from Iroh basics.
/// The hash for the empty byte range (`b""`).
pub(crate) const EMPTY_HASH: Hash = Hash::from_bytes([
175, 19, 73, 185, 245, 249, 161, 166, 160, 64, 77, 234, 54, 220, 201, 73, 155, 203, 37, 201,
173, 193, 18, 183, 204, 154, 147, 202, 228, 31, 50, 98,
]);

View File

@@ -1,6 +1,7 @@
#[cfg(test)]
mod test {
use crate::{HashTreap, Node};
use crate::node::Node;
use crate::treap::HashTreap;
impl<'a> HashTreap<'a> {
pub fn as_mermaid_graph(&self) -> String {
@@ -8,7 +9,7 @@ mod test {
graph.push_str("graph TD;\n");
if let Some(root) = self.get_node(&self.root) {
if let Some(root) = self.root.clone() {
self.build_graph_string(&root, &mut graph);
}
@@ -23,29 +24,30 @@ mod test {
let key = bytes_to_string(node.key());
let node_label = format!("{}(({}))", node.hash(), key);
graph.push_str(&format!(" {};\n", node_label));
// graph.push_str(&format!("## START node {}\n", node_label));
if let Some(child) = self.get_node(node.left()) {
let key = bytes_to_string(child.key());
let child_label = format!("{}(({}))", child.hash(), key);
graph.push_str(&format!(" {} --> {};\n", node_label, child_label));
graph.push_str(&format!(" {} --l--> {};\n", node_label, child_label));
self.build_graph_string(&child, graph);
} else {
graph.push_str(&format!(" {} -.-> {}l((l));\n", node_label, node.hash()));
graph.push_str(&format!(" class {}l null;\n", node.hash()));
}
// graph.push_str(&format!("## done left at node {}\n", node_label));
if let Some(child) = self.get_node(node.right()) {
let key = bytes_to_string(child.key());
let child_label = format!("{}(({}))", child.hash(), key);
graph.push_str(&format!(" {} --> {};\n", node_label, child_label));
graph.push_str(&format!(" {} --r--> {};\n", node_label, child_label));
self.build_graph_string(&child, graph);
} else {
graph.push_str(&format!(" {} -.-> {}r((r));\n", node_label, node.hash()));
graph.push_str(&format!(" class {}r null;\n", node.hash()));
}
// graph.push_str(&format!("## done right at node {}\n", node_label));
}
}

View File

@@ -1,10 +1,12 @@
use redb::{Database, ReadableTable, Table, TableDefinition, WriteTransaction};
use crate::{Hash, Hasher, EMPTY_HASH};
use crate::{Hash, Hasher};
// TODO: Are we creating too many hashers?
// TODO: are we calculating the rank and hash too often?
const HASH_LEN: usize = 32;
#[derive(Debug, Clone)]
/// In memory reprsentation of treap node.
pub(crate) struct Node {
@@ -27,31 +29,6 @@ pub(crate) enum Branch {
}
impl Node {
pub fn from_bytes(bytes: &[u8]) -> Self {
let (size, remaining) = varu64::decode(bytes).unwrap();
let key = remaining[..size as usize].to_vec().into_boxed_slice();
let (size, remaining) = varu64::decode(&remaining[size as usize..]).unwrap();
let value = remaining[..size as usize].to_vec().into_boxed_slice();
let left = remaining[size as usize..((size as usize) + 32)]
.try_into()
.map_or(None, |h| Some(Hash::from_bytes(h)));
let right = remaining[(size as usize) + 32..((size as usize) + 32 + 32)]
.try_into()
.map_or(None, |h| Some(Hash::from_bytes(h)));
Node {
key,
value,
left,
right,
ref_count: 0,
}
}
pub fn new(key: &[u8], value: &[u8]) -> Self {
Self {
key: key.into(),
@@ -62,7 +39,46 @@ impl Node {
ref_count: 0,
}
}
// TODO: remember to update its hash.
pub fn decode(data: (u64, &[u8])) -> Node {
let (ref_count, encoded_node) = data;
let (key, rest) = decode(encoded_node);
let (value, rest) = decode(rest);
let (left, rest) = decode(rest);
let left = match left.len() {
0 => None,
32 => {
let bytes: [u8; HASH_LEN] = left.try_into().unwrap();
Some(Hash::from_bytes(bytes))
}
_ => {
panic!("invalid hash length!")
}
};
let (right, rest) = decode(rest);
let right = match right.len() {
0 => None,
32 => {
let bytes: [u8; HASH_LEN] = right.try_into().unwrap();
Some(Hash::from_bytes(bytes))
}
_ => {
panic!("invalid hash length!")
}
};
Node {
key: key.into(),
value: value.into(),
left,
right,
ref_count,
}
}
// === Getters ===
@@ -116,9 +132,15 @@ impl Node {
Branch::Right => self.right = new_child,
}
self.save(table);
}
pub(crate) fn save(&self, table: &mut Table<&[u8], (u64, &[u8])>) {
let encoded = self.canonical_encode();
let hash = hash(&encoded);
table.insert(
hash(&encoded).as_bytes().as_slice(),
hash.as_bytes().as_slice(),
(self.ref_count, encoded.as_slice()),
);
}
@@ -130,24 +152,39 @@ impl Node {
encode(&self.key, &mut bytes);
encode(&self.value, &mut bytes);
encode(
&self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(),
&mut bytes,
);
encode(
&self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(),
&mut bytes,
);
let left = &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default();
let right = &self
.right
.map(|h| h.as_bytes().to_vec())
.unwrap_or_default();
encode(left, &mut bytes);
encode(right, &mut bytes);
bytes
}
}
fn encode(bytes: &[u8], out: &mut Vec<u8>) {
varu64::encode(bytes.len() as u64, out);
// TODO: find a better way to reserve bytes.
let current_len = out.len();
for _ in 0..varu64::encoding_length(bytes.len() as u64) {
out.push(0)
}
varu64::encode(bytes.len() as u64, &mut out[current_len..]);
out.extend_from_slice(bytes);
}
fn decode(bytes: &[u8]) -> (&[u8], &[u8]) {
let (len, remaining) = varu64::decode(bytes).unwrap();
let value = &remaining[..len as usize];
let rest = &remaining[value.len() as usize..];
(value, rest)
}
fn hash(bytes: &[u8]) -> Hash {
let mut hasher = Hasher::new();
hasher.update(bytes);
@@ -176,9 +213,24 @@ fn update_ref_count(child: Option<Hash>, ref_diff: i8, table: &mut Table<&[u8],
};
drop(existing);
table.insert(
hash.as_bytes().as_slice(),
(ref_count + ref_diff as u64, bytes.as_slice()),
);
match ref_count {
0 => {
// TODO: This doesn't seem to work yet.
// I think we should keep doing it recursively.
// or wait for the GC to do it?
// TODO: Is it the case that we don't clean up the other branch when the tree requires that?
// Well that should not happen really, but it is probably caused by the fact that
// the order of keys are missed up (not history independent)
//
// TODO: Confirm (read: test) this, because it is not easy to see in graphs.
table.remove(hash.as_bytes().as_slice());
}
_ => {
table.insert(
hash.as_bytes().as_slice(),
(ref_count + ref_diff as u64, bytes.as_slice()),
);
}
}
}
}

View File

@@ -1,39 +0,0 @@
use blake3::Hash;
use std::collections::HashMap;
use crate::Node;
#[derive(Debug)]
pub struct MemoryStorage {
roots: HashMap<Box<[u8]>, Node>,
nodes: HashMap<Hash, Node>,
blobs: HashMap<Hash, Box<[u8]>>,
}
impl MemoryStorage {
pub(crate) fn new() -> Self {
Self {
roots: HashMap::new(),
nodes: HashMap::new(),
blobs: HashMap::new(),
}
}
// TODO: return result or something.
pub(crate) fn insert_root(&mut self, name: &[u8], node: Node) {
self.roots.insert(name.into(), node);
}
pub(crate) fn insert_node(&mut self, node: &Node) {
self.nodes.insert(*node.hash(), node.clone());
}
pub(crate) fn insert_blob(&mut self, hash: Hash, blob: &[u8]) {
self.blobs.insert(hash, blob.into());
}
pub(crate) fn get_node(&self, hash: &Hash) -> Option<Node> {
self.nodes.get(hash).cloned()
}
}

View File

@@ -1 +0,0 @@
pub mod memory;

View File

@@ -1,212 +1,237 @@
use blake3::{Hash, Hasher};
use redb::{Database, ReadableTable, Table, TableDefinition};
use crate::node::Branch;
use crate::storage::memory::MemoryStorage;
use crate::Node;
use crate::node::{Branch, Node};
// TODO: remove unused
// TODO: remove unwrap
#[derive(Debug)]
pub struct HashTreap<'a> {
pub(crate) storage: &'a mut MemoryStorage,
pub(crate) root: Option<Hash>,
/// Redb database to store the nodes.
pub(crate) db: &'a Database,
pub(crate) root: Option<Node>,
}
// Table: Nodes v0
// Key: `[u8; 32]` # Node hash
// Value: `(u64, [u8])` # (RefCount, EncodedNode)
const NODES_TABLE: TableDefinition<&[u8], (u64, &[u8])> =
TableDefinition::new("kytz:hash_treap:nodes:v0");
impl<'a> HashTreap<'a> {
// TODO: add name to open from storage with.
pub fn new(storage: &'a mut MemoryStorage) -> Self {
Self {
root: None,
storage,
pub fn new(db: &'a Database) -> Self {
// Setup tables
let write_tx = db.begin_write().unwrap();
{
let _table = write_tx.open_table(NODES_TABLE).unwrap();
}
write_tx.commit().unwrap();
// TODO: Try to open root (using this treaps or tags table).
// TODO: sould be checking for root on the fly probably!
Self { root: None, db }
}
pub fn insert(&mut self, key: &[u8], value: &[u8]) {
// TODO: validate key and value length.
let value = self.insert_blob(value);
let mut node = Node::new(key, value);
if self.root.is_none() {
node.update(self.storage);
self.update_root(*node.hash());
return;
}
let write_txn = self.db.begin_write().unwrap();
// Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm.
// Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf).
// The difference here is that in a Hash Treap, we need to update nodes bottom up.
let _ = 'transaction: {
let mut nodes_table = write_txn.open_table(NODES_TABLE).unwrap();
// Let's say we have the following tree:
//
// F
// / \
// D P
// / / \
// C H X
// / / \ \
// A G M Y
// /
// I
//
// First we mark the binary search path to the leaf, going right if the key is greater than
// the current node's key and vice versa.
//
// F
// \
// P
// /
// H
// \
// M
// /
// I
//
if self.root.is_none() {
// We are done.
self.update_root(&node, &mut nodes_table);
// Path before insertion point. (Node, Branch to update)
let mut top_path: Vec<(Node, Branch)> = Vec::new();
// Subtree of nodes on the path smaller than the inserted key.
let mut left_unzip_path: Vec<Node> = Vec::new();
// Subtree of nodes on the path larger than the inserted key.
let mut right_unzip_path: Vec<Node> = Vec::new();
break 'transaction;
}
let mut next = self.root;
// Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm.
// Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf).
// The difference here is that in a Hash Treap, we need to update nodes bottom up.
// Top down traversal of the binary search path.
while let Some(current) = self.get_node(&next) {
let should_zip = node.rank().as_bytes() > current.rank().as_bytes();
// Let's say we have the following tree:
//
// F
// / \
// D P
// / / \
// C H X
// / / \ \
// A G M Y
// /
// I
//
// First we mark the binary search path to the leaf, going right if the key is greater than
// the current node's key and vice versa.
//
// F
// \
// P
// /
// H
// \
// M
// /
// I
//
// Traverse left or right.
if key < current.key() {
next = *current.left();
// Path before insertion point. (Node, Branch to update)
let mut top_path: Vec<(Node, Branch)> = Vec::new();
// Subtree of nodes on the path smaller than the inserted key.
let mut left_unzip_path: Vec<Node> = Vec::new();
// Subtree of nodes on the path larger than the inserted key.
let mut right_unzip_path: Vec<Node> = Vec::new();
if should_zip {
left_unzip_path.push(current)
let mut next = self.root.clone().map(|n| n.hash());
// Top down traversal of the binary search path.
while let Some(current) = self.get_node(&next) {
let should_zip = node.rank().as_bytes() > current.rank().as_bytes();
// Traverse left or right.
if key < current.key() {
next = *current.left();
if should_zip {
right_unzip_path.push(current)
} else {
top_path.push((current, Branch::Left));
}
} else {
top_path.push((current, Branch::Left));
}
} else {
next = *current.right();
next = *current.right();
if should_zip {
right_unzip_path.push(current)
} else {
top_path.push((current, Branch::Right));
}
};
}
if should_zip {
left_unzip_path.push(current)
} else {
top_path.push((current, Branch::Right));
}
};
}
// === Updating hashes bottom up ===
// === Updating hashes bottom up ===
// We are at the unzipping part of the path.
//
// First do the unzipping bottom up.
//
// H
// \
// M < current_right
// /
// I < current_left
//
// Into (hopefully you can see the "unzipping"):
//
// left right
// subtree subtree
//
// H |
// \ |
// I | M
// We are at the unzipping part of the path.
//
// First do the unzipping bottom up.
//
// H
// \
// M < current_right
// /
// I < current_left
//
// Into (hopefully you can see the "unzipping"):
//
// left right
// subtree subtree
//
// H |
// \ |
// I | M
while left_unzip_path.len() > 1 {
let child = left_unzip_path.pop().unwrap();
let mut parent = left_unzip_path.last_mut().unwrap();
while left_unzip_path.len() > 1 {
let child = left_unzip_path.pop().unwrap();
let mut parent = left_unzip_path.last_mut().unwrap();
parent.set_child(&Branch::Right, Some(*child.hash()));
parent.update(self.storage);
}
parent.set_child(&Branch::Right, Some(child.hash()), &mut nodes_table);
}
while right_unzip_path.len() > 1 {
let child = right_unzip_path.pop().unwrap();
let mut parent = right_unzip_path.last_mut().unwrap();
while right_unzip_path.len() > 1 {
let child = right_unzip_path.pop().unwrap();
let mut parent = right_unzip_path.last_mut().unwrap();
parent.set_child(&Branch::Left, Some(*child.hash()));
parent.update(self.storage);
}
parent.set_child(&Branch::Left, Some(child.hash()), &mut nodes_table);
}
// Done unzipping, join the current_left and current_right to J and update hashes upwards.
//
// J < Insertion point.
// / \
// H M
// \
// I
// Done unzipping, join the current_left and current_right to J and update hashes upwards.
//
// J < Insertion point.
// / \
// H M
// \
// I
node.set_child(&Branch::Left, left_unzip_path.first().map(|n| *n.hash()));
node.set_child(&Branch::Right, right_unzip_path.first().map(|n| *n.hash()));
// No more updates lower than the new node, save it to storage.
node.update(self.storage);
node.set_child(
&Branch::Left,
left_unzip_path.first().map(|n| n.hash()),
&mut nodes_table,
);
node.set_child(
&Branch::Right,
right_unzip_path.first().map(|n| n.hash()),
&mut nodes_table,
);
// Update the rest of the path upwards with the new hashes.
// So the final tree should look like:
//
// F
// / \
// D P
// / / \
// C J X
// / / \ \
// A H M Y
// / \
// G I
// Update the rest of the path upwards with the new hashes.
// So the final tree should look like:
//
// F
// / \
// D P
// / / \
// C J X
// / / \ \
// A H M Y
// / \
// G I
if top_path.is_empty() {
// The insertion point is at the root and we are done.
self.update_root(*node.hash())
}
if top_path.is_empty() {
// The insertion point is at the root and we are done.
self.update_root(&node, &mut nodes_table)
}
let mut previous = node;
let mut previous = node;
while let Some((mut parent, branch)) = top_path.pop() {
parent.set_child(&branch, Some(*previous.hash()));
parent.update(self.storage);
while let Some((mut parent, branch)) = top_path.pop() {
parent.set_child(&branch, Some(previous.hash()), &mut nodes_table);
previous = parent;
}
previous = parent;
}
// Update the root pointer.
self.update_root(*previous.hash())
// Update the root pointer.
self.update_root(&previous, &mut nodes_table)
};
// Finally we should commit the changes to the storage.
// TODO: commit
write_txn.commit().unwrap();
}
// === Private Methods ===
fn update_root(&mut self, hash: Hash) {
fn update_root(&mut self, node: &Node, table: &mut Table<&[u8], (u64, &[u8])>) {
node.save(table);
// The tree is empty, the incoming node has to be the root, and we are done.
self.root = Some(hash);
self.root = Some(node.clone());
// TODO: we need to persist the root change too to the storage.
}
// TODO: Add stream input API.
fn insert_blob(&mut self, blob: &[u8]) -> Hash {
let mut hasher = Hasher::new();
hasher.update(blob);
let hash = hasher.finalize();
self.storage.insert_blob(hash, blob);
hash
}
pub(crate) fn get_node(&self, hash: &Option<Hash>) -> Option<Node> {
hash.and_then(|h| self.storage.get_node(&h))
let read_txn = self.db.begin_read().unwrap();
let table = read_txn.open_table(NODES_TABLE).unwrap();
hash.and_then(|h| {
table
.get(h.as_bytes().as_slice())
.unwrap()
.map(|existing| Node::decode(existing.value()))
})
}
// === Test Methods ===
#[cfg(test)]
fn verify_ranks(&self) -> bool {
let node = self.get_node(&self.root);
let node = self.get_node(&self.root.clone().map(|n| n.hash()));
self.check_rank(node)
}
@@ -231,19 +256,25 @@ impl<'a> HashTreap<'a> {
#[cfg(test)]
mod test {
use super::HashTreap;
use super::MemoryStorage;
use super::Node;
use redb::{Database, Error, ReadableTable, TableDefinition};
#[test]
fn basic() {
let mut storage = MemoryStorage::new();
let mut treap = HashTreap::new(&mut storage);
// Create an in-memory database
let file = tempfile::NamedTempFile::new().unwrap();
let db = Database::create(file.path()).unwrap();
let mut treap = HashTreap::new(&db);
let mut keys = ["A", "C", "D", "F", "G", "H", "M", "P", "X", "Y"];
let mut keys = [
"D", "N", "P", "X", "F", "Z", "Y", "A", "G", "C", "M", "H", "I", "J",
];
// let mut keys = [
// "D", "N", "P", "X", "F", "Z", "Y", "A", "G", "C", "M", "H", "I", "J",
// ];
let mut keys = ["A", "B", "C"];
// let mut keys = ["A", "B"];
// let mut keys = ["A"];
// keys.reverse();
// keys.reverse(); // Overflowing stack! damn recursion.