wip: fix zip path

2026-02-15 19:14:21 +01:00 · 2023-12-19 19:46:24 +03:00
parent c88c085dec
commit eac90cc9fe
6 changed files with 289 additions and 255 deletions
--- a/mast/src/lib.rs
+++ b/mast/src/lib.rs
@@ -2,17 +2,6 @@

 mod mermaid;
 mod node;
-mod storage;
 pub mod treap;

 pub(crate) use blake3::{Hash, Hasher};
-
-pub(crate) use node::Node;
-pub(crate) use treap::HashTreap;
-
-// TODO: If we are going to use Iroh Bytes, might as well ues this from Iroh basics.
-/// The hash for the empty byte range (`b""`).
-pub(crate) const EMPTY_HASH: Hash = Hash::from_bytes([
-    175, 19, 73, 185, 245, 249, 161, 166, 160, 64, 77, 234, 54, 220, 201, 73, 155, 203, 37, 201,
-    173, 193, 18, 183, 204, 154, 147, 202, 228, 31, 50, 98,
-]);
--- a/mast/src/mermaid.rs
+++ b/mast/src/mermaid.rs
@@ -1,6 +1,7 @@
 #[cfg(test)]
 mod test {
-    use crate::{HashTreap, Node};
+    use crate::node::Node;
+    use crate::treap::HashTreap;

    impl<'a> HashTreap<'a> {
        pub fn as_mermaid_graph(&self) -> String {
@@ -8,7 +9,7 @@ mod test {

            graph.push_str("graph TD;\n");

-            if let Some(root) = self.get_node(&self.root) {
+            if let Some(root) = self.root.clone() {
                self.build_graph_string(&root, &mut graph);
            }

@@ -23,29 +24,30 @@ mod test {
            let key = bytes_to_string(node.key());
            let node_label = format!("{}(({}))", node.hash(), key);

-            graph.push_str(&format!("    {};\n", node_label));
-
+            // graph.push_str(&format!("## START node {}\n", node_label));
            if let Some(child) = self.get_node(node.left()) {
                let key = bytes_to_string(child.key());
                let child_label = format!("{}(({}))", child.hash(), key);

-                graph.push_str(&format!("    {} --> {};\n", node_label, child_label));
+                graph.push_str(&format!("    {} --l--> {};\n", node_label, child_label));
                self.build_graph_string(&child, graph);
            } else {
                graph.push_str(&format!("    {} -.-> {}l((l));\n", node_label, node.hash()));
                graph.push_str(&format!("    class {}l null;\n", node.hash()));
            }
+            // graph.push_str(&format!("## done left at node {}\n", node_label));

            if let Some(child) = self.get_node(node.right()) {
                let key = bytes_to_string(child.key());
                let child_label = format!("{}(({}))", child.hash(), key);

-                graph.push_str(&format!("    {} --> {};\n", node_label, child_label));
+                graph.push_str(&format!("    {} --r--> {};\n", node_label, child_label));
                self.build_graph_string(&child, graph);
            } else {
                graph.push_str(&format!("    {} -.-> {}r((r));\n", node_label, node.hash()));
                graph.push_str(&format!("    class {}r null;\n", node.hash()));
            }
+            // graph.push_str(&format!("## done right at node {}\n", node_label));
        }
    }

--- a/mast/src/node.rs
+++ b/mast/src/node.rs
@@ -1,10 +1,12 @@
 use redb::{Database, ReadableTable, Table, TableDefinition, WriteTransaction};

-use crate::{Hash, Hasher, EMPTY_HASH};
+use crate::{Hash, Hasher};

 // TODO: Are we creating too many hashers?
 // TODO: are we calculating the rank and hash too often?

+const HASH_LEN: usize = 32;
+
 #[derive(Debug, Clone)]
 /// In memory reprsentation of treap node.
 pub(crate) struct Node {
@@ -27,31 +29,6 @@ pub(crate) enum Branch {
 }

 impl Node {
-    pub fn from_bytes(bytes: &[u8]) -> Self {
-        let (size, remaining) = varu64::decode(bytes).unwrap();
-        let key = remaining[..size as usize].to_vec().into_boxed_slice();
-
-        let (size, remaining) = varu64::decode(&remaining[size as usize..]).unwrap();
-        let value = remaining[..size as usize].to_vec().into_boxed_slice();
-
-        let left = remaining[size as usize..((size as usize) + 32)]
-            .try_into()
-            .map_or(None, |h| Some(Hash::from_bytes(h)));
-
-        let right = remaining[(size as usize) + 32..((size as usize) + 32 + 32)]
-            .try_into()
-            .map_or(None, |h| Some(Hash::from_bytes(h)));
-
-        Node {
-            key,
-            value,
-            left,
-            right,
-
-            ref_count: 0,
-        }
-    }
-
    pub fn new(key: &[u8], value: &[u8]) -> Self {
        Self {
            key: key.into(),
@@ -62,7 +39,46 @@ impl Node {
            ref_count: 0,
        }
    }
-    // TODO: remember to update its hash.
+
+    pub fn decode(data: (u64, &[u8])) -> Node {
+        let (ref_count, encoded_node) = data;
+
+        let (key, rest) = decode(encoded_node);
+        let (value, rest) = decode(rest);
+
+        let (left, rest) = decode(rest);
+        let left = match left.len() {
+            0 => None,
+            32 => {
+                let bytes: [u8; HASH_LEN] = left.try_into().unwrap();
+                Some(Hash::from_bytes(bytes))
+            }
+            _ => {
+                panic!("invalid hash length!")
+            }
+        };
+
+        let (right, rest) = decode(rest);
+        let right = match right.len() {
+            0 => None,
+            32 => {
+                let bytes: [u8; HASH_LEN] = right.try_into().unwrap();
+                Some(Hash::from_bytes(bytes))
+            }
+            _ => {
+                panic!("invalid hash length!")
+            }
+        };
+
+        Node {
+            key: key.into(),
+            value: value.into(),
+            left,
+            right,
+
+            ref_count,
+        }
+    }

    // === Getters ===

@@ -116,9 +132,15 @@ impl Node {
            Branch::Right => self.right = new_child,
        }

+        self.save(table);
+    }
+
+    pub(crate) fn save(&self, table: &mut Table<&[u8], (u64, &[u8])>) {
        let encoded = self.canonical_encode();
+        let hash = hash(&encoded);
+
        table.insert(
-            hash(&encoded).as_bytes().as_slice(),
+            hash.as_bytes().as_slice(),
            (self.ref_count, encoded.as_slice()),
        );
    }
@@ -130,24 +152,39 @@ impl Node {

        encode(&self.key, &mut bytes);
        encode(&self.value, &mut bytes);
-        encode(
-            &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(),
-            &mut bytes,
-        );
-        encode(
-            &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(),
-            &mut bytes,
-        );
+
+        let left = &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default();
+        let right = &self
+            .right
+            .map(|h| h.as_bytes().to_vec())
+            .unwrap_or_default();
+
+        encode(left, &mut bytes);
+        encode(right, &mut bytes);

        bytes
    }
 }

 fn encode(bytes: &[u8], out: &mut Vec<u8>) {
-    varu64::encode(bytes.len() as u64, out);
+    // TODO: find a better way to reserve bytes.
+    let current_len = out.len();
+    for _ in 0..varu64::encoding_length(bytes.len() as u64) {
+        out.push(0)
+    }
+    varu64::encode(bytes.len() as u64, &mut out[current_len..]);
+
    out.extend_from_slice(bytes);
 }

+fn decode(bytes: &[u8]) -> (&[u8], &[u8]) {
+    let (len, remaining) = varu64::decode(bytes).unwrap();
+    let value = &remaining[..len as usize];
+    let rest = &remaining[value.len() as usize..];
+
+    (value, rest)
+}
+
 fn hash(bytes: &[u8]) -> Hash {
    let mut hasher = Hasher::new();
    hasher.update(bytes);
@@ -176,9 +213,24 @@ fn update_ref_count(child: Option<Hash>, ref_diff: i8, table: &mut Table<&[u8],
        };
        drop(existing);

-        table.insert(
-            hash.as_bytes().as_slice(),
-            (ref_count + ref_diff as u64, bytes.as_slice()),
-        );
+        match ref_count {
+            0 => {
+                // TODO: This doesn't seem to work yet.
+                // I think we should keep doing it recursively.
+                // or wait for the GC to do it?
+                // TODO: Is it the case that we don't clean up the other branch when the tree requires that?
+                // Well that should not happen really, but it is probably caused by the fact that
+                // the order of keys are missed up (not history independent)
+                //
+                // TODO: Confirm (read: test) this, because it is not easy to see in graphs.
+                table.remove(hash.as_bytes().as_slice());
+            }
+            _ => {
+                table.insert(
+                    hash.as_bytes().as_slice(),
+                    (ref_count + ref_diff as u64, bytes.as_slice()),
+                );
+            }
+        }
    }
 }
--- a/mast/src/storage/memory.rs
+++ b/mast/src/storage/memory.rs
@@ -1,39 +0,0 @@
-use blake3::Hash;
-use std::collections::HashMap;
-
-use crate::Node;
-
-#[derive(Debug)]
-pub struct MemoryStorage {
-    roots: HashMap<Box<[u8]>, Node>,
-    nodes: HashMap<Hash, Node>,
-    blobs: HashMap<Hash, Box<[u8]>>,
-}
-
-impl MemoryStorage {
-    pub(crate) fn new() -> Self {
-        Self {
-            roots: HashMap::new(),
-            nodes: HashMap::new(),
-            blobs: HashMap::new(),
-        }
-    }
-
-    // TODO: return result or something.
-
-    pub(crate) fn insert_root(&mut self, name: &[u8], node: Node) {
-        self.roots.insert(name.into(), node);
-    }
-
-    pub(crate) fn insert_node(&mut self, node: &Node) {
-        self.nodes.insert(*node.hash(), node.clone());
-    }
-
-    pub(crate) fn insert_blob(&mut self, hash: Hash, blob: &[u8]) {
-        self.blobs.insert(hash, blob.into());
-    }
-
-    pub(crate) fn get_node(&self, hash: &Hash) -> Option<Node> {
-        self.nodes.get(hash).cloned()
-    }
-}
--- a/mast/src/storage/mod.rs
+++ b/mast/src/storage/mod.rs
@@ -1 +0,0 @@
-pub mod memory;
--- a/mast/src/treap.rs
+++ b/mast/src/treap.rs
@@ -1,212 +1,237 @@
 use blake3::{Hash, Hasher};
+use redb::{Database, ReadableTable, Table, TableDefinition};

-use crate::node::Branch;
-use crate::storage::memory::MemoryStorage;
-use crate::Node;
+use crate::node::{Branch, Node};
+
+// TODO: remove unused
+// TODO: remove unwrap

 #[derive(Debug)]
 pub struct HashTreap<'a> {
-    pub(crate) storage: &'a mut MemoryStorage,
-    pub(crate) root: Option<Hash>,
+    /// Redb database to store the nodes.
+    pub(crate) db: &'a Database,
+    pub(crate) root: Option<Node>,
 }

+// Table: Nodes v0
+// Key:   `[u8; 32]`    # Node hash
+// Value: `(u64, [u8])` # (RefCount, EncodedNode)
+const NODES_TABLE: TableDefinition<&[u8], (u64, &[u8])> =
+    TableDefinition::new("kytz:hash_treap:nodes:v0");
+
 impl<'a> HashTreap<'a> {
    // TODO: add name to open from storage with.
-    pub fn new(storage: &'a mut MemoryStorage) -> Self {
-        Self {
-            root: None,
-            storage,
+    pub fn new(db: &'a Database) -> Self {
+        // Setup tables
+
+        let write_tx = db.begin_write().unwrap();
+        {
+            let _table = write_tx.open_table(NODES_TABLE).unwrap();
        }
+        write_tx.commit().unwrap();
+
+        // TODO: Try to open root (using this treaps or tags table).
+        // TODO: sould be checking for root on the fly probably!
+
+        Self { root: None, db }
    }

    pub fn insert(&mut self, key: &[u8], value: &[u8]) {
        // TODO: validate key and value length.

-        let value = self.insert_blob(value);
        let mut node = Node::new(key, value);

-        if self.root.is_none() {
-            node.update(self.storage);
-            self.update_root(*node.hash());
-            return;
-        }
+        let write_txn = self.db.begin_write().unwrap();

-        // Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm.
-        // Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf).
-        // The difference here is that in a Hash Treap, we need to update nodes bottom up.
+        let _ = 'transaction: {
+            let mut nodes_table = write_txn.open_table(NODES_TABLE).unwrap();

-        // Let's say we have the following tree:
-        //
-        //         F
-        //        / \
-        //       D   P
-        //      /   / \
-        //     C   H   X
-        //    /   / \   \
-        //   A   G   M   Y
-        //          /
-        //         I
-        //
-        // First we mark the binary search path to the leaf, going right if the key is greater than
-        // the current node's key and vice versa.
-        //
-        //         F
-        //          \
-        //           P
-        //          /
-        //         H
-        //          \
-        //           M
-        //          /
-        //         I
-        //
+            if self.root.is_none() {
+                // We are done.
+                self.update_root(&node, &mut nodes_table);

-        // Path before insertion point. (Node, Branch to update)
-        let mut top_path: Vec<(Node, Branch)> = Vec::new();
-        // Subtree of nodes on the path smaller than the inserted key.
-        let mut left_unzip_path: Vec<Node> = Vec::new();
-        // Subtree of nodes on the path larger  than the inserted key.
-        let mut right_unzip_path: Vec<Node> = Vec::new();
+                break 'transaction;
+            }

-        let mut next = self.root;
+            // Watch this [video](https://youtu.be/NxRXhBur6Xs?si=GNwaUOfuGwr_tBKI&t=1763) for a good explanation of the unzipping algorithm.
+            // Also see the Iterative insertion algorithm in the page 12 of the [original paper](https://arxiv.org/pdf/1806.06726.pdf).
+            // The difference here is that in a Hash Treap, we need to update nodes bottom up.

-        // Top down traversal of the binary search path.
-        while let Some(current) = self.get_node(&next) {
-            let should_zip = node.rank().as_bytes() > current.rank().as_bytes();
+            // Let's say we have the following tree:
+            //
+            //         F
+            //        / \
+            //       D   P
+            //      /   / \
+            //     C   H   X
+            //    /   / \   \
+            //   A   G   M   Y
+            //          /
+            //         I
+            //
+            // First we mark the binary search path to the leaf, going right if the key is greater than
+            // the current node's key and vice versa.
+            //
+            //         F
+            //          \
+            //           P
+            //          /
+            //         H
+            //          \
+            //           M
+            //          /
+            //         I
+            //

-            // Traverse left or right.
-            if key < current.key() {
-                next = *current.left();
+            // Path before insertion point. (Node, Branch to update)
+            let mut top_path: Vec<(Node, Branch)> = Vec::new();
+            // Subtree of nodes on the path smaller than the inserted key.
+            let mut left_unzip_path: Vec<Node> = Vec::new();
+            // Subtree of nodes on the path larger  than the inserted key.
+            let mut right_unzip_path: Vec<Node> = Vec::new();

-                if should_zip {
-                    left_unzip_path.push(current)
+            let mut next = self.root.clone().map(|n| n.hash());
+
+            // Top down traversal of the binary search path.
+            while let Some(current) = self.get_node(&next) {
+                let should_zip = node.rank().as_bytes() > current.rank().as_bytes();
+
+                // Traverse left or right.
+                if key < current.key() {
+                    next = *current.left();
+
+                    if should_zip {
+                        right_unzip_path.push(current)
+                    } else {
+                        top_path.push((current, Branch::Left));
+                    }
                } else {
-                    top_path.push((current, Branch::Left));
-                }
-            } else {
-                next = *current.right();
+                    next = *current.right();

-                if should_zip {
-                    right_unzip_path.push(current)
-                } else {
-                    top_path.push((current, Branch::Right));
-                }
-            };
-        }
+                    if should_zip {
+                        left_unzip_path.push(current)
+                    } else {
+                        top_path.push((current, Branch::Right));
+                    }
+                };
+            }

-        // === Updating hashes bottom up ===
+            // === Updating hashes bottom up ===

-        // We are at the unzipping part of the path.
-        //
-        // First do the unzipping bottom up.
-        //
-        //         H
-        //          \
-        //           M    < current_right
-        //          /
-        //         I      < current_left
-        //
-        // Into (hopefully you can see the "unzipping"):
-        //
-        //  left     right
-        //  subtree  subtree
-        //
-        //    H    |
-        //      \  |
-        //       I |  M
+            // We are at the unzipping part of the path.
+            //
+            // First do the unzipping bottom up.
+            //
+            //         H
+            //          \
+            //           M    < current_right
+            //          /
+            //         I      < current_left
+            //
+            // Into (hopefully you can see the "unzipping"):
+            //
+            //  left     right
+            //  subtree  subtree
+            //
+            //    H    |
+            //      \  |
+            //       I |  M

-        while left_unzip_path.len() > 1 {
-            let child = left_unzip_path.pop().unwrap();
-            let mut parent = left_unzip_path.last_mut().unwrap();
+            while left_unzip_path.len() > 1 {
+                let child = left_unzip_path.pop().unwrap();
+                let mut parent = left_unzip_path.last_mut().unwrap();

-            parent.set_child(&Branch::Right, Some(*child.hash()));
-            parent.update(self.storage);
-        }
+                parent.set_child(&Branch::Right, Some(child.hash()), &mut nodes_table);
+            }

-        while right_unzip_path.len() > 1 {
-            let child = right_unzip_path.pop().unwrap();
-            let mut parent = right_unzip_path.last_mut().unwrap();
+            while right_unzip_path.len() > 1 {
+                let child = right_unzip_path.pop().unwrap();
+                let mut parent = right_unzip_path.last_mut().unwrap();

-            parent.set_child(&Branch::Left, Some(*child.hash()));
-            parent.update(self.storage);
-        }
+                parent.set_child(&Branch::Left, Some(child.hash()), &mut nodes_table);
+            }

-        // Done unzipping, join the current_left and current_right to J and update hashes upwards.
-        //
-        //         J     < Insertion point.
-        //        / \
-        //       H   M
-        //        \
-        //         I
+            // Done unzipping, join the current_left and current_right to J and update hashes upwards.
+            //
+            //         J     < Insertion point.
+            //        / \
+            //       H   M
+            //        \
+            //         I

-        node.set_child(&Branch::Left, left_unzip_path.first().map(|n| *n.hash()));
-        node.set_child(&Branch::Right, right_unzip_path.first().map(|n| *n.hash()));
-        // No more updates lower than the new node, save it to storage.
-        node.update(self.storage);
+            node.set_child(
+                &Branch::Left,
+                left_unzip_path.first().map(|n| n.hash()),
+                &mut nodes_table,
+            );
+            node.set_child(
+                &Branch::Right,
+                right_unzip_path.first().map(|n| n.hash()),
+                &mut nodes_table,
+            );

-        // Update the rest of the path upwards with the new hashes.
-        // So the final tree should look like:
-        //
-        //         F
-        //        / \
-        //       D   P
-        //      /   / \
-        //     C   J   X
-        //    /   / \   \
-        //   A   H   M   Y
-        //      / \
-        //     G   I
+            // Update the rest of the path upwards with the new hashes.
+            // So the final tree should look like:
+            //
+            //         F
+            //        / \
+            //       D   P
+            //      /   / \
+            //     C   J   X
+            //    /   / \   \
+            //   A   H   M   Y
+            //      / \
+            //     G   I

-        if top_path.is_empty() {
-            // The insertion point is at the root and we are done.
-            self.update_root(*node.hash())
-        }
+            if top_path.is_empty() {
+                // The insertion point is at the root and we are done.
+                self.update_root(&node, &mut nodes_table)
+            }

-        let mut previous = node;
+            let mut previous = node;

-        while let Some((mut parent, branch)) = top_path.pop() {
-            parent.set_child(&branch, Some(*previous.hash()));
-            parent.update(self.storage);
+            while let Some((mut parent, branch)) = top_path.pop() {
+                parent.set_child(&branch, Some(previous.hash()), &mut nodes_table);

-            previous = parent;
-        }
+                previous = parent;
+            }

-        // Update the root pointer.
-        self.update_root(*previous.hash())
+            // Update the root pointer.
+            self.update_root(&previous, &mut nodes_table)
+        };

        // Finally we should commit the changes to the storage.
-        // TODO: commit
+        write_txn.commit().unwrap();
    }

    // === Private Methods ===

-    fn update_root(&mut self, hash: Hash) {
+    fn update_root(&mut self, node: &Node, table: &mut Table<&[u8], (u64, &[u8])>) {
+        node.save(table);
+
        // The tree is empty, the incoming node has to be the root, and we are done.
-        self.root = Some(hash);
+        self.root = Some(node.clone());

        // TODO: we need to persist the root change too to the storage.
    }

-    // TODO: Add stream input API.
-    fn insert_blob(&mut self, blob: &[u8]) -> Hash {
-        let mut hasher = Hasher::new();
-        hasher.update(blob);
-        let hash = hasher.finalize();
-
-        self.storage.insert_blob(hash, blob);
-
-        hash
-    }
-
    pub(crate) fn get_node(&self, hash: &Option<Hash>) -> Option<Node> {
-        hash.and_then(|h| self.storage.get_node(&h))
+        let read_txn = self.db.begin_read().unwrap();
+        let table = read_txn.open_table(NODES_TABLE).unwrap();
+
+        hash.and_then(|h| {
+            table
+                .get(h.as_bytes().as_slice())
+                .unwrap()
+                .map(|existing| Node::decode(existing.value()))
+        })
    }

    // === Test Methods ===

    #[cfg(test)]
    fn verify_ranks(&self) -> bool {
-        let node = self.get_node(&self.root);
+        let node = self.get_node(&self.root.clone().map(|n| n.hash()));
        self.check_rank(node)
    }

@@ -231,19 +256,25 @@ impl<'a> HashTreap<'a> {
 #[cfg(test)]
 mod test {
    use super::HashTreap;
-    use super::MemoryStorage;
    use super::Node;

+    use redb::{Database, Error, ReadableTable, TableDefinition};
+
    #[test]
    fn basic() {
-        let mut storage = MemoryStorage::new();
-        let mut treap = HashTreap::new(&mut storage);
+        // Create an in-memory database
+        let file = tempfile::NamedTempFile::new().unwrap();
+        let db = Database::create(file.path()).unwrap();
+
+        let mut treap = HashTreap::new(&db);

        let mut keys = ["A", "C", "D", "F", "G", "H", "M", "P", "X", "Y"];
-        let mut keys = [
-            "D", "N", "P", "X", "F", "Z", "Y", "A", "G", "C", "M", "H", "I", "J",
-        ];
+        // let mut keys = [
+        // "D", "N", "P", "X", "F", "Z", "Y", "A", "G", "C", "M", "H", "I", "J",
+        // ];
        let mut keys = ["A", "B", "C"];
+        // let mut keys = ["A", "B"];
+        // let mut keys = ["A"];
        // keys.reverse();
        // keys.reverse(); // Overflowing stack! damn recursion.