From 82a5091c14cbf9aa45b5e30e49f3b1f61c05374c Mon Sep 17 00:00:00 2001 From: nazeh Date: Fri, 29 Dec 2023 23:45:29 +0300 Subject: [PATCH] feat: change node encoding --- mast/src/node.rs | 163 +++++++++++++++++++++++++--------- mast/src/operations/insert.rs | 14 +-- mast/src/operations/remove.rs | 2 +- mast/src/treap.rs | 1 + 4 files changed, 128 insertions(+), 52 deletions(-) diff --git a/mast/src/node.rs b/mast/src/node.rs index 1d71ff1..c11089a 100644 --- a/mast/src/node.rs +++ b/mast/src/node.rs @@ -4,9 +4,6 @@ use redb::{ReadableTable, Table}; use crate::{Hash, Hasher, HASH_LEN}; -// TODO: room for improvement (pending actual benchmarks to justify): -// - cache encoding - // TODO: remove unwrap // TODO: KeyType and ValueType @@ -182,20 +179,57 @@ impl Node { self } + /// Encodes the node in a canonical way: + /// - 1 byte header + /// - 0b1100_0000: Two reserved bits + /// - 0b0011_0000: Two bits represents the size of the key length (0, u8, u16, u32) + /// - 0b0000_1100: Two bits represents the size of the value length (0, u8, u16, u32) + /// - 0b0000_0010: left child is present + /// - 0b0000_0001: right child is present + /// - key + /// - value fn canonical_encode(&self) -> Vec { - let mut bytes = vec![]; + let key_length = self.key.len(); + let val_length = self.value.len(); - encode(&self.key, &mut bytes); - encode(&self.value, &mut bytes); + let key_length_encoding_length = len_encoding_length(key_length); + let val_length_encoding_length = len_encoding_length(val_length); - let left = &self.left.map(|h| h.as_bytes().to_vec()).unwrap_or_default(); - let right = &self - .right - .map(|h| h.as_bytes().to_vec()) - .unwrap_or_default(); + let header = 0_u8 + | (key_length_encoding_length << 4) + | (val_length_encoding_length << 2) + | ((self.left.is_some() as u8) << 1) + | (self.right.is_some() as u8); - encode(left, &mut bytes); - encode(right, &mut bytes); + let mut bytes = vec![header]; + + // Encode key length + match key_length_encoding_length { + 1 => bytes.push(key_length as u8), + 2 => bytes.extend_from_slice(&(key_length as u16).to_be_bytes()), + 3 => bytes.extend_from_slice(&(key_length as u32).to_be_bytes()), + _ => {} // Do nothing for 0 length + } + + // Encode value length + match val_length_encoding_length { + 1 => bytes.push(val_length as u8), + 2 => bytes.extend_from_slice(&(val_length as u16).to_be_bytes()), + 3 => bytes.extend_from_slice(&(val_length as u32).to_be_bytes()), + _ => {} // Do nothing for 0 length + } + + bytes.extend_from_slice(&self.key); + bytes.extend_from_slice(&self.value); + + if let Some(left) = &self.left { + bytes[0] |= 0b0000_0010; + bytes.extend_from_slice(left.as_bytes()); + } + if let Some(right) = &self.right { + bytes[0] |= 0b0000_0001; + bytes.extend_from_slice(right.as_bytes()); + } bytes } @@ -208,18 +242,7 @@ fn hash(bytes: &[u8]) -> Hash { hasher.finalize() } -fn encode(bytes: &[u8], out: &mut Vec) { - // TODO: find a better way to reserve bytes. - let current_len = out.len(); - for _ in 0..varu64::encoding_length(bytes.len() as u64) { - out.push(0) - } - varu64::encode(bytes.len() as u64, &mut out[current_len..]); - - out.extend_from_slice(bytes); -} - -fn decode(bytes: &[u8]) -> (&[u8], &[u8]) { +fn varu64_decode(bytes: &[u8]) -> (&[u8], &[u8]) { let (len, remaining) = varu64::decode(bytes).unwrap(); let value = &remaining[..len as usize]; let rest = &remaining[value.len()..]; @@ -230,30 +253,70 @@ fn decode(bytes: &[u8]) -> (&[u8], &[u8]) { fn decode_node(data: (u64, &[u8])) -> Node { let (ref_count, encoded_node) = data; - let (key, rest) = decode(encoded_node); - let (value, rest) = decode(rest); + // We can calculate the size of then node from the first few bytes. + let header = encoded_node[0]; - let (left, rest) = decode(rest); - let left = match left.len() { - 0 => None, - 32 => { - let bytes: [u8; HASH_LEN] = left.try_into().unwrap(); - Some(Hash::from_bytes(bytes)) + let mut rest = &encoded_node[1..]; + + let key_length = match (header & 0b0011_0000) >> 4 { + 1 => { + let len = rest[0] as usize; + rest = &rest[1..]; + len } - _ => { - panic!("invalid hash length!") + 2 => { + let len = u16::from_be_bytes(rest[0..3].try_into().unwrap()) as usize; + rest = &rest[3..]; + len + } + 3 => { + let len = u32::from_be_bytes(rest[0..4].try_into().unwrap()) as usize; + rest = &rest[4..]; + len + } + _ => 0, + }; + + let val_length = match (header & 0b0000_1100) >> 2 { + 1 => { + let len = rest[0] as usize; + rest = &rest[1..]; + len + } + 2 => { + let len = u16::from_be_bytes(rest[0..3].try_into().unwrap()) as usize; + rest = &rest[3..]; + len + } + 3 => { + let len = u32::from_be_bytes(rest[0..4].try_into().unwrap()) as usize; + rest = &rest[4..]; + len + } + _ => 0, + }; + + let key = &rest[..key_length]; + rest = &rest[key_length..]; + + let value = &rest[..val_length]; + rest = &rest[val_length..]; + + let left = match header & 0b0000_0010 == 0 { + true => None, + false => { + let hash_bytes: [u8; HASH_LEN] = rest[0..32].try_into().unwrap(); + rest = &rest[32..]; + + Some(Hash::from_bytes(hash_bytes)) } }; - let (right, _) = decode(rest); - let right = match right.len() { - 0 => None, - 32 => { - let bytes: [u8; HASH_LEN] = right.try_into().unwrap(); - Some(Hash::from_bytes(bytes)) - } - _ => { - panic!("invalid hash length!") + let right = match header & 0b0000_0001 == 0 { + true => None, + false => { + let hash_bytes: [u8; HASH_LEN] = rest[0..32].try_into().unwrap(); + Some(Hash::from_bytes(hash_bytes)) } }; @@ -269,3 +332,15 @@ fn decode_node(data: (u64, &[u8])) -> Node { hash: None, } } + +fn len_encoding_length(len: usize) -> u8 { + if len == 0 { + 0 + } else if len <= u8::max_value() as usize { + 1 + } else if len <= u16::max_value() as usize { + 2 + } else { + 3 + } +} diff --git a/mast/src/operations/insert.rs b/mast/src/operations/insert.rs index 3489327..c1ad523 100644 --- a/mast/src/operations/insert.rs +++ b/mast/src/operations/insert.rs @@ -220,7 +220,7 @@ mod test { test_operations( &case.map(|key| Entry::insert(key.as_bytes(), &[b"v", key.as_bytes()].concat())), - Some("78fd7507ef338f1a5816ffd702394999680a9694a85f4b8af77795d9fdd5854d"), + Some("9fbdb0a2023f8029871b44722b2091a45b8209eaa5ce912740959fc00c611b91"), ) } @@ -233,7 +233,7 @@ mod test { test_operations( &case.map(|key| Entry::insert(key.as_bytes(), &[b"v", key.as_bytes()].concat())), - Some("02af3de6ed6368c5abc16f231a17d1140e7bfec483c8d0aa63af4ef744d29bc3"), + Some("26820b21fec1451a2478808bb8bc3ade05dcfbcd50d9556cca77d12d6239f4a7"), ); } @@ -247,7 +247,7 @@ mod test { test_operations( &case.map(|key| Entry::insert(key.as_bytes(), &[b"v", key.as_bytes()].concat())), - Some("02af3de6ed6368c5abc16f231a17d1140e7bfec483c8d0aa63af4ef744d29bc3"), + Some("26820b21fec1451a2478808bb8bc3ade05dcfbcd50d9556cca77d12d6239f4a7"), ) } @@ -257,7 +257,7 @@ mod test { test_operations( &case.map(|key| Entry::insert(key.as_bytes(), &[b"v", key.as_bytes()].concat())), - Some("0957cc9b87c11cef6d88a95328cfd9043a3d6a99e9ba35ee5c9c47e53fb6d42b"), + Some("96c3cff677fb331fe2901a6b5297395f089a38af9ab4ad310d362f557d60fca5"), ) } @@ -272,7 +272,7 @@ mod test { i += 1; Entry::insert(key.as_bytes(), i.to_string().as_bytes()) }), - Some("4538b4de5e58f9be9d54541e69fab8c94c31553a1dec579227ef9b572d1c1dff"), + Some("69e8b408d10174feb9d9befd0a3de95767cc0e342d0dba5f51139f4b49588fb7"), ) } @@ -288,7 +288,7 @@ mod test { i += 1; Entry::insert(key.as_bytes(), i.to_string().as_bytes()) }), - Some("c9f7aaefb18ec8569322b9621fc64f430a7389a790e0bf69ec0ad02879d6ce54"), + Some("9e73a80068adf0fb31382eb35d489aa9b50f91a3ad8e55523d5cca6d6247b15b"), ) } @@ -304,7 +304,7 @@ mod test { i += 1; Entry::insert(key.as_bytes(), i.to_string().as_bytes()) }), - Some("02e26311f2b55bf6d4a7163399f99e17c975891a05af2f1e09bc969f8bf0f95d"), + Some("8c3cb6bb83df437b73183692e4b1b3809afd6974aec49d67b1ce3266e909cb67"), ) } } diff --git a/mast/src/operations/remove.rs b/mast/src/operations/remove.rs index 25bfe52..c8b07e3 100644 --- a/mast/src/operations/remove.rs +++ b/mast/src/operations/remove.rs @@ -225,7 +225,7 @@ mod test { test_operations( &case, - Some("02af3de6ed6368c5abc16f231a17d1140e7bfec483c8d0aa63af4ef744d29bc3"), + Some("26820b21fec1451a2478808bb8bc3ade05dcfbcd50d9556cca77d12d6239f4a7"), ); } } diff --git a/mast/src/treap.rs b/mast/src/treap.rs index 626ba41..6d8acdf 100644 --- a/mast/src/treap.rs +++ b/mast/src/treap.rs @@ -57,6 +57,7 @@ impl<'treap> HashTreap<'treap> { pub fn insert(&mut self, key: &[u8], value: &[u8]) { // TODO: validate key and value length. + // key and value mast be less than 2^32 bytes. let write_txn = self.db.begin_write().unwrap();