diff --git a/src/dict/mod.rs b/src/dict/mod.rs index b0d7c8be..2bd0e015 100644 --- a/src/dict/mod.rs +++ b/src/dict/mod.rs @@ -1342,6 +1342,262 @@ pub fn dict_merge( Ok(()) } +/// Builds a new dictionary from an iterator of sorted unique entries. +/// +/// It is a preferred way to build a large `Dict` as doing lots of +/// insertions is too slow. +pub fn build_dict_from_sorted_vec( + entries: I, + context: &mut dyn CellContext, +) -> Result, Error> +where + I: IntoIterator, + K: DictKey + Store + std::fmt::Debug, + V: Store + std::fmt::Debug, +{ + enum StackItem { + Leaf { + prefix: CellBuilder, + value: V, + prev_lcp_len: u16, + }, + Node { + prefix: CellBuilder, + value: Cell, + prev_lcp_len: u16, + }, + } + + struct DebugBinary(T); + + impl std::fmt::Debug for DebugBinary { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:b}", self.0) + } + } + + impl std::fmt::Debug for StackItem + where + V: std::fmt::Debug, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Leaf { + prefix, + value, + prev_lcp_len, + } => f + .debug_struct("Leaf") + .field("prefix", &DebugBinary(prefix.display_data())) + .field("value", value) + .field("prev_lcp_len", prev_lcp_len) + .finish(), + Self::Node { + prefix, + value, + prev_lcp_len, + } => f + .debug_struct("Node") + .field("prefix", &DebugBinary(prefix.display_data())) + .field("value", value) + .field("prev_lcp_len", prev_lcp_len) + .finish(), + } + } + } + + impl StackItem { + fn prefix_slice(&self) -> CellSlice<'_> { + match self { + Self::Leaf { prefix, .. } => prefix.as_data_slice(), + Self::Node { prefix, .. } => prefix.as_data_slice(), + } + } + + fn prev_lcp_len(&self) -> u16 { + match self { + Self::Leaf { prev_lcp_len, .. } => *prev_lcp_len, + Self::Node { prev_lcp_len, .. } => *prev_lcp_len, + } + } + + fn build(self, key_bit_len: u16, context: &mut dyn CellContext) -> Result { + match self { + Self::Leaf { prefix, value, .. } => { + make_leaf(&prefix.as_data_slice(), key_bit_len, &value, context) + } + Self::Node { value, .. } => Ok(value), + } + } + + fn merge( + self, + right: Self, + key_offset: u16, + key_bit_len: u16, + context: &mut dyn CellContext, + ) -> Result { + let split_at = right.prev_lcp_len(); + let label_len = split_at - key_offset; + + println!("key_offset={key_offset}, split_at={split_at}, label_len={label_len} key_bit_len={key_bit_len}"); + + let leaf_key_bit_len = key_bit_len - split_at - 1; + let key_bit_len = key_bit_len - key_offset; + + let mut builder = CellBuilder::new(); + + // Build the right cell and write the common prefix + let result_prefix; + let right_cell = match right { + Self::Leaf { prefix, value, .. } => { + let mut prefix_slice = prefix.as_data_slice(); + + // Write the common prefix as a label + let mut common_prefix = prefix_slice; + ok!(common_prefix.advance(key_offset, 0)); // TODO: Unwrap + + println!( + "LABEL (leaf): {:b}", + common_prefix.get_prefix(label_len, 0).display_data() + ); + + ok!(write_label( + &common_prefix.get_prefix(label_len, 0), + key_bit_len, + &mut builder, + )); + + // Build leaf from value + ok!(prefix_slice.advance(split_at + 1, 0)); // TODO: Unwrap + let value = ok!(make_leaf(&prefix_slice, leaf_key_bit_len, &value, context)); + + result_prefix = prefix; + value + } + Self::Node { prefix, value, .. } => { + // Write the common prefix as a label + let mut common_prefix = prefix.as_data_slice(); + ok!(common_prefix.advance(key_offset, 0)); // TODO: Unwrap + + println!( + "LABEL (node): {:b}", + common_prefix.get_prefix(label_len, 0).display_data() + ); + + ok!(write_label( + &common_prefix.get_prefix(label_len, 0), + key_bit_len, + &mut builder, + )); + + // Use value as is + result_prefix = prefix; + value + } + }; + + // Build the left cell + let left_cell = match self { + Self::Leaf { prefix, value, .. } => { + let mut prefix_slice = prefix.as_data_slice(); + ok!(prefix_slice.advance(split_at + 1, 0)); // TODO: Unwrap + ok!(make_leaf(&prefix_slice, leaf_key_bit_len, &value, context)) + } + Self::Node { value, .. } => value, + }; + + // Complete the node + ok!(builder.store_reference(left_cell)); + ok!(builder.store_reference(right_cell)); + + // Wrap into a new stack item + Ok(StackItem::Node { + prefix: result_prefix, + value: ok!(builder.build_ext(context)), + prev_lcp_len: key_offset, + }) + } + } + + let mut stack = Vec::>::new(); + + 'outer: for (key, value) in entries { + let prefix = { + let mut builder = CellBuilder::new(); + ok!(key.store_into(&mut builder, context)); + builder + }; + + println!("\n\nENTRY: ({key:?}, {value:?})"); + println!("PREFIX: {:b}", prefix.display_data()); + + let mut lcp_len = 0; + if let Some(last) = stack.last() { + let left_prefix = last.prefix_slice(); + let right_prefix = prefix.as_data_slice(); + + let lcp = left_prefix.longest_common_data_prefix(&right_prefix); + lcp_len = lcp.remaining_bits(); + println!("lcp_len: {lcp_len}"); + + match lcp_len.cmp(&last.prev_lcp_len()) { + // Skip duplicates and keep only the first one + std::cmp::Ordering::Equal => { + println!("continue?"); + continue 'outer; + } + // LCP increased, we might group the current item to the last one + std::cmp::Ordering::Greater => { + println!("skip last: {last:?}"); + } + // LCP decreased, we are not able to group the current item to the last one. + // At this point we can safely reduce the stack. + std::cmp::Ordering::Less => { + let mut key_offset = lcp_len + 1; + let mut right = stack.pop().unwrap(); + println!("right: {right:?}"); + while let Some(left) = stack.last() { + println!("left: {left:?}"); + if right.prev_lcp_len() < lcp_len { + println!("stop"); + break; + } + let left = stack.pop().unwrap(); + + key_offset = left.prev_lcp_len().max(key_offset); + + right = ok!(left.merge(right, key_offset, K::BITS, context)); + println!("rebuild right: {right:?}"); + } + stack.push(right); + } + } + } + + println!("TOTAL LCP: {lcp_len}"); + + stack.push(StackItem::Leaf { + prefix, + value, + prev_lcp_len: lcp_len, + }); + } + + let Some(mut result) = stack.pop() else { + return Ok(None); + }; + + println!("\nBUILD FINAL"); + while let Some(left) = stack.pop() { + result = ok!(left.merge(result, 0, K::BITS, context)); + } + + println!("---\n"); + + result.build(K::BITS, context).map(Some) +} + /// Creates a leaf node fn make_leaf( key: &CellSlice, @@ -1892,4 +2148,16 @@ mod tests { Ok(()) } + + #[test] + fn build_from_array() { + let entries = [(0u32, 1u32), (1, 2), (2, 3), (3, 4), (4, 5)]; + let result = build_dict_from_sorted_vec(entries, &mut Cell::empty_context()).unwrap(); + println!("{}", result.as_ref().unwrap().display_tree()); + + println!( + "BOC: {}", + crate::boc::BocRepr::encode_base64(result).unwrap() + ); + } }