Skip to content

Commit

Permalink
feat: build dict from sorted leaves (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rexagon committed Jul 2, 2024
1 parent 491a0b9 commit 435bd9d
Showing 1 changed file with 268 additions and 0 deletions.
268 changes: 268 additions & 0 deletions src/dict/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1342,6 +1342,262 @@ pub fn dict_merge(
Ok(())
}

/// Builds a new dictionary from an iterator of sorted unique entries.
///
/// It is a preferred way to build a large `Dict` as doing lots of
/// insertions is too slow.
pub fn build_dict_from_sorted_vec<K, V, I>(
entries: I,
context: &mut dyn CellContext,
) -> Result<Option<Cell>, Error>
where
I: IntoIterator<Item = (K, V)>,
K: DictKey + Store + std::fmt::Debug,
V: Store + std::fmt::Debug,
{
enum StackItem<V> {
Leaf {
prefix: CellBuilder,
value: V,
prev_lcp_len: u16,
},
Node {
prefix: CellBuilder,
value: Cell,
prev_lcp_len: u16,
},
}

struct DebugBinary<T>(T);

impl<T: std::fmt::Binary> std::fmt::Debug for DebugBinary<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:b}", self.0)
}
}

impl<V> std::fmt::Debug for StackItem<V>
where
V: std::fmt::Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Leaf {
prefix,
value,
prev_lcp_len,
} => f
.debug_struct("Leaf")
.field("prefix", &DebugBinary(prefix.display_data()))
.field("value", value)
.field("prev_lcp_len", prev_lcp_len)
.finish(),
Self::Node {
prefix,
value,
prev_lcp_len,
} => f
.debug_struct("Node")
.field("prefix", &DebugBinary(prefix.display_data()))
.field("value", value)
.field("prev_lcp_len", prev_lcp_len)
.finish(),
}
}
}

impl<V: Store> StackItem<V> {
fn prefix_slice(&self) -> CellSlice<'_> {
match self {
Self::Leaf { prefix, .. } => prefix.as_data_slice(),
Self::Node { prefix, .. } => prefix.as_data_slice(),
}
}

fn prev_lcp_len(&self) -> u16 {
match self {
Self::Leaf { prev_lcp_len, .. } => *prev_lcp_len,
Self::Node { prev_lcp_len, .. } => *prev_lcp_len,
}
}

fn build(self, key_bit_len: u16, context: &mut dyn CellContext) -> Result<Cell, Error> {
match self {
Self::Leaf { prefix, value, .. } => {
make_leaf(&prefix.as_data_slice(), key_bit_len, &value, context)
}
Self::Node { value, .. } => Ok(value),
}
}

fn merge(
self,
right: Self,
key_offset: u16,
key_bit_len: u16,
context: &mut dyn CellContext,
) -> Result<Self, Error> {
let split_at = right.prev_lcp_len();
let label_len = split_at - key_offset;

println!("key_offset={key_offset}, split_at={split_at}, label_len={label_len} key_bit_len={key_bit_len}");

let leaf_key_bit_len = key_bit_len - split_at - 1;
let key_bit_len = key_bit_len - key_offset;

let mut builder = CellBuilder::new();

// Build the right cell and write the common prefix
let result_prefix;
let right_cell = match right {
Self::Leaf { prefix, value, .. } => {
let mut prefix_slice = prefix.as_data_slice();

// Write the common prefix as a label
let mut common_prefix = prefix_slice;
ok!(common_prefix.advance(key_offset, 0)); // TODO: Unwrap

println!(
"LABEL (leaf): {:b}",
common_prefix.get_prefix(label_len, 0).display_data()
);

ok!(write_label(
&common_prefix.get_prefix(label_len, 0),
key_bit_len,
&mut builder,
));

// Build leaf from value
ok!(prefix_slice.advance(split_at + 1, 0)); // TODO: Unwrap
let value = ok!(make_leaf(&prefix_slice, leaf_key_bit_len, &value, context));

result_prefix = prefix;
value
}
Self::Node { prefix, value, .. } => {
// Write the common prefix as a label
let mut common_prefix = prefix.as_data_slice();
ok!(common_prefix.advance(key_offset, 0)); // TODO: Unwrap

println!(
"LABEL (node): {:b}",
common_prefix.get_prefix(label_len, 0).display_data()
);

ok!(write_label(
&common_prefix.get_prefix(label_len, 0),
key_bit_len,
&mut builder,
));

// Use value as is
result_prefix = prefix;
value
}
};

// Build the left cell
let left_cell = match self {
Self::Leaf { prefix, value, .. } => {
let mut prefix_slice = prefix.as_data_slice();
ok!(prefix_slice.advance(split_at + 1, 0)); // TODO: Unwrap
ok!(make_leaf(&prefix_slice, leaf_key_bit_len, &value, context))
}
Self::Node { value, .. } => value,
};

// Complete the node
ok!(builder.store_reference(left_cell));
ok!(builder.store_reference(right_cell));

// Wrap into a new stack item
Ok(StackItem::Node {
prefix: result_prefix,
value: ok!(builder.build_ext(context)),
prev_lcp_len: key_offset,
})
}
}

let mut stack = Vec::<StackItem<V>>::new();

'outer: for (key, value) in entries {
let prefix = {
let mut builder = CellBuilder::new();
ok!(key.store_into(&mut builder, context));
builder
};

println!("\n\nENTRY: ({key:?}, {value:?})");
println!("PREFIX: {:b}", prefix.display_data());

let mut lcp_len = 0;
if let Some(last) = stack.last() {
let left_prefix = last.prefix_slice();
let right_prefix = prefix.as_data_slice();

let lcp = left_prefix.longest_common_data_prefix(&right_prefix);
lcp_len = lcp.remaining_bits();
println!("lcp_len: {lcp_len}");

match lcp_len.cmp(&last.prev_lcp_len()) {
// Skip duplicates and keep only the first one
std::cmp::Ordering::Equal => {
println!("continue?");
continue 'outer;
}
// LCP increased, we might group the current item to the last one
std::cmp::Ordering::Greater => {
println!("skip last: {last:?}");
}
// LCP decreased, we are not able to group the current item to the last one.
// At this point we can safely reduce the stack.
std::cmp::Ordering::Less => {
let mut key_offset = lcp_len + 1;
let mut right = stack.pop().unwrap();
println!("right: {right:?}");
while let Some(left) = stack.last() {
println!("left: {left:?}");
if right.prev_lcp_len() < lcp_len {
println!("stop");
break;
}
let left = stack.pop().unwrap();

key_offset = left.prev_lcp_len().max(key_offset);

right = ok!(left.merge(right, key_offset, K::BITS, context));
println!("rebuild right: {right:?}");
}
stack.push(right);
}
}
}

println!("TOTAL LCP: {lcp_len}");

stack.push(StackItem::Leaf {
prefix,
value,
prev_lcp_len: lcp_len,
});
}

let Some(mut result) = stack.pop() else {
return Ok(None);
};

println!("\nBUILD FINAL");
while let Some(left) = stack.pop() {
result = ok!(left.merge(result, 0, K::BITS, context));
}

println!("---\n");

result.build(K::BITS, context).map(Some)
}

/// Creates a leaf node
fn make_leaf(
key: &CellSlice,
Expand Down Expand Up @@ -1892,4 +2148,16 @@ mod tests {

Ok(())
}

#[test]
fn build_from_array() {
let entries = [(0u32, 1u32), (1, 2), (2, 3), (3, 4), (4, 5)];
let result = build_dict_from_sorted_vec(entries, &mut Cell::empty_context()).unwrap();
println!("{}", result.as_ref().unwrap().display_tree());

println!(
"BOC: {}",
crate::boc::BocRepr::encode_base64(result).unwrap()
);
}
}

0 comments on commit 435bd9d

Please sign in to comment.