-
Notifications
You must be signed in to change notification settings - Fork 286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extended balance_root
to handle root underflow case
#621
base: main
Are you sure you want to change the base?
Changes from 3 commits
a1bd833
8a54812
0f682f1
9a96605
abed8c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -667,7 +667,9 @@ impl BTreeCursor { | |||||||||||||||
continue; | ||||||||||||||||
} | ||||||||||||||||
None => { | ||||||||||||||||
unreachable!("we shall not go back up! The only way is down the slope"); | ||||||||||||||||
// We've hit either a leaf page or an overflow page | ||||||||||||||||
// Either way, we're done traversing | ||||||||||||||||
return Ok(CursorResult::Ok(())); | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
|
@@ -699,10 +701,15 @@ impl BTreeCursor { | |||||||||||||||
self.pager.add_dirty(page.get().id); | ||||||||||||||||
|
||||||||||||||||
let page = page.get().contents.as_mut().unwrap(); | ||||||||||||||||
assert!(matches!(page.page_type(), PageType::TableLeaf)); | ||||||||||||||||
|
||||||||||||||||
let page_type = match page.maybe_page_type() { | ||||||||||||||||
Some(pt) => pt, | ||||||||||||||||
None => return Ok(CursorResult::Ok(())), // Handle overflow pages | ||||||||||||||||
}; | ||||||||||||||||
assert!(matches!(page_type, PageType::TableLeaf)); | ||||||||||||||||
|
||||||||||||||||
// find cell | ||||||||||||||||
(self.find_cell(page, int_key), page.page_type()) | ||||||||||||||||
(self.find_cell(page, int_key), page_type) | ||||||||||||||||
}; | ||||||||||||||||
|
||||||||||||||||
// TODO: if overwrite drop cell | ||||||||||||||||
|
@@ -1191,83 +1198,191 @@ impl BTreeCursor { | |||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
/// Balance the root page. | ||||||||||||||||
/// This is done when the root page overflows, and we need to create a new root page. | ||||||||||||||||
/// See e.g. https://en.wikipedia.org/wiki/B-tree | ||||||||||||||||
fn balance_root(&mut self) { | ||||||||||||||||
/* todo: balance deeper, create child and copy contents of root there. Then split root */ | ||||||||||||||||
/* if we are in root page then we just need to create a new root and push key there */ | ||||||||||||||||
fn calculate_used_space(&self, page: &PageContent) -> usize { | ||||||||||||||||
let mut total_used = 0; | ||||||||||||||||
total_used += page.header_size(); | ||||||||||||||||
let (_, cell_array_size) = page.cell_pointer_array_offset_and_size(); | ||||||||||||||||
total_used += cell_array_size; | ||||||||||||||||
for cell_idx in 0..page.cell_count() { | ||||||||||||||||
let (_, cell_len) = page.cell_get_raw_region( | ||||||||||||||||
cell_idx, | ||||||||||||||||
self.payload_overflow_threshold_max(page.page_type()), | ||||||||||||||||
self.payload_overflow_threshold_min(page.page_type()), | ||||||||||||||||
self.usable_space(), | ||||||||||||||||
); | ||||||||||||||||
total_used += cell_len; | ||||||||||||||||
} | ||||||||||||||||
for overflow_cell in &page.overflow_cells { | ||||||||||||||||
total_used += overflow_cell.payload.len(); | ||||||||||||||||
} | ||||||||||||||||
total_used -= page.num_frag_free_bytes() as usize; | ||||||||||||||||
|
||||||||||||||||
let is_page_1 = { | ||||||||||||||||
let current_root = self.stack.top(); | ||||||||||||||||
current_root.get().id == 1 | ||||||||||||||||
}; | ||||||||||||||||
total_used | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
fn estimate_cell_pointer_growth(&self) -> usize { | ||||||||||||||||
// In SQLite, we want to leave some space for future cell pointers | ||||||||||||||||
// Each cell pointer takes 2 bytes | ||||||||||||||||
const CELL_POINTER_SIZE: usize = 2; | ||||||||||||||||
const ESTIMATED_FUTURE_CELLS: usize = 4; // Leave room for a few more cells | ||||||||||||||||
|
||||||||||||||||
CELL_POINTER_SIZE * ESTIMATED_FUTURE_CELLS | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
fn is_overflow(&self, page: &PageContent) -> bool { | ||||||||||||||||
if !page.overflow_cells.is_empty() { | ||||||||||||||||
return true; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
let used_space = self.calculate_used_space(page); | ||||||||||||||||
let available_space = self.usable_space(); | ||||||||||||||||
|
||||||||||||||||
// Check if we're about to exceed usable space | ||||||||||||||||
// We also need some buffer space for future cell pointers | ||||||||||||||||
let required_space = used_space + self.estimate_cell_pointer_growth(); | ||||||||||||||||
required_space > available_space | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
fn is_underflow(&self, page: &PageContent) -> bool { | ||||||||||||||||
// Root is special case - only underflow when empty with one child | ||||||||||||||||
let current_page = self.stack.top(); | ||||||||||||||||
if current_page.get().id == self.root_page { | ||||||||||||||||
return page.cell_count() == 0 && !page.is_leaf(); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
let used_space = self.calculate_used_space(page); | ||||||||||||||||
let usable_space = self.usable_space(); | ||||||||||||||||
|
||||||||||||||||
// Underflow if used space is less than 50% of usable space | ||||||||||||||||
used_space < (usable_space / 2) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
fn balance_root(&mut self) { | ||||||||||||||||
let current_root = self.stack.top(); | ||||||||||||||||
let contents = current_root.get().contents.as_mut().unwrap(); | ||||||||||||||||
let is_page_1 = current_root.get().id == 1; | ||||||||||||||||
let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 }; | ||||||||||||||||
let new_root_page = self.allocate_page(PageType::TableInterior, offset); | ||||||||||||||||
{ | ||||||||||||||||
let current_root = self.stack.top(); | ||||||||||||||||
let current_root_contents = current_root.get().contents.as_ref().unwrap(); | ||||||||||||||||
|
||||||||||||||||
let new_root_page_id = new_root_page.get().id; | ||||||||||||||||
let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap(); | ||||||||||||||||
if is_page_1 { | ||||||||||||||||
// Copy header | ||||||||||||||||
let current_root_buf = current_root_contents.as_ptr(); | ||||||||||||||||
let new_root_buf = new_root_page_contents.as_ptr(); | ||||||||||||||||
new_root_buf[0..DATABASE_HEADER_SIZE] | ||||||||||||||||
.copy_from_slice(¤t_root_buf[0..DATABASE_HEADER_SIZE]); | ||||||||||||||||
if self.is_underflow(contents) { | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe do: if self.is_overflow(contents) {
...
} else if self.is_underflow(contents) {
...
} else {
unreachable!("balance_root was called where we didn't have any overflow or underflow")
} |
||||||||||||||||
let child_page_id = contents.rightmost_pointer().unwrap(); | ||||||||||||||||
let child_page = self | ||||||||||||||||
.pager | ||||||||||||||||
.read_page(child_page_id as usize) | ||||||||||||||||
.expect("This shouldn't have happened, child page not found"); | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to propagate errors
Suggested change
And I think we are not handling the case where child page is not loaded/locked? |
||||||||||||||||
let child_contents = child_page.get().contents.as_mut().unwrap(); | ||||||||||||||||
|
||||||||||||||||
// Defragment child before inserting its cells into root because root is smaller than child due to it | ||||||||||||||||
// containing header. | ||||||||||||||||
self.defragment_page(child_contents, self.database_header.borrow()); | ||||||||||||||||
|
||||||||||||||||
let necessary_space = self.calculate_used_space(child_contents); | ||||||||||||||||
let available_space = if is_page_1 { | ||||||||||||||||
self.usable_space() - DATABASE_HEADER_SIZE | ||||||||||||||||
} else { | ||||||||||||||||
self.usable_space() | ||||||||||||||||
}; | ||||||||||||||||
|
||||||||||||||||
// If root can't consume child, leave it empty temporarily and handle it in balance_leaf step | ||||||||||||||||
if available_space < necessary_space { | ||||||||||||||||
return; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
let grandchild_ptr = child_contents.rightmost_pointer(); | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rename There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moreover, if we are balancing root because it doesn't have any more cells, why do we expect the child to not be a leaf page? I mean, if you take rightmost pointer it means that child is a interior page, therefore, the tree is not balanced. |
||||||||||||||||
|
||||||||||||||||
let mut cells = Vec::new(); | ||||||||||||||||
for idx in 0..child_contents.cell_count() { | ||||||||||||||||
let (start, len) = child_contents.cell_get_raw_region( | ||||||||||||||||
idx, | ||||||||||||||||
self.payload_overflow_threshold_max(child_contents.page_type()), | ||||||||||||||||
self.payload_overflow_threshold_min(child_contents.page_type()), | ||||||||||||||||
self.usable_space(), | ||||||||||||||||
); | ||||||||||||||||
let buf = child_contents.as_ptr(); | ||||||||||||||||
cells.push(buf[start..start + len].to_vec()); | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we |
||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// TODO: free the child page by adding it to the free list. Functionality has to be added to pager.rs | ||||||||||||||||
|
||||||||||||||||
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); | ||||||||||||||||
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); | ||||||||||||||||
if !contents.is_leaf() { | ||||||||||||||||
contents.write_u32( | ||||||||||||||||
PAGE_HEADER_OFFSET_RIGHTMOST_PTR, | ||||||||||||||||
grandchild_ptr.unwrap_or(0), | ||||||||||||||||
); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Insert cells into root | ||||||||||||||||
for (idx, cell) in cells.iter().enumerate() { | ||||||||||||||||
self.insert_into_cell(contents, cell, idx); | ||||||||||||||||
} | ||||||||||||||||
// point new root right child to previous root | ||||||||||||||||
new_root_page_contents | ||||||||||||||||
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32); | ||||||||||||||||
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
/* swap splitted page buffer with new root buffer so we don't have to update page idx */ | ||||||||||||||||
{ | ||||||||||||||||
let (root_id, child_id, child) = { | ||||||||||||||||
let page_ref = self.stack.top(); | ||||||||||||||||
let child = page_ref.clone(); | ||||||||||||||||
|
||||||||||||||||
// Swap the entire Page structs | ||||||||||||||||
std::mem::swap(&mut child.get().id, &mut new_root_page.get().id); | ||||||||||||||||
// TODO:: shift bytes by offset to left on child because now child has offset 100 | ||||||||||||||||
// and header bytes | ||||||||||||||||
// Also change the offset of page | ||||||||||||||||
// | ||||||||||||||||
if is_page_1 { | ||||||||||||||||
// Remove header from child and set offset to 0 | ||||||||||||||||
let contents = child.get().contents.as_mut().unwrap(); | ||||||||||||||||
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size(); | ||||||||||||||||
// change cell pointers | ||||||||||||||||
for cell_idx in 0..contents.cell_count() { | ||||||||||||||||
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset; | ||||||||||||||||
let pc = contents.read_u16(cell_pointer_offset); | ||||||||||||||||
contents.write_u16(cell_pointer_offset, pc - offset as u16); | ||||||||||||||||
} | ||||||||||||||||
if self.is_overflow(contents) { | ||||||||||||||||
// Remember original page type - the child will inherit this | ||||||||||||||||
let original_type = contents.page_type(); | ||||||||||||||||
|
||||||||||||||||
contents.offset = 0; | ||||||||||||||||
let buf = contents.as_ptr(); | ||||||||||||||||
buf.copy_within(DATABASE_HEADER_SIZE.., 0); | ||||||||||||||||
} | ||||||||||||||||
// Create new child page with root's original type | ||||||||||||||||
let child_page = self.allocate_page(original_type, 0); | ||||||||||||||||
let child_page_id = child_page.get().id; | ||||||||||||||||
let child_contents = child_page.get().contents.as_mut().unwrap(); | ||||||||||||||||
|
||||||||||||||||
self.pager.add_dirty(new_root_page.get().id); | ||||||||||||||||
self.pager.add_dirty(child.get().id); | ||||||||||||||||
(new_root_page.get().id, child.get().id, child) | ||||||||||||||||
}; | ||||||||||||||||
// Move all cells and right pointer from root to child | ||||||||||||||||
let mut cells = Vec::new(); | ||||||||||||||||
for idx in 0..contents.cell_count() { | ||||||||||||||||
let (start, len) = contents.cell_get_raw_region( | ||||||||||||||||
idx, | ||||||||||||||||
self.payload_overflow_threshold_max(contents.page_type()), | ||||||||||||||||
self.payload_overflow_threshold_min(contents.page_type()), | ||||||||||||||||
self.usable_space(), | ||||||||||||||||
); | ||||||||||||||||
let buf = contents.as_ptr(); | ||||||||||||||||
cells.push(buf[start..start + len].to_vec()); | ||||||||||||||||
} | ||||||||||||||||
let right_pointer = contents.rightmost_pointer(); | ||||||||||||||||
|
||||||||||||||||
// Clear root - IMPORTANT: Set type BEFORE clearing contents | ||||||||||||||||
contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, PageType::TableInterior as u8); | ||||||||||||||||
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0); | ||||||||||||||||
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0); | ||||||||||||||||
|
||||||||||||||||
// Initialize child with root's old contents | ||||||||||||||||
for (idx, cell) in cells.iter().enumerate() { | ||||||||||||||||
self.insert_into_cell(child_contents, cell, idx); | ||||||||||||||||
} | ||||||||||||||||
if !child_contents.is_leaf() { | ||||||||||||||||
child_contents | ||||||||||||||||
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, right_pointer.unwrap_or(0)); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Handle page 1 special case | ||||||||||||||||
if is_page_1 { | ||||||||||||||||
self.handle_page_one_special_case(child_contents, offset); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
debug!("Balancing root. root={}, rightmost={}", root_id, child_id); | ||||||||||||||||
let root = new_root_page.clone(); | ||||||||||||||||
self.pager.add_dirty(child_page_id); | ||||||||||||||||
|
||||||||||||||||
self.root_page = root_id; | ||||||||||||||||
// Update page stack to include both root and child | ||||||||||||||||
self.stack.clear(); | ||||||||||||||||
self.stack.push(root.clone()); | ||||||||||||||||
self.stack.push(child.clone()); | ||||||||||||||||
self.stack.push(current_root.clone()); | ||||||||||||||||
self.stack.push(child_page.clone()); | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Helper function to handle page 1's special case | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should at least describe what the special case is and why it occurs There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rename the function to hint at what the special case is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not mentioned in SQLite. The overflow case is almost copied from previous
You can find |
||||||||||||||||
fn handle_page_one_special_case(&self, contents: &mut PageContent, offset: usize) { | ||||||||||||||||
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size(); | ||||||||||||||||
|
||||||||||||||||
self.pager.put_loaded_page(root_id, root); | ||||||||||||||||
self.pager.put_loaded_page(child_id, child); | ||||||||||||||||
// Update cell pointers to account for removed header | ||||||||||||||||
for cell_idx in 0..contents.cell_count() { | ||||||||||||||||
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset; | ||||||||||||||||
let pc = contents.read_u16(cell_pointer_offset); | ||||||||||||||||
contents.write_u16(cell_pointer_offset, pc - offset as u16); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
contents.offset = 0; | ||||||||||||||||
let buf = contents.as_ptr(); | ||||||||||||||||
buf.copy_within(DATABASE_HEADER_SIZE.., 0); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
/// Allocate a new page to the btree via the pager. | ||||||||||||||||
|
@@ -1285,8 +1400,7 @@ impl BTreeCursor { | |||||||||||||||
|
||||||||||||||||
// setup overflow page | ||||||||||||||||
let contents = page.get().contents.as_mut().unwrap(); | ||||||||||||||||
let buf = contents.as_ptr(); | ||||||||||||||||
buf.fill(0); | ||||||||||||||||
contents.write_u32(0, 0); // Initialize next overflow page pointer to 0 | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why 0, can we use one of the constants? This 0 means There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, what I did is wrong. Will fix it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pereman2 can you please tell me what's happening here?: https://github.com/sqlite/sqlite/blob/c2e400af042bdd7d21e159a41fcf34c05398044c/src/btree.c#L7166-L7172 This is my understanding: // Writes the page number of the new overflow page (pgnoOvfl) to pPrior. For the first overflow page, pPrior points to the end of the local cell content.
put4byte(pPrior, pgnoOvfl);
// ignore this
releasePage(pToRelease);
pToRelease = pOvfl;
// Points pPrior to the start of the new overflow page's data area
pPrior = pOvfl->aData;
// Writes 0 at the start of the overflow page, which shows its the last overflow page
put4byte(pPrior, 0);
// Sets pPayload to point past the 4-byte pointer. This is where the actual data will be stored in the overflow page
pPayload = &pOvfl->aData[4];
//Calculates remaining space for data in this overflow page. Usable space minus 4 bytes used for the next-page pointer
spaceLeft = pBt->usableSize - 4; Where are we doing |
||||||||||||||||
|
||||||||||||||||
page | ||||||||||||||||
} | ||||||||||||||||
|
@@ -1645,13 +1759,17 @@ impl BTreeCursor { | |||||||||||||||
fn find_cell(&self, page: &PageContent, int_key: u64) -> usize { | ||||||||||||||||
let mut cell_idx = 0; | ||||||||||||||||
let cell_count = page.cell_count(); | ||||||||||||||||
let page_type = match page.maybe_page_type() { | ||||||||||||||||
Some(pt) => pt, | ||||||||||||||||
None => return 0, // Return 0 for overflow pages | ||||||||||||||||
}; | ||||||||||||||||
while cell_idx < cell_count { | ||||||||||||||||
match page | ||||||||||||||||
.cell_get( | ||||||||||||||||
cell_idx, | ||||||||||||||||
self.pager.clone(), | ||||||||||||||||
self.payload_overflow_threshold_max(page.page_type()), | ||||||||||||||||
self.payload_overflow_threshold_min(page.page_type()), | ||||||||||||||||
self.payload_overflow_threshold_max(page_type), | ||||||||||||||||
self.payload_overflow_threshold_min(page_type), | ||||||||||||||||
self.usable_space(), | ||||||||||||||||
) | ||||||||||||||||
.unwrap() | ||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this come from somewhere, e.g. sqlite source code? If so, good to leave a link as comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've removed this function to keep this PR close handling just the root and for checking root overflow, we just need to check if there the page contains any overflow cells. SQLite here: https://github.com/sqlite/sqlite/blob/c2e400af042bdd7d21e159a41fcf34c05398044c/src/btree.c#L9059-L9060