Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended balance_root to handle root underflow case #621

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 174 additions & 74 deletions core/storage/btree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,9 @@ impl BTreeCursor {
continue;
}
None => {
unreachable!("we shall not go back up! The only way is down the slope");
// We've hit either a leaf page or an overflow page
// Either way, we're done traversing
return Ok(CursorResult::Ok(()));
}
}
}
Expand Down Expand Up @@ -699,10 +701,15 @@ impl BTreeCursor {
self.pager.add_dirty(page.get().id);

let page = page.get().contents.as_mut().unwrap();
assert!(matches!(page.page_type(), PageType::TableLeaf));

let page_type = match page.maybe_page_type() {
Some(pt) => pt,
None => return Ok(CursorResult::Ok(())), // Handle overflow pages
};
assert!(matches!(page_type, PageType::TableLeaf));

// find cell
(self.find_cell(page, int_key), page.page_type())
(self.find_cell(page, int_key), page_type)
};

// TODO: if overwrite drop cell
Expand Down Expand Up @@ -903,7 +910,7 @@ impl BTreeCursor {
}

if !self.stack.has_parent() {
self.balance_root();
self.balance_root()?;
return Ok(CursorResult::Ok(()));
}
debug!("Balancing leaf. leaf={}", current_page.get().id);
Expand Down Expand Up @@ -989,7 +996,7 @@ impl BTreeCursor {
let (page_type, current_idx) = {
let current_page = self.stack.top();
let contents = current_page.get().contents.as_ref().unwrap();
(contents.page_type().clone(), current_page.get().id)
(contents.page_type(), current_page.get().id)
};

parent.set_dirty();
Expand All @@ -1005,8 +1012,8 @@ impl BTreeCursor {
.cell_get(
cell_idx,
self.pager.clone(),
self.payload_overflow_threshold_max(page_type.clone()),
self.payload_overflow_threshold_min(page_type.clone()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
)
.unwrap();
Expand All @@ -1019,8 +1026,8 @@ impl BTreeCursor {
if found {
let (start, _len) = parent_contents.cell_get_raw_region(
cell_idx,
self.payload_overflow_threshold_max(page_type.clone()),
self.payload_overflow_threshold_min(page_type.clone()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
);
right_pointer = start;
Expand Down Expand Up @@ -1191,85 +1198,174 @@ impl BTreeCursor {
}
}

/// Balance the root page.
/// This is done when the root page overflows, and we need to create a new root page.
/// See e.g. https://en.wikipedia.org/wiki/B-tree
fn balance_root(&mut self) {
/* todo: balance deeper, create child and copy contents of root there. Then split root */
/* if we are in root page then we just need to create a new root and push key there */
fn calculate_used_space(&self, page: &PageContent) -> usize {
let mut total_used = 0;
total_used += page.header_size();
let (_, cell_array_size) = page.cell_pointer_array_offset_and_size();
total_used += cell_array_size;
for cell_idx in 0..page.cell_count() {
let (_, cell_len) = page.cell_get_raw_region(
cell_idx,
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.usable_space(),
);
total_used += cell_len;
}
for overflow_cell in &page.overflow_cells {
total_used += overflow_cell.payload.len();
}
total_used -= page.num_frag_free_bytes() as usize;

let is_page_1 = {
let current_root = self.stack.top();
current_root.get().id == 1
};
total_used
}

fn is_overflow(&self, page: &PageContent) -> bool {
!page.overflow_cells.is_empty()
}

fn is_underflow(&self, page: &PageContent) -> bool {
// Root is special case - only underflow when empty with one child
let current_page = self.stack.top();
if current_page.get().id == self.root_page {
return page.cell_count() == 0 && !page.is_leaf();
}

let used_space = self.calculate_used_space(page);
let usable_space = self.usable_space();

// Underflow if used space is less than 50% of usable space
used_space < (usable_space / 2)
}

fn balance_root(&mut self) -> Result<CursorResult<()>> {
let current_root = self.stack.top();
let contents = current_root.get().contents.as_mut().unwrap();
let is_page_1 = current_root.get().id == 1;
let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 };
let new_root_page = self.allocate_page(PageType::TableInterior, offset);
{
let current_root = self.stack.top();
let current_root_contents = current_root.get().contents.as_ref().unwrap();

let new_root_page_id = new_root_page.get().id;
let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap();
if self.is_overflow(contents) {
// Remember original page type - the child will inherit this
let original_type = contents.page_type();

// Create new child page with root's original type
let child_page = self.allocate_page(original_type, 0);
let child_page_id = child_page.get().id;
let child_contents = child_page.get().contents.as_mut().unwrap();

// Move all cells and right pointer from root to child
let mut cells = Vec::new();
for idx in 0..contents.cell_count() {
let (start, len) = contents.cell_get_raw_region(
idx,
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
);
let buf = contents.as_ptr();
cells.push(buf[start..start + len].to_vec());
}
let right_pointer = contents.rightmost_pointer();

// Clear root - IMPORTANT: Set type BEFORE clearing contents
contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, PageType::TableInterior as u8);
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);

// Initialize child with root's old contents
for (idx, cell) in cells.iter().enumerate() {
self.insert_into_cell(child_contents, cell, idx);
}
if !child_contents.is_leaf() {
child_contents
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, right_pointer.unwrap_or(0));
}

// Handle page 1 special case
if is_page_1 {
// Copy header
let current_root_buf = current_root_contents.as_ptr();
let new_root_buf = new_root_page_contents.as_ptr();
new_root_buf[0..DATABASE_HEADER_SIZE]
.copy_from_slice(&current_root_buf[0..DATABASE_HEADER_SIZE]);
self.handle_page_one_special_case(child_contents, offset);
}
// point new root right child to previous root
new_root_page_contents
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32);
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
}

/* swap splitted page buffer with new root buffer so we don't have to update page idx */
{
let (root_id, child_id, child) = {
let page_ref = self.stack.top();
let child = page_ref.clone();

// Swap the entire Page structs
std::mem::swap(&mut child.get().id, &mut new_root_page.get().id);
// TODO:: shift bytes by offset to left on child because now child has offset 100
// and header bytes
// Also change the offset of page
//
if is_page_1 {
// Remove header from child and set offset to 0
let contents = child.get().contents.as_mut().unwrap();
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size();
// change cell pointers
for cell_idx in 0..contents.cell_count() {
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset;
let pc = contents.read_u16(cell_pointer_offset);
contents.write_u16(cell_pointer_offset, pc - offset as u16);
}
self.pager.add_dirty(child_page_id);

contents.offset = 0;
let buf = contents.as_ptr();
buf.copy_within(DATABASE_HEADER_SIZE.., 0);
}
// Update page stack to include both root and child
self.stack.clear();
self.stack.push(current_root.clone());
self.stack.push(child_page.clone());
Ok(CursorResult::Ok(()))
} else if self.is_underflow(contents) {
let child_page_id = contents.rightmost_pointer().unwrap();
let child_page = self.pager.read_page(child_page_id as usize)?;

return_if_locked!(child_page);
if !child_page.is_loaded() {
self.pager.load_page(child_page.clone())?;
return Ok(CursorResult::IO);
}

self.pager.add_dirty(new_root_page.get().id);
self.pager.add_dirty(child.get().id);
(new_root_page.get().id, child.get().id, child)
let child_contents = child_page.get().contents.as_mut().unwrap();

// Defragment child before inserting its cells into root because root is smaller than child due to it
// containing header.
self.defragment_page(child_contents, self.database_header.borrow());

let necessary_space = self.calculate_used_space(child_contents);
let available_space = if is_page_1 {
self.usable_space() - DATABASE_HEADER_SIZE
} else {
self.usable_space()
};

debug!("Balancing root. root={}, rightmost={}", root_id, child_id);
let root = new_root_page.clone();
// If root can't consume child, leave it empty temporarily and handle it in balance_leaf step
if available_space < necessary_space {
return Ok(CursorResult::Ok(()));
}

self.root_page = root_id;
self.stack.clear();
self.stack.push(root.clone());
self.stack.push(child.clone());
let child_rightmost_pointer = child_contents.rightmost_pointer();

self.pager.put_loaded_page(root_id, root);
self.pager.put_loaded_page(child_id, child);
for idx in 0..child_contents.cell_count() {
let (start, len) = child_contents.cell_get_raw_region(
idx,
self.payload_overflow_threshold_max(child_contents.page_type()),
self.payload_overflow_threshold_min(child_contents.page_type()),
self.usable_space(),
);
let buf = child_contents.as_ptr();
self.insert_into_cell(contents, &buf[start..start + len], idx);
}

// TODO: free the child page by adding it to the free list. Functionality has to be added to pager.rs first.

contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
if !contents.is_leaf() {
contents.write_u32(
PAGE_HEADER_OFFSET_RIGHTMOST_PTR,
child_rightmost_pointer.unwrap_or(0),
);
}
Ok(CursorResult::Ok(()))
} else {
unreachable!("balance_root was called where we didn't have any overflow or underflow");
}
}

// Helper function to handle page 1's special case
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should at least describe what the special case is and why it occurs

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rename the function to hint at what the special case is

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not mentioned in SQLite. The overflow case is almost copied from previous balance_root implementation which handled only overflow case.

balance_deeper in SQLite handles the root overflow case. But it works a bit differently. It allocates a new child page and makes it right-child page and then it copies the contents into the child page. Then balance_nonroot will handle the child page overflow by splitting the page.

You can find balance_deeper docs and implementation here: https://github.com/sqlite/sqlite/blob/master/src/btree.c#L8940-L9004

fn handle_page_one_special_case(&self, contents: &mut PageContent, offset: usize) {
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size();

// Update cell pointers to account for removed header
for cell_idx in 0..contents.cell_count() {
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset;
let pc = contents.read_u16(cell_pointer_offset);
contents.write_u16(cell_pointer_offset, pc - offset as u16);
}

contents.offset = 0;
let buf = contents.as_ptr();
buf.copy_within(DATABASE_HEADER_SIZE.., 0);
}

/// Allocate a new page to the btree via the pager.
/// This marks the page as dirty and writes the page header.
fn allocate_page(&self, page_type: PageType, offset: usize) -> PageRef {
Expand Down Expand Up @@ -1645,13 +1741,17 @@ impl BTreeCursor {
fn find_cell(&self, page: &PageContent, int_key: u64) -> usize {
let mut cell_idx = 0;
let cell_count = page.cell_count();
let page_type = match page.maybe_page_type() {
Some(pt) => pt,
None => return 0, // Return 0 for overflow pages
};
while cell_idx < cell_count {
match page
.cell_get(
cell_idx,
self.pager.clone(),
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
)
.unwrap()
Expand Down
39 changes: 25 additions & 14 deletions core/storage/sqlite3_ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
}

#[repr(u8)]
#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum PageType {
IndexInterior = 2,
TableInterior = 5,
Expand Down Expand Up @@ -421,9 +421,13 @@ impl PageContent {
}

pub fn maybe_page_type(&self) -> Option<PageType> {
match self.read_u8(0).try_into() {
Ok(v) => Some(v),
Err(_) => None, // this could be an overflow page
let page_type = self.read_u8(0);
match page_type {
2 => Some(PageType::IndexInterior),
5 => Some(PageType::TableInterior),
10 => Some(PageType::IndexLeaf),
13 => Some(PageType::TableLeaf),
_ => None, // This handles overflow pages and any other non-btree page types
}
}

Expand Down Expand Up @@ -534,11 +538,14 @@ impl PageContent {
}

pub fn rightmost_pointer(&self) -> Option<u32> {
match self.page_type() {
PageType::IndexInterior => Some(self.read_u32(8)),
PageType::TableInterior => Some(self.read_u32(8)),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
match self.maybe_page_type() {
Some(page_type) => match page_type {
PageType::IndexInterior => Some(self.read_u32(8)),
PageType::TableInterior => Some(self.read_u32(8)),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
},
None => None, // Handle overflow pages by returning None
}
}

Expand Down Expand Up @@ -650,11 +657,15 @@ impl PageContent {
}

pub fn is_leaf(&self) -> bool {
match self.page_type() {
PageType::IndexInterior => false,
PageType::TableInterior => false,
PageType::IndexLeaf => true,
PageType::TableLeaf => true,
// First check if this is a btree page by looking at the type
match self.maybe_page_type() {
Some(page_type) => match page_type {
PageType::IndexInterior => false,
PageType::TableInterior => false,
PageType::IndexLeaf => true,
PageType::TableLeaf => true,
},
None => false, // Overflow pages are not leaf pages
}
}

Expand Down
Loading