Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended balance_root to handle root underflow case #621

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 188 additions & 70 deletions core/storage/btree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,9 @@ impl BTreeCursor {
continue;
}
None => {
unreachable!("we shall not go back up! The only way is down the slope");
// We've hit either a leaf page or an overflow page
// Either way, we're done traversing
return Ok(CursorResult::Ok(()));
}
}
}
Expand Down Expand Up @@ -699,10 +701,15 @@ impl BTreeCursor {
self.pager.add_dirty(page.get().id);

let page = page.get().contents.as_mut().unwrap();
assert!(matches!(page.page_type(), PageType::TableLeaf));

let page_type = match page.maybe_page_type() {
Some(pt) => pt,
None => return Ok(CursorResult::Ok(())), // Handle overflow pages
};
assert!(matches!(page_type, PageType::TableLeaf));

// find cell
(self.find_cell(page, int_key), page.page_type())
(self.find_cell(page, int_key), page_type)
};

// TODO: if overwrite drop cell
Expand Down Expand Up @@ -1191,83 +1198,191 @@ impl BTreeCursor {
}
}

/// Balance the root page.
/// This is done when the root page overflows, and we need to create a new root page.
/// See e.g. https://en.wikipedia.org/wiki/B-tree
fn balance_root(&mut self) {
/* todo: balance deeper, create child and copy contents of root there. Then split root */
/* if we are in root page then we just need to create a new root and push key there */
fn calculate_used_space(&self, page: &PageContent) -> usize {
let mut total_used = 0;
total_used += page.header_size();
let (_, cell_array_size) = page.cell_pointer_array_offset_and_size();
total_used += cell_array_size;
for cell_idx in 0..page.cell_count() {
let (_, cell_len) = page.cell_get_raw_region(
cell_idx,
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.usable_space(),
);
total_used += cell_len;
}
for overflow_cell in &page.overflow_cells {
total_used += overflow_cell.payload.len();
}
total_used -= page.num_frag_free_bytes() as usize;

let is_page_1 = {
let current_root = self.stack.top();
current_root.get().id == 1
};
total_used
}

fn estimate_cell_pointer_growth(&self) -> usize {
// In SQLite, we want to leave some space for future cell pointers
// Each cell pointer takes 2 bytes
const CELL_POINTER_SIZE: usize = 2;
const ESTIMATED_FUTURE_CELLS: usize = 4; // Leave room for a few more cells
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this come from somewhere, e.g. sqlite source code? If so, good to leave a link as comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed this function to keep this PR close handling just the root and for checking root overflow, we just need to check if there the page contains any overflow cells. SQLite here: https://github.com/sqlite/sqlite/blob/c2e400af042bdd7d21e159a41fcf34c05398044c/src/btree.c#L9059-L9060


CELL_POINTER_SIZE * ESTIMATED_FUTURE_CELLS
}

fn is_overflow(&self, page: &PageContent) -> bool {
if !page.overflow_cells.is_empty() {
return true;
}

let used_space = self.calculate_used_space(page);
let available_space = self.usable_space();

// Check if we're about to exceed usable space
// We also need some buffer space for future cell pointers
let required_space = used_space + self.estimate_cell_pointer_growth();
required_space > available_space
}

fn is_underflow(&self, page: &PageContent) -> bool {
// Root is special case - only underflow when empty with one child
let current_page = self.stack.top();
if current_page.get().id == self.root_page {
return page.cell_count() == 0 && !page.is_leaf();
}

let used_space = self.calculate_used_space(page);
let usable_space = self.usable_space();

// Underflow if used space is less than 50% of usable space
used_space < (usable_space / 2)
}

fn balance_root(&mut self) {
let current_root = self.stack.top();
let contents = current_root.get().contents.as_mut().unwrap();
let is_page_1 = current_root.get().id == 1;
let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 };
let new_root_page = self.allocate_page(PageType::TableInterior, offset);
{
let current_root = self.stack.top();
let current_root_contents = current_root.get().contents.as_ref().unwrap();

let new_root_page_id = new_root_page.get().id;
let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap();
if is_page_1 {
// Copy header
let current_root_buf = current_root_contents.as_ptr();
let new_root_buf = new_root_page_contents.as_ptr();
new_root_buf[0..DATABASE_HEADER_SIZE]
.copy_from_slice(&current_root_buf[0..DATABASE_HEADER_SIZE]);
if self.is_underflow(contents) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe do:

if self.is_overflow(contents) {
...
} else if self.is_underflow(contents) {
...
} else {
unreachable!("balance_root was called where we didn't have any overflow or underflow")
}

let child_page_id = contents.rightmost_pointer().unwrap();
let child_page = self
.pager
.read_page(child_page_id as usize)
.expect("This shouldn't have happened, child page not found");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to propagate errors

Suggested change
let child_page = self
.pager
.read_page(child_page_id as usize)
.expect("This shouldn't have happened, child page not found");
let child_page = self
.pager
.read_page(child_page_id as usize)?;

And I think we are not handling the case where child page is not loaded/locked?

let child_contents = child_page.get().contents.as_mut().unwrap();

// Defragment child before inserting its cells into root because root is smaller than child due to it
// containing header.
self.defragment_page(child_contents, self.database_header.borrow());

let necessary_space = self.calculate_used_space(child_contents);
let available_space = if is_page_1 {
self.usable_space() - DATABASE_HEADER_SIZE
} else {
self.usable_space()
};

// If root can't consume child, leave it empty temporarily and handle it in balance_leaf step
if available_space < necessary_space {
return;
}

let grandchild_ptr = child_contents.rightmost_pointer();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename grandchild_ptr -> child_rightmost_pointer? There could be a lot of grand childs so it this feels more exact.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moreover, if we are balancing root because it doesn't have any more cells, why do we expect the child to not be a leaf page? I mean, if you take rightmost pointer it means that child is a interior page, therefore, the tree is not balanced.


let mut cells = Vec::new();
for idx in 0..child_contents.cell_count() {
let (start, len) = child_contents.cell_get_raw_region(
idx,
self.payload_overflow_threshold_max(child_contents.page_type()),
self.payload_overflow_threshold_min(child_contents.page_type()),
self.usable_space(),
);
let buf = child_contents.as_ptr();
cells.push(buf[start..start + len].to_vec());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we insert_into_cell here instead of cloning vecs?

}

// TODO: free the child page by adding it to the free list. Functionality has to be added to pager.rs

contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
if !contents.is_leaf() {
contents.write_u32(
PAGE_HEADER_OFFSET_RIGHTMOST_PTR,
grandchild_ptr.unwrap_or(0),
);
}

// Insert cells into root
for (idx, cell) in cells.iter().enumerate() {
self.insert_into_cell(contents, cell, idx);
}
// point new root right child to previous root
new_root_page_contents
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32);
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
}

/* swap splitted page buffer with new root buffer so we don't have to update page idx */
{
let (root_id, child_id, child) = {
let page_ref = self.stack.top();
let child = page_ref.clone();

// Swap the entire Page structs
std::mem::swap(&mut child.get().id, &mut new_root_page.get().id);
// TODO:: shift bytes by offset to left on child because now child has offset 100
// and header bytes
// Also change the offset of page
//
if is_page_1 {
// Remove header from child and set offset to 0
let contents = child.get().contents.as_mut().unwrap();
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size();
// change cell pointers
for cell_idx in 0..contents.cell_count() {
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset;
let pc = contents.read_u16(cell_pointer_offset);
contents.write_u16(cell_pointer_offset, pc - offset as u16);
}
if self.is_overflow(contents) {
// Remember original page type - the child will inherit this
let original_type = contents.page_type();

contents.offset = 0;
let buf = contents.as_ptr();
buf.copy_within(DATABASE_HEADER_SIZE.., 0);
}
// Create new child page with root's original type
let child_page = self.allocate_page(original_type, 0);
let child_page_id = child_page.get().id;
let child_contents = child_page.get().contents.as_mut().unwrap();

self.pager.add_dirty(new_root_page.get().id);
self.pager.add_dirty(child.get().id);
(new_root_page.get().id, child.get().id, child)
};
// Move all cells and right pointer from root to child
let mut cells = Vec::new();
for idx in 0..contents.cell_count() {
let (start, len) = contents.cell_get_raw_region(
idx,
self.payload_overflow_threshold_max(contents.page_type()),
self.payload_overflow_threshold_min(contents.page_type()),
self.usable_space(),
);
let buf = contents.as_ptr();
cells.push(buf[start..start + len].to_vec());
}
let right_pointer = contents.rightmost_pointer();

// Clear root - IMPORTANT: Set type BEFORE clearing contents
contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, PageType::TableInterior as u8);
contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u16(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);

// Initialize child with root's old contents
for (idx, cell) in cells.iter().enumerate() {
self.insert_into_cell(child_contents, cell, idx);
}
if !child_contents.is_leaf() {
child_contents
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, right_pointer.unwrap_or(0));
}

// Handle page 1 special case
if is_page_1 {
self.handle_page_one_special_case(child_contents, offset);
}

debug!("Balancing root. root={}, rightmost={}", root_id, child_id);
let root = new_root_page.clone();
self.pager.add_dirty(child_page_id);

self.root_page = root_id;
// Update page stack to include both root and child
self.stack.clear();
self.stack.push(root.clone());
self.stack.push(child.clone());
self.stack.push(current_root.clone());
self.stack.push(child_page.clone());
}
}

// Helper function to handle page 1's special case
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should at least describe what the special case is and why it occurs

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rename the function to hint at what the special case is

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not mentioned in SQLite. The overflow case is almost copied from previous balance_root implementation which handled only overflow case.

balance_deeper in SQLite handles the root overflow case. But it works a bit differently. It allocates a new child page and makes it right-child page and then it copies the contents into the child page. Then balance_nonroot will handle the child page overflow by splitting the page.

You can find balance_deeper docs and implementation here: https://github.com/sqlite/sqlite/blob/master/src/btree.c#L8940-L9004

fn handle_page_one_special_case(&self, contents: &mut PageContent, offset: usize) {
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size();

self.pager.put_loaded_page(root_id, root);
self.pager.put_loaded_page(child_id, child);
// Update cell pointers to account for removed header
for cell_idx in 0..contents.cell_count() {
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset;
let pc = contents.read_u16(cell_pointer_offset);
contents.write_u16(cell_pointer_offset, pc - offset as u16);
}

contents.offset = 0;
let buf = contents.as_ptr();
buf.copy_within(DATABASE_HEADER_SIZE.., 0);
}

/// Allocate a new page to the btree via the pager.
Expand All @@ -1285,8 +1400,7 @@ impl BTreeCursor {

// setup overflow page
let contents = page.get().contents.as_mut().unwrap();
let buf = contents.as_ptr();
buf.fill(0);
contents.write_u32(0, 0); // Initialize next overflow page pointer to 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 0, can we use one of the constants? This 0 means PAGE_HEADER_OFFSET_PAGE_TYPE, are you sure this is what this 0 write means?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, what I did is wrong. Will fix it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pereman2 can you please tell me what's happening here?: https://github.com/sqlite/sqlite/blob/c2e400af042bdd7d21e159a41fcf34c05398044c/src/btree.c#L7166-L7172

This is my understanding:

// Writes the page number of the new overflow page (pgnoOvfl) to pPrior. For the first overflow page, pPrior points to the end of the local cell content.
put4byte(pPrior, pgnoOvfl);

// ignore this
releasePage(pToRelease);
pToRelease = pOvfl;

// Points pPrior to the start of the new overflow page's data area
pPrior = pOvfl->aData;

// Writes 0 at the start of the overflow page, which shows its the last overflow page
put4byte(pPrior, 0);

// Sets pPayload to point past the 4-byte pointer. This is where the actual data will be stored in the overflow page
pPayload = &pOvfl->aData[4];

//Calculates remaining space for data in this overflow page. Usable space minus 4 bytes used for the next-page pointer
spaceLeft = pBt->usableSize - 4;

Where are we doing put4byte(pPrior, 0); equivalent in limbo?


page
}
Expand Down Expand Up @@ -1645,13 +1759,17 @@ impl BTreeCursor {
fn find_cell(&self, page: &PageContent, int_key: u64) -> usize {
let mut cell_idx = 0;
let cell_count = page.cell_count();
let page_type = match page.maybe_page_type() {
Some(pt) => pt,
None => return 0, // Return 0 for overflow pages
};
while cell_idx < cell_count {
match page
.cell_get(
cell_idx,
self.pager.clone(),
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
)
.unwrap()
Expand Down
39 changes: 25 additions & 14 deletions core/storage/sqlite3_ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
}

#[repr(u8)]
#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum PageType {
IndexInterior = 2,
TableInterior = 5,
Expand Down Expand Up @@ -421,9 +421,13 @@ impl PageContent {
}

pub fn maybe_page_type(&self) -> Option<PageType> {
match self.read_u8(0).try_into() {
Ok(v) => Some(v),
Err(_) => None, // this could be an overflow page
let page_type = self.read_u8(0);
match page_type {
2 => Some(PageType::IndexInterior),
5 => Some(PageType::TableInterior),
10 => Some(PageType::IndexLeaf),
13 => Some(PageType::TableLeaf),
_ => None, // This handles overflow pages and any other non-btree page types
}
}

Expand Down Expand Up @@ -534,11 +538,14 @@ impl PageContent {
}

pub fn rightmost_pointer(&self) -> Option<u32> {
match self.page_type() {
PageType::IndexInterior => Some(self.read_u32(8)),
PageType::TableInterior => Some(self.read_u32(8)),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
match self.maybe_page_type() {
Some(page_type) => match page_type {
PageType::IndexInterior => Some(self.read_u32(8)),
PageType::TableInterior => Some(self.read_u32(8)),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
},
None => None, // Handle overflow pages by returning None
}
}

Expand Down Expand Up @@ -650,11 +657,15 @@ impl PageContent {
}

pub fn is_leaf(&self) -> bool {
match self.page_type() {
PageType::IndexInterior => false,
PageType::TableInterior => false,
PageType::IndexLeaf => true,
PageType::TableLeaf => true,
// First check if this is a btree page by looking at the type
match self.maybe_page_type() {
Some(page_type) => match page_type {
PageType::IndexInterior => false,
PageType::TableInterior => false,
PageType::IndexLeaf => true,
PageType::TableLeaf => true,
},
None => false, // Overflow pages are not leaf pages
}
}

Expand Down
Loading