From 72064f9403b8aba323105857fa88f7fc9e7388af Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Tue, 25 Jun 2024 16:11:06 +0200 Subject: [PATCH] introduce a freelist interface This introduces an interface for the freelist, splits it into two concrete implementations. fixes #773 Signed-off-by: Thomas Jungblut --- allocate_test.go | 7 +- bucket.go | 2 +- bucket_test.go | 3 + cmd/bbolt/command_version.go | 1 + concurrent_test.go | 18 +- db.go | 44 +- freelist.go | 418 --------------- freelist_test.go | 485 ------------------ internal/btesting/btesting.go | 8 +- internal/freelist/array.go | 116 +++++ internal/freelist/array_test.go | 52 ++ internal/freelist/freelist.go | 108 ++++ internal/freelist/freelist_test.go | 236 +++++++++ .../freelist/hashmap.go | 198 +++---- internal/freelist/hashmap_test.go | 33 ++ internal/freelist/serde.go | 79 +++ internal/freelist/serde_test.go | 55 ++ internal/freelist/shared.go | 158 ++++++ node.go | 4 +- tx.go | 30 +- tx_check.go | 6 +- 21 files changed, 1008 insertions(+), 1053 deletions(-) delete mode 100644 freelist.go delete mode 100644 freelist_test.go create mode 100644 internal/freelist/array.go create mode 100644 internal/freelist/array_test.go create mode 100644 internal/freelist/freelist.go create mode 100644 internal/freelist/freelist_test.go rename freelist_hmap.go => internal/freelist/hashmap.go (75%) create mode 100644 internal/freelist/hashmap_test.go create mode 100644 internal/freelist/serde.go create mode 100644 internal/freelist/serde_test.go create mode 100644 internal/freelist/shared.go diff --git a/allocate_test.go b/allocate_test.go index 9f08be1cf..81dd381c1 100644 --- a/allocate_test.go +++ b/allocate_test.go @@ -4,12 +4,13 @@ import ( "testing" "go.etcd.io/bbolt/internal/common" + "go.etcd.io/bbolt/internal/freelist" ) func TestTx_allocatePageStats(t *testing.T) { - f := newTestFreelist() + f := freelist.NewFreelist(freelist.FreelistArrayType) ids := []common.Pgid{2, 3} - f.readIDs(ids) + f.Init(ids) tx := &Tx{ db: &DB{ @@ -22,7 +23,7 @@ func TestTx_allocatePageStats(t *testing.T) { txStats := tx.Stats() prePageCnt := txStats.GetPageCount() - allocateCnt := f.free_count() + allocateCnt := f.FreeCount() if _, err := tx.allocate(allocateCnt); err != nil { t.Fatal(err) diff --git a/bucket.go b/bucket.go index 785ad9bd5..6371ace97 100644 --- a/bucket.go +++ b/bucket.go @@ -903,7 +903,7 @@ func (b *Bucket) free() { var tx = b.tx b.forEachPageNode(func(p *common.Page, n *node, _ int) { if p != nil { - tx.db.freelist.free(tx.meta.Txid(), p) + tx.db.freelist.Free(tx.meta.Txid(), p) } else { n.free() } diff --git a/bucket_test.go b/bucket_test.go index 3255e7b89..493d133a7 100644 --- a/bucket_test.go +++ b/bucket_test.go @@ -430,6 +430,9 @@ func TestBucket_Delete_FreelistOverflow(t *testing.T) { if reopenFreePages := db.Stats().FreePageN; freePages != reopenFreePages { t.Fatalf("expected %d free pages, got %+v", freePages, db.Stats()) } + if reopenPendingPages := db.Stats().PendingPageN; reopenPendingPages != 0 { + t.Fatalf("expected no pending pages, got %+v", db.Stats()) + } } // Ensure that deleting of non-existing key is a no-op. diff --git a/cmd/bbolt/command_version.go b/cmd/bbolt/command_version.go index 73019c798..39d756bd9 100644 --- a/cmd/bbolt/command_version.go +++ b/cmd/bbolt/command_version.go @@ -5,6 +5,7 @@ import ( "runtime" "github.com/spf13/cobra" + "go.etcd.io/bbolt/version" ) diff --git a/concurrent_test.go b/concurrent_test.go index 10f1a2f8e..98260730a 100644 --- a/concurrent_test.go +++ b/concurrent_test.go @@ -17,6 +17,8 @@ import ( "time" "unicode/utf8" + "go.etcd.io/bbolt/internal/freelist" + "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" @@ -235,9 +237,9 @@ func mustOpenDB(t *testing.T, dbPath string, o *bolt.Options) *bolt.DB { o = bolt.DefaultOptions } - freelistType := bolt.FreelistArrayType - if env := os.Getenv("TEST_FREELIST_TYPE"); env == string(bolt.FreelistMapType) { - freelistType = bolt.FreelistMapType + freelistType := freelist.FreelistArrayType + if env := os.Getenv("TEST_FREELIST_TYPE"); env == string(freelist.FreelistMapType) { + freelistType = freelist.FreelistMapType } o.FreelistType = freelistType @@ -767,29 +769,29 @@ func TestConcurrentRepeatableRead(t *testing.T) { testCases := []struct { name string noFreelistSync bool - freelistType bolt.FreelistType + freelistType freelist.FreelistType }{ // [array] freelist { name: "sync array freelist", noFreelistSync: false, - freelistType: bolt.FreelistArrayType, + freelistType: freelist.FreelistArrayType, }, { name: "not sync array freelist", noFreelistSync: true, - freelistType: bolt.FreelistArrayType, + freelistType: freelist.FreelistArrayType, }, // [map] freelist { name: "sync map freelist", noFreelistSync: false, - freelistType: bolt.FreelistMapType, + freelistType: freelist.FreelistMapType, }, { name: "not sync map freelist", noFreelistSync: true, - freelistType: bolt.FreelistMapType, + freelistType: freelist.FreelistMapType, }, } diff --git a/db.go b/db.go index cd3c5b0ba..d0afa0e74 100644 --- a/db.go +++ b/db.go @@ -13,25 +13,12 @@ import ( berrors "go.etcd.io/bbolt/errors" "go.etcd.io/bbolt/internal/common" + fl "go.etcd.io/bbolt/internal/freelist" ) // The time elapsed between consecutive file locking attempts. const flockRetryTimeout = 50 * time.Millisecond -// FreelistType is the type of the freelist backend -type FreelistType string - -// TODO(ahrtr): eventually we should (step by step) -// 1. default to `FreelistMapType`; -// 2. remove the `FreelistArrayType`, do not export `FreelistMapType` -// and remove field `FreelistType' from both `DB` and `Options`; -const ( - // FreelistArrayType indicates backend freelist type is array - FreelistArrayType = FreelistType("array") - // FreelistMapType indicates backend freelist type is hashmap - FreelistMapType = FreelistType("hashmap") -) - // DB represents a collection of buckets persisted to a file on disk. // All data access is performed through transactions which can be obtained through the DB. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. @@ -70,7 +57,7 @@ type DB struct { // The alternative one is using hashmap, it is faster in almost all circumstances // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. // The default type is array - FreelistType FreelistType + FreelistType fl.FreelistType // When true, skips the truncate call when growing the database. // Setting this to true is only safe on non-ext3/ext4 systems. @@ -134,8 +121,9 @@ type DB struct { rwtx *Tx txs []*Tx - freelist *freelist - freelistLoad sync.Once + freelist fl.Freelist + freelistSerializer fl.Serializable + freelistLoad sync.Once pagePool sync.Pool @@ -190,6 +178,7 @@ func Open(path string, mode os.FileMode, options *Options) (db *DB, err error) { db.NoFreelistSync = options.NoFreelistSync db.PreLoadFreelist = options.PreLoadFreelist db.FreelistType = options.FreelistType + db.freelistSerializer = fl.Serializer{} db.Mlock = options.Mlock // Set default values for later DB operations. @@ -416,15 +405,16 @@ func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) { // concurrent accesses being made to the freelist. func (db *DB) loadFreelist() { db.freelistLoad.Do(func() { - db.freelist = newFreelist(db.FreelistType) + db.freelist = fl.NewFreelist(db.FreelistType) if !db.hasSyncedFreelist() { // Reconstruct free list by scanning the DB. - db.freelist.readIDs(db.freepages()) + db.freelist.Init(db.freepages()) } else { // Read free list from freelist page. - db.freelist.read(db.page(db.meta().Freelist())) + db.freelistSerializer.Read(db.freelist, db.page(db.meta().Freelist())) } - db.stats.FreePageN = db.freelist.free_count() + db.stats.FreePageN = db.freelist.FreeCount() + db.stats.PendingPageN = db.freelist.PendingCount() }) } @@ -854,14 +844,14 @@ func (db *DB) freePages() { minid = db.txs[0].meta.Txid() } if minid > 0 { - db.freelist.release(minid - 1) + db.freelist.Release(minid - 1) } // Release unused txid extents. for _, t := range db.txs { - db.freelist.releaseRange(minid, t.meta.Txid()-1) + db.freelist.ReleaseRange(minid, t.meta.Txid()-1) minid = t.meta.Txid() + 1 } - db.freelist.releaseRange(minid, common.Txid(0xFFFFFFFFFFFFFFFF)) + db.freelist.ReleaseRange(minid, common.Txid(0xFFFFFFFFFFFFFFFF)) // Any page both allocated and freed in an extent is safe to release. } @@ -1176,7 +1166,7 @@ func (db *DB) allocate(txid common.Txid, count int) (*common.Page, error) { p.SetOverflow(uint32(count - 1)) // Use pages from the freelist if they are available. - p.SetId(db.freelist.allocate(txid, count)) + p.SetId(db.freelist.Allocate(txid, count)) if p.Id() != 0 { return p, nil } @@ -1305,7 +1295,7 @@ type Options struct { // The alternative one is using hashmap, it is faster in almost all circumstances // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. // The default type is array - FreelistType FreelistType + FreelistType fl.FreelistType // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to // grab a shared lock (UNIX). @@ -1360,7 +1350,7 @@ func (o *Options) String() string { var DefaultOptions = &Options{ Timeout: 0, NoGrowSync: false, - FreelistType: FreelistArrayType, + FreelistType: fl.FreelistArrayType, } // Stats represents statistics about the database. diff --git a/freelist.go b/freelist.go deleted file mode 100644 index 5bbc27445..000000000 --- a/freelist.go +++ /dev/null @@ -1,418 +0,0 @@ -package bbolt - -import ( - "fmt" - "sort" - "unsafe" - - "go.etcd.io/bbolt/internal/common" -) - -// txPending holds a list of pgids and corresponding allocation txns -// that are pending to be freed. -type txPending struct { - ids []common.Pgid - alloctx []common.Txid // txids allocating the ids - lastReleaseBegin common.Txid // beginning txid of last matching releaseRange -} - -// pidSet holds the set of starting pgids which have the same span size -type pidSet map[common.Pgid]struct{} - -// freelist represents a list of all pages that are available for allocation. -// It also tracks pages that have been freed but are still in use by open transactions. -type freelist struct { - freelistType FreelistType // freelist type - ids []common.Pgid // all free and available free page ids. - allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid. - pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx. - cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids. - freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size - forwardMap map[common.Pgid]uint64 // key is start pgid, value is its span size - backwardMap map[common.Pgid]uint64 // key is end pgid, value is its span size - freePagesCount uint64 // count of free pages(hashmap version) - allocate func(txid common.Txid, n int) common.Pgid // the freelist allocate func - free_count func() int // the function which gives you free page number - mergeSpans func(ids common.Pgids) // the mergeSpan func - getFreePageIDs func() []common.Pgid // get free pgids func - readIDs func(pgids []common.Pgid) // readIDs func reads list of pages and init the freelist -} - -// newFreelist returns an empty, initialized freelist. -func newFreelist(freelistType FreelistType) *freelist { - f := &freelist{ - freelistType: freelistType, - allocs: make(map[common.Pgid]common.Txid), - pending: make(map[common.Txid]*txPending), - cache: make(map[common.Pgid]struct{}), - freemaps: make(map[uint64]pidSet), - forwardMap: make(map[common.Pgid]uint64), - backwardMap: make(map[common.Pgid]uint64), - } - - if freelistType == FreelistMapType { - f.allocate = f.hashmapAllocate - f.free_count = f.hashmapFreeCount - f.mergeSpans = f.hashmapMergeSpans - f.getFreePageIDs = f.hashmapGetFreePageIDs - f.readIDs = f.hashmapReadIDs - } else { - f.allocate = f.arrayAllocate - f.free_count = f.arrayFreeCount - f.mergeSpans = f.arrayMergeSpans - f.getFreePageIDs = f.arrayGetFreePageIDs - f.readIDs = f.arrayReadIDs - } - - return f -} - -// size returns the size of the page after serialization. -func (f *freelist) size() int { - n := f.count() - if n >= 0xFFFF { - // The first element will be used to store the count. See freelist.write. - n++ - } - return int(common.PageHeaderSize) + (int(unsafe.Sizeof(common.Pgid(0))) * n) -} - -// count returns count of pages on the freelist -func (f *freelist) count() int { - return f.free_count() + f.pending_count() -} - -// arrayFreeCount returns count of free pages(array version) -func (f *freelist) arrayFreeCount() int { - return len(f.ids) -} - -// pending_count returns count of pending pages -func (f *freelist) pending_count() int { - var count int - for _, txp := range f.pending { - count += len(txp.ids) - } - return count -} - -// copyall copies a list of all free ids and all pending ids in one sorted list. -// f.count returns the minimum length required for dst. -func (f *freelist) copyall(dst []common.Pgid) { - m := make(common.Pgids, 0, f.pending_count()) - for _, txp := range f.pending { - m = append(m, txp.ids...) - } - sort.Sort(m) - common.Mergepgids(dst, f.getFreePageIDs(), m) -} - -// arrayAllocate returns the starting page id of a contiguous list of pages of a given size. -// If a contiguous block cannot be found then 0 is returned. -func (f *freelist) arrayAllocate(txid common.Txid, n int) common.Pgid { - if len(f.ids) == 0 { - return 0 - } - - var initial, previd common.Pgid - for i, id := range f.ids { - if id <= 1 { - panic(fmt.Sprintf("invalid page allocation: %d", id)) - } - - // Reset initial page if this is not contiguous. - if previd == 0 || id-previd != 1 { - initial = id - } - - // If we found a contiguous block then remove it and return it. - if (id-initial)+1 == common.Pgid(n) { - // If we're allocating off the beginning then take the fast path - // and just adjust the existing slice. This will use extra memory - // temporarily but the append() in free() will realloc the slice - // as is necessary. - if (i + 1) == n { - f.ids = f.ids[i+1:] - } else { - copy(f.ids[i-n+1:], f.ids[i+1:]) - f.ids = f.ids[:len(f.ids)-n] - } - - // Remove from the free cache. - for i := common.Pgid(0); i < common.Pgid(n); i++ { - delete(f.cache, initial+i) - } - f.allocs[initial] = txid - return initial - } - - previd = id - } - return 0 -} - -// free releases a page and its overflow for a given transaction id. -// If the page is already free then a panic will occur. -func (f *freelist) free(txid common.Txid, p *common.Page) { - if p.Id() <= 1 { - panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.Id())) - } - - // Free page and all its overflow pages. - txp := f.pending[txid] - if txp == nil { - txp = &txPending{} - f.pending[txid] = txp - } - allocTxid, ok := f.allocs[p.Id()] - if ok { - delete(f.allocs, p.Id()) - } else if p.IsFreelistPage() { - // Freelist is always allocated by prior tx. - allocTxid = txid - 1 - } - - for id := p.Id(); id <= p.Id()+common.Pgid(p.Overflow()); id++ { - // Verify that page is not already free. - if _, ok := f.cache[id]; ok { - panic(fmt.Sprintf("page %d already freed", id)) - } - // Add to the freelist and cache. - txp.ids = append(txp.ids, id) - txp.alloctx = append(txp.alloctx, allocTxid) - f.cache[id] = struct{}{} - } -} - -// release moves all page ids for a transaction id (or older) to the freelist. -func (f *freelist) release(txid common.Txid) { - m := make(common.Pgids, 0) - for tid, txp := range f.pending { - if tid <= txid { - // Move transaction's pending pages to the available freelist. - // Don't remove from the cache since the page is still free. - m = append(m, txp.ids...) - delete(f.pending, tid) - } - } - f.mergeSpans(m) -} - -// releaseRange moves pending pages allocated within an extent [begin,end] to the free list. -func (f *freelist) releaseRange(begin, end common.Txid) { - if begin > end { - return - } - var m common.Pgids - for tid, txp := range f.pending { - if tid < begin || tid > end { - continue - } - // Don't recompute freed pages if ranges haven't updated. - if txp.lastReleaseBegin == begin { - continue - } - for i := 0; i < len(txp.ids); i++ { - if atx := txp.alloctx[i]; atx < begin || atx > end { - continue - } - m = append(m, txp.ids[i]) - txp.ids[i] = txp.ids[len(txp.ids)-1] - txp.ids = txp.ids[:len(txp.ids)-1] - txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1] - txp.alloctx = txp.alloctx[:len(txp.alloctx)-1] - i-- - } - txp.lastReleaseBegin = begin - if len(txp.ids) == 0 { - delete(f.pending, tid) - } - } - f.mergeSpans(m) -} - -// rollback removes the pages from a given pending tx. -func (f *freelist) rollback(txid common.Txid) { - // Remove page ids from cache. - txp := f.pending[txid] - if txp == nil { - return - } - var m common.Pgids - for i, pgid := range txp.ids { - delete(f.cache, pgid) - tx := txp.alloctx[i] - if tx == 0 { - continue - } - if tx != txid { - // Pending free aborted; restore page back to alloc list. - f.allocs[pgid] = tx - } else { - // Freed page was allocated by this txn; OK to throw away. - m = append(m, pgid) - } - } - // Remove pages from pending list and mark as free if allocated by txid. - delete(f.pending, txid) - f.mergeSpans(m) -} - -// freed returns whether a given page is in the free list. -func (f *freelist) freed(pgId common.Pgid) bool { - _, ok := f.cache[pgId] - return ok -} - -// read initializes the freelist from a freelist page. -func (f *freelist) read(p *common.Page) { - if !p.IsFreelistPage() { - panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.Id(), p.Typ())) - } - - ids := p.FreelistPageIds() - - // Copy the list of page ids from the freelist. - if len(ids) == 0 { - f.ids = nil - } else { - // copy the ids, so we don't modify on the freelist page directly - idsCopy := make([]common.Pgid, len(ids)) - copy(idsCopy, ids) - // Make sure they're sorted. - sort.Sort(common.Pgids(idsCopy)) - - f.readIDs(idsCopy) - } -} - -// arrayReadIDs initializes the freelist from a given list of ids. -func (f *freelist) arrayReadIDs(ids []common.Pgid) { - f.ids = ids - f.reindex() -} - -func (f *freelist) arrayGetFreePageIDs() []common.Pgid { - return f.ids -} - -// write writes the page ids onto a freelist page. All free and pending ids are -// saved to disk since in the event of a program crash, all pending ids will -// become free. -func (f *freelist) write(p *common.Page) error { - // Combine the old free pgids and pgids waiting on an open transaction. - - // Update the header flag. - p.SetFlags(common.FreelistPageFlag) - - // The page.count can only hold up to 64k elements so if we overflow that - // number then we handle it by putting the size in the first element. - l := f.count() - if l == 0 { - p.SetCount(uint16(l)) - } else if l < 0xFFFF { - p.SetCount(uint16(l)) - data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) - ids := unsafe.Slice((*common.Pgid)(data), l) - f.copyall(ids) - } else { - p.SetCount(0xFFFF) - data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) - ids := unsafe.Slice((*common.Pgid)(data), l+1) - ids[0] = common.Pgid(l) - f.copyall(ids[1:]) - } - - return nil -} - -// reload reads the freelist from a page and filters out pending items. -func (f *freelist) reload(p *common.Page) { - f.read(p) - - // Build a cache of only pending pages. - pcache := make(map[common.Pgid]bool) - for _, txp := range f.pending { - for _, pendingID := range txp.ids { - pcache[pendingID] = true - } - } - - // Check each page in the freelist and build a new available freelist - // with any pages not in the pending lists. - var a []common.Pgid - for _, id := range f.getFreePageIDs() { - if !pcache[id] { - a = append(a, id) - } - } - - f.readIDs(a) -} - -// noSyncReload reads the freelist from Pgids and filters out pending items. -func (f *freelist) noSyncReload(Pgids []common.Pgid) { - // Build a cache of only pending pages. - pcache := make(map[common.Pgid]bool) - for _, txp := range f.pending { - for _, pendingID := range txp.ids { - pcache[pendingID] = true - } - } - - // Check each page in the freelist and build a new available freelist - // with any pages not in the pending lists. - var a []common.Pgid - for _, id := range Pgids { - if !pcache[id] { - a = append(a, id) - } - } - - f.readIDs(a) -} - -// reindex rebuilds the free cache based on available and pending free lists. -func (f *freelist) reindex() { - ids := f.getFreePageIDs() - f.cache = make(map[common.Pgid]struct{}, len(ids)) - for _, id := range ids { - f.cache[id] = struct{}{} - } - for _, txp := range f.pending { - for _, pendingID := range txp.ids { - f.cache[pendingID] = struct{}{} - } - } -} - -// arrayMergeSpans try to merge list of pages(represented by pgids) with existing spans but using array -func (f *freelist) arrayMergeSpans(ids common.Pgids) { - sort.Sort(ids) - common.Verify(func() { - idsIdx := make(map[common.Pgid]struct{}) - for _, id := range f.ids { - // The existing f.ids shouldn't have duplicated free ID. - if _, ok := idsIdx[id]; ok { - panic(fmt.Sprintf("detected duplicated free page ID: %d in existing f.ids: %v", id, f.ids)) - } - idsIdx[id] = struct{}{} - } - - prev := common.Pgid(0) - for _, id := range ids { - // The ids shouldn't have duplicated free ID. Note page 0 and 1 - // are reserved for meta pages, so they can never be free page IDs. - if prev == id { - panic(fmt.Sprintf("detected duplicated free ID: %d in ids: %v", id, ids)) - } - prev = id - - // The ids shouldn't have any overlap with the existing f.ids. - if _, ok := idsIdx[id]; ok { - panic(fmt.Sprintf("detected overlapped free page ID: %d between ids: %v and existing f.ids: %v", id, ids, f.ids)) - } - } - }) - f.ids = common.Pgids(f.ids).Merge(ids) -} diff --git a/freelist_test.go b/freelist_test.go deleted file mode 100644 index 5cf40bd1c..000000000 --- a/freelist_test.go +++ /dev/null @@ -1,485 +0,0 @@ -package bbolt - -import ( - "math/rand" - "os" - "reflect" - "sort" - "testing" - "unsafe" - - "go.etcd.io/bbolt/internal/common" -) - -// TestFreelistType is used as a env variable for test to indicate the backend type -const TestFreelistType = "TEST_FREELIST_TYPE" - -// Ensure that a page is added to a transaction's freelist. -func TestFreelist_free(t *testing.T) { - f := newTestFreelist() - f.free(100, common.NewPage(12, 0, 0, 0)) - if !reflect.DeepEqual([]common.Pgid{12}, f.pending[100].ids) { - t.Fatalf("exp=%v; got=%v", []common.Pgid{12}, f.pending[100].ids) - } -} - -// Ensure that a page and its overflow is added to a transaction's freelist. -func TestFreelist_free_overflow(t *testing.T) { - f := newTestFreelist() - f.free(100, common.NewPage(12, 0, 0, 3)) - if exp := []common.Pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100].ids) { - t.Fatalf("exp=%v; got=%v", exp, f.pending[100].ids) - } -} - -// Ensure that a transaction's free pages can be released. -func TestFreelist_release(t *testing.T) { - f := newTestFreelist() - f.free(100, common.NewPage(12, 0, 0, 1)) - f.free(100, common.NewPage(9, 0, 0, 0)) - f.free(102, common.NewPage(39, 0, 0, 0)) - f.release(100) - f.release(101) - if exp := []common.Pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { - t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) - } - - f.release(102) - if exp := []common.Pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { - t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) - } -} - -// Ensure that releaseRange handles boundary conditions correctly -func TestFreelist_releaseRange(t *testing.T) { - type testRange struct { - begin, end common.Txid - } - - type testPage struct { - id common.Pgid - n int - allocTxn common.Txid - freeTxn common.Txid - } - - var releaseRangeTests = []struct { - title string - pagesIn []testPage - releaseRanges []testRange - wantFree []common.Pgid - }{ - { - title: "Single pending in range", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, - releaseRanges: []testRange{{1, 300}}, - wantFree: []common.Pgid{3}, - }, - { - title: "Single pending with minimum end range", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, - releaseRanges: []testRange{{1, 200}}, - wantFree: []common.Pgid{3}, - }, - { - title: "Single pending outsize minimum end range", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, - releaseRanges: []testRange{{1, 199}}, - wantFree: nil, - }, - { - title: "Single pending with minimum begin range", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, - releaseRanges: []testRange{{100, 300}}, - wantFree: []common.Pgid{3}, - }, - { - title: "Single pending outside minimum begin range", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, - releaseRanges: []testRange{{101, 300}}, - wantFree: nil, - }, - { - title: "Single pending in minimum range", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, - releaseRanges: []testRange{{199, 200}}, - wantFree: []common.Pgid{3}, - }, - { - title: "Single pending and read transaction at 199", - pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, - releaseRanges: []testRange{{100, 198}, {200, 300}}, - wantFree: nil, - }, - { - title: "Adjacent pending and read transactions at 199, 200", - pagesIn: []testPage{ - {id: 3, n: 1, allocTxn: 199, freeTxn: 200}, - {id: 4, n: 1, allocTxn: 200, freeTxn: 201}, - }, - releaseRanges: []testRange{ - {100, 198}, - {200, 199}, // Simulate the ranges db.freePages might produce. - {201, 300}, - }, - wantFree: nil, - }, - { - title: "Out of order ranges", - pagesIn: []testPage{ - {id: 3, n: 1, allocTxn: 199, freeTxn: 200}, - {id: 4, n: 1, allocTxn: 200, freeTxn: 201}, - }, - releaseRanges: []testRange{ - {201, 199}, - {201, 200}, - {200, 200}, - }, - wantFree: nil, - }, - { - title: "Multiple pending, read transaction at 150", - pagesIn: []testPage{ - {id: 3, n: 1, allocTxn: 100, freeTxn: 200}, - {id: 4, n: 1, allocTxn: 100, freeTxn: 125}, - {id: 5, n: 1, allocTxn: 125, freeTxn: 150}, - {id: 6, n: 1, allocTxn: 125, freeTxn: 175}, - {id: 7, n: 2, allocTxn: 150, freeTxn: 175}, - {id: 9, n: 2, allocTxn: 175, freeTxn: 200}, - }, - releaseRanges: []testRange{{50, 149}, {151, 300}}, - wantFree: []common.Pgid{4, 9, 10}, - }, - } - - for _, c := range releaseRangeTests { - f := newTestFreelist() - var ids []common.Pgid - for _, p := range c.pagesIn { - for i := uint64(0); i < uint64(p.n); i++ { - ids = append(ids, common.Pgid(uint64(p.id)+i)) - } - } - f.readIDs(ids) - for _, p := range c.pagesIn { - f.allocate(p.allocTxn, p.n) - } - - for _, p := range c.pagesIn { - f.free(p.freeTxn, common.NewPage(p.id, 0, 0, uint32(p.n-1))) - } - - for _, r := range c.releaseRanges { - f.releaseRange(r.begin, r.end) - } - - if exp := c.wantFree; !reflect.DeepEqual(exp, f.getFreePageIDs()) { - t.Errorf("exp=%v; got=%v for %s", exp, f.getFreePageIDs(), c.title) - } - } -} - -func TestFreelistHashmap_allocate(t *testing.T) { - f := newTestFreelist() - if f.freelistType != FreelistMapType { - t.Skip() - } - - ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} - f.readIDs(ids) - - f.allocate(1, 3) - if x := f.free_count(); x != 6 { - t.Fatalf("exp=6; got=%v", x) - } - - f.allocate(1, 2) - if x := f.free_count(); x != 4 { - t.Fatalf("exp=4; got=%v", x) - } - f.allocate(1, 1) - if x := f.free_count(); x != 3 { - t.Fatalf("exp=3; got=%v", x) - } - - f.allocate(1, 0) - if x := f.free_count(); x != 3 { - t.Fatalf("exp=3; got=%v", x) - } -} - -// Ensure that a freelist can find contiguous blocks of pages. -func TestFreelistArray_allocate(t *testing.T) { - f := newTestFreelist() - if f.freelistType != FreelistArrayType { - t.Skip() - } - ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} - f.readIDs(ids) - if id := int(f.allocate(1, 3)); id != 3 { - t.Fatalf("exp=3; got=%v", id) - } - if id := int(f.allocate(1, 1)); id != 6 { - t.Fatalf("exp=6; got=%v", id) - } - if id := int(f.allocate(1, 3)); id != 0 { - t.Fatalf("exp=0; got=%v", id) - } - if id := int(f.allocate(1, 2)); id != 12 { - t.Fatalf("exp=12; got=%v", id) - } - if id := int(f.allocate(1, 1)); id != 7 { - t.Fatalf("exp=7; got=%v", id) - } - if id := int(f.allocate(1, 0)); id != 0 { - t.Fatalf("exp=0; got=%v", id) - } - if id := int(f.allocate(1, 0)); id != 0 { - t.Fatalf("exp=0; got=%v", id) - } - if exp := []common.Pgid{9, 18}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { - t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) - } - - if id := int(f.allocate(1, 1)); id != 9 { - t.Fatalf("exp=9; got=%v", id) - } - if id := int(f.allocate(1, 1)); id != 18 { - t.Fatalf("exp=18; got=%v", id) - } - if id := int(f.allocate(1, 1)); id != 0 { - t.Fatalf("exp=0; got=%v", id) - } - if exp := []common.Pgid{}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { - t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) - } -} - -// Ensure that a freelist can deserialize from a freelist page. -func TestFreelist_read(t *testing.T) { - // Create a page. - var buf [4096]byte - page := (*common.Page)(unsafe.Pointer(&buf[0])) - page.SetFlags(common.FreelistPageFlag) - page.SetCount(2) - - // Insert 2 page ids. - ids := (*[3]common.Pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page))) - ids[0] = 23 - ids[1] = 50 - - // Deserialize page into a freelist. - f := newTestFreelist() - f.read(page) - - // Ensure that there are two page ids in the freelist. - if exp := []common.Pgid{23, 50}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { - t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) - } -} - -// Ensure that a freelist can serialize into a freelist page. -func TestFreelist_write(t *testing.T) { - // Create a freelist and write it to a page. - var buf [4096]byte - f := newTestFreelist() - - f.readIDs([]common.Pgid{12, 39}) - f.pending[100] = &txPending{ids: []common.Pgid{28, 11}} - f.pending[101] = &txPending{ids: []common.Pgid{3}} - p := (*common.Page)(unsafe.Pointer(&buf[0])) - if err := f.write(p); err != nil { - t.Fatal(err) - } - - // Read the page back out. - f2 := newTestFreelist() - f2.read(p) - - // Ensure that the freelist is correct. - // All pages should be present and in reverse order. - if exp := []common.Pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.getFreePageIDs()) { - t.Fatalf("exp=%v; got=%v", exp, f2.getFreePageIDs()) - } -} - -func Benchmark_FreelistRelease10K(b *testing.B) { benchmark_FreelistRelease(b, 10000) } -func Benchmark_FreelistRelease100K(b *testing.B) { benchmark_FreelistRelease(b, 100000) } -func Benchmark_FreelistRelease1000K(b *testing.B) { benchmark_FreelistRelease(b, 1000000) } -func Benchmark_FreelistRelease10000K(b *testing.B) { benchmark_FreelistRelease(b, 10000000) } - -func benchmark_FreelistRelease(b *testing.B, size int) { - ids := randomPgids(size) - pending := randomPgids(len(ids) / 400) - b.ResetTimer() - for i := 0; i < b.N; i++ { - txp := &txPending{ids: pending} - f := newTestFreelist() - f.pending = map[common.Txid]*txPending{1: txp} - f.readIDs(ids) - f.release(1) - } -} - -func randomPgids(n int) []common.Pgid { - pgids := make(common.Pgids, n) - for i := range pgids { - pgids[i] = common.Pgid(rand.Int63()) - } - sort.Sort(pgids) - return pgids -} - -func Test_freelist_ReadIDs_and_getFreePageIDs(t *testing.T) { - f := newTestFreelist() - exp := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} - - f.readIDs(exp) - - if got := f.getFreePageIDs(); !reflect.DeepEqual(exp, got) { - t.Fatalf("exp=%v; got=%v", exp, got) - } - - f2 := newTestFreelist() - var exp2 []common.Pgid - f2.readIDs(exp2) - - if got2 := f2.getFreePageIDs(); !reflect.DeepEqual(got2, exp2) { - t.Fatalf("exp2=%#v; got2=%#v", exp2, got2) - } - -} - -func Test_freelist_mergeWithExist(t *testing.T) { - bm1 := pidSet{1: struct{}{}} - - bm2 := pidSet{5: struct{}{}} - tests := []struct { - name string - ids []common.Pgid - pgid common.Pgid - want []common.Pgid - wantForwardmap map[common.Pgid]uint64 - wantBackwardmap map[common.Pgid]uint64 - wantfreemap map[uint64]pidSet - }{ - { - name: "test1", - ids: []common.Pgid{1, 2, 4, 5, 6}, - pgid: 3, - want: []common.Pgid{1, 2, 3, 4, 5, 6}, - wantForwardmap: map[common.Pgid]uint64{1: 6}, - wantBackwardmap: map[common.Pgid]uint64{6: 6}, - wantfreemap: map[uint64]pidSet{6: bm1}, - }, - { - name: "test2", - ids: []common.Pgid{1, 2, 5, 6}, - pgid: 3, - want: []common.Pgid{1, 2, 3, 5, 6}, - wantForwardmap: map[common.Pgid]uint64{1: 3, 5: 2}, - wantBackwardmap: map[common.Pgid]uint64{6: 2, 3: 3}, - wantfreemap: map[uint64]pidSet{3: bm1, 2: bm2}, - }, - { - name: "test3", - ids: []common.Pgid{1, 2}, - pgid: 3, - want: []common.Pgid{1, 2, 3}, - wantForwardmap: map[common.Pgid]uint64{1: 3}, - wantBackwardmap: map[common.Pgid]uint64{3: 3}, - wantfreemap: map[uint64]pidSet{3: bm1}, - }, - { - name: "test4", - ids: []common.Pgid{2, 3}, - pgid: 1, - want: []common.Pgid{1, 2, 3}, - wantForwardmap: map[common.Pgid]uint64{1: 3}, - wantBackwardmap: map[common.Pgid]uint64{3: 3}, - wantfreemap: map[uint64]pidSet{3: bm1}, - }, - } - for _, tt := range tests { - f := newTestFreelist() - if f.freelistType == FreelistArrayType { - t.Skip() - } - f.readIDs(tt.ids) - - f.mergeWithExistingSpan(tt.pgid) - - if got := f.getFreePageIDs(); !reflect.DeepEqual(tt.want, got) { - t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.want, got) - } - if got := f.forwardMap; !reflect.DeepEqual(tt.wantForwardmap, got) { - t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.wantForwardmap, got) - } - if got := f.backwardMap; !reflect.DeepEqual(tt.wantBackwardmap, got) { - t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.wantBackwardmap, got) - } - if got := f.freemaps; !reflect.DeepEqual(tt.wantfreemap, got) { - t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.wantfreemap, got) - } - } -} - -// newTestFreelist get the freelist type from env and initial the freelist -func newTestFreelist() *freelist { - freelistType := FreelistArrayType - if env := os.Getenv(TestFreelistType); env == string(FreelistMapType) { - freelistType = FreelistMapType - } - - return newFreelist(freelistType) -} - -func Test_freelist_hashmapGetFreePageIDs(t *testing.T) { - f := newTestFreelist() - if f.freelistType == FreelistArrayType { - t.Skip() - } - - N := int32(100000) - fm := make(map[common.Pgid]uint64) - i := int32(0) - val := int32(0) - for i = 0; i < N; { - val = rand.Int31n(1000) - fm[common.Pgid(i)] = uint64(val) - i += val - f.freePagesCount += uint64(val) - } - - f.forwardMap = fm - res := f.hashmapGetFreePageIDs() - - if !sort.SliceIsSorted(res, func(i, j int) bool { return res[i] < res[j] }) { - t.Fatalf("pgids not sorted") - } -} - -func Benchmark_freelist_hashmapGetFreePageIDs(b *testing.B) { - f := newTestFreelist() - if f.freelistType == FreelistArrayType { - b.Skip() - } - - N := int32(100000) - fm := make(map[common.Pgid]uint64) - i := int32(0) - val := int32(0) - for i = 0; i < N; { - val = rand.Int31n(1000) - fm[common.Pgid(i)] = uint64(val) - i += val - } - - f.forwardMap = fm - - b.ReportAllocs() - b.ResetTimer() - for n := 0; n < b.N; n++ { - f.hashmapGetFreePageIDs() - } -} diff --git a/internal/btesting/btesting.go b/internal/btesting/btesting.go index c83369f09..b7eef8781 100644 --- a/internal/btesting/btesting.go +++ b/internal/btesting/btesting.go @@ -10,6 +10,8 @@ import ( "testing" "time" + "go.etcd.io/bbolt/internal/freelist" + "github.com/stretchr/testify/require" bolt "go.etcd.io/bbolt" @@ -49,9 +51,9 @@ func MustOpenDBWithOption(t testing.TB, f string, o *bolt.Options) *DB { o = bolt.DefaultOptions } - freelistType := bolt.FreelistArrayType - if env := os.Getenv(TestFreelistType); env == string(bolt.FreelistMapType) { - freelistType = bolt.FreelistMapType + freelistType := freelist.FreelistArrayType + if env := os.Getenv(TestFreelistType); env == string(freelist.FreelistMapType) { + freelistType = freelist.FreelistMapType } o.FreelistType = freelistType diff --git a/internal/freelist/array.go b/internal/freelist/array.go new file mode 100644 index 000000000..20c4355b3 --- /dev/null +++ b/internal/freelist/array.go @@ -0,0 +1,116 @@ +package freelist + +import ( + "fmt" + "sort" + + "go.etcd.io/bbolt/internal/common" +) + +type Array struct { + shared + + ids []common.Pgid // all free and available free page ids. +} + +func (f *Array) Init(ids common.Pgids) { + f.ids = ids + f.reindex(f.FreePageIds(), f.PendingPageIds()) +} + +func (f *Array) Allocate(txid common.Txid, n int) common.Pgid { + if len(f.ids) == 0 { + return 0 + } + + var initial, previd common.Pgid + for i, id := range f.ids { + if id <= 1 { + panic(fmt.Sprintf("invalid page allocation: %d", id)) + } + + // Reset initial page if this is not contiguous. + if previd == 0 || id-previd != 1 { + initial = id + } + + // If we found a contiguous block then remove it and return it. + if (id-initial)+1 == common.Pgid(n) { + // If we're allocating off the beginning then take the fast path + // and just adjust the existing slice. This will use extra memory + // temporarily but the append() in free() will realloc the slice + // as is necessary. + if (i + 1) == n { + f.ids = f.ids[i+1:] + } else { + copy(f.ids[i-n+1:], f.ids[i+1:]) + f.ids = f.ids[:len(f.ids)-n] + } + + // Remove from the free cache. + for i := common.Pgid(0); i < common.Pgid(n); i++ { + delete(f.cache, initial+i) + } + f.allocs[initial] = txid + return initial + } + + previd = id + } + return 0 +} + +func (f *Array) Count() int { + return f.FreeCount() + f.PendingCount() +} + +func (f *Array) FreeCount() int { + return len(f.ids) +} + +func (f *Array) FreePageIds() common.Pgids { + return f.ids +} + +func (f *Array) mergeSpans(ids common.Pgids) { + sort.Sort(ids) + common.Verify(func() { + idsIdx := make(map[common.Pgid]struct{}) + for _, id := range f.ids { + // The existing f.ids shouldn't have duplicated free ID. + if _, ok := idsIdx[id]; ok { + panic(fmt.Sprintf("detected duplicated free page ID: %d in existing f.ids: %v", id, f.ids)) + } + idsIdx[id] = struct{}{} + } + + prev := common.Pgid(0) + for _, id := range ids { + // The ids shouldn't have duplicated free ID. Note page 0 and 1 + // are reserved for meta pages, so they can never be free page IDs. + if prev == id { + panic(fmt.Sprintf("detected duplicated free ID: %d in ids: %v", id, ids)) + } + prev = id + + // The ids shouldn't have any overlap with the existing f.ids. + if _, ok := idsIdx[id]; ok { + panic(fmt.Sprintf("detected overlapped free page ID: %d between ids: %v and existing f.ids: %v", id, ids, f.ids)) + } + } + }) + f.ids = common.Pgids(f.ids).Merge(ids) +} + +func NewArray() *Array { + a := &Array{ + shared: shared{ + pending: make(map[common.Txid]*txPending), + allocs: make(map[common.Pgid]common.Txid), + cache: make(map[common.Pgid]struct{}), + }, + } + // this loopy reference allows us to share the span merging via interfaces + a.shared.spanMerger = a + return a +} diff --git a/internal/freelist/array_test.go b/internal/freelist/array_test.go new file mode 100644 index 000000000..84990f059 --- /dev/null +++ b/internal/freelist/array_test.go @@ -0,0 +1,52 @@ +package freelist + +import ( + "reflect" + "testing" + + "go.etcd.io/bbolt/internal/common" +) + +// Ensure that a freelist can find contiguous blocks of pages. +func TestFreelistArray_allocate(t *testing.T) { + f := NewArray() + ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} + f.Init(ids) + if id := int(f.Allocate(1, 3)); id != 3 { + t.Fatalf("exp=3; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 6 { + t.Fatalf("exp=6; got=%v", id) + } + if id := int(f.Allocate(1, 3)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if id := int(f.Allocate(1, 2)); id != 12 { + t.Fatalf("exp=12; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 7 { + t.Fatalf("exp=7; got=%v", id) + } + if id := int(f.Allocate(1, 0)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if id := int(f.Allocate(1, 0)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if exp := common.Pgids([]common.Pgid{9, 18}); !reflect.DeepEqual(exp, f.FreePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.FreePageIds()) + } + + if id := int(f.Allocate(1, 1)); id != 9 { + t.Fatalf("exp=9; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 18 { + t.Fatalf("exp=18; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if exp := common.Pgids([]common.Pgid{}); !reflect.DeepEqual(exp, f.FreePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.FreePageIds()) + } +} diff --git a/internal/freelist/freelist.go b/internal/freelist/freelist.go new file mode 100644 index 000000000..6a7131aba --- /dev/null +++ b/internal/freelist/freelist.go @@ -0,0 +1,108 @@ +package freelist + +import ( + "sort" + + "go.etcd.io/bbolt/internal/common" +) + +// FreelistType is the type of the freelist backend +type FreelistType string + +// TODO(ahrtr): eventually we should (step by step) +// 1. default to `FreelistMapType`; +// 2. remove the `FreelistArrayType`, do not export `FreelistMapType` +// and remove field `FreelistType' from both `DB` and `Options`; +const ( + // FreelistArrayType indicates backend freelist type is array + FreelistArrayType = FreelistType("array") + // FreelistMapType indicates backend freelist type is hashmap + FreelistMapType = FreelistType("hashmap") +) + +type Freelist interface { + // Init initializes this freelist with the given list of pages. + Init(ids common.Pgids) + + // Allocate returns the starting page id of a contiguous block of a given size in number of pages. + // If a contiguous block cannot be found then 0 is returned. + Allocate(txid common.Txid, numPages int) common.Pgid + + // Count returns the number of free and pending pages. + Count() int + + // FreeCount returns the number of free pages. + FreeCount() int + + // FreePageIds returns all free pages. + FreePageIds() common.Pgids + + // PendingCount returns the number of pending pages. + PendingCount() int + + // PendingPageIds returns all pending pages by transaction id. + PendingPageIds() map[common.Txid]*txPending + + // Release moves all page ids for a transaction id (or older) to the freelist. + Release(txId common.Txid) + + // ReleaseRange moves pending pages allocated within an extent [begin,end] to the free list. + ReleaseRange(begin, end common.Txid) + + // Free releases a page and its overflow for a given transaction id. + // If the page is already free then a panic will occur. + Free(txId common.Txid, p *common.Page) + + // Freed returns whether a given page is in the free list. + Freed(pgId common.Pgid) bool + + // Rollback removes the pages from a given pending tx. + Rollback(txId common.Txid) +} + +// Copyall copies a list of all free ids and all pending ids in one sorted list. +// f.count returns the minimum length required for dst. +func Copyall(f Freelist, dst []common.Pgid) { + m := make(common.Pgids, 0, f.PendingCount()) + for _, txp := range f.PendingPageIds() { + m = append(m, txp.ids...) + } + sort.Sort(m) + common.Mergepgids(dst, f.FreePageIds(), m) +} + +// Reload reads the freelist from a page and filters out pending items. +func Reload(s Serializable, f Freelist, p *common.Page) { + s.Read(f, p) + NoSyncReload(f, p.FreelistPageIds()) +} + +// NoSyncReload reads the freelist from Pgids and filters out pending items. +func NoSyncReload(f Freelist, pgIds common.Pgids) { + // Build a cache of only pending pages. + pcache := make(map[common.Pgid]bool) + for _, txp := range f.PendingPageIds() { + for _, pendingID := range txp.ids { + pcache[pendingID] = true + } + } + + // Check each page in the freelist and build a new available freelist + // with any pages not in the pending lists. + var a []common.Pgid + for _, id := range f.FreePageIds() { + if !pcache[id] { + a = append(a, id) + } + } + + f.Init(a) +} + +// NewFreelist returns an empty, initialized freelist. +func NewFreelist(freelistType FreelistType) Freelist { + if freelistType == FreelistMapType { + return NewHashMap() + } + return NewArray() +} diff --git a/internal/freelist/freelist_test.go b/internal/freelist/freelist_test.go new file mode 100644 index 000000000..8ec20a7c1 --- /dev/null +++ b/internal/freelist/freelist_test.go @@ -0,0 +1,236 @@ +package freelist + +import ( + "math/rand" + "os" + "reflect" + "sort" + "testing" + + "go.etcd.io/bbolt/internal/common" +) + +// TestFreelistType is used as a env variable for test to indicate the backend type +const TestFreelistType = "TEST_FREELIST_TYPE" + +// Ensure that a page is added to a transaction's freelist. +func TestFreelist_free(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 0)) + if !reflect.DeepEqual([]common.Pgid{12}, f.PendingPageIds()[100].ids) { + t.Fatalf("exp=%v; got=%v", []common.Pgid{12}, f.PendingPageIds()[100].ids) + } +} + +// Ensure that a page and its overflow is added to a transaction's freelist. +func TestFreelist_free_overflow(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 3)) + if exp := []common.Pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.PendingPageIds()[100].ids) { + t.Fatalf("exp=%v; got=%v", exp, f.PendingPageIds()[100].ids) + } +} + +// Ensure that a transaction's free pages can be released. +func TestFreelist_release(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 1)) + f.Free(100, common.NewPage(9, 0, 0, 0)) + f.Free(102, common.NewPage(39, 0, 0, 0)) + f.Release(100) + f.Release(101) + if exp := common.Pgids([]common.Pgid{9, 12, 13}); !reflect.DeepEqual(exp, f.FreePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.FreePageIds()) + } + + f.Release(102) + if exp := common.Pgids([]common.Pgid{9, 12, 13, 39}); !reflect.DeepEqual(exp, f.FreePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.FreePageIds()) + } +} + +// Ensure that releaseRange handles boundary conditions correctly +func TestFreelist_releaseRange(t *testing.T) { + type testRange struct { + begin, end common.Txid + } + + type testPage struct { + id common.Pgid + n int + allocTxn common.Txid + freeTxn common.Txid + } + + var releaseRangeTests = []struct { + title string + pagesIn []testPage + releaseRanges []testRange + wantFree []common.Pgid + }{ + { + title: "Single pending in range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{1, 300}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending with minimum end range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{1, 200}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending outsize minimum end range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{1, 199}}, + wantFree: nil, + }, + { + title: "Single pending with minimum begin range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{100, 300}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending outside minimum begin range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{101, 300}}, + wantFree: nil, + }, + { + title: "Single pending in minimum range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, + releaseRanges: []testRange{{199, 200}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending and read transaction at 199", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, + releaseRanges: []testRange{{100, 198}, {200, 300}}, + wantFree: nil, + }, + { + title: "Adjacent pending and read transactions at 199, 200", + pagesIn: []testPage{ + {id: 3, n: 1, allocTxn: 199, freeTxn: 200}, + {id: 4, n: 1, allocTxn: 200, freeTxn: 201}, + }, + releaseRanges: []testRange{ + {100, 198}, + {200, 199}, // Simulate the ranges db.freePages might produce. + {201, 300}, + }, + wantFree: nil, + }, + { + title: "Out of order ranges", + pagesIn: []testPage{ + {id: 3, n: 1, allocTxn: 199, freeTxn: 200}, + {id: 4, n: 1, allocTxn: 200, freeTxn: 201}, + }, + releaseRanges: []testRange{ + {201, 199}, + {201, 200}, + {200, 200}, + }, + wantFree: nil, + }, + { + title: "Multiple pending, read transaction at 150", + pagesIn: []testPage{ + {id: 3, n: 1, allocTxn: 100, freeTxn: 200}, + {id: 4, n: 1, allocTxn: 100, freeTxn: 125}, + {id: 5, n: 1, allocTxn: 125, freeTxn: 150}, + {id: 6, n: 1, allocTxn: 125, freeTxn: 175}, + {id: 7, n: 2, allocTxn: 150, freeTxn: 175}, + {id: 9, n: 2, allocTxn: 175, freeTxn: 200}, + }, + releaseRanges: []testRange{{50, 149}, {151, 300}}, + wantFree: []common.Pgid{4, 9, 10}, + }, + } + + for _, c := range releaseRangeTests { + f := newTestFreelist() + var ids []common.Pgid + for _, p := range c.pagesIn { + for i := uint64(0); i < uint64(p.n); i++ { + ids = append(ids, common.Pgid(uint64(p.id)+i)) + } + } + f.Init(ids) + for _, p := range c.pagesIn { + f.Allocate(p.allocTxn, p.n) + } + + for _, p := range c.pagesIn { + f.Free(p.freeTxn, common.NewPage(p.id, 0, 0, uint32(p.n-1))) + } + + for _, r := range c.releaseRanges { + f.ReleaseRange(r.begin, r.end) + } + + if exp := common.Pgids(c.wantFree); !reflect.DeepEqual(exp, f.FreePageIds()) { + t.Errorf("exp=%v; got=%v for %s", exp, f.FreePageIds(), c.title) + } + } +} + +func Benchmark_FreelistRelease10K(b *testing.B) { benchmark_FreelistRelease(b, 10000) } +func Benchmark_FreelistRelease100K(b *testing.B) { benchmark_FreelistRelease(b, 100000) } +func Benchmark_FreelistRelease1000K(b *testing.B) { benchmark_FreelistRelease(b, 1000000) } +func Benchmark_FreelistRelease10000K(b *testing.B) { benchmark_FreelistRelease(b, 10000000) } + +func benchmark_FreelistRelease(b *testing.B, size int) { + ids := randomPgids(size) + pending := randomPgids(len(ids) / 400) + b.ResetTimer() + for i := 0; i < b.N; i++ { + txp := &txPending{ids: pending} + f := newTestFreelist() + f.PendingPageIds()[1] = txp + f.Init(ids) + f.Release(1) + } +} + +func randomPgids(n int) []common.Pgid { + pgids := make(common.Pgids, n) + for i := range pgids { + pgids[i] = common.Pgid(rand.Int63()) + } + sort.Sort(pgids) + return pgids +} + +func Test_freelist_ReadIDs_and_getFreePageIDs(t *testing.T) { + f := newTestFreelist() + exp := common.Pgids([]common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}) + + f.Init(exp) + + if got := f.FreePageIds(); !reflect.DeepEqual(exp, got) { + t.Fatalf("exp=%v; got=%v", exp, got) + } + + f2 := newTestFreelist() + var exp2 []common.Pgid + f2.Init(exp2) + + if got2 := f2.FreePageIds(); !reflect.DeepEqual(got2, common.Pgids(exp2)) { + t.Fatalf("exp2=%#v; got2=%#v", exp2, got2) + } + +} + +// newTestFreelist get the freelist type from env and initial the freelist +func newTestFreelist() Freelist { + freelistType := FreelistArrayType + if env := os.Getenv(TestFreelistType); env == string(FreelistMapType) { + freelistType = FreelistMapType + } + + return NewFreelist(freelistType) +} diff --git a/freelist_hmap.go b/internal/freelist/hashmap.go similarity index 75% rename from freelist_hmap.go rename to internal/freelist/hashmap.go index c5c09f55e..21be9517e 100644 --- a/freelist_hmap.go +++ b/internal/freelist/hashmap.go @@ -1,4 +1,4 @@ -package bbolt +package freelist import ( "fmt" @@ -8,26 +8,57 @@ import ( "go.etcd.io/bbolt/internal/common" ) -// hashmapFreeCount returns count of free pages(hashmap version) -func (f *freelist) hashmapFreeCount() int { - common.Verify(func() { - expectedFreePageCount := f.hashmapFreeCountSlow() - common.Assert(int(f.freePagesCount) == expectedFreePageCount, - "freePagesCount (%d) is out of sync with free pages map (%d)", f.freePagesCount, expectedFreePageCount) - }) - return int(f.freePagesCount) +// pidSet holds the set of starting pgids which have the same span size +type pidSet map[common.Pgid]struct{} + +type HashMap struct { + shared + + freePagesCount uint64 // count of free pages(hashmap version) + freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size + forwardMap map[common.Pgid]uint64 // key is start pgid, value is its span size + backwardMap map[common.Pgid]uint64 // key is end pgid, value is its span size } -func (f *freelist) hashmapFreeCountSlow() int { - count := 0 - for _, size := range f.forwardMap { - count += int(size) +func (f *HashMap) Init(pgids common.Pgids) { + if len(pgids) == 0 { + return } - return count + + size := uint64(1) + start := pgids[0] + // reset the counter when freelist init + f.freePagesCount = 0 + + if !sort.SliceIsSorted([]common.Pgid(pgids), func(i, j int) bool { return pgids[i] < pgids[j] }) { + panic("pgids not sorted") + } + + f.freemaps = make(map[uint64]pidSet) + f.forwardMap = make(map[common.Pgid]uint64) + f.backwardMap = make(map[common.Pgid]uint64) + + for i := 1; i < len(pgids); i++ { + // continuous page + if pgids[i] == pgids[i-1]+1 { + size++ + } else { + f.addSpan(start, size) + + size = 1 + start = pgids[i] + } + } + + // init the tail + if size != 0 && start != 0 { + f.addSpan(start, size) + } + + f.reindex(f.FreePageIds(), f.PendingPageIds()) } -// hashmapAllocate serves the same purpose as arrayAllocate, but use hashmap as backend -func (f *freelist) hashmapAllocate(txid common.Txid, n int) common.Pgid { +func (f *HashMap) Allocate(txid common.Txid, n int) common.Pgid { if n == 0 { return 0 } @@ -74,17 +105,21 @@ func (f *freelist) hashmapAllocate(txid common.Txid, n int) common.Pgid { return 0 } -// hashmapReadIDs reads pgids as input an initial the freelist(hashmap version) -func (f *freelist) hashmapReadIDs(pgids []common.Pgid) { - f.init(pgids) +func (f *HashMap) Count() int { + return f.FreeCount() + f.PendingCount() +} - // Rebuild the page cache. - f.reindex() +func (f *HashMap) FreeCount() int { + common.Verify(func() { + expectedFreePageCount := f.hashmapFreeCountSlow() + common.Assert(int(f.freePagesCount) == expectedFreePageCount, + "freePagesCount (%d) is out of sync with free pages map (%d)", f.freePagesCount, expectedFreePageCount) + }) + return int(f.freePagesCount) } -// hashmapGetFreePageIDs returns the sorted free page ids -func (f *freelist) hashmapGetFreePageIDs() []common.Pgid { - count := f.free_count() +func (f *HashMap) FreePageIds() common.Pgids { + count := f.FreeCount() if count == 0 { return nil } @@ -108,8 +143,36 @@ func (f *freelist) hashmapGetFreePageIDs() []common.Pgid { return m } -// hashmapMergeSpans try to merge list of pages(represented by pgids) with existing spans -func (f *freelist) hashmapMergeSpans(ids common.Pgids) { +func (f *HashMap) hashmapFreeCountSlow() int { + count := 0 + for _, size := range f.forwardMap { + count += int(size) + } + return count +} + +func (f *HashMap) addSpan(start common.Pgid, size uint64) { + f.backwardMap[start-1+common.Pgid(size)] = size + f.forwardMap[start] = size + if _, ok := f.freemaps[size]; !ok { + f.freemaps[size] = make(map[common.Pgid]struct{}) + } + + f.freemaps[size][start] = struct{}{} + f.freePagesCount += size +} + +func (f *HashMap) delSpan(start common.Pgid, size uint64) { + delete(f.forwardMap, start) + delete(f.backwardMap, start+common.Pgid(size-1)) + delete(f.freemaps[size], start) + if len(f.freemaps[size]) == 0 { + delete(f.freemaps, size) + } + f.freePagesCount -= size +} + +func (f *HashMap) mergeSpans(ids common.Pgids) { common.Verify(func() { ids1Freemap := f.idsFromFreemaps() ids2Forward := f.idsFromForwardMap() @@ -144,7 +207,7 @@ func (f *freelist) hashmapMergeSpans(ids common.Pgids) { } // mergeWithExistingSpan merges pid to the existing free spans, try to merge it backward and forward -func (f *freelist) mergeWithExistingSpan(pid common.Pgid) { +func (f *HashMap) mergeWithExistingSpan(pid common.Pgid) { prev := pid - 1 next := pid + 1 @@ -171,68 +234,9 @@ func (f *freelist) mergeWithExistingSpan(pid common.Pgid) { f.addSpan(newStart, newSize) } -func (f *freelist) addSpan(start common.Pgid, size uint64) { - f.backwardMap[start-1+common.Pgid(size)] = size - f.forwardMap[start] = size - if _, ok := f.freemaps[size]; !ok { - f.freemaps[size] = make(map[common.Pgid]struct{}) - } - - f.freemaps[size][start] = struct{}{} - f.freePagesCount += size -} - -func (f *freelist) delSpan(start common.Pgid, size uint64) { - delete(f.forwardMap, start) - delete(f.backwardMap, start+common.Pgid(size-1)) - delete(f.freemaps[size], start) - if len(f.freemaps[size]) == 0 { - delete(f.freemaps, size) - } - f.freePagesCount -= size -} - -// initial from pgids using when use hashmap version -// pgids must be sorted -func (f *freelist) init(pgids []common.Pgid) { - if len(pgids) == 0 { - return - } - - size := uint64(1) - start := pgids[0] - // reset the counter when freelist init - f.freePagesCount = 0 - - if !sort.SliceIsSorted([]common.Pgid(pgids), func(i, j int) bool { return pgids[i] < pgids[j] }) { - panic("pgids not sorted") - } - - f.freemaps = make(map[uint64]pidSet) - f.forwardMap = make(map[common.Pgid]uint64) - f.backwardMap = make(map[common.Pgid]uint64) - - for i := 1; i < len(pgids); i++ { - // continuous page - if pgids[i] == pgids[i-1]+1 { - size++ - } else { - f.addSpan(start, size) - - size = 1 - start = pgids[i] - } - } - - // init the tail - if size != 0 && start != 0 { - f.addSpan(start, size) - } -} - // idsFromFreemaps get all free page IDs from f.freemaps. // used by test only. -func (f *freelist) idsFromFreemaps() map[common.Pgid]struct{} { +func (f *HashMap) idsFromFreemaps() map[common.Pgid]struct{} { ids := make(map[common.Pgid]struct{}) for size, idSet := range f.freemaps { for start := range idSet { @@ -250,7 +254,7 @@ func (f *freelist) idsFromFreemaps() map[common.Pgid]struct{} { // idsFromForwardMap get all free page IDs from f.forwardMap. // used by test only. -func (f *freelist) idsFromForwardMap() map[common.Pgid]struct{} { +func (f *HashMap) idsFromForwardMap() map[common.Pgid]struct{} { ids := make(map[common.Pgid]struct{}) for start, size := range f.forwardMap { for i := 0; i < int(size); i++ { @@ -266,7 +270,7 @@ func (f *freelist) idsFromForwardMap() map[common.Pgid]struct{} { // idsFromBackwardMap get all free page IDs from f.backwardMap. // used by test only. -func (f *freelist) idsFromBackwardMap() map[common.Pgid]struct{} { +func (f *HashMap) idsFromBackwardMap() map[common.Pgid]struct{} { ids := make(map[common.Pgid]struct{}) for end, size := range f.backwardMap { for i := 0; i < int(size); i++ { @@ -279,3 +283,19 @@ func (f *freelist) idsFromBackwardMap() map[common.Pgid]struct{} { } return ids } + +func NewHashMap() *HashMap { + hm := &HashMap{ + shared: shared{ + allocs: make(map[common.Pgid]common.Txid), + cache: make(map[common.Pgid]struct{}), + pending: make(map[common.Txid]*txPending), + }, + freemaps: make(map[uint64]pidSet), + forwardMap: make(map[common.Pgid]uint64), + backwardMap: make(map[common.Pgid]uint64), + } + // this loopy reference allows us to share the span merging via interfaces + hm.shared.spanMerger = hm + return hm +} diff --git a/internal/freelist/hashmap_test.go b/internal/freelist/hashmap_test.go new file mode 100644 index 000000000..457980ab3 --- /dev/null +++ b/internal/freelist/hashmap_test.go @@ -0,0 +1,33 @@ +package freelist + +import ( + "testing" + + "go.etcd.io/bbolt/internal/common" +) + +func TestFreelistHashmap_allocate(t *testing.T) { + f := NewHashMap() + + ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} + f.Init(ids) + + f.Allocate(1, 3) + if x := f.FreeCount(); x != 6 { + t.Fatalf("exp=6; got=%v", x) + } + + f.Allocate(1, 2) + if x := f.FreeCount(); x != 4 { + t.Fatalf("exp=4; got=%v", x) + } + f.Allocate(1, 1) + if x := f.FreeCount(); x != 3 { + t.Fatalf("exp=3; got=%v", x) + } + + f.Allocate(1, 0) + if x := f.FreeCount(); x != 3 { + t.Fatalf("exp=3; got=%v", x) + } +} diff --git a/internal/freelist/serde.go b/internal/freelist/serde.go new file mode 100644 index 000000000..b42cc3f22 --- /dev/null +++ b/internal/freelist/serde.go @@ -0,0 +1,79 @@ +package freelist + +import ( + "fmt" + "sort" + "unsafe" + + "go.etcd.io/bbolt/internal/common" +) + +type Serializable interface { + // Read calls Init with the page ids stored in te given page. + Read(f Freelist, page *common.Page) + + // Write writes the freelist into the given page. + Write(f Freelist, page *common.Page) + + // EstimatedWritePageSize returns the size of the page after serialization in Write. + // This should never underestimate the size. + EstimatedWritePageSize(f Freelist) int +} + +type Serializer struct { +} + +func (s Serializer) Read(f Freelist, p *common.Page) { + if !p.IsFreelistPage() { + panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.Id(), p.Typ())) + } + + ids := p.FreelistPageIds() + + // Copy the list of page ids from the freelist. + if len(ids) == 0 { + f.Init(nil) + } else { + // copy the ids, so we don't modify on the freelist page directly + idsCopy := make([]common.Pgid, len(ids)) + copy(idsCopy, ids) + // Make sure they're sorted. + sort.Sort(common.Pgids(idsCopy)) + + f.Init(idsCopy) + } +} + +func (s Serializer) EstimatedWritePageSize(f Freelist) int { + n := f.Count() + if n >= 0xFFFF { + // The first element will be used to store the count. See freelist.write. + n++ + } + return int(common.PageHeaderSize) + (int(unsafe.Sizeof(common.Pgid(0))) * n) +} + +func (s Serializer) Write(f Freelist, p *common.Page) { + // Combine the old free pgids and pgids waiting on an open transaction. + + // Update the header flag. + p.SetFlags(common.FreelistPageFlag) + + // The page.count can only hold up to 64k elements so if we overflow that + // number then we handle it by putting the size in the first element. + l := f.Count() + if l == 0 { + p.SetCount(uint16(l)) + } else if l < 0xFFFF { + p.SetCount(uint16(l)) + data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + ids := unsafe.Slice((*common.Pgid)(data), l) + Copyall(f, ids) + } else { + p.SetCount(0xFFFF) + data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + ids := unsafe.Slice((*common.Pgid)(data), l+1) + ids[0] = common.Pgid(l) + Copyall(f, ids[1:]) + } +} diff --git a/internal/freelist/serde_test.go b/internal/freelist/serde_test.go new file mode 100644 index 000000000..8bc763f2a --- /dev/null +++ b/internal/freelist/serde_test.go @@ -0,0 +1,55 @@ +package freelist + +import ( + "reflect" + "testing" + "unsafe" + + "go.etcd.io/bbolt/internal/common" +) + +// Ensure that a freelist can deserialize from a freelist page. +func TestFreelist_read(t *testing.T) { + // Create a page. + var buf [4096]byte + page := (*common.Page)(unsafe.Pointer(&buf[0])) + page.SetFlags(common.FreelistPageFlag) + page.SetCount(2) + + // Insert 2 page ids. + ids := (*[3]common.Pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page))) + ids[0] = 23 + ids[1] = 50 + + // Deserialize page into a freelist. + f := newTestFreelist() + Serializer{}.Read(f, page) + + // Ensure that there are two page ids in the freelist. + if exp := common.Pgids([]common.Pgid{23, 50}); !reflect.DeepEqual(exp, f.FreePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.FreePageIds()) + } +} + +// Ensure that a freelist can serialize into a freelist page. +func TestFreelist_write(t *testing.T) { + // Create a freelist and write it to a page. + var buf [4096]byte + f := newTestFreelist() + + f.Init([]common.Pgid{12, 39}) + f.PendingPageIds()[100] = &txPending{ids: []common.Pgid{28, 11}} + f.PendingPageIds()[101] = &txPending{ids: []common.Pgid{3}} + p := (*common.Page)(unsafe.Pointer(&buf[0])) + Serializer{}.Write(f, p) + + // Read the page back out. + f2 := newTestFreelist() + Serializer{}.Read(f2, p) + + // Ensure that the freelist is correct. + // All pages should be present and in reverse order. + if exp := common.Pgids([]common.Pgid{3, 11, 12, 28, 39}); !reflect.DeepEqual(exp, f2.FreePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f2.FreePageIds()) + } +} diff --git a/internal/freelist/shared.go b/internal/freelist/shared.go new file mode 100644 index 000000000..d2e58d7b9 --- /dev/null +++ b/internal/freelist/shared.go @@ -0,0 +1,158 @@ +package freelist + +import ( + "fmt" + + "go.etcd.io/bbolt/internal/common" +) + +type spanMerger interface { + // mergeSpans is merging the given pages into the freelist + mergeSpans(ids common.Pgids) +} + +type txPending struct { + ids []common.Pgid + alloctx []common.Txid // txids allocating the ids + lastReleaseBegin common.Txid // beginning txid of last matching releaseRange +} + +type shared struct { + spanMerger + + allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid. + cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids. + pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx. +} + +func (t *shared) PendingPageIds() map[common.Txid]*txPending { + return t.pending +} + +func (t *shared) PendingCount() int { + var count int + for _, txp := range t.pending { + count += len(txp.ids) + } + return count +} + +func (t *shared) Freed(pgId common.Pgid) bool { + _, ok := t.cache[pgId] + return ok +} + +func (t *shared) Free(txid common.Txid, p *common.Page) { + if p.Id() <= 1 { + panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.Id())) + } + + // Free page and all its overflow pages. + txp := t.pending[txid] + if txp == nil { + txp = &txPending{} + t.pending[txid] = txp + } + allocTxid, ok := t.allocs[p.Id()] + if ok { + delete(t.allocs, p.Id()) + } else if p.IsFreelistPage() { + // Freelist is always allocated by prior tx. + allocTxid = txid - 1 + } + + for id := p.Id(); id <= p.Id()+common.Pgid(p.Overflow()); id++ { + // Verify that page is not already free. + if _, ok := t.cache[id]; ok { + panic(fmt.Sprintf("page %d already freed", id)) + } + // Add to the freelist and cache. + txp.ids = append(txp.ids, id) + txp.alloctx = append(txp.alloctx, allocTxid) + t.cache[id] = struct{}{} + } +} + +func (t *shared) Rollback(txid common.Txid) { + // Remove page ids from cache. + txp := t.pending[txid] + if txp == nil { + return + } + var m common.Pgids + for i, pgid := range txp.ids { + delete(t.cache, pgid) + tx := txp.alloctx[i] + if tx == 0 { + continue + } + if tx != txid { + // Pending free aborted; restore page back to alloc list. + t.allocs[pgid] = tx + } else { + // Freed page was allocated by this txn; OK to throw away. + m = append(m, pgid) + } + } + // Remove pages from pending list and mark as free if allocated by txid. + delete(t.pending, txid) + t.spanMerger.mergeSpans(m) +} + +func (t *shared) Release(txid common.Txid) { + m := make(common.Pgids, 0) + for tid, txp := range t.pending { + if tid <= txid { + // Move transaction's pending pages to the available freelist. + // Don't remove from the cache since the page is still free. + m = append(m, txp.ids...) + delete(t.pending, tid) + } + } + t.mergeSpans(m) +} + +func (t *shared) ReleaseRange(begin, end common.Txid) { + if begin > end { + return + } + var m common.Pgids + for tid, txp := range t.pending { + if tid < begin || tid > end { + continue + } + // Don't recompute freed pages if ranges haven't updated. + if txp.lastReleaseBegin == begin { + continue + } + for i := 0; i < len(txp.ids); i++ { + if atx := txp.alloctx[i]; atx < begin || atx > end { + continue + } + m = append(m, txp.ids[i]) + txp.ids[i] = txp.ids[len(txp.ids)-1] + txp.ids = txp.ids[:len(txp.ids)-1] + txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1] + txp.alloctx = txp.alloctx[:len(txp.alloctx)-1] + i-- + } + txp.lastReleaseBegin = begin + if len(txp.ids) == 0 { + delete(t.pending, tid) + } + } + t.mergeSpans(m) +} + +// reindex rebuilds the free cache based on available and pending free lists. +func (t *shared) reindex(free common.Pgids, pending map[common.Txid]*txPending) { + t.cache = make(map[common.Pgid]struct{}, len(free)) + for _, id := range free { + t.cache[id] = struct{}{} + } + for _, txp := range pending { + for _, pendingID := range txp.ids { + t.cache[pendingID] = struct{}{} + } + } +} diff --git a/node.go b/node.go index fe67c3c89..022b1001e 100644 --- a/node.go +++ b/node.go @@ -316,7 +316,7 @@ func (n *node) spill() error { for _, node := range nodes { // Add node's page to the freelist if it's not new. if node.pgid > 0 { - tx.db.freelist.free(tx.meta.Txid(), tx.page(node.pgid)) + tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid)) node.pgid = 0 } @@ -493,7 +493,7 @@ func (n *node) dereference() { // free adds the node's underlying page to the freelist. func (n *node) free() { if n.pgid != 0 { - n.bucket.tx.db.freelist.free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid)) + n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid)) n.pgid = 0 } } diff --git a/tx.go b/tx.go index 011e2c382..27c38753c 100644 --- a/tx.go +++ b/tx.go @@ -12,6 +12,8 @@ import ( "time" "unsafe" + fl "go.etcd.io/bbolt/internal/freelist" + berrors "go.etcd.io/bbolt/errors" "go.etcd.io/bbolt/internal/common" ) @@ -213,7 +215,7 @@ func (tx *Tx) Commit() (err error) { // Free the old freelist because commit writes out a fresh freelist. if tx.meta.Freelist() != common.PgidNoFreelist { - tx.db.freelist.free(tx.meta.Txid(), tx.db.page(tx.meta.Freelist())) + tx.db.freelist.Free(tx.meta.Txid(), tx.db.page(tx.meta.Freelist())) } if !tx.db.NoFreelistSync { @@ -285,15 +287,13 @@ func (tx *Tx) Commit() (err error) { func (tx *Tx) commitFreelist() error { // Allocate new pages for the new free list. This will overestimate // the size of the freelist but not underestimate the size (which would be bad). - p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) + p, err := tx.allocate((tx.db.freelistSerializer.EstimatedWritePageSize(tx.db.freelist) / tx.db.pageSize) + 1) if err != nil { tx.rollback() return err } - if err := tx.db.freelist.write(p); err != nil { - tx.rollback() - return err - } + + tx.db.freelistSerializer.Write(tx.db.freelist, p) tx.meta.SetFreelist(p.Id()) return nil @@ -316,7 +316,7 @@ func (tx *Tx) nonPhysicalRollback() { return } if tx.writable { - tx.db.freelist.rollback(tx.meta.Txid()) + tx.db.freelist.Rollback(tx.meta.Txid()) } tx.close() } @@ -327,17 +327,17 @@ func (tx *Tx) rollback() { return } if tx.writable { - tx.db.freelist.rollback(tx.meta.Txid()) + tx.db.freelist.Rollback(tx.meta.Txid()) // When mmap fails, the `data`, `dataref` and `datasz` may be reset to // zero values, and there is no way to reload free page IDs in this case. if tx.db.data != nil { if !tx.db.hasSyncedFreelist() { // Reconstruct free page list by scanning the DB to get the whole free page list. - // Note: scaning the whole db is heavy if your db size is large in NoSyncFreeList mode. - tx.db.freelist.noSyncReload(tx.db.freepages()) + // Note: scanning the whole db is heavy if your db size is large in NoSyncFreeList mode. + fl.NoSyncReload(tx.db.freelist, tx.db.freepages()) } else { // Read free page list from freelist page. - tx.db.freelist.reload(tx.db.page(tx.db.meta().Freelist())) + fl.Reload(tx.db.freelistSerializer, tx.db.freelist, tx.db.page(tx.db.meta().Freelist())) } } } @@ -350,9 +350,9 @@ func (tx *Tx) close() { } if tx.writable { // Grab freelist stats. - var freelistFreeN = tx.db.freelist.free_count() - var freelistPendingN = tx.db.freelist.pending_count() - var freelistAlloc = tx.db.freelist.size() + var freelistFreeN = tx.db.freelist.FreeCount() + var freelistPendingN = tx.db.freelist.PendingCount() + var freelistAlloc = tx.db.freelistSerializer.EstimatedWritePageSize(tx.db.freelist) // Remove transaction ref & writer lock. tx.db.rwtx = nil @@ -639,7 +639,7 @@ func (tx *Tx) Page(id int) (*common.PageInfo, error) { } // Determine the type (or if it's free). - if tx.db.freelist.freed(common.Pgid(id)) { + if tx.db.freelist.Freed(common.Pgid(id)) { info.Type = "free" } else { info.Type = p.Typ() diff --git a/tx_check.go b/tx_check.go index 4e3c41ae4..3c5d3cec6 100644 --- a/tx_check.go +++ b/tx_check.go @@ -4,6 +4,8 @@ import ( "encoding/hex" "fmt" + "go.etcd.io/bbolt/internal/freelist" + "go.etcd.io/bbolt/internal/common" ) @@ -41,8 +43,8 @@ func (tx *Tx) check(cfg checkConfig, ch chan error) { // Check if any pages are double freed. freed := make(map[common.Pgid]bool) - all := make([]common.Pgid, tx.db.freelist.count()) - tx.db.freelist.copyall(all) + all := make([]common.Pgid, tx.db.freelist.Count()) + freelist.Copyall(tx.db.freelist, all) for _, id := range all { if freed[id] { ch <- fmt.Errorf("page %d: already freed", id)