Skip to content

Commit

Permalink
Don't evict and re-add during initial filecache scan
Browse files Browse the repository at this point in the history
  • Loading branch information
bduffany committed Nov 20, 2024
1 parent 80f7e09 commit a113d30
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 79 deletions.
3 changes: 1 addition & 2 deletions enterprise/server/remote_execution/filecache/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,15 @@ go_test(
deps = [
":filecache",
"//proto:remote_execution_go_proto",
"//server/interfaces",
"//server/metrics",
"//server/remote_cache/digest",
"//server/testutil/testfs",
"//server/testutil/testmetrics",
"//server/util/claims",
"//server/util/hash",
"//server/util/log",
"//server/util/status",
"//server/util/testing/flags",
"@com_github_google_uuid//:uuid",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_x_sync//errgroup",
Expand Down
46 changes: 24 additions & 22 deletions enterprise/server/remote_execution/filecache/filecache.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,7 @@ func (c *fileCache) nodeFromPathAndSize(fullPath string, sizeBytes int64) (strin
func (c *fileCache) scanDir() {
dirCount := 0
fileCount := 0
errCount := 0
scanStart := time.Now()
var addErr error
walkFn := func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
Expand All @@ -185,16 +183,11 @@ func (c *fileCache) scanDir() {
return err
}
if err := c.addFileToGroup(groupID, node, path); err != nil {
// Ignore NotFound - it's fine if files disappear while we're
// scanning.
if status.IsNotFoundError(err) {
return nil
}
// Record other errors to be logged at the end - but continue the
// scan.
errCount++
addErr = err
return nil
// Any errors here are unexpected - this addFileToGroup call should
// just be updating LRU state. There is a chance that it will
// trigger an eviction, but any error from the associated unlink
// operation is just logged and not returned.
return status.WrapError(err, "add file from initial scan")
}
return nil
}
Expand Down Expand Up @@ -223,9 +216,6 @@ func (c *fileCache) scanDir() {
c.lock.Unlock()

log.Infof("filecache(%q) scanned %d dirs, %d files in %s. Total tracked bytes: %d", c.rootDir, dirCount, fileCount, time.Since(scanStart), lruSize)
if addErr != nil {
log.Errorf("filecache(%q) failed to add %d files. Last error: %s", c.rootDir, errCount, addErr)
}
close(c.dirScanDone)
}

Expand Down Expand Up @@ -298,15 +288,27 @@ func (c *fileCache) addFileToGroup(groupID string, node *repb.FileNode, existing
c.lock.Lock()
defer c.lock.Unlock()

c.l.Remove(k)

fp := c.filecachePath(k)
if err := disk.EnsureDirectoryExists(filepath.Dir(fp)); err != nil {
return err
}
if err := cloneOrLink(groupID, existingFilePath, fp); err != nil {
return err

// During the initial directory scan (and only during the initial directory
// scan), the source and dest paths will be the same. In this case, make
// sure we don't evict an LRU entry that was added by some action before we
// got to this file in the scan. We'd wind up losing the cache entry in this
// case, because the eviction will unlink the destination file path (via the
// eviction callback), which is also the source file in this case.

if fp != existingFilePath {
c.l.Remove(k)
if err := disk.EnsureDirectoryExists(filepath.Dir(fp)); err != nil {
return err
}
if err := cloneOrLink(groupID, existingFilePath, fp); err != nil {
return err
}
} else if c.l.Contains(k) {
return nil
}

e := &entry{
addedAtUsec: time.Now().UnixMicro(),
sizeBytes: sizeOnDisk,
Expand Down
76 changes: 21 additions & 55 deletions enterprise/server/remote_execution/filecache/filecache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,15 @@ import (
"os"
"path/filepath"
"testing"
"time"

"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/filecache"
"github.com/buildbuddy-io/buildbuddy/server/interfaces"
"github.com/buildbuddy-io/buildbuddy/server/metrics"
"github.com/buildbuddy-io/buildbuddy/server/remote_cache/digest"
"github.com/buildbuddy-io/buildbuddy/server/testutil/testfs"
"github.com/buildbuddy-io/buildbuddy/server/testutil/testmetrics"
"github.com/buildbuddy-io/buildbuddy/server/util/claims"
"github.com/buildbuddy-io/buildbuddy/server/util/hash"
"github.com/buildbuddy-io/buildbuddy/server/util/log"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/testing/flags"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -315,75 +312,44 @@ func TestFileCacheEvictionAfterStartupScan(t *testing.T) {
}
}

func TestScanWithConcurrentRemove(t *testing.T) {
for i := 0; i < 100; i++ {
func TestScanWithConcurrentAdd(t *testing.T) {
for trial := 0; trial < 100; trial++ {
ctx := context.Background()
filecacheRoot := testfs.MakeTempDir(t)

const n = 100
var nodes [n]*repb.FileNode
var nodeContents [n]string
for i := 0; i < n; i++ {
name := fmt.Sprint(i)
nodes[i] = nodeFromString(name, false)
writeFileContent(t, filecacheRoot, "ANON/"+nodes[i].GetDigest().GetHash(), name, false)
nodeContents[i] = name
}

i := rand.Intn(n)
pathToDelete := filepath.Join(filecacheRoot, "ANON/"+nodes[i].GetDigest().GetHash())

var fc interfaces.FileCache
var eg errgroup.Group
eg.Go(func() error {
time.Sleep(time.Duration(rand.Intn(100)) * time.Microsecond)
var err error
fc, err = filecache.NewFileCache(filecacheRoot, 10_000_000, false)
if err != nil {
return err
}
fc.WaitForDirectoryScanToComplete()
return nil
})
// While the directory scan is in progress, delete a random file.
eg.Go(func() error {
time.Sleep(time.Duration(rand.Intn(100)) * time.Microsecond)
return os.Remove(pathToDelete)
})
err := eg.Wait()
nodeToReAdd := nodes[i]

nodeToReAddPath := filepath.Join(testfs.MakeTempDir(t), "action.output")
err := os.WriteFile(nodeToReAddPath, []byte(nodeContents[i]), 0644)
require.NoError(t, err)

// The directory scan should be resilient to this deletion - linking any
// other random file should work.
j := (i + 1 + rand.Intn(n-1)) % n
ok := fc.FastLinkFile(ctx, nodes[j], filepath.Join(fc.TempDir(), "out"))
require.True(t, ok)
}
}
fc, err := filecache.NewFileCache(filecacheRoot, 10_000_000, false)
require.NoError(t, err)

func TestAddWithConcurrentRemove(t *testing.T) {
ctx := context.Background()
filecacheRoot := testfs.MakeTempDir(t)
fc, err := filecache.NewFileCache(filecacheRoot, 10_000_000, false)
require.NoError(t, err)
fc.WaitForDirectoryScanToComplete()
tempDir := fc.TempDir()
// While the directory scan is in progress, re-add a random file
// to trigger a race.
err = fc.AddFile(ctx, nodeToReAdd, nodeToReAddPath)
require.NoError(t, err)

for i := 0; i < 100; i++ {
name := fmt.Sprint(i)
node := nodeFromString(name, false)
path := filepath.Join(tempDir, name)
writeFileContent(t, tempDir, name, name, false)
fc.WaitForDirectoryScanToComplete()

var eg errgroup.Group
eg.Go(func() error {
time.Sleep(time.Duration(rand.Intn(5)) * time.Microsecond)
return os.Remove(path)
})
eg.Go(func() error {
time.Sleep(time.Duration(rand.Intn(5)) * time.Microsecond)
return fc.AddFile(ctx, node, path)
})
err := eg.Wait()
require.True(t, err == nil || status.IsNotFoundError(err), "expected NotFound or nil error, got %v", err)
// The directory scan should be resilient to this race condition -
// linking any file should work.
for i := 0; i < n; i++ {
ok := fc.FastLinkFile(ctx, nodes[i], filepath.Join(fc.TempDir(), fmt.Sprintf("out-%d", i)))
require.True(t, ok, "link node %d (test trial %d)", i, trial)
}
}
}

Expand Down

0 comments on commit a113d30

Please sign in to comment.