diff --git a/enterprise/server/remote_execution/filecache/BUILD b/enterprise/server/remote_execution/filecache/BUILD index 4893fb411cb..9a22a598825 100644 --- a/enterprise/server/remote_execution/filecache/BUILD +++ b/enterprise/server/remote_execution/filecache/BUILD @@ -32,7 +32,6 @@ go_test( deps = [ ":filecache", "//proto:remote_execution_go_proto", - "//server/interfaces", "//server/metrics", "//server/remote_cache/digest", "//server/testutil/testfs", @@ -40,8 +39,8 @@ go_test( "//server/util/claims", "//server/util/hash", "//server/util/log", - "//server/util/status", "//server/util/testing/flags", + "@com_github_google_uuid//:uuid", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_golang_x_sync//errgroup", diff --git a/enterprise/server/remote_execution/filecache/filecache.go b/enterprise/server/remote_execution/filecache/filecache.go index 265d2c373cc..b333d6c3a78 100644 --- a/enterprise/server/remote_execution/filecache/filecache.go +++ b/enterprise/server/remote_execution/filecache/filecache.go @@ -166,9 +166,7 @@ func (c *fileCache) nodeFromPathAndSize(fullPath string, sizeBytes int64) (strin func (c *fileCache) scanDir() { dirCount := 0 fileCount := 0 - errCount := 0 scanStart := time.Now() - var addErr error walkFn := func(path string, d fs.DirEntry, err error) error { if err != nil { return err @@ -185,16 +183,11 @@ func (c *fileCache) scanDir() { return err } if err := c.addFileToGroup(groupID, node, path); err != nil { - // Ignore NotFound - it's fine if files disappear while we're - // scanning. - if status.IsNotFoundError(err) { - return nil - } - // Record other errors to be logged at the end - but continue the - // scan. - errCount++ - addErr = err - return nil + // Any errors here are unexpected - this addFileToGroup call should + // just be updating LRU state. There is a chance that it will + // trigger an eviction, but any associated error from the associated + // unlink should just be logged. + return status.WrapError(err, "add file from initial scan") } return nil } @@ -223,9 +216,6 @@ func (c *fileCache) scanDir() { c.lock.Unlock() log.Infof("filecache(%q) scanned %d dirs, %d files in %s. Total tracked bytes: %d", c.rootDir, dirCount, fileCount, time.Since(scanStart), lruSize) - if addErr != nil { - log.Errorf("filecache(%q) failed to add %d files. Last error: %s", c.rootDir, errCount, addErr) - } close(c.dirScanDone) } @@ -298,15 +288,27 @@ func (c *fileCache) addFileToGroup(groupID string, node *repb.FileNode, existing c.lock.Lock() defer c.lock.Unlock() - c.l.Remove(k) - fp := c.filecachePath(k) - if err := disk.EnsureDirectoryExists(filepath.Dir(fp)); err != nil { - return err - } - if err := cloneOrLink(groupID, existingFilePath, fp); err != nil { - return err + + // During the initial directory scan (and only during the initial directory + // scan), the source and dest paths will be the same. In this case, make + // sure we don't evict an LRU entry that was added by some action before we + // got to this file in the scan. We'd wind up losing the cache entry in this + // case, because the eviction will unlink the destination file path (via the + // eviction callback), which is also the source file in this case. + + if fp != existingFilePath { + c.l.Remove(k) + if err := disk.EnsureDirectoryExists(filepath.Dir(fp)); err != nil { + return err + } + if err := cloneOrLink(groupID, existingFilePath, fp); err != nil { + return err + } + } else if c.l.Contains(k) { + return nil } + e := &entry{ addedAtUsec: time.Now().UnixMicro(), sizeBytes: sizeOnDisk, diff --git a/enterprise/server/remote_execution/filecache/filecache_test.go b/enterprise/server/remote_execution/filecache/filecache_test.go index 6ee00f3de44..56eecdf357c 100644 --- a/enterprise/server/remote_execution/filecache/filecache_test.go +++ b/enterprise/server/remote_execution/filecache/filecache_test.go @@ -8,10 +8,8 @@ import ( "os" "path/filepath" "testing" - "time" "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/filecache" - "github.com/buildbuddy-io/buildbuddy/server/interfaces" "github.com/buildbuddy-io/buildbuddy/server/metrics" "github.com/buildbuddy-io/buildbuddy/server/remote_cache/digest" "github.com/buildbuddy-io/buildbuddy/server/testutil/testfs" @@ -19,7 +17,6 @@ import ( "github.com/buildbuddy-io/buildbuddy/server/util/claims" "github.com/buildbuddy-io/buildbuddy/server/util/hash" "github.com/buildbuddy-io/buildbuddy/server/util/log" - "github.com/buildbuddy-io/buildbuddy/server/util/status" "github.com/buildbuddy-io/buildbuddy/server/util/testing/flags" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -315,75 +312,44 @@ func TestFileCacheEvictionAfterStartupScan(t *testing.T) { } } -func TestScanWithConcurrentRemove(t *testing.T) { - for i := 0; i < 100; i++ { +func TestScanWithConcurrentAdd(t *testing.T) { + for trial := 0; trial < 100; trial++ { ctx := context.Background() filecacheRoot := testfs.MakeTempDir(t) const n = 100 var nodes [n]*repb.FileNode + var nodeContents [n]string for i := 0; i < n; i++ { name := fmt.Sprint(i) nodes[i] = nodeFromString(name, false) writeFileContent(t, filecacheRoot, "ANON/"+nodes[i].GetDigest().GetHash(), name, false) + nodeContents[i] = name } i := rand.Intn(n) - pathToDelete := filepath.Join(filecacheRoot, "ANON/"+nodes[i].GetDigest().GetHash()) - - var fc interfaces.FileCache - var eg errgroup.Group - eg.Go(func() error { - time.Sleep(time.Duration(rand.Intn(100)) * time.Microsecond) - var err error - fc, err = filecache.NewFileCache(filecacheRoot, 10_000_000, false) - if err != nil { - return err - } - fc.WaitForDirectoryScanToComplete() - return nil - }) - // While the directory scan is in progress, delete a random file. - eg.Go(func() error { - time.Sleep(time.Duration(rand.Intn(100)) * time.Microsecond) - return os.Remove(pathToDelete) - }) - err := eg.Wait() + nodeToReAdd := nodes[i] + + nodeToReAddPath := filepath.Join(testfs.MakeTempDir(t), "action.output") + err := os.WriteFile(nodeToReAddPath, []byte(nodeContents[i]), 0644) require.NoError(t, err) - // The directory scan should be resilient to this deletion - linking any - // other random file should work. - j := (i + 1 + rand.Intn(n-1)) % n - ok := fc.FastLinkFile(ctx, nodes[j], filepath.Join(fc.TempDir(), "out")) - require.True(t, ok) - } -} + fc, err := filecache.NewFileCache(filecacheRoot, 10_000_000, false) + require.NoError(t, err) -func TestAddWithConcurrentRemove(t *testing.T) { - ctx := context.Background() - filecacheRoot := testfs.MakeTempDir(t) - fc, err := filecache.NewFileCache(filecacheRoot, 10_000_000, false) - require.NoError(t, err) - fc.WaitForDirectoryScanToComplete() - tempDir := fc.TempDir() + // While the directory scan is in progress, re-add a random file + // to trigger a race. + err = fc.AddFile(ctx, nodeToReAdd, nodeToReAddPath) + require.NoError(t, err) - for i := 0; i < 100; i++ { - name := fmt.Sprint(i) - node := nodeFromString(name, false) - path := filepath.Join(tempDir, name) - writeFileContent(t, tempDir, name, name, false) + fc.WaitForDirectoryScanToComplete() - var eg errgroup.Group - eg.Go(func() error { - time.Sleep(time.Duration(rand.Intn(5)) * time.Microsecond) - return os.Remove(path) - }) - eg.Go(func() error { - time.Sleep(time.Duration(rand.Intn(5)) * time.Microsecond) - return fc.AddFile(ctx, node, path) - }) - err := eg.Wait() - require.True(t, err == nil || status.IsNotFoundError(err), "expected NotFound or nil error, got %v", err) + // The directory scan should be resilient to this race condition - + // linking any file should work. + for i := 0; i < n; i++ { + ok := fc.FastLinkFile(ctx, nodes[i], filepath.Join(fc.TempDir(), fmt.Sprintf("out-%d", i))) + require.True(t, ok, "link node %d (test trial %d)", i, trial) + } } }