diff --git a/changelog/21.0/21.0.0/release_notes.md b/changelog/21.0/21.0.0/release_notes.md
index 956d5c3ee1c..6fe295fbb3b 100644
--- a/changelog/21.0/21.0.0/release_notes.md
+++ b/changelog/21.0/21.0.0/release_notes.md
@@ -3,8 +3,8 @@
### Table of Contents
-- **[Known Issue](#known-issues)**
- - **[Backup reports itself as successful despite failures](#backup-reports-as-successful)**
+- **[Known Issues](#known-issues)**
+ - **[Backup reports itself as successful despite failures](#backup-reports-as-successful)**
- **[Major Changes](#major-changes)**
- **[Deprecations and Deletions](#deprecations-and-deletions)**
- [Deprecated VTTablet Flags](#vttablet-flags)
@@ -28,14 +28,14 @@
- **[vtctldclient ChangeTabletTags](#vtctldclient-changetablettags)**
- **[Support for specifying expected primary in reparents](#reparents-expectedprimary)**
-## Known Issue
+## Known Issues
### Backup reports itself as successful despite failures
-In this release, we identified an issue where a backup may succeed even if a file fails to be backed up.
-Leading to a successful backup, even if some errors occurred.
-This only happen with the Builtin Backup Engine, and when all files have already been initiated in the backup process.
-For more details, please refer to the related GitHub Issue https://github.com/vitessio/vitess/issues/17063.
+In this release, we have identified an issue where a backup may succeed even if one of the underlying files fails to be backed up.
+The underlying errors are ignored and the backup action reports success.
+This issue exists only with the `builtin` backup engine, and it can occur only when the engine has already started backing up all files.
+Please refer to https://github.com/vitessio/vitess/issues/17063 for more details.
## Major Changes
diff --git a/changelog/21.0/21.0.0/summary.md b/changelog/21.0/21.0.0/summary.md
index fcac56180a9..512aa45a12f 100644
--- a/changelog/21.0/21.0.0/summary.md
+++ b/changelog/21.0/21.0.0/summary.md
@@ -2,7 +2,7 @@
### Table of Contents
-- **[Known Issue](#known-issues)**
+- **[Known Issues](#known-issues)**
- **[Backup reports itself as successful despite failures](#backup-reports-as-successful)**
- **[Major Changes](#major-changes)**
- **[Deprecations and Deletions](#deprecations-and-deletions)**
@@ -27,14 +27,14 @@
- **[vtctldclient ChangeTabletTags](#vtctldclient-changetablettags)**
- **[Support for specifying expected primary in reparents](#reparents-expectedprimary)**
-## Known Issue
+## Known Issues
### Backup reports itself as successful despite failures
-In this release, we identified an issue where a backup may succeed even if a file fails to be backed up.
-Leading to a successful backup, even if some errors occurred.
-This only happen with the Builtin Backup Engine, and when all files have already been initiated in the backup process.
-For more details, please refer to the related GitHub Issue https://github.com/vitessio/vitess/issues/17063.
+In this release, we have identified an issue where a backup may succeed even if one of the underlying files fails to be backed up.
+The underlying errors are ignored and the backup action reports success.
+This issue exists only with the `builtin` backup engine, and it can occur only when the engine has already started backing up all files.
+Please refer to https://github.com/vitessio/vitess/issues/17063 for more details.
## Major Changes
diff --git a/go/vt/mysqlctl/backup_blackbox_race_test.go b/go/vt/mysqlctl/backup_blackbox_race_test.go
new file mode 100644
index 00000000000..5414ebc5fa6
--- /dev/null
+++ b/go/vt/mysqlctl/backup_blackbox_race_test.go
@@ -0,0 +1,152 @@
+//go:build !race
+
+/*
+Copyright 2024 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package mysqlctl_test is the blackbox tests for package mysqlctl.
+package mysqlctl_test
+
+import (
+ "fmt"
+ "os"
+ "path"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "vitess.io/vitess/go/mysql"
+ "vitess.io/vitess/go/mysql/fakesqldb"
+ "vitess.io/vitess/go/test/utils"
+ "vitess.io/vitess/go/vt/logutil"
+ "vitess.io/vitess/go/vt/mysqlctl"
+ "vitess.io/vitess/go/vt/mysqlctl/backupstats"
+ "vitess.io/vitess/go/vt/mysqlctl/filebackupstorage"
+ "vitess.io/vitess/go/vt/proto/topodata"
+ "vitess.io/vitess/go/vt/proto/vttime"
+ "vitess.io/vitess/go/vt/topo"
+ "vitess.io/vitess/go/vt/topo/memorytopo"
+)
+
+// This test triggers a certain code path that only happens when a backup file fails to be backed up,
+// only and only if, all the other backup files have either started or finished. When we reach
+// this scenario, files no longer try to acquire the semaphore and thus the backup cannot fail
+// because of context deadline when acquiring it. At this point, the only place where the backup
+// can fail, is if the return of be.backupFiles fails, and we record the error correctly.
+// This test specifically test this scenario and arose because of issue https://github.com/vitessio/vitess/issues/17063
+// The test does:
+// 1. Create the backup and data directory
+// 2. Create a keyspace and shard
+// 3. Already create the last backup file that would be created
+// 4. Remove all permissions on this file
+// 5. Execute the restore
+// 6. The restore must fail due to an error on file number 3 ("cannot add file: 3")
+//
+// This test is extracted into its own file that won't be run if we do 'go test -race' as this test
+// exposes an old race condition that will be fixed after https://github.com/vitessio/vitess/pull/17062
+// Link to the race condition issue: https://github.com/vitessio/vitess/issues/17065
+func TestExecuteBackupWithFailureOnLastFile(t *testing.T) {
+ ctx := utils.LeakCheckContext(t)
+
+ // Set up local backup directory
+ id := fmt.Sprintf("%d", time.Now().UnixNano())
+ backupRoot := fmt.Sprintf("testdata/builtinbackup_test_%s", id)
+ filebackupstorage.FileBackupStorageRoot = backupRoot
+ require.NoError(t, createBackupDir(backupRoot, "innodb", "log", "datadir"))
+ dataDir := path.Join(backupRoot, "datadir")
+ // Add some files under data directory to force backup to execute semaphore acquire inside
+ // backupFiles() method (https://github.com/vitessio/vitess/blob/main/go/vt/mysqlctl/builtinbackupengine.go#L483).
+ require.NoError(t, createBackupDir(dataDir, "test1"))
+ require.NoError(t, createBackupDir(dataDir, "test2"))
+ require.NoError(t, createBackupFiles(path.Join(dataDir, "test1"), 2, "ibd"))
+ require.NoError(t, createBackupFiles(path.Join(dataDir, "test2"), 2, "ibd"))
+ defer os.RemoveAll(backupRoot)
+
+ needIt, err := needInnoDBRedoLogSubdir()
+ require.NoError(t, err)
+ if needIt {
+ fpath := path.Join("log", mysql.DynamicRedoLogSubdir)
+ if err := createBackupDir(backupRoot, fpath); err != nil {
+ require.Failf(t, err.Error(), "failed to create directory: %s", fpath)
+ }
+ }
+
+ // Set up topo
+ keyspace, shard := "mykeyspace", "-"
+ ts := memorytopo.NewServer(ctx, "cell1")
+ defer ts.Close()
+
+ require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodata.Keyspace{}))
+ require.NoError(t, ts.CreateShard(ctx, keyspace, shard))
+
+ tablet := topo.NewTablet(100, "cell1", "mykeyspace-00-80-0100")
+ tablet.Keyspace = keyspace
+ tablet.Shard = shard
+
+ require.NoError(t, ts.CreateTablet(ctx, tablet))
+
+ _, err = ts.UpdateShardFields(ctx, keyspace, shard, func(si *topo.ShardInfo) error {
+ si.PrimaryAlias = &topodata.TabletAlias{Uid: 100, Cell: "cell1"}
+
+ now := time.Now()
+ si.PrimaryTermStartTime = &vttime.Time{Seconds: int64(now.Second()), Nanoseconds: int32(now.Nanosecond())}
+
+ return nil
+ })
+
+ require.NoError(t, err)
+
+ be := &mysqlctl.BuiltinBackupEngine{}
+ bh := filebackupstorage.NewBackupHandle(nil, "", "", false)
+ // Spin up a fake daemon to be used in backups. It needs to be allowed to receive:
+ // "STOP REPLICA", "START REPLICA", in that order.
+ fakedb := fakesqldb.New(t)
+ defer fakedb.Close()
+ mysqld := mysqlctl.NewFakeMysqlDaemon(fakedb)
+ defer mysqld.Close()
+ mysqld.ExpectedExecuteSuperQueryList = []string{"STOP REPLICA", "START REPLICA"}
+
+ // With this setup, 4 backup files will be created (0, 1, 2, 3). For the last file (3), we create
+ // it in advance and remove all permission on the file so that the backup be.ExecuteBackup will not
+ // be able to override the file and thus will fail. Triggering the error mechanism after calling be.backupFile.
+ lastBackupFile := path.Join(backupRoot, "3")
+ f, err := os.Create(lastBackupFile)
+ require.NoError(t, err)
+ _, err = f.Write(make([]byte, 1024))
+ require.NoError(t, err)
+ require.NoError(t, f.Chmod(0444))
+ require.NoError(t, f.Close())
+
+ backupResult, err := be.ExecuteBackup(ctx, mysqlctl.BackupParams{
+ Logger: logutil.NewConsoleLogger(),
+ Mysqld: mysqld,
+ Cnf: &mysqlctl.Mycnf{
+ InnodbDataHomeDir: path.Join(backupRoot, "innodb"),
+ InnodbLogGroupHomeDir: path.Join(backupRoot, "log"),
+ DataDir: path.Join(backupRoot, "datadir"),
+ },
+ Stats: backupstats.NewFakeStats(),
+ Concurrency: 4,
+ HookExtraEnv: map[string]string{},
+ TopoServer: ts,
+ Keyspace: keyspace,
+ Shard: shard,
+ MysqlShutdownTimeout: mysqlShutdownTimeout,
+ }, bh)
+
+ require.ErrorContains(t, err, "cannot add file: 3")
+ require.Equal(t, mysqlctl.BackupUnusable, backupResult)
+}
diff --git a/go/vt/mysqlctl/builtinbackupengine.go b/go/vt/mysqlctl/builtinbackupengine.go
index 9c9403c948c..f3cbe5364a0 100644
--- a/go/vt/mysqlctl/builtinbackupengine.go
+++ b/go/vt/mysqlctl/builtinbackupengine.go
@@ -643,9 +643,8 @@ func (be *BuiltinBackupEngine) backupFiles(
// Backup the individual file.
name := fmt.Sprintf("%v", i)
- err := be.backupFile(ctxCancel, params, bh, fe, name)
- if err != nil {
- bh.RecordError(acqErr)
+ if err := be.backupFile(ctxCancel, params, bh, fe, name); err != nil {
+ bh.RecordError(err)
cancel()
}
}(i)