diff --git a/tests/framework/e2e/etcd_process.go b/tests/framework/e2e/etcd_process.go index d409d9eb5ccd..25d214c13beb 100644 --- a/tests/framework/e2e/etcd_process.go +++ b/tests/framework/e2e/etcd_process.go @@ -17,6 +17,7 @@ package e2e import ( "bytes" "context" + "errors" "fmt" "io" "net/http" @@ -338,7 +339,15 @@ type BinaryFailpoints struct { availableCache map[string]struct{} } -func (f *BinaryFailpoints) Setup(ctx context.Context, failpoint, payload string) error { +func (f *BinaryFailpoints) SetupEnv(failpoint, payload string) error { + if f.member.IsRunning() { + return errors.New("cannot setup environment variable while process is running") + } + f.member.Config().EnvVars["GOFAIL_FAILPOINTS"] = fmt.Sprintf("%s=%s", failpoint, payload) + return nil +} + +func (f *BinaryFailpoints) SetupHTTP(ctx context.Context, failpoint, payload string) error { host := fmt.Sprintf("127.0.0.1:%d", f.member.Config().GoFailPort) failpointUrl := url.URL{ Scheme: "http", diff --git a/tests/robustness/failpoints.go b/tests/robustness/failpoints.go index 48529dca5444..0546d185c5e5 100644 --- a/tests/robustness/failpoints.go +++ b/tests/robustness/failpoints.go @@ -66,6 +66,7 @@ var ( RaftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} + RaftBeforeSaveSleep Failpoint = killAndGofailSleep{"raftBeforeSave", time.Millisecond * 200} allFailpoints = []Failpoint{ KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic, RaftAfterSavePanic, DefragBeforeCopyPanic, DefragBeforeRenamePanic, BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic, @@ -75,6 +76,7 @@ var ( CompactAfterCommitBatchPanic, RaftBeforeLeaderSendPanic, BlackholePeerNetwork, DelayPeerNetwork, RaftBeforeFollowerSendPanic, RaftBeforeApplySnapPanic, RaftAfterApplySnapPanic, RaftAfterWALReleasePanic, RaftBeforeSaveSnapPanic, RaftAfterSaveSnapPanic, BlackholeUntilSnapshot, + RaftBeforeSaveSleep, } ) @@ -252,7 +254,7 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg default: } lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name())) - err := member.Failpoints().Setup(ctx, f.failpoint, "panic") + err := member.Failpoints().SetupHTTP(ctx, f.failpoint, "panic") if err != nil { lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err)) continue @@ -318,7 +320,7 @@ func (f goPanicFailpoint) Available(config e2e.EtcdProcessClusterConfig, member } func (f goPanicFailpoint) Name() string { - return f.failpoint + return fmt.Sprintf("%s=panic()", f.failpoint) } type triggerDefrag struct{} @@ -527,3 +529,48 @@ func (f delayPeerNetworkFailpoint) Name() string { func (f delayPeerNetworkFailpoint) Available(config e2e.EtcdProcessClusterConfig, clus e2e.EtcdProcess) bool { return config.ClusterSize > 1 && clus.PeerProxy() != nil } + +type killAndGofailSleep struct { + failpoint string + time time.Duration +} + +func (f killAndGofailSleep) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error { + member := clus.Procs[rand.Int()%len(clus.Procs)] + for member.IsRunning() { + err := member.Kill() + if err != nil { + lg.Info("Sending kill signal failed", zap.Error(err)) + } + err = member.Wait(ctx) + if err != nil && !strings.Contains(err.Error(), "unexpected exit code") { + lg.Info("Failed to kill the process", zap.Error(err)) + return fmt.Errorf("failed to kill the process within %s, err: %w", triggerTimeout, err) + } + } + lg.Info("Setting up goFailpoint", zap.String("failpoint", f.Name())) + err := member.Failpoints().SetupEnv(f.failpoint, fmt.Sprintf(`sleep(%q)`, f.time)) + if err != nil { + return err + } + err = member.Start(ctx) + if err != nil { + return err + } + // TODO: Check gofail status (https://github.com/etcd-io/gofail/pull/47) and wait for sleep to beis executed at least once. + return nil +} + +func (f killAndGofailSleep) Name() string { + return fmt.Sprintf("%s=sleep(%s)", f.failpoint, f.time) +} + +func (f killAndGofailSleep) Available(config e2e.EtcdProcessClusterConfig, member e2e.EtcdProcess) bool { + memberFailpoints := member.Failpoints() + if memberFailpoints == nil { + return false + } + available := memberFailpoints.Available() + _, found := available[f.failpoint] + return found +}