diff --git a/server/etcdserver/api/membership/cluster.go b/server/etcdserver/api/membership/cluster.go index 6becdfd62ed..1ec42c38b51 100644 --- a/server/etcdserver/api/membership/cluster.go +++ b/server/etcdserver/api/membership/cluster.go @@ -817,8 +817,10 @@ func (c *RaftCluster) SetDowngradeInfo(d *serverversion.DowngradeInfo, shouldApp // IsMemberExist returns if the member with the given id exists in cluster. func (c *RaftCluster) IsMemberExist(id types.ID) bool { c.Lock() - defer c.Unlock() _, ok := c.members[id] + c.Unlock() + + // gofail: var afterIsMemberExist struct{} return ok } diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index ba3a3f3ffe1..0615d20e8b3 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -1224,7 +1224,8 @@ func (s *EtcdServer) isLeader() bool { // MoveLeader transfers the leader to the given transferee. func (s *EtcdServer) MoveLeader(ctx context.Context, lead, transferee uint64) error { - if !s.cluster.IsMemberExist(types.ID(transferee)) || s.cluster.Member(types.ID(transferee)).IsLearner { + member := s.cluster.Member(types.ID(transferee)) + if member == nil || member.IsLearner { return errors.ErrBadLeaderTransferee } @@ -1593,9 +1594,9 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error { } lg := s.Logger() - isLearner := s.cluster.IsMemberExist(id) && s.cluster.Member(id).IsLearner + member := s.cluster.Member(id) // no need to check quorum when removing non-voting member - if isLearner { + if member != nil && member.IsLearner { return nil } diff --git a/tests/integration/cluster_test.go b/tests/integration/cluster_test.go index 29f8ae8dd5d..74bf3098582 100644 --- a/tests/integration/cluster_test.go +++ b/tests/integration/cluster_test.go @@ -518,3 +518,51 @@ func TestSpeedyTerminate(t *testing.T) { case <-donec: } } + +// TestConcurrentRemoveMember demonstrated a panic in mayRemoveMember with +// concurrent calls to MemberRemove. To reliably reproduce the panic, a delay +// needed to be injected in IsMemberExist, which is done using a failpoint. +// After fixing the bug, IsMemberExist is no longer called by mayRemoveMember. +func TestConcurrentRemoveMember(t *testing.T) { + integration.BeforeTest(t, integration.WithFailpoint("afterIsMemberExist", `sleep("1s")`)) + c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1}) + defer c.Terminate(t) + + addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"}) + if err != nil { + t.Fatal(err) + } + removeID := addResp.Member.ID + done := make(chan struct{}) + go func() { + time.Sleep(time.Second / 2) + c.Members[0].Client.MemberRemove(context.Background(), removeID) + close(done) + }() + if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil { + t.Fatal(err) + } + <-done +} + +func TestConcurrentMoveLeader(t *testing.T) { + integration.BeforeTest(t, integration.WithFailpoint("afterIsMemberExist", `sleep("1s")`)) + c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1}) + defer c.Terminate(t) + + addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"}) + if err != nil { + t.Fatal(err) + } + removeID := addResp.Member.ID + done := make(chan struct{}) + go func() { + time.Sleep(time.Second / 2) + c.Members[0].Client.MoveLeader(context.Background(), removeID) + close(done) + }() + if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil { + t.Fatal(err) + } + <-done +} diff --git a/tests/robustness/makefile.mk b/tests/robustness/makefile.mk index 8c40436cd2f..09d10108fb6 100644 --- a/tests/robustness/makefile.mk +++ b/tests/robustness/makefile.mk @@ -54,7 +54,7 @@ install-gofail: $(GOPATH)/bin/gofail .PHONY: gofail-enable gofail-enable: $(GOPATH)/bin/gofail - $(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ + $(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/ cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION} cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION} cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION} @@ -62,7 +62,7 @@ gofail-enable: $(GOPATH)/bin/gofail .PHONY: gofail-disable gofail-disable: $(GOPATH)/bin/gofail - $(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ + $(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/ cd ./server && go mod tidy cd ./etcdutl && go mod tidy cd ./etcdctl && go mod tidy