Skip to content

Commit

Permalink
etcdserver: fix panic when checking IsLearner of removed member
Browse files Browse the repository at this point in the history
There was a concurrency bug when accessing the IsLearner
property of a member, which will panic with a nil pointer access error
if the member is removed between the IsMemberExist() and Member() calls.

Signed-off-by: Jan Schär <[email protected]>
  • Loading branch information
jscissr committed Sep 30, 2024
1 parent 5704c61 commit 3374e27
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 6 deletions.
4 changes: 3 additions & 1 deletion server/etcdserver/api/membership/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,8 +817,10 @@ func (c *RaftCluster) SetDowngradeInfo(d *serverversion.DowngradeInfo, shouldApp
// IsMemberExist returns if the member with the given id exists in cluster.
func (c *RaftCluster) IsMemberExist(id types.ID) bool {
c.Lock()
defer c.Unlock()
_, ok := c.members[id]
c.Unlock()

// gofail: var afterIsMemberExist struct{}
return ok
}

Expand Down
7 changes: 4 additions & 3 deletions server/etcdserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,8 @@ func (s *EtcdServer) isLeader() bool {

// MoveLeader transfers the leader to the given transferee.
func (s *EtcdServer) MoveLeader(ctx context.Context, lead, transferee uint64) error {
if !s.cluster.IsMemberExist(types.ID(transferee)) || s.cluster.Member(types.ID(transferee)).IsLearner {
member := s.cluster.Member(types.ID(transferee))
if member == nil || member.IsLearner {
return errors.ErrBadLeaderTransferee
}

Expand Down Expand Up @@ -1593,9 +1594,9 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
}

lg := s.Logger()
isLearner := s.cluster.IsMemberExist(id) && s.cluster.Member(id).IsLearner
member := s.cluster.Member(id)
// no need to check quorum when removing non-voting member
if isLearner {
if member != nil && member.IsLearner {
return nil
}

Expand Down
48 changes: 48 additions & 0 deletions tests/integration/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -518,3 +518,51 @@ func TestSpeedyTerminate(t *testing.T) {
case <-donec:
}
}

// TestConcurrentRemoveMember demonstrated a panic in mayRemoveMember with
// concurrent calls to MemberRemove. To reliably reproduce the panic, a delay
// needed to be injected in IsMemberExist, which is done using a failpoint.
// After fixing the bug, IsMemberExist is no longer called by mayRemoveMember.
func TestConcurrentRemoveMember(t *testing.T) {
integration.BeforeTest(t, integration.WithFailpoint("afterIsMemberExist", `sleep("1s")`))
c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
defer c.Terminate(t)

addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"})
if err != nil {
t.Fatal(err)
}
removeID := addResp.Member.ID
done := make(chan struct{})
go func() {
time.Sleep(time.Second / 2)
c.Members[0].Client.MemberRemove(context.Background(), removeID)
close(done)
}()
if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil {
t.Fatal(err)
}
<-done
}

func TestConcurrentMoveLeader(t *testing.T) {
integration.BeforeTest(t, integration.WithFailpoint("afterIsMemberExist", `sleep("1s")`))
c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
defer c.Terminate(t)

addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"})
if err != nil {
t.Fatal(err)
}
removeID := addResp.Member.ID
done := make(chan struct{})
go func() {
time.Sleep(time.Second / 2)
c.Members[0].Client.MoveLeader(context.Background(), removeID)
close(done)
}()
if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil {
t.Fatal(err)
}
<-done
}
4 changes: 2 additions & 2 deletions tests/robustness/makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ install-gofail: $(GOPATH)/bin/gofail

.PHONY: gofail-enable
gofail-enable: $(GOPATH)/bin/gofail
$(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/
$(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION}

.PHONY: gofail-disable
gofail-disable: $(GOPATH)/bin/gofail
$(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/
$(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/
cd ./server && go mod tidy
cd ./etcdutl && go mod tidy
cd ./etcdctl && go mod tidy
Expand Down

0 comments on commit 3374e27

Please sign in to comment.