Skip to content

Commit

Permalink
etcdserver: fix panic when checking IsLearner of removed member
Browse files Browse the repository at this point in the history
Previously, calling s.IsLearner() when the local node is no longer a
member panics. There was an attempt to fix this by first checking
IsMemberExist(), but this is not a correct fix because the member could
be removed between the two calls. Instead of panicking when the member
was removed, IsLearner() should return false. A node which is not a
member is also not a learner.

There was a similar concurrency bug when accessing the IsLearner
property of a member, which will panic with a nil pointer access error
if the member is removed between the IsMemberExist() and Member() calls.

Signed-off-by: Jan Schär <[email protected]>
  • Loading branch information
jscissr committed Sep 25, 2024
1 parent 5704c61 commit 605abca
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 12 deletions.
9 changes: 4 additions & 5 deletions server/etcdserver/api/membership/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -783,11 +783,7 @@ func (c *RaftCluster) IsLocalMemberLearner() bool {
defer c.Unlock()
localMember, ok := c.members[c.localID]
if !ok {
c.lg.Panic(
"failed to find local ID in cluster members",
zap.String("cluster-id", c.cid.String()),
zap.String("local-member-id", c.localID.String()),
)
return false
}
return localMember.IsLearner
}
Expand Down Expand Up @@ -816,6 +812,9 @@ func (c *RaftCluster) SetDowngradeInfo(d *serverversion.DowngradeInfo, shouldApp

// IsMemberExist returns if the member with the given id exists in cluster.
func (c *RaftCluster) IsMemberExist(id types.ID) bool {
// gofail: var sleepAfterIsMemberExist struct{}
// defer time.Sleep(time.Second)

c.Lock()
defer c.Unlock()
_, ok := c.members[id]
Expand Down
4 changes: 2 additions & 2 deletions server/etcdserver/api/v3rpc/interceptor.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func newUnaryInterceptor(s *etcdserver.EtcdServer) grpc.UnaryServerInterceptor {
return nil, rpctypes.ErrGRPCNotCapable
}

if s.IsMemberExist(s.MemberID()) && s.IsLearner() && !isRPCSupportedForLearner(req) {
if s.IsLearner() && !isRPCSupportedForLearner(req) {
return nil, rpctypes.ErrGRPCNotSupportedForLearner
}

Expand Down Expand Up @@ -218,7 +218,7 @@ func newStreamInterceptor(s *etcdserver.EtcdServer) grpc.StreamServerInterceptor
return rpctypes.ErrGRPCNotCapable
}

if s.IsMemberExist(s.MemberID()) && s.IsLearner() && info.FullMethod != snapshotMethod { // learner does not support stream RPC except Snapshot
if s.IsLearner() && info.FullMethod != snapshotMethod { // learner does not support stream RPC except Snapshot
return rpctypes.ErrGRPCNotSupportedForLearner
}

Expand Down
7 changes: 4 additions & 3 deletions server/etcdserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,8 @@ func (s *EtcdServer) isLeader() bool {

// MoveLeader transfers the leader to the given transferee.
func (s *EtcdServer) MoveLeader(ctx context.Context, lead, transferee uint64) error {
if !s.cluster.IsMemberExist(types.ID(transferee)) || s.cluster.Member(types.ID(transferee)).IsLearner {
member := s.cluster.Member(types.ID(transferee))
if member == nil || member.IsLearner {
return errors.ErrBadLeaderTransferee
}

Expand Down Expand Up @@ -1593,9 +1594,9 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
}

lg := s.Logger()
isLearner := s.cluster.IsMemberExist(id) && s.cluster.Member(id).IsLearner
member := s.cluster.Member(id)
// no need to check quorum when removing non-voting member
if isLearner {
if member != nil && member.IsLearner {
return nil
}

Expand Down
48 changes: 48 additions & 0 deletions tests/integration/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -518,3 +518,51 @@ func TestSpeedyTerminate(t *testing.T) {
case <-donec:
}
}

// TestConcurrentRemoveMember demonstrated a panic in mayRemoveMember with
// concurrent calls to MemberRemove. To reliably reproduce the panic, a delay
// needed to be injected in IsMemberExist, which is done using a failpoint.
// After fixing the bug, IsMemberExist is no longer called by mayRemoveMember.
func TestConcurrentRemoveMember(t *testing.T) {
integration.BeforeTest(t, integration.WithFailpoint("sleepAfterIsMemberExist", `return`))
c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
defer c.Terminate(t)

addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"})
if err != nil {
t.Fatal(err)
}
removeID := addResp.Member.ID
done := make(chan struct{})
go func() {
time.Sleep(time.Second / 2)
c.Members[0].Client.MemberRemove(context.Background(), removeID)
close(done)
}()
if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil {
t.Fatal(err)
}
<-done
}

func TestConcurrentMoveLeader(t *testing.T) {
integration.BeforeTest(t, integration.WithFailpoint("sleepAfterIsMemberExist", `return`))
c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
defer c.Terminate(t)

addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"})
if err != nil {
t.Fatal(err)
}
removeID := addResp.Member.ID
done := make(chan struct{})
go func() {
time.Sleep(time.Second / 2)
c.Members[0].Client.MoveLeader(context.Background(), removeID)
close(done)
}()
if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil {
t.Fatal(err)
}
<-done
}
4 changes: 2 additions & 2 deletions tests/robustness/makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ install-gofail: $(GOPATH)/bin/gofail

.PHONY: gofail-enable
gofail-enable: $(GOPATH)/bin/gofail
$(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/
$(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION}

.PHONY: gofail-disable
gofail-disable: $(GOPATH)/bin/gofail
$(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/
$(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/
cd ./server && go mod tidy
cd ./etcdutl && go mod tidy
cd ./etcdctl && go mod tidy
Expand Down

0 comments on commit 605abca

Please sign in to comment.