Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "etcd server shouldn't wait for the ready notification infinitely on startup" #16677

Merged
merged 1 commit into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CHANGELOG/CHANGELOG-3.6.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ See [code changes](https://github.com/etcd-io/etcd/compare/v3.5.0...v3.6.0).
- Add [`etcd --experimental-max-learners`](https://github.com/etcd-io/etcd/pull/13377) flag to allow configuration of learner max membership.
- Add [`etcd --experimental-enable-lease-checkpoint-persist`](https://github.com/etcd-io/etcd/pull/13508) flag to handle upgrade from v3.5.2 clusters with this feature enabled.
- Add [`etcdctl make-mirror --rev`](https://github.com/etcd-io/etcd/pull/13519) flag to support incremental mirror.
- Add [`etcd --experimental-wait-cluster-ready-timeout`](https://github.com/etcd-io/etcd/pull/13525) flag to wait for cluster to be ready before serving client requests.
- Add [v3 discovery](https://github.com/etcd-io/etcd/pull/13635) to bootstrap a new etcd cluster.
- Add [field `storage`](https://github.com/etcd-io/etcd/pull/13772) into the response body of endpoint `/version`.
- Add [`etcd --max-concurrent-streams`](https://github.com/etcd-io/etcd/pull/14169) flag to configure the max concurrent streams each client can open at a time, and defaults to math.MaxUint32.
Expand Down
4 changes: 0 additions & 4 deletions server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,6 @@ type ServerConfig struct {
TickMs uint
ElectionTicks int

// WaitClusterReadyTimeout is the maximum time to wait for the
// cluster to be ready on startup before serving client requests.
WaitClusterReadyTimeout time.Duration

// InitialElectionTickAdvance is true, then local member fast-forwards
// election ticks to speed up "initial" leader election trigger. This
// benefits the case of larger election ticks. For instance, cross
Expand Down
14 changes: 5 additions & 9 deletions server/embed/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ const (
DefaultGRPCKeepAliveInterval = 2 * time.Hour
DefaultGRPCKeepAliveTimeout = 20 * time.Second
DefaultDowngradeCheckTime = 5 * time.Second
DefaultWaitClusterReadyTimeout = 5 * time.Second
DefaultAutoCompactionMode = "periodic"

DefaultDiscoveryDialTimeout = 2 * time.Second
Expand Down Expand Up @@ -242,10 +241,9 @@ type Config struct {
Durl string `json:"discovery"`
DiscoveryCfg v3discovery.DiscoveryConfig `json:"discovery-config"`

InitialCluster string `json:"initial-cluster"`
InitialClusterToken string `json:"initial-cluster-token"`
StrictReconfigCheck bool `json:"strict-reconfig-check"`
ExperimentalWaitClusterReadyTimeout time.Duration `json:"wait-cluster-ready-timeout"`
InitialCluster string `json:"initial-cluster"`
InitialClusterToken string `json:"initial-cluster-token"`
StrictReconfigCheck bool `json:"strict-reconfig-check"`

// AutoCompactionMode is either 'periodic' or 'revision'.
AutoCompactionMode string `json:"auto-compaction-mode"`
Expand Down Expand Up @@ -502,9 +500,8 @@ func NewConfig() *Config {
AdvertisePeerUrls: []url.URL{*apurl},
AdvertiseClientUrls: []url.URL{*acurl},

ClusterState: ClusterStateFlagNew,
InitialClusterToken: "etcd-cluster",
ExperimentalWaitClusterReadyTimeout: DefaultWaitClusterReadyTimeout,
ClusterState: ClusterStateFlagNew,
InitialClusterToken: "etcd-cluster",

StrictReconfigCheck: DefaultStrictReconfigCheck,
Metrics: "basic",
Expand Down Expand Up @@ -728,7 +725,6 @@ func (cfg *Config) AddFlags(fs *flag.FlagSet) {
fs.BoolVar(&cfg.ExperimentalTxnModeWriteWithSharedBuffer, "experimental-txn-mode-write-with-shared-buffer", true, "Enable the write transaction to use a shared buffer in its readonly check operations.")
fs.UintVar(&cfg.ExperimentalBootstrapDefragThresholdMegabytes, "experimental-bootstrap-defrag-threshold-megabytes", 0, "Enable the defrag during etcd server bootstrap on condition that it will free at least the provided threshold of disk space. Needs to be set to non-zero value to take effect.")
fs.IntVar(&cfg.ExperimentalMaxLearners, "experimental-max-learners", membership.DefaultMaxLearners, "Sets the maximum number of learners that can be available in the cluster membership.")
fs.DurationVar(&cfg.ExperimentalWaitClusterReadyTimeout, "experimental-wait-cluster-ready-timeout", cfg.ExperimentalWaitClusterReadyTimeout, "Maximum duration to wait for the cluster to be ready.")
fs.Uint64Var(&cfg.SnapshotCatchUpEntries, "experimental-snapshot-catchup-entries", cfg.SnapshotCatchUpEntries, "Number of entries for a slow follower to catch up after compacting the raft storage entries.")

// unsafe
Expand Down
2 changes: 0 additions & 2 deletions server/embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
PeerTLSInfo: cfg.PeerTLSInfo,
TickMs: cfg.TickMs,
ElectionTicks: cfg.ElectionTicks(),
WaitClusterReadyTimeout: cfg.ExperimentalWaitClusterReadyTimeout,
InitialElectionTickAdvance: cfg.InitialElectionTickAdvance,
AutoCompactionRetention: autoCompactionRetention,
AutoCompactionMode: cfg.AutoCompactionMode,
Expand Down Expand Up @@ -328,7 +327,6 @@ func print(lg *zap.Logger, ec Config, sc config.ServerConfig, memberInitialized
zap.Bool("force-new-cluster", sc.ForceNewCluster),
zap.String("heartbeat-interval", fmt.Sprintf("%v", time.Duration(sc.TickMs)*time.Millisecond)),
zap.String("election-timeout", fmt.Sprintf("%v", time.Duration(sc.ElectionTicks*int(sc.TickMs))*time.Millisecond)),
zap.String("wait-cluster-ready-timeout", sc.WaitClusterReadyTimeout.String()),
zap.Bool("initial-election-tick-advance", sc.InitialElectionTickAdvance),
zap.Uint64("snapshot-count", sc.SnapshotCount),
zap.Uint("max-wals", sc.MaxWALFiles),
Expand Down
11 changes: 1 addition & 10 deletions server/embed/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"net"
"net/http"
"strings"
"time"

gw "github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
"github.com/soheilhy/cmux"
Expand Down Expand Up @@ -100,15 +99,7 @@ func (sctx *serveCtx) serve(
splitHttp bool,
gopts ...grpc.ServerOption) (err error) {
logger := defaultLog.New(io.Discard, "etcdhttp", 0)

// When the quorum isn't satisfied, then etcd server will be blocked
// on <-s.ReadyNotify(). Set a timeout here so that the etcd server
// can continue to serve serializable read request.
select {
case <-time.After(s.Cfg.WaitClusterReadyTimeout):
sctx.lg.Warn("timed out waiting for the ready notification")
case <-s.ReadyNotify():
}
<-s.ReadyNotify()

sctx.lg.Info("ready to serve client requests")

Expand Down
3 changes: 0 additions & 3 deletions server/etcdmain/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"os"
"runtime"
"strings"
"time"

"go.uber.org/zap"
"google.golang.org/grpc"
Expand Down Expand Up @@ -211,8 +210,6 @@ func startEtcd(cfg *embed.Config) (<-chan struct{}, <-chan error, error) {
select {
case <-e.Server.ReadyNotify(): // wait for e.Server to join the cluster
case <-e.Server.StopNotify(): // publish aborted from 'ErrStopped'
case <-time.After(cfg.ExperimentalWaitClusterReadyTimeout):
e.GetLogger().Warn("startEtcd: timed out waiting for the ready notification")
}
return e.Server.StopNotify(), e.Err(), nil
}
Expand Down
2 changes: 0 additions & 2 deletions server/etcdmain/help.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,6 @@ Experimental feature:
Set time duration after which a warning is generated if a unary request takes more than this duration. It's deprecated, and will be decommissioned in v3.7. Use --warning-unary-request-duration instead.
--experimental-max-learners '1'
Set the max number of learner members allowed in the cluster membership.
--experimental-wait-cluster-ready-timeout '5s'
Set the maximum time duration to wait for the cluster to be ready.
--experimental-snapshot-catch-up-entries '5000'
Number of entries for a slow follower to catch up after compacting the raft storage entries.
--experimental-compaction-sleep-interval
Expand Down
23 changes: 17 additions & 6 deletions tests/e2e/corrupt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !cluster_proxy

package e2e

import (
"context"
"fmt"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -155,22 +158,30 @@ func TestInPlaceRecovery(t *testing.T) {
assert.NoError(t, err)

// Rolling recovery of the servers.
wg := sync.WaitGroup{}
t.Log("rolling updating servers in place...")
for i, newProc := range epcNew.Procs {
for i := range epcNew.Procs {
oldProc := epcOld.Procs[i]
err = oldProc.Close()
if err != nil {
t.Fatalf("could not stop etcd process (%v)", err)
}
t.Logf("old cluster server %d: %s stopped.", i, oldProc.Config().Name)
err = newProc.Start(ctx)
if err != nil {
t.Fatalf("could not start etcd process (%v)", err)
}
t.Logf("new cluster server %d: %s started in-place with blank db.", i, newProc.Config().Name)
wg.Add(1)
// Start servers in background to avoid blocking on server start.
// EtcdProcess.Start waits until etcd becomes healthy, which will not happen here until we restart at least 2 members.
go func(proc e2e.EtcdProcess) {
defer wg.Done()
err = proc.Start(ctx)
if err != nil {
t.Errorf("could not start etcd process (%v)", err)
}
t.Logf("new cluster server: %s started in-place with blank db.", proc.Config().Name)
}(epcNew.Procs[i])
t.Log("sleeping 5 sec to let nodes do periodical check...")
time.Sleep(5 * time.Second)
}
wg.Wait()
t.Log("new cluster started.")

alarmResponse, err := newCc.AlarmList(ctx)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/ctl_v3_member_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ func TestCtlV3ConsistentMemberList(t *testing.T) {

epc, err := e2e.NewEtcdProcessCluster(ctx, t,
e2e.WithClusterSize(1),
e2e.WithWaitClusterReadyTimeout(1*time.Nanosecond),
e2e.WithEnvVars(map[string]string{"GOFAIL_FAILPOINTS": `beforeApplyOneConfChange=sleep("2s")`}),
)
require.NoError(t, err, "failed to start etcd cluster: %v", err)
Expand Down
45 changes: 0 additions & 45 deletions tests/e2e/no_quorum_ready_test.go

This file was deleted.

7 changes: 0 additions & 7 deletions tests/framework/e2e/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,6 @@ func WithWatchProcessNotifyInterval(interval time.Duration) EPClusterOption {
return func(c *EtcdProcessClusterConfig) { c.ServerConfig.ExperimentalWatchProgressNotifyInterval = interval }
}

func WithWaitClusterReadyTimeout(readyTimeout time.Duration) EPClusterOption {
return func(c *EtcdProcessClusterConfig) { c.ServerConfig.ExperimentalWaitClusterReadyTimeout = readyTimeout }
}

func WithEnvVars(ev map[string]string) EPClusterOption {
return func(c *EtcdProcessClusterConfig) { c.EnvVars = ev }
}
Expand Down Expand Up @@ -601,9 +597,6 @@ func (cfg *EtcdProcessClusterConfig) EtcdServerProcessConfig(tb testing.TB, i in
if cfg.ServerConfig.ExperimentalWatchProgressNotifyInterval != 0 {
args = append(args, "--experimental-watch-progress-notify-interval", cfg.ServerConfig.ExperimentalWatchProgressNotifyInterval.String())
}
if cfg.ServerConfig.ExperimentalWaitClusterReadyTimeout != 0 {
args = append(args, "--experimental-wait-cluster-ready-timeout", cfg.ServerConfig.ExperimentalWaitClusterReadyTimeout.String())
}
if cfg.ServerConfig.SnapshotCatchUpEntries != etcdserver.DefaultSnapshotCatchUpEntries {
if cfg.Version == CurrentVersion || (cfg.Version == MinorityLastVersion && i <= cfg.ClusterSize/2) || (cfg.Version == QuorumLastVersion && i > cfg.ClusterSize/2) {
args = append(args, "--experimental-snapshot-catchup-entries", fmt.Sprintf("%d", cfg.ServerConfig.SnapshotCatchUpEntries))
Expand Down
Loading